checkpoint commit

3 years ago · 786bfae90b
6 changed files with 385 additions and 0 deletions
--- a/all_positions_v2.pickle
+++ b/all_positions_v2.pickle
--- a/evaluate.py
+++ b/evaluate.py
@ -0,0 +1,118 @@
 import time, re, sys, csv, os
 import gym
 from PIL import Image
 from copy import deepcopy
 from dataclasses import dataclass, field
 import numpy as np
 from matplotlib import pyplot as plt
 import readchar
 def string_to_action(action):
    if action == "left":
        return 2
    if action == "right":
        return 1
    if action == "noop":
        return 0
    return 0
 scheduler_file = "x80_y128_pos8.sched"
 def convert(tuples):
    return dict(tuples)
@dataclass(frozen=True)
 class State:
    x: int
    y: int
    ski_position: int
 def parse_scheduler(scheduler_file):
    scheduler = dict()
    try:
        with open(scheduler_file, "r") as f:
            file_content = f.readlines()
        for line in file_content:
            if not "move=0" in line: continue
            stateMapping = convert(re.findall(r"([a-zA-Z_]*[a-zA-Z])=(\d+)?", line))
            #print("stateMapping", stateMapping)
            choice = re.findall(r"{(left|right|noop)}", line)
            if choice: choice = choice[0]
            #print("choice", choice)
            state = State(int(stateMapping["x"]), int(stateMapping["y"]), int(stateMapping["ski_position"]))
            scheduler[state] = choice
        return scheduler
    except EnvironmentError:
        print("TODO file not available. Exiting.")
        sys.exit(1)
 env = gym.make("ALE/Skiing-v5")#, render_mode="human")
 #env = gym.wrappers.ResizeObservation(env, (84, 84))
 #env = gym.wrappers.GrayScaleObservation(env)
 observation, info = env.reset()
 y = 40
 standstillcounter = 0
 def update_y(y, ski_position):
    y_update = 0
    global standstillcounter
    if ski_position in [6,7, 8,9]:
        standstillcounter = 0
        y_update = 16
    elif ski_position in [4,5, 10,11]:
        standstillcounter = 0
        y_update = 12
    elif ski_position in [2,3, 12,13]:
        standstillcounter = 0
        y_update = 8
    elif ski_position in [1, 14] and standstillcounter >= 5:
        if standstillcounter >= 8:
            print("!!!!!!!!!! no more x updates!!!!!!!!!!!")
        y_update = 0
    elif ski_position in [1, 14]:
        y_update = 4
    if ski_position in [1, 14]:
        standstillcounter += 1
    return y_update
 def update_ski_position(ski_position, action):
    if action == 0:
        return ski_position
    elif action == 1:
        return min(ski_position+1, 14)
    elif action == 2:
        return max(ski_position-1, 1)
 approx_x_coordinate = 80
 ski_position = 8
 #scheduler = parse_scheduler(scheduler_file)
 j = 0
 for _ in range(1000000):
    j += 1
    #action = env.action_space.sample()  # agent policy that uses the observation and info
    #action = int(repr(readchar.readchar())[1])
    #action = string_to_action(scheduler.get(State(approx_x_coordinate, y, ski_position), "noop"))
    action = 0
    #ski_position = update_ski_position(ski_position, action)
    #y_update = update_y(y, ski_position)
    #y += y_update if y_update else 0
    #old_x = deepcopy(approx_x_coordinate)
    #approx_x_coordinate = int(np.mean(np.where(observation[:,:,1] == 92)[1]))
    #print(f"Action: {action},\tski position: {ski_position},\ty_update: {y_update},\ty: {y},\tx: {approx_x_coordinate},\tx_update:{approx_x_coordinate - old_x}")
    observation, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        observation, info = env.reset()
        break
    img = Image.fromarray(observation)
    img.save(f"images/{j:05}.png")
    #observation, reward, terminated, truncated, info = env.step(0)
    #observation, reward, terminated, truncated, info = env.step(0)
    #observation, reward, terminated, truncated, info = env.step(0)
    #observation, reward, terminated, truncated, info = env.step(0)
 env.close()
--- a/init.png
+++ b/init.png
--- a/install.sh
+++ b/install.sh
@ -0,0 +1,9 @@
 #!/bin/bash
 # aptitude dependencies
 sudo apt install python3.8-venv python3-tk
 python3 -m pip install --user virtualenv
 python3 -m venv env
 source env/bin/activate
 which python3
--- a/query_sample_factory_checkpoint.py
+++ b/query_sample_factory_checkpoint.py
@ -0,0 +1,69 @@
 import time
 from collections import deque
 from typing import Dict, Tuple
 import gymnasium as gym
 import numpy as np
 import torch
 from torch import Tensor
 from sample_factory.algo.learning.learner import Learner
 from sample_factory.algo.sampling.batched_sampling import preprocess_actions
 from sample_factory.algo.utils.action_distributions import argmax_actions
 from sample_factory.algo.utils.env_info import extract_env_info
 from sample_factory.algo.utils.make_env import make_env_func_batched
 from sample_factory.algo.utils.misc import ExperimentStatus
 from sample_factory.algo.utils.rl_utils import make_dones, prepare_and_normalize_obs
 from sample_factory.algo.utils.tensor_utils import unsqueeze_tensor
 from sample_factory.cfg.arguments import load_from_checkpoint
 from sample_factory.huggingface.huggingface_utils import generate_model_card, generate_replay_video, push_to_hf
 from sample_factory.model.actor_critic import create_actor_critic
 from sample_factory.model.model_utils import get_rnn_size
 from sample_factory.utils.attr_dict import AttrDict
 from sample_factory.utils.typing import Config, StatusCode
 from sample_factory.utils.utils import debug_log_every_n, experiment_dir, log
 from sf_examples.atari.train_atari import parse_atari_args, register_atari_components
 class SampleFactoryNNQueryWrapper:
    def setup(self):
        register_atari_components()
        cfg = parse_atari_args()
        actor_critic = create_actor_critic(cfg, gym.spaces.Dict({"obs": gym.spaces.Box(0, 255, (4, 84, 84), np.uint8)}), gym.spaces.Discrete(3)) # TODO
        actor_critic.eval()
        device = torch.device("cpu") # ("cpu" if cfg.device == "cpu" else "cuda")
        actor_critic.model_to_device(device)
        policy_id = 0 #cfg.policy_index
        #name_prefix = dict(latest="checkpoint", best="best")[cfg.load_checkpoint_kind]
        name_prefix = "best"
        checkpoints = Learner.get_checkpoints(Learner.checkpoint_dir(cfg, policy_id), f"{name_prefix}_*")
        checkpoint_dict = Learner.load_checkpoint(checkpoints, device) # torch.load(...)
        actor_critic.load_state_dict(checkpoint_dict["model"])
        rnn_states = torch.zeros([1, get_rnn_size(cfg)], dtype=torch.float32, device=device)
        self.rnn_states = rnn_states
        self.actor_critic = actor_critic
    def __init__(self):
        self.setup()
    def query(self, obs):
        with torch.no_grad():
            normalized_obs = prepare_and_normalize_obs(self.actor_critic, obs)
            policy_outputs = self.actor_critic(normalized_obs, self.rnn_states)
            # sample actions from the distribution by default
            actions = policy_outputs["actions"]
            action_distribution = self.actor_critic.action_distribution()
            actions = argmax_actions(action_distribution)
            if actions.ndim == 1:
                actions = unsqueeze_tensor(actions, dim=-1)
            rnn_states = policy_outputs["new_rnn_states"]
            return actions[0][0].item()
--- a/rom_evaluate.py
+++ b/rom_evaluate.py
@ -0,0 +1,189 @@
 import sys
 from random import randrange
 from ale_py import ALEInterface, SDL_SUPPORT, Action
 from colors import *
 from PIL import Image
 from matplotlib import pyplot as plt
 import cv2
 import pickle
 import queue
 from copy import deepcopy
 import numpy as np
 import readchar
 from sample_factory.algo.utils.tensor_dict import TensorDict
 from query_sample_factory_checkpoint import SampleFactoryNNQueryWrapper
 import time
 def input_to_action(char):
    if char == "0":
        return Action.NOOP
    if char == "1":
        return Action.RIGHT
    if char == "2":
        return Action.LEFT
    if char == "3":
        return "reset"
    if char == "4":
        return "set_x"
    if char == "5":
        return "set_vel"
    if char in ["w", "a", "s", "d"]:
        return char
 ski_position_counter = {1: (Action.LEFT, 40), 2: (Action.LEFT, 35), 3: (Action.LEFT, 30), 4: (Action.LEFT, 10), 5: (Action.NOOP, 1), 6: (Action.RIGHT, 10), 7: (Action.RIGHT, 30), 8: (Action.RIGHT, 40) }
 def run_single_test(ale, nn_wrapper, x,y,ski_position, duration=200):
    print(f"Running Test from x: {x:04}, y: {y:04}, ski_position: {ski_position}")
    for i, r in enumerate(ramDICT[y]):
        ale.setRAM(i,r)
    ski_position_setting = ski_position_counter[ski_position]
    for i in range(0,ski_position_setting[1]):
        ale.act(ski_position_setting[0])
        ale.setRAM(14,0)
        ale.setRAM(25,x)
    ale.setRAM(14,180)
    all_obs = list()
    for i in range(0,duration):
        resized_obs = cv2.resize(ale.getScreenGrayscale() , (84,84), interpolation=cv2.INTER_AREA)
        all_obs.append(resized_obs)
        if len(all_obs) >= 4:
            stack_tensor = TensorDict({"obs": np.array(all_obs[-4:])})
            action = nn_wrapper.query(stack_tensor)
            ale.act(input_to_action(str(action)))
        else:
            ale.act(Action.NOOP)
        time.sleep(0.005)
 ale = ALEInterface()
 if SDL_SUPPORT:
    ale.setBool("sound", True)
    ale.setBool("display_screen", True)
 # Load the ROM file
 rom_file = "/home/spranger/research/Skiing/env/lib/python3.8/site-packages/AutoROM/roms/skiing.bin"
 ale.loadROM(rom_file)
 # Get the list of legal actions
 with open('all_positions_v2.pickle', 'rb') as handle:
    ramDICT = pickle.load(handle)
 #ramDICT = dict()
 #for i,r in enumerate(ramDICT[235]):
 #    ale.setRAM(i,r)
 y_ram_setting = 60
 x = 70
 nn_wrapper = SampleFactoryNNQueryWrapper()
 #run_single_test(ale, nn_wrapper, 70,61,5)
 #input("")
 run_single_test(ale, nn_wrapper, 30,61,5,duration=1000)
 run_single_test(ale, nn_wrapper, 114,170,7)
 run_single_test(ale, nn_wrapper, 124,170,5)
 run_single_test(ale, nn_wrapper, 134,170,2)
 run_single_test(ale, nn_wrapper, 120,185,1)
 run_single_test(ale, nn_wrapper, 134,170,8)
 run_single_test(ale, nn_wrapper, 85,195,8)
 velocity_set = False
 for episode in range(10):
    total_reward = 0
    j = 0
    while not ale.game_over():
        if not velocity_set: ale.setRAM(14,0)
        j += 1
        a = input_to_action(repr(readchar.readchar())[1])
        #a = Action.NOOP
        if a == "w":
            y_ram_setting -= 1
            if y_ram_setting <= 61:
                y_ram_setting = 61
            for i, r in enumerate(ramDICT[y_ram_setting]):
                ale.setRAM(i,r)
            ale.setRAM(25,x)
            ale.act(Action.NOOP)
        elif a == "s":
            y_ram_setting += 1
            if y_ram_setting >= 1950:
                y_ram_setting = 1945
            for i, r in enumerate(ramDICT[y_ram_setting]):
                ale.setRAM(i,r)
            ale.setRAM(25,x)
            ale.act(Action.NOOP)
        elif a == "a":
            x -= 1
            if x <= 0:
                x = 0
            ale.setRAM(25,x)
            ale.act(Action.NOOP)
        elif a == "d":
            x += 1
            if x >= 144:
                x = 144
            ale.setRAM(25,x)
            ale.act(Action.NOOP)
        elif a == "reset":
            ram_pos = input("Ram Position:")
            for i, r in enumerate(ramDICT[int(ram_pos)]):
                ale.setRAM(i,r)
            ale.act(Action.NOOP)
        # Apply an action and get the resulting reward
        elif a == "set_x":
            x = int(input("X:"))
            ale.setRAM(25, x)
            ale.act(Action.NOOP)
        elif a == "set_vel":
            vel = input("Velocity:")
            ale.setRAM(14, int(vel))
            ale.act(Action.NOOP)
            velocity_set = True
        else:
            reward = ale.act(a)
        ram = ale.getRAM()
        #if j % 2 == 0:
        #    y_pixel = int(j*1/2) + 55
        #    ramDICT[y_pixel] = ram
        #    print(f"saving to {y_pixel:04}")
        #    if y_pixel == 126 or y_pixel == 235:
        #        input("")
        int_old_ram = list(map(int, oldram))
        int_ram = list(map(int, ram))
        difference = list()
        for o, r in zip(int_old_ram, int_ram):
            difference.append(r-o)
        oldram = deepcopy(ram)
        #print(f"player_x: {ram[25]},\tclock_m: {ram[104]},\tclock_s: {ram[105]},\tclock_ms: {ram[106]},\tscore: {ram[107]}")
        print(f"player_x: {ram[25]},\tplayer_y: {y_ram_setting}")
        #print(f"y_0: {ram[86]}, y_1: {ram[87]}, y_2: {ram[88]}, y_3: {ram[89]}, y_4: {ram[90]}, y_5: {ram[91]}, y_6: {ram[92]}, y_7: {ram[93]}, y_8: {ram[94]}")
        #for i, r in enumerate(ram):
        #    print('{:03}:{:02x} '.format(i,r), end="")
        #    if i % 16 == 15: print("")
        #print("")
        #for i, r in enumerate(difference):
        #    string = '{:02}:{:03} '.format(i%100,r)
        #    if r != 0:
        #        print(color(string, fg='red'), end="")
        #    else:
        #        print(string, end="")
        #    if i % 16 == 15: print("")
    print("Episode %d ended with score: %d" % (episode, total_reward))
    input("")
    with open('all_positions_v2.pickle', 'wb') as handle:
        pickle.dump(ramDICT, handle, protocol=pickle.HIGHEST_PROTOCOL)
    ale.reset_game()