basic action embedding

2 years ago · cf18349819
5 changed files with 650 additions and 63 deletions
--- a/examples/shields/rl/11_minigridrl.py
+++ b/examples/shields/rl/11_minigridrl.py
@ -0,0 +1,255 @@
 from typing import Dict, Optional, Union
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.evaluation import RolloutWorker
 from ray.rllib.evaluation.episode import Episode
 from ray.rllib.evaluation.episode_v2 import EpisodeV2
 from ray.rllib.policy import Policy
 from ray.rllib.utils.typing import PolicyID
 import stormpy
 import stormpy.core
 import stormpy.simulator
 import stormpy.shields
 import stormpy.logic
 import stormpy.examples
 import stormpy.examples.files
 import os
 import gymnasium as gym
 import minigrid
 import numpy as np
 import ray
 from ray.tune import register_env
 from ray.rllib.algorithms.ppo import PPOConfig
 from ray.rllib.utils.test_utils import check_learning_achieved, framework_iterator
 from ray import tune, air
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
 from ray.tune.logger import pretty_print
 from ray.rllib.algorithms import ppo
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.torch_utils import FLOAT_MIN
 from ray.rllib.models.preprocessors import get_preprocessor
 from MaskEnvironments import ParametricActionsMiniGridEnv
 from MaskModels import TorchActionMaskModel
 from Wrapper import OneHotWrapper, MiniGridEnvWrapper, ImgObsWrapper
 import matplotlib.pyplot as plt
 import argparse
 class MyCallbacks(DefaultCallbacks):
    def on_episode_start(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: Episode | EpisodeV2, env_index: int | None = None, **kwargs) -> None:
        # print(F"Epsiode started Environment: {base_env.get_sub_environments()}")
        env = base_env.get_sub_environments()[0]
        episode.user_data["count"] = 0
        # print(env.printGrid())
        # print(env.action_space.n)
        # print(env.actions)
        # print(env.mission)
        # print(env.observation_space)
        # img = env.get_frame()
        # plt.imshow(img)
        # plt.show()
    def on_episode_step(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[PolicyID, Policy] | None = None, episode: Episode | EpisodeV2, env_index: int | None = None, **kwargs) -> None:
         episode.user_data["count"] = episode.user_data["count"] + 1
         env = base_env.get_sub_environments()[0]
         print(env.env.env.printGrid())
    def on_episode_end(self, *, worker: RolloutWorker, base_env: BaseEnv, policies: Dict[PolicyID, Policy], episode: Episode | EpisodeV2 | Exception, env_index: int | None = None, **kwargs) -> None:
        # print(F"Epsiode end Environment: {base_env.get_sub_environments()}")
        env = base_env.get_sub_environments()[0]
        # print(env.env.env.printGrid())
        # print(episode.user_data["count"])
 def parse_arguments(argparse):
    parser = argparse.ArgumentParser()
    # parser.add_argument("--env", help="gym environment to load", default="MiniGrid-Empty-8x8-v0")
    parser.add_argument("--env", help="gym environment to load", default="MiniGrid-LavaCrossingS9N1-v0")
    parser.add_argument("--seed", type=int, help="seed for environment", default=1)
    parser.add_argument("--tile_size", type=int, help="size at which to render tiles", default=32)
    parser.add_argument("--agent_view", default=False, action="store_true", help="draw the agent sees")
    parser.add_argument("--grid_path", default="Grid.txt")
    parser.add_argument("--prism_path", default="Grid.PRISM")
    args = parser.parse_args()
    return args
 def env_creater_custom(config):
    # name = config.get("name", "MiniGrid-LavaCrossingS9N1-v0")
    # # name = config.get("name", "MiniGrid-Empty-8x8-v0")
    framestack = config.get("framestack", 4)
    # env = gym.make(name)
    # env = ParametricActionsMiniGridEnv(config)
    name = config.get("name", "MiniGrid-LavaCrossingS9N1-v0")
    framestack = config.get("framestack", 4)
    env = gym.make(name)
    env = MiniGridEnvWrapper(env)
    # env = minigrid.wrappers.ImgObsWrapper(env)
    # env = ImgObsWrapper(env)
    env = OneHotWrapper(env,
                        config.vector_index if hasattr(config, "vector_index") else 0,
                        framestack=framestack
                        )
    obs = env.observation_space.sample()
    obs2, infos = env.reset(seed=None, options={})
    print(F"Obs is {obs} before reset. After reset: {obs2}")
    # env = minigrid.wrappers.RGBImgPartialObsWrapper(env)
    print(F"Created Custom Minigrid Environment is {env}")
    return env
 def env_creater_cart(config):
    return gym.make("CartPole-v1")
 def env_creater(config):
    name = config.get("name", "MiniGrid-LavaCrossingS9N1-v0")
    # name = config.get("name", "MiniGrid-Empty-8x8-v0")
    framestack = config.get("framestack", 4)
    env = gym.make(name)
    # env = minigrid.wrappers.RGBImgPartialObsWrapper(env)
    env = minigrid.wrappers.ImgObsWrapper(env)
    env = OneHotWrapper(env,
                        config.vector_index if hasattr(config, "vector_index") else 0,
                        framestack=framestack
                        )
    print(F"Created Minigrid Environment is {env}")
    return env
 def create_shield(grid_file, prism_path):
    os.system(F"/home/tknoll/Documents/main -v 'agent' -i {grid_file} -o {prism_path}")
    f = open(prism_path, "a")
    f.write("label \"AgentIsInLava\" = AgentIsInLava;")
    f.close()
    program = stormpy.parse_prism_program(prism_path)
    formula_str = "Pmax=? [G !\"AgentIsInLavaAndNotDone\"]"
    formulas = stormpy.parse_properties_for_prism_program(formula_str, program)
    options = stormpy.BuilderOptions([p.raw_formula for p in formulas])
    options.set_build_state_valuations(True)
    options.set_build_choice_labels(True)
    options.set_build_all_labels()
    model = stormpy.build_sparse_model_with_options(program, options)
    shield_specification = stormpy.logic.ShieldExpression(stormpy.logic.ShieldingType.PRE_SAFETY, stormpy.logic.ShieldComparison.RELATIVE, 0.1) 
    result = stormpy.model_checking(model, formulas[0], extract_scheduler=True, shield_expression=shield_specification)
    assert result.has_scheduler
    assert result.has_shield
    shield = result.shield
    stormpy.shields.export_shield(model, shield, "Grid.shield")
    return shield.construct(), model
 def export_grid_to_text(env, grid_file):
    f = open(grid_file, "w")
    # print(env)
    f.write(env.printGrid(init=True))
    # f.write(env.pprint_grid())
    f.close()
 def create_environment(args):
    env_id= args.env
    env = gym.make(env_id)
    env.reset()
    return env
 def main():
    args = parse_arguments(argparse)
    env = create_environment(args)
    ray.init(num_cpus=3)
    # print(env.pprint_grid())
    # print(env.printGrid(init=False))
    grid_file = args.grid_path
    export_grid_to_text(env, grid_file)
    prism_path = args.prism_path
    shield, model = create_shield(grid_file, prism_path)
    shield_dict = {state.id : shield.get_choice(state).choice_map for state in model.states}
    print(shield_dict)
    for state_id in model.states:
        choices = shield.get_choice(state_id)
        print(F"Allowed choices in state {state_id}, are {choices.choice_map} ")
    env_name = "mini-grid"
    register_env(env_name, env_creater_custom)
    ModelCatalog.register_custom_model(
        "pa_model", 
        TorchActionMaskModel
    )
    config = (PPOConfig()
        .rollouts(num_rollout_workers=1)
        .resources(num_gpus=0)
        .environment(env="mini-grid")
        .framework("torch")       
        .experimental(_disable_preprocessor_api=False)
        .callbacks(MyCallbacks)
        .rl_module(_enable_rl_module_api = False)
        .training(_enable_learner_api=False ,model={
            "custom_model": "pa_model",
            "custom_model_config" : {"shield": shield_dict, "no_masking": True}
            # "fcnet_hiddens": [256,256],
            # "fcnet_activation": "relu",
        }))
    algo =(
        config.build()
    )
    episode_reward = 0
    terminated = truncated = False
    obs, info = env.reset()
    # while not terminated and not truncated:
    #     action = algo.compute_single_action(obs)
    #     obs, reward, terminated, truncated = env.step(action)
    for i in range(30):
        result = algo.train()
        print(pretty_print(result))
        if i % 5 == 0:
            checkpoint_dir = algo.save()
            print(f"Checkpoint saved in directory {checkpoint_dir}")
    ray.shutdown()
 if __name__ == '__main__':
    main()
--- a/examples/shields/rl/12_basic_training.py
+++ b/examples/shields/rl/12_basic_training.py
@ -33,11 +33,81 @@ from ray.tune.logger import pretty_print
 from ray.rllib.utils.numpy import one_hot
 from ray.rllib.algorithms import ppo
 from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
 from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
 from ray.rllib.utils.torch_utils import FLOAT_MIN
 from ray.rllib.models.preprocessors import get_preprocessor
 import matplotlib.pyplot as plt
 import argparse
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from Wrapper import OneHotWrapper
 torch, nn = try_import_torch()
 class TorchActionMaskModel(TorchModelV2, nn.Module):
    """PyTorch version of above ActionMaskingModel."""
    def __init__(
        self,
        obs_space,
        action_space,
        num_outputs,
        model_config,
        name,
        **kwargs,
    ):
        orig_space = getattr(obs_space, "original_space", obs_space)
        assert (
            isinstance(orig_space, Dict)
            and "action_mask" in orig_space.spaces
            and "observations" in orig_space.spaces
        )
        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name, **kwargs
        )
        nn.Module.__init__(self)
        self.internal_model = TorchFC(
            orig_space["observations"],
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )
        # disable action masking --> will likely lead to invalid actions
        self.no_masking = False
        if "no_masking" in model_config["custom_model_config"]:
            self.no_masking = model_config["custom_model_config"]["no_masking"]
    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]
        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs"]["observations"]})
        # If action masking is disabled, directly return unmasked logits
        if self.no_masking:
            return logits, state
        # Convert action_mask into a [0.0 || -inf]-type mask.
        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
        masked_logits = logits + inf_mask
        # Return masked logits.
        return masked_logits, state
    def value_function(self):
        return self.internal_model.value_function()
 class MyCallbacks(DefaultCallbacks):
@ -66,69 +136,7 @@ class MyCallbacks(DefaultCallbacks):
        # print(episode.user_data["count"])
-       
+    
 class OneHotWrapper(gym.core.ObservationWrapper):
    def __init__(self, env, vector_index, framestack):
        super().__init__(env)
        self.framestack = framestack
        # 49=7x7 field of vision; 11=object types; 6=colors; 3=state types.
        # +4: Direction.
        self.single_frame_dim = 49 * (11 + 6 + 3) + 4
        self.init_x = None
        self.init_y = None
        self.x_positions = []
        self.y_positions = []
        self.x_y_delta_buffer = deque(maxlen=100)
        self.vector_index = vector_index
        self.frame_buffer = deque(maxlen=self.framestack)
        for _ in range(self.framestack):
            self.frame_buffer.append(np.zeros((self.single_frame_dim,)))
        self.observation_space = gym.spaces.Box(
            0.0, 1.0, shape=(self.single_frame_dim * self.framestack,), dtype=np.float32
        )
    def observation(self, obs):
        # Debug output: max-x/y positions to watch exploration progress.
        if self.step_count == 0:
            for _ in range(self.framestack):
                self.frame_buffer.append(np.zeros((self.single_frame_dim,)))
            if self.vector_index == 0:
                if self.x_positions:
                    max_diff = max(
                        np.sqrt(
                            (np.array(self.x_positions) - self.init_x) ** 2
                            + (np.array(self.y_positions) - self.init_y) ** 2
                        )
                    )
                    self.x_y_delta_buffer.append(max_diff)
                    print(
                        "100-average dist travelled={}".format(
                            np.mean(self.x_y_delta_buffer)
                        )
                    )
                    self.x_positions = []
                    self.y_positions = []
                self.init_x = self.agent_pos[0]
                self.init_y = self.agent_pos[1]
        self.x_positions.append(self.agent_pos[0])
        self.y_positions.append(self.agent_pos[1])
        # One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten.
        objects = one_hot(obs[:, :, 0], depth=11)
        colors = one_hot(obs[:, :, 1], depth=6)
        states = one_hot(obs[:, :, 2], depth=3)
        all_ = np.concatenate([objects, colors, states], -1)
        all_flat = np.reshape(all_, (-1,))
        direction = one_hot(np.array(self.agent_dir), depth=4).astype(np.float32)
        single_frame = np.concatenate([all_flat, direction])
        self.frame_buffer.append(single_frame)
        return np.concatenate(self.frame_buffer)
 def parse_arguments(argparse):
--- a/examples/shields/rl/MaskEnvironments.py
+++ b/examples/shields/rl/MaskEnvironments.py
@ -0,0 +1,91 @@
 import random
 import minigrid
 import gymnasium as gym
 import numpy as np
 from gymnasium.spaces import Box, Dict, Discrete
 from Wrapper import OneHotWrapper
 class ParametricActionsMiniGridEnv(gym.Env):
    """Parametric action version of MiniGrid.
    """
    def __init__(self, config):
        name = config.get("name", "MiniGrid-LavaCrossingS9N1-v0")
        self.left_action_embed = np.random.randn(2)
        self.right_action_embed = np.random.randn(2)
        framestack = config.get("framestack", 4)
        # env = gym.make(name)
        # env = minigrid.wrappers.ImgObsWrapper(env)
        # env = OneHotWrapper(env,
        #                 config.vector_index if hasattr(config, "vector_index") else 0,
        #                 framestack=framestack
        #                 )
        self.wrapped = gym.make(name)
        # self.observation_space = Dict(
        #     {
        #          "action_mask": None,
        #          "avail_actions": None,
        #         "cart": self.wrapped.observation_space,
        #     }
        # )
        print(F"Wrapped environment is {self.wrapped}")
        self.step_count = 0
        self.action_space = self.wrapped.action_space
        self.observation_space = self.wrapped.observation_space
    def update_avail_actions(self):
        self.action_assignments = np.array(
            [[0.0, 0.0]] * self.action_space.n, dtype=np.float32
        )
        self.action_mask = np.array([0.0] * self.action_space.n, dtype=np.int8)
        self.left_idx, self.right_idx = random.sample(range(self.action_space.n), 2)
        self.action_assignments[self.left_idx] = self.left_action_embed
        self.action_assignments[self.right_idx] = self.right_action_embed
        self.action_mask[self.left_idx] = 1
        self.action_mask[self.right_idx] = 1
    def reset(self, *, seed=None, options=None):
        self.update_avail_actions()
        obs, infos = self.wrapped.reset()
        return obs, infos
        return {
            "action_mask": self.action_mask,
            "avail_actions": self.action_assignments,
            "cart": obs,
        }, infos
    def step(self, action):
        if action == self.left_idx:
            actual_action = 0
        elif action == self.right_idx:
            actual_action = 1
        else:
            actual_action = 0
            # raise ValueError(
            #     "Chosen action was not one of the non-zero action embeddings",
            #     action,
            #     self.action_assignments,
            #     self.action_mask,
            #     self.left_idx,
            #     self.right_idx,
            # )
        orig_obs, rew, done, truncated, info = self.wrapped.step(actual_action)
        self.update_avail_actions()
        self.action_mask = self.action_mask.astype(np.int8)
        print(F"Info is {info}")
        info["Hello" : "Ich kenn mich nix aus"]
        return orig_obs, rew, done, truncated, info
        obs = {
            "action_mask": self.action_mask,
            "avail_actions": self.action_assignments,
            "cart": orig_obs,
        }
        return obs, rew, done, truncated, info
--- a/examples/shields/rl/MaskModels.py
+++ b/examples/shields/rl/MaskModels.py
@ -0,0 +1,81 @@
 from typing import Dict, Optional, Union
 from ray.rllib.algorithms.dqn.dqn_torch_model import DQNTorchModel
 from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
 from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.torch_utils import FLOAT_MIN, FLOAT_MAX
 torch, nn = try_import_torch()
 class TorchActionMaskModel(TorchModelV2, nn.Module):
    """PyTorch version of above ActionMaskingModel."""
    def __init__(
        self,
        obs_space,
        action_space,
        num_outputs,
        model_config,
        name,
        **kwargs,
    ):
        orig_space = getattr(obs_space, "original_space", obs_space)
        custom_config = model_config['custom_model_config']
        print(F"Original Space is: {orig_space}")
        #print(model_config)
        print(F"Observation space in model: {obs_space}")
        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name, **kwargs
        )
        nn.Module.__init__(self)
        assert("shield" in custom_config)
        self.shield = custom_config["shield"]
        self.internal_model = TorchFC(
            orig_space["data"],
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )
        # disable action masking --> will likely lead to invalid actions
        self.no_masking = False
        if "no_masking" in model_config["custom_model_config"]:
            self.no_masking = model_config["custom_model_config"]["no_masking"]
    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        # print(F"Input dict is {input_dict} at obs: {input_dict['obs']}")
        # print(F"State is {state}")
        action_mask = []
        # print(input_dict["env"])
        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs"]["data"]})
        # If action masking is disabled, directly return unmasked logits
        if self.no_masking:
            return logits, state
        assert(False)
        return logits, state
        # Convert action_mask into a [0.0 || -inf]-type mask.
        # inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
        # masked_logits = logits + inf_mask
        # # Return masked logits.
        # return masked_logits, state
    def value_function(self):
        return self.internal_model.value_function()
--- a/examples/shields/rl/Wrapper.py
+++ b/examples/shields/rl/Wrapper.py
@ -0,0 +1,152 @@
 import gymnasium as gym
 import numpy as np
 from gymnasium.spaces import Dict, Box
 from collections import deque
 from ray.rllib.utils.numpy import one_hot
 class OneHotWrapper(gym.core.ObservationWrapper):
    def __init__(self, env, vector_index, framestack):
        super().__init__(env)
        self.framestack = framestack
        # 49=7x7 field of vision; 11=object types; 6=colors; 3=state types.
        # +4: Direction.
        self.single_frame_dim = 49 * (11 + 6 + 3) + 4
        self.init_x = None
        self.init_y = None
        self.x_positions = []
        self.y_positions = []
        self.x_y_delta_buffer = deque(maxlen=100)
        self.vector_index = vector_index
        self.frame_buffer = deque(maxlen=self.framestack)
        for _ in range(self.framestack):
            self.frame_buffer.append(np.zeros((self.single_frame_dim,)))
        self.observation_space = Dict(
            {
                "data": gym.spaces.Box(0.0, 1.0, shape=(self.single_frame_dim * self.framestack,), dtype=np.float32),
                "avail_actions": gym.spaces.Box(0, 10, shape=(10,), dtype=int),
            }
            ) 
        print(F"Set obersvation space to {self.observation_space}")
    def observation(self, obs):
        # Debug output: max-x/y positions to watch exploration progress.
        # print(F"Initial observation in Wrapper {obs}")
        if self.step_count == 0:
            for _ in range(self.framestack):
                self.frame_buffer.append(np.zeros((self.single_frame_dim,)))
            if self.vector_index == 0:
                if self.x_positions:
                    max_diff = max(
                        np.sqrt(
                            (np.array(self.x_positions) - self.init_x) ** 2
                            + (np.array(self.y_positions) - self.init_y) ** 2
                        )
                    )
                    self.x_y_delta_buffer.append(max_diff)
                    print(
                        "100-average dist travelled={}".format(
                            np.mean(self.x_y_delta_buffer)
                        )
                    )
                    self.x_positions = []
                    self.y_positions = []
                self.init_x = self.agent_pos[0]
                self.init_y = self.agent_pos[1]
        self.x_positions.append(self.agent_pos[0])
        self.y_positions.append(self.agent_pos[1])
        image = obs["data"]
        # One-hot the last dim into 11, 6, 3 one-hot vectors, then flatten.
        objects = one_hot(image[:, :, 0], depth=11)
        colors = one_hot(image[:, :, 1], depth=6)
        states = one_hot(image[:, :, 2], depth=3)
        all_ = np.concatenate([objects, colors, states], -1)
        all_flat = np.reshape(all_, (-1,))
        direction = one_hot(np.array(self.agent_dir), depth=4).astype(np.float32)
        single_frame = np.concatenate([all_flat, direction])
        self.frame_buffer.append(single_frame)
        #obs["one-hot"] = np.concatenate(self.frame_buffer)
        tmp = {"data": np.concatenate(self.frame_buffer), "avail_actions": obs["avail_actions"] }
        return tmp#np.concatenate(self.frame_buffer)
 class MiniGridEnvWrapper(gym.core.Wrapper):
    def __init__(self, env):
        super(MiniGridEnvWrapper, self).__init__(env)
        self.observation_space = Dict(
            {
                "data": env.observation_space.spaces["image"],
                "avail_actions" : Box(0, 10, shape=(10,), dtype=np.int8),
            }
        )
    def test(self):
        print("Testing some stuff")
    def reset(self, *, seed=None, options=None):
        obs, infos = self.env.reset()
        return {
            "data": obs["image"],
            "avail_actions": np.array([0.0] * 10, dtype=np.int8)
        }, infos
    def step(self, action):
        orig_obs, rew, done, truncated, info = self.env.step(action)
        self.test()
        #print(F"Original observation is {orig_obs}")
        obs = {
            "data": orig_obs["image"],
            "avail_actions":  np.array([0.0] * 10, dtype=np.int8),
        }
        #print(F"Info is {info}")
        return obs, rew, done, truncated, info
 class ImgObsWrapper(gym.core.ObservationWrapper):
    """
    Use the image as the only observation output, no language/mission.
    Example:
        >>> import gymnasium as gym
        >>> from minigrid.wrappers import ImgObsWrapper
        >>> env = gym.make("MiniGrid-Empty-5x5-v0")
        >>> obs, _ = env.reset()
        >>> obs.keys()
        dict_keys(['image', 'direction', 'mission'])
        >>> env = ImgObsWrapper(env)
        >>> obs, _ = env.reset()
        >>> obs.shape
        (7, 7, 3)
    """
    def __init__(self, env):
        """A wrapper that makes image the only observation.
        Args:
            env: The environment to apply the wrapper
        """
        super().__init__(env)
        self.observation_space = env.observation_space.spaces["image"]
        print(F"Set obersvation space to {self.observation_space}")
    def observation(self, obs):
        #print(F"obs in img obs wrapper {obs}")
        tmp = {"data": obs["image"], "Test": obs["Test"]}
        return tmp