You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

163 lines
6.1 KiB

1 year ago
  1. import gymnasium as gym
  2. import minigrid
  3. import ray
  4. from ray.tune import register_env
  5. from ray.tune.experiment.trial import Trial
  6. from ray import tune, air
  7. from ray.rllib.algorithms.ppo import PPOConfig
  8. from ray.tune.logger import UnifiedLogger
  9. from ray.rllib.models import ModelCatalog
  10. from ray.tune.logger import pretty_print, UnifiedLogger, CSVLogger
  11. from ray.rllib.algorithms.algorithm import Algorithm
  12. from ray.air import session
  13. from torch_action_mask_model import TorchActionMaskModel
  14. from wrappers import OneHotShieldingWrapper, MiniGridShieldingWrapper
  15. from helpers import parse_arguments, create_log_dir, ShieldingConfig, test_name
  16. from shieldhandlers import MiniGridShieldHandler, create_shield_query
  17. from torch.utils.tensorboard import SummaryWriter
  18. from callbacks import MyCallbacks
  19. def shielding_env_creater(config):
  20. name = config.get("name", "MiniGrid-LavaCrossingS9N3-v0")
  21. framestack = config.get("framestack", 4)
  22. args = config.get("args", None)
  23. args.grid_path = F"{args.expname}_{args.grid_path}_{config.worker_index}.txt"
  24. args.prism_path = F"{args.expname}_{args.prism_path}_{config.worker_index}.prism"
  25. shielding = config.get("shielding", False)
  26. shield_creator = MiniGridShieldHandler(grid_file=args.grid_path,
  27. grid_to_prism_path=args.grid_to_prism_binary_path,
  28. prism_path=args.prism_path,
  29. formula=args.formula)
  30. env = gym.make(name, randomize_start=True)
  31. env = MiniGridShieldingWrapper(env, shield_creator=shield_creator, shield_query_creator=create_shield_query ,mask_actions=shielding != ShieldingConfig.Disabled)
  32. env = OneHotShieldingWrapper(env,
  33. config.vector_index if hasattr(config, "vector_index") else 0,
  34. framestack=framestack
  35. )
  36. return env
  37. def register_minigrid_shielding_env(args):
  38. env_name = "mini-grid-shielding"
  39. register_env(env_name, shielding_env_creater)
  40. ModelCatalog.register_custom_model(
  41. "shielding_model",
  42. TorchActionMaskModel
  43. )
  44. def trial_name_creator(trial : Trial):
  45. return "trial"
  46. def ppo(args):
  47. register_minigrid_shielding_env(args)
  48. logdir = args.log_dir
  49. config = (PPOConfig()
  50. .rollouts(num_rollout_workers=args.workers)
  51. .resources(num_gpus=0)
  52. .environment( env="mini-grid-shielding",
  53. env_config={"name": args.env,
  54. "args": args,
  55. "shielding": args.shielding is ShieldingConfig.Full or args.shielding is ShieldingConfig.Training,
  56. },)
  57. .framework("torch")
  58. .callbacks(MyCallbacks)
  59. .evaluation(evaluation_config={
  60. "evaluation_interval": 1,
  61. "evaluation_duration": 10,
  62. "evaluation_num_workers":1,
  63. "env": "mini-grid-shielding",
  64. "env_config": {"name": args.env,
  65. "args": args,
  66. "shielding": args.shielding is ShieldingConfig.Full or args.shielding is ShieldingConfig.Evaluation}})
  67. .rl_module(_enable_rl_module_api = False)
  68. .debugging(logger_config={
  69. "type": UnifiedLogger,
  70. "logdir": logdir
  71. })
  72. .training(_enable_learner_api=False ,model={
  73. "custom_model": "shielding_model"
  74. }))
  75. tuner = tune.Tuner("PPO",
  76. tune_config=tune.TuneConfig(
  77. metric="episode_reward_mean",
  78. mode="max",
  79. num_samples=1,
  80. trial_name_creator=trial_name_creator,
  81. ),
  82. run_config=air.RunConfig(
  83. stop = {"episode_reward_mean": 94,
  84. "timesteps_total": args.steps,},
  85. checkpoint_config=air.CheckpointConfig(checkpoint_at_end=True,
  86. num_to_keep=1,
  87. checkpoint_score_attribute="episode_reward_mean",
  88. ),
  89. storage_path=F"{logdir}",
  90. name=test_name(args),
  91. )
  92. ,
  93. param_space=config,)
  94. results = tuner.fit()
  95. best_result = results.get_best_result()
  96. import pprint
  97. metrics_to_print = [
  98. "episode_reward_mean",
  99. "episode_reward_max",
  100. "episode_reward_min",
  101. "episode_len_mean",
  102. ]
  103. pprint.pprint({k: v for k, v in best_result.metrics.items() if k in metrics_to_print})
  104. # algo = Algorithm.from_checkpoint(best_result.checkpoint)
  105. # eval_log_dir = F"{logdir}-eval"
  106. # writer = SummaryWriter(log_dir=eval_log_dir)
  107. # csv_logger = CSVLogger(config=config, logdir=eval_log_dir)
  108. # for i in range(args.evaluations):
  109. # eval_result = algo.evaluate()
  110. # print(pretty_print(eval_result))
  111. # print(eval_result)
  112. # # logger.on_result(eval_result)
  113. # csv_logger.on_result(eval_result)
  114. # evaluation = eval_result['evaluation']
  115. # epsiode_reward_mean = evaluation['episode_reward_mean']
  116. # episode_len_mean = evaluation['episode_len_mean']
  117. # print(epsiode_reward_mean)
  118. # writer.add_scalar("evaluation/episode_reward_mean", epsiode_reward_mean, i)
  119. # writer.add_scalar("evaluation/episode_len_mean", episode_len_mean, i)
  120. def main():
  121. ray.init(num_cpus=3)
  122. import argparse
  123. args = parse_arguments(argparse)
  124. ppo(args)
  125. ray.shutdown()
  126. if __name__ == '__main__':
  127. main()