o
    ci#                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZ dd
lmZ edddZejdd ejdeddd ejdddd dddZg dg dg dgZdefdd ZG d!d" d"eZed#kre  Z!ee!j"# $ej%d$d%ee!j&rd&nd ied'j'd(d)d* d+j(d,d-d.d/j)edd0d1Z*ee!j+d2d3ee!j,iZ-ee*e!e-d2d3id4 d5S d5S )6aX
  Example of using an env-task curriculum by implementing a custom callback.

This example:
    - demonstrates how to define your own curriculum-capable environments using
    gymnasium's FrozenLake env.
    - defines a custom callback that gets called once per iteration and - if necessary -
    changes the maps used by FrozenLake on all EnvRunners to a new task (by moving the
    goal position further and further away from the starting position).
    - also demonstrates an alternative approach via reloading/recreating an entirely new
    env inside all EnvRunners.
    - uses Tune and RLlib to curriculum-learn the env described above and compares 2
    algorithms, one that does use curriculum learning vs one that does not.

We use a FrozenLake (sparse reward) environment with a map size of 8x8 and a time step
limit of 16 to make it almost impossible for a non-curriculum policy to learn.


How to run this script
----------------------
`python [script file name].py --enable-new-api-stack`

Use the `--no-curriculum` flag to disable curriculum learning and force your policy
to be trained on the hardest task right away. With this option, the algorithm should NOT
succeed.

For debugging, use the following additional command line options
`--no-tune --num-env-runners=0`
which should allow you to set breakpoints anywhere in the RLlib code and
have the execution stop there for inspection and debugging.

For logging to your WandB account, use:
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
--wandb-run-name=[optional: WandB run name (within the defined project)]`


Results to expect
-----------------
In the console output, you can see that only PPO policy that uses a curriculum can
actually learn, whereas the one that is thrown into the toughest task right from the
start never learns anything.

Policy using the curriculum:
+-------------------------------+------------+-----------------+--------+
| Trial name                    | status     | loc             |   iter |
|-------------------------------+------------+-----------------+--------+
| PPO_FrozenLake-v1_93ca4_00000 | TERMINATED | 127.0.0.1:73318 |     41 |
+-------------------------------+------------+-----------------+--------+
+------------------+--------+----------+--------------------+
|   total time (s) |     ts |   reward |   episode_len_mean |
|------------------+--------+----------+--------------------|
|           97.652 | 164000 |        1 |            14.0348 |
+------------------+--------+----------+--------------------+

Policy NOT using the curriculum (trying to solve the hardest task right away):
[DOES NOT LEARN AT ALL]
    )partial)TRAINING_ITERATION)	Algorithm)RLlibCallbackFlattenObservations)DefaultModelConfig)ENV_RUNNER_RESULTSEPISODE_RETURN_MEANNUM_ENV_STEPS_SAMPLED_LIFETIME)add_rllib_example_script_args#run_rllib_example_script_experiment)get_trainable_clsd   i'	 )default_itersdefault_timestepsT)enable_new_api_stackz--upgrade-task-thresholdgGz?zLThe mean episode return, upon reaching of which we increase the task by one.)typedefaulthelpz--no-curriculum
store_truezaWhether to NOT use curriculum learning (and instead trying to solve the hardest task right away).)actionr   F   )is_slipperymax_episode_steps)SFFHFFFHFFFHFFFFFFGFFFFFFFFFFFFFHFFFFFFFHHFFFFHFFFFFFHHFFHFFFFFF)r   r   r   r   r   HHFFGFHFr!   r"   )r   r   r   r   r   r    r!   FHFFFFFGnew_taskc                 C   s"   | j jdt| id |   d S )Ndesc
env_config)configenvironmentENV_MAPSmake_env)
env_runnerr%    r.   e/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/examples/curriculum/curriculum_learning.py
_remote_fn   s   r0   c                   @   s,   e Zd ZdZdddededdfddZdS )	EnvTaskCallbackzMCustom callback implementing `on_train_result()` for changing the envs' maps.N)metrics_logger	algorithmresultreturnc                K   s   t jrd|jd< |jd }d|d< |t t }|t jkrM|dk rA|d }td| d| d |jjt	t
|d	d
 ||jd< d S |dkrKd|d< d S d S |dkrk|dkrmtd |jjt	t
dd	d
 d|jd< d S d S d S )N   current_env_taskg        task_solved   z)Switching task/map on all EnvRunners to #z (0=easiest, 2=hardest), b/c R=z on current task.)r%   )func      ?r   zOEmergency brake: Our policy seemed to have collapsed -> Setting task back to 0.)argsno_curriculum	_countersr	   r
   upgrade_task_thresholdprintenv_runner_groupforeach_env_runnerr   r0   )selfr3   r2   r4   kwargscurrent_taskcurrent_returnr%   r.   r.   r/   on_train_result   s:   





zEnvTaskCallback.on_train_result)__name__
__module____qualname____doc__r   dictrG   r.   r.   r.   r/   r1      s    r1   __main__zFrozenLake-v1r&   r6   r'      c                 C   s   t  S )Nr   )envspacesdevicer.   r.   r/   <lambda>   s    rR   )num_envs_per_env_runnerenv_to_module_connector   g{Gz?g-C6*?)
num_epochsvf_loss_coefflr)vf_share_layers)model_configr8   r;   )stopsuccess_metricN).rK   	functoolsr   ray.tune.resultr   ray.rllib.algorithms.algorithmr   ray.rllib.callbacks.callbacksr   "ray.rllib.connectors.env_to_moduler   -ray.rllib.core.rl_module.default_model_configr   ray.rllib.utils.metricsr	   r
   r   ray.rllib.utils.test_utilsr   r   ray.tune.registryr   parserset_defaultsadd_argumentfloatENV_OPTIONSr+   intr0   r1   rH   
parse_argsr<   algoget_default_config	callbacksr*   r=   env_runnerstraining	rl_modulebase_config
stop_itersstop_timestepsr[   r.   r.   r.   r/   <module>   s    8	
+2


