o
    ci&                  
   @   s  d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZ eddddZejddd ejdedd ejdedd ejdedd dede
deeeef fddZedkre Zd e_ eej!" j#ed!ej$id"j%d d#j&ej'rd$ned%d&ej(rd'ndej(ej)d!ej$d% id(d)d*Z*eej+e d+e d+e ej,eej-iZ.ee*ee.e d+e d+e ej,id, d$S d$S )-a  Example of customizing the evaluation procedure for an RLlib Algorithm.

Note, that you should only choose to provide a custom eval function, in case the already
built-in eval options are not sufficient. Normally, though, RLlib's eval utilities
that come with each Algorithm are enough to properly evaluate the learning progress
of your Algorithm.

This script uses the SimpleCorridor environment, a simple 1D gridworld, in which
the agent can only walk left (action=0) or right (action=1). The goal state is located
at the end of the (1D) corridor. The env exposes an API to change the length of the
corridor on-the-fly. We use this API here to extend the size of the corridor for the
evaluation runs.

For demonstration purposes only, we define a simple custom evaluation method that does
the following:
- It changes the corridor length of all environments used on the evaluation EnvRunners.
- It runs a defined number of episodes for evaluation purposes.
- It collects the metrics from those runs, summarizes these metrics and returns them.


How to run this script
----------------------
`python [script file name].py --enable-new-api-stack

You can switch off custom evaluation (and use RLlib's default evaluation procedure)
with the `--no-custom-eval` flag.

You can switch on parallel evaluation to training using the
`--evaluation-parallel-to-training` flag. See this example script here:
https://github.com/ray-project/ray/blob/master/rllib/examples/evaluation/evaluation_parallel_to_training.py  # noqa
for more details on running evaluation parallel to training.

For debugging, use the following additional command line options
`--no-tune --num-env-runners=0`
which should allow you to set breakpoints anywhere in the RLlib code and
have the execution stop there for inspection and debugging.

For logging to your WandB account, use:
`--wandb-key=[your WandB API key] --wandb-project=[some project name]
--wandb-run-name=[optional: WandB run name (within the defined project)]`


Results to expect
-----------------
You should see the following (or very similar) console output when running this script.
Note that for each iteration, due to the definition of our custom evaluation function,
we run 3 evaluation rounds per single training round.

...
Training iteration 1 -> evaluation round 0
Training iteration 1 -> evaluation round 1
Training iteration 1 -> evaluation round 2
...
...
+--------------------------------+------------+-----------------+--------+
| Trial name                     | status     | loc             |   iter |
|--------------------------------+------------+-----------------+--------+
| PPO_SimpleCorridor_06582_00000 | TERMINATED | 127.0.0.1:69905 |      4 |
+--------------------------------+------------+-----------------+--------+
+------------------+-------+----------+--------------------+
|   total time (s) |    ts |   reward |   episode_len_mean |
|------------------+-------+----------+--------------------|
|          26.1973 | 16000 | 0.872034 |            13.7966 |
+------------------+-------+----------+--------------------+
    )Tuple)TRAINING_ITERATION)	Algorithm)AlgorithmConfig)EnvRunnerGroup)SimpleCorridor)ENV_RUNNER_RESULTSEVALUATION_RESULTSEPISODE_RETURN_MEANNUM_ENV_STEPS_SAMPLED_LIFETIME)add_rllib_example_script_args#run_rllib_example_script_experiment)
ResultDict)get_trainable_cls2   gffffff?iP  )default_itersdefault_rewarddefault_timestepsz--no-custom-eval
store_true)actionz--corridor-length-training
   )typedefaultz--corridor-length-eval-worker-1   z--corridor-length-eval-worker-2   	algorithmeval_workersreturnc           	      C   s   |j dd d g }g }tdD ]*}td| j d|  |j dd dd	}|d
d |D  |dd |D  q| jj|ttfd | j	ttf}t
dd |D }t
dd |D }|||fS )zExample of a custom evaluation function.

    Args:
        algorithm: Algorithm class to evaluate.
        eval_workers: Evaluation EnvRunnerGroup.

    Returns:
        metrics: Evaluation metrics dict.
    c                    s    fdd j jjD S )Nc                 3   s.    | ]}|j  jd krtjntjV  qdS    N)	unwrappedset_corridor_lengthworker_indexargscorridor_length_eval_worker_1corridor_length_eval_worker_2).0envworker c/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/examples/evaluation/custom_evaluation.py	<genexpr>r   s    

z9custom_eval_function.<locals>.<lambda>.<locals>.<genexpr>)r'   r    envsr(   r*   r(   r+   <lambda>r   s   
 z&custom_eval_function.<locals>.<lambda>)func   zTraining iteration z -> evaluation round c                 S   s   |   |  fS N)sampleget_metricsr(   r*   r*   r+   r.      s    F)r/   local_env_runnerc                 s   s"    | ]}|d  D ]}|V  qqdS )r   Nr*   )r&   eps_and_mtrcsepsr*   r*   r+   r,      s    z'custom_eval_function.<locals>.<genexpr>c                 s   s    | ]}|d  V  qdS r   r*   )r&   r5   r*   r*   r+   r,      s    
)keyc                 s       | ]}|  V  qd S r1   )	env_stepsr&   r6   r*   r*   r+   r,          c                 s   r8   r1   )agent_stepsr:   r*   r*   r+   r,      r;   )foreach_env_runnerrangeprint	iterationextendmetrics	aggregater	   r   peeksum)	r   r   env_runner_metricssampled_episodesi$episodes_and_metrics_all_env_runnerseval_resultsr9   r<   r*   r*   r+   custom_eval_function`   s0   



rK   __main__Tcorridor_length)
env_config)create_env_on_local_workerN   r   auto   )rN   "metrics_num_episodes_for_smoothing)custom_evaluation_functionevaluation_num_env_runnersevaluation_intervalevaluation_durationevaluation_parallel_to_trainingevaluation_config/)stopsuccess_metric)/__doc__typingr   ray.tune.resultr   ray.rllib.algorithms.algorithmr   %ray.rllib.algorithms.algorithm_configr   ray.rllib.env.env_runner_groupr   /ray.rllib.examples.envs.classes.simple_corridorr   ray.rllib.utils.metricsr   r	   r
   r   ray.rllib.utils.test_utilsr   r   ray.rllib.utils.typingr   ray.tune.registryr   parseradd_argumentintrK   __name__
parse_argsr#   
local_modealgoget_default_configenvironmentcorridor_length_trainingenv_runners
evaluationno_custom_evalrX   	overridesbase_config
stop_itersstop_rewardstop_timestepsr[   r*   r*   r*   r+   <module>   s|    A
I	)
