o
    ci                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlmZmZmZmZ d dlmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlm Z m!Z! d dl"m#Z#m$Z$ d d	l%m&Z&m'Z' d d
l(m)Z) d dl*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< e	rd dl=m>Z>m?Z? d dl@mAZA e+ \ZBZCe, \ZDZEZFe- \ZGZCeHeIZJ				dedee jK deLdeMdeMde jKf
ddZNdfdd ZO	dgd!d"ZPdhd$eMfd%d&ZQde0 d'fd(d)d*eLd+eeR d,eSfd-d.ZT	/did0e6d1eLd2eLdeeL fd3d4ZUd5e6ddfd6d7ZVe)d5e6fd8d9ZW	djd:eeSe
f d0e6d;eRdeRfd<d=ZX	dkddddddddd>d?d@dAee jY d:ee dBee dCee dDee dEeRd;eRdee6ejZj[f fdFdGZ\dldHdIZ]dJdKdLedM dNd@dOeeSe
f dPeMddf
dQdRZ^dmdSeMddTfdUdVZ_G dWdX dXZ`dYdMdZejadejafd[d\Zb	]			dndYeSd^d@d_eRd`eRdaeeeS  dbeRfdcddZcdS )o    N)TYPE_CHECKINGAnyDictListOptionalTupleTypeUnion)BoxDiscreteMultiDiscreteMultiBinary)r   )r   )tune)WandbLoggerCallbackWANDB_ENV_VAR)DEFAULT_MODULE_IDColumns)is_atariwrap_deepmind)OldAPIStack)try_import_jaxtry_import_tftry_import_torch)'DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICYENV_RUNNER_RESULTSEPISODE_RETURN_MEANEVALUATION_RESULTSNUM_ENV_STEPS_TRAINEDNUM_ENV_STEPS_SAMPLED_LIFETIME)
ResultDict)UnsupportedSpaceException)CLIReporter)TRAINING_ITERATION)	AlgorithmAlgorithmConfig)DatasetReader      Y@   順 parserdefault_rewarddefault_itersdefault_timestepsreturnc                 C   s  | du rt  } | jdtddd | jdddd	 | jd
g dddd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jddd dd d | jd!td"d"d#gd$d% | jd&dd'd	 | jd(tdd)d | jd*tdg d+d,d% | jd-dd.d	 | jd/td0d1d | jd2tdd3d | jd4td5d6d | jd7tdd8d | jd9dd:d	 | jd;tdd<d | jd=tdd>d | jd?tdd@d | jdAt|dBd | jdCt|dDd | jdEt|dFd | jdGddHd	 | jdIddJd	 | jdKtddLd | jdMtddNd | jdOtddPd | jdQtddRd | jdStddT | jdUddVd	 | jdWtddXd | S )Yac  Adds RLlib-typical (and common) examples scripts command line args to a parser.

    TODO (sven): This function should be used by most of our examples scripts, which
     already mostly have this logic in them (but written out).

    Args:
        parser: The parser to add the arguments to. If None, create a new one.
        default_reward: The default value for the --stop-reward option.
        default_iters: The default value for the --stop-iters option.
        default_timesteps: The default value for the --stop-timesteps option.

    Returns:
        The altered (or newly created) parser object.
    Nz--algoPPOz&The RLlib-registered algorithm to use.)typedefaulthelpz--enable-new-api-stack
store_truezAWhether to use the `enable_rl_module_and_learner` config setting.)actionr1   z--framework)tftf2torchr6   zThe DL framework specifier.)choicesr0   r1   z--envz2The gym.Env identifier to run the experiment with.z--num-env-runnersz<The number of (remote) EnvRunners to use for the experiment.z--num-envs-per-env-runnerzThe number of (vectorized) environments per EnvRunner. Note that this is identical to the batch size for (inference) action computations.z--num-agentsr   a  If 0 (default), will run as single-agent. If > 0, will run as multi-agent with the environment simply cloned n times and each agent acting independently at every single timestep. The overall reward for this experiment is then the sum over all individual agents' rewards.z--evaluation-num-env-runnerszGThe number of evaluation (remote) EnvRunners to use for the experiment.z--evaluation-intervalz`Every how many iterations to run one round of evaluation. Use 0 (default) to disable evaluation.z--evaluation-durationc                 S   s   | dkr| S t | S )Nauto)int)v r;   N/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/utils/test_utils.py<lambda>       z/add_rllib_example_script_args.<locals>.<lambda>
   zThe number of evaluation units to run each evaluation round. Use `--evaluation-duration-unit` to count either in 'episodes' or 'timesteps'. If 'auto', will run as many as possible during train pass (`--evaluation-parallel-to-training` must be set then).z--evaluation-duration-unitepisodes	timestepszThe evaluation duration unit to count by. One of 'episodes' or 'timesteps'. This unit will be run `--evaluation-duration` times in each evaluation round. If `--evaluation-duration=auto`, this setting does not matter.)r/   r0   r7   r1   z!--evaluation-parallel-to-trainingzWhether to run evaluation parallel to training. This might help speed up your overall iteration time. Be aware that when using this option, your reported evaluation results are referring to one iteration before the current one.z--outputz\The output directory to write trajectories to, which are collected by the algo's EnvRunners.z--log-level)INFODEBUGWARNERRORz-The log-level to be used by the RLlib logger.z	--no-tunezWhether to NOT use tune.Tuner(), but rather a simple for-loop calling `algo.train()` repeatedly until one of the stop criteria is met.z--num-samples   zMHow many (tune.Tuner.fit()) experiments to execute - if possible in parallel.z--max-concurrent-trialsz1How many (tune.Tuner) trials to run concurrently.z	--verbose   zBThe verbosity level for the `tune.Tuner()` running the experiment.z--checkpoint-freqzThe frequency (in training iterations) with which to create checkpoints. Note that if --wandb-key is provided, all checkpoints will automatically be uploaded to WandB.z--checkpoint-at-endzWhether to create a checkpoint at the very end of the experiment. Note that if --wandb-key is provided, all checkpoints will automatically be uploaded to WandB.z--wandb-keyz/The WandB API key to use for uploading results.z--wandb-projectzThe WandB project name to use.z--wandb-run-namezThe WandB run name to use.z--stop-rewardz0Reward at which the script should stop training.z--stop-itersz"The number of iterations to train.z--stop-timestepsz8The number of (environment sampling) timesteps to train.z	--as-testzWhether this script should be run as a test. If set, --stop-reward must be achieved within --stop-timesteps AND --stop-iters, otherwise this script will throw an exception at the end.z--as-release-testzWhether this script should be run as a release test. If set, all that applies to the --as-test option is true, plus, a short JSON summary will be written into a results file whose location is given by the ENV variable `TEST_OUTPUT_JSON`.z--num-learnerszLThe number of Learners to use. If `None`, use the algorithm's default value.z--num-cpus-per-learnerzTThe number of CPUs per Learner to use. If `None`, use the algorithm's default value.z--num-gpus-per-learnerzThe number of GPUs per Learner to use. If `None` and there are enough GPUs for all required Learners (--num-learners), use a value of 1, otherwise 0.z#--num-aggregator-actors-per-learnerzaThe number of Aggregator actors to use per Learner. If `None`, use the algorithm's default value.z
--num-cpus)r/   r0   z--local-modez,Init Ray in local mode for easier debugging.z
--num-gpusz6The number of GPUs to use (only on the old API stack).)argparseArgumentParseradd_argumentstrr9   float)r)   r*   r+   r,   r;   r;   r<   add_rllib_example_script_args:   s  	

	

	rM      Fc              
   C   s  t | trJt |tsJ dt|  }|  D ]#\}}||v s*J d| d| t||| ||||d || q|rHJ dt||dS t | t	tfrt |t	tfs\J dt
|t
| kspJ dt
|t
| t| D ]\}	}t|||	 ||||d qtdS t | tjtfr|du rt| t|usJ d	|  d
| ddS t| t|u sJ d	|  d| ddS | du s|du st | tst | trt |tr|du r| |ksJ d	|  d| ddS | |ksJ d	|  d| ddS t| dr| jtkst| jdst | trKztj| | |du r-J d	|  d| dW dS  tyJ }
 z|du r?|
W Y d}
~
dS d}
~
ww tdurt |tjtjfrht rd| }ntdt | tjtjfrt r||  } n$t  }|!| } t| |||||dW  d   S 1 sw   Y  t"durt | t"jr| # $  } t |t"jr|# $  }ddl%m&} t | |r| ' } t ||r|' }|du r |du r ztjj(| ||d W n ty }
 z|du r|
W Y d}
~
dS d}
~
ww |du rJ d	|  d| ddS |du r'd}|du r.d}ztjj)| |||d W n tyU }
 z|du rJ|
W Y d}
~
dS d}
~
ww |du rfJ d	|  d| ddS )ao  
    Checks two structures (dict, tuple, list,
    np.array, float, int, etc..) for (almost) numeric identity.
    All numbers in the two structures have to match up to `decimal` digits
    after the floating point. Uses assertions.

    Args:
        x: The value to be compared (to the expectation: `y`). This
            may be a Tensor.
        y: The expected value to be compared to `x`. This must not
            be a tf-Tensor, but may be a tf/torch-Tensor.
        decimals: The number of digits after the floating point up to
            which all numeric values have to match.
        atol: Absolute tolerance of the difference between x and y
            (overrides `decimals` if given).
        rtol: Relative tolerance of the difference between x and y
            (overrides `decimals` if given).
        false: Whether to check that x and y are NOT the same.
    z2ERROR: If x is dict, y needs to be a dict as well!z ERROR: y does not have x's key='z'! y=)decimalsatolrtolfalsez3ERROR: y contains keys ({}) that are not in x! y={}z>ERROR: If x is tuple/list, y needs to be a tuple/list as well!z7ERROR: y does not have the same length as x ({} vs {})!Tz
ERROR: x (z) is y ()!z) is not y (Nz) is the same as y (z) is not the same as y (dtypez<UFzD`y` (expected value) must not be a Tensor. Use numpy.ndarray insteadr   )Stats)decimalgHz>)rP   rQ   )*
isinstancedictsetkeysitemscheckremoveformatlisttuplelen	enumeratenpbool_boolrK   r9   hasattrrT   object
startswithbytestestingassert_array_equalAssertionErrortf1TensorVariabler4   executing_eagerlynumpy
ValueErrorSessionrunr6   detachcpuray.rllib.utils.metrics.statsrU   peekassert_almost_equalassert_allclose)xyrO   rP   rQ   rR   y_keyskeyvalueiesessrU   r;   r;   r<   r\   N  s   


**""








"






r\   c                    s  ddl m}m  |zttj W n ty&   j	Y nw j
j fdd}fD ]}|u ritdd}|sNJ |jsXj}n
|jjdd d	}t|d
|}nj}dg|u rudgng  D ]e}dD ]`}	|u rddgndgD ]R}
tdd}dD ]G}|rdgng dD ];}td td|  td|  td|	  td|
  td|  td|  |||||
|	||| qqqq|qxq>dS )a  Tests different combinations of args for algorithm.compute_single_action.

    Args:
        algorithm: The Algorithm object to test.
        include_state: Whether to include the initial state of the Policy's
            Model in the `compute_single_action` call.
        include_prev_action_reward: Whether to include the prev-action and
            -reward in the `compute_single_action` call.

    Raises:
        ValueError: If anything unexpected happens.
    r   )DEFAULT_POLICY_IDSampleBatchc              	      sT  i }| u r||d< |d< |  }	t|trt|	dd}	d }
rN }
|
sNg }
d}d| jv rN|
jd|  j   |d7 }d| jv s4rT  nd }rZdnd }|dkr| u sfJ  j	|	i}rw|| j
< || j< |
r| jd	d
r|
|d< nt|
D ]\}}||d| < q tdd |}jd|||d|}t|d trt|d |d |d f}tdd |}zjd|||d|}|sjdst|| W n ty   Y nw | j|	|
f||||||d|}d }|
s|s| u r|\}}}|r;tt|
t|D ]\}}t|r.|j }nt|j}t||j q|d u rE| jd }|d u rO| jd }|dkr| kr|sx|sk|skttsxtd| d|  d ttr|s| jdrtt |dkrtd| d|  dd S d S d S d S d S d S )N
full_fetch	policy_id            ?r   	state_in_rF   
input_dictenable_rl_module_and_learnerFstate_inc                 S   s   t | dS Nr   )rc   expand_dimssr;   r;   r<   r=   %  s    z<check_compute_single_action.<locals>._test.<locals>.<lambda>)r   exploretimesteprG   c                 S   s   | d S r   r;   r   r;   r;   r<   r=   2  s    noisy)prev_actionprev_rewardr   r   unsquash_actionclip_actionnormalize_actionsclip_actionssinglezReturned action (z) of algorithm/policy z not in Env's action_space g      .@zC should be in normalized space, but seems too large/small for that!r;   )!samplerW   r
   rc   clipget_initial_stateview_requirementsappendspaceOBSPREV_ACTIONSPREV_REWARDSconfiggetrb   treemap_structurecompute_actions_from_input_dictr_   arraycompute_single_actionr\   	TypeErrorzipflattenr4   	is_tensorshapeas_listcontainsrr   anyabs)whatmethod_to_test	obs_spacer   r   r   unsquashr   call_kwargsobsr   r   	action_in	reward_inr   r   input_dict_batchedr3   action2	state_out_sisosi_shaper   action_space	algorithminclude_prev_action_rewardinclude_statemodelpidpolr;   r<   _test  s   




	
	







z*check_compute_single_action.<locals>._testenv_runner_groupNc                 S   s   | j S N)observation_space)pr;   r;   r<   r=     s    z-check_compute_single_action.<locals>.<lambda>)r   original_spacer   r   )TFFTr(   )TFNzP--------------------------------------------------------------------------------zwhat=zmethod_to_test=zexplore=zfull_fetch=z	unsquash=zclip=)ray.rllib.policy.sample_batchr   r   nextiter
env_runnerget_policies_to_train
get_policyAttributeErrorpolicyr   r   getattrlocal_env_runnerr   
for_policyrandomrandintprint)r   r   r   r   r   r   
worker_setr   r   r   r   r   r   r   r;   r   r<   check_compute_single_action  sf   
~r   d   	max_stepsc              
   C   s   ddl m} t|}t|r!t|| jd d | jd dd}| \}}d\}}}	d}
|s^|	sb|
|k rd|| dd||||	|d}|	|d d \}}}}	}|
d	7 }
|s`|	sf|
|k s6d
S d
S d
S d
S d
S d
S )a[  Checks whether the given policy can infer actions from an env with connectors.

    Args:
        policy: The policy to check.
        env_name: Name of the environment to check
        max_steps: The maximum number of steps to run the environment for.

    Raises:
        ValueError: If the policy cannot infer actions from the environment.
    r   )local_policy_inferencer   dim
framestack)r   r   )        FF)env_idagent_idr   reward
terminated	truncatedinforF   N)
ray.rllib.utils.policyr   gymmaker   r   r   r   resetstep)r   env_namer   r   envr   r   r   r   r   ts
action_outr;   r;   r<   check_inference_w_connectors  s2   


(r   z/episode_return_meantune_resultsztune.ResultGrid	min_value
evaluationmetricc                 C   s   g }|    D ]'\}}|s|du r(t d| |v r(||t d|   q|||  qt|}||k rCtd| d| dtd| d| d dS )ak  Throws an error if `min_reward` is not reached within tune_results.

    Checks the last iteration found in tune_results for its
    "episode_return_mean" value and compares it to `min_reward`.

    Args:
        tune_results: The tune.Tuner().fit() returned results object.
        min_reward: The min reward that must be reached.
        evaluation: If True, use `evaluation/env_runners/[metric]`, if False, use
            `env_runners/[metric]`, if None, use evaluation sampler results if
            available otherwise, use train sampler results.

    Raises:
        ValueError: If `min_reward` not reached.
    N/`` of  not reached! reached! ok)get_dataframeiterrowsr   r   maxrr   r   )r   r   r   r   recorded_valuesr   row
best_valuer;   r;   r<   check_learning_achieved  s   r  r   resultsupper_limitlower_limitc                 C   sr   ddl m} ddlm} | d | }||vrdS || t }||  kr(|ks7n td| d| d| d	|S )
ay  Verifies that the off-policy'ness of some update is within some range.

    Off-policy'ness is defined as the average (across n workers) diff
    between the number of gradient updates performed on the policy used
    for sampling vs the number of gradient updates that have been performed
    on the trained policy (usually the one on the local worker).

    Uses the published DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICY metric inside
    a training results dict and compares to the given bounds.

    Note: Only works with single-agent results thus far.

    Args:
        results: The training results dict.
        upper_limit: The upper limit to for the off_policy_ness value.
        lower_limit: The lower limit to for the off_policy_ness value.

    Returns:
        The off-policy'ness value (described above).

    Raises:
        AssertionError: If the value is out of bounds.
    r   r   LEARNER_INFOr   Nz`off_policy_ness` (z) is outside the given bounds (z - rS   )r   r   $ray.rllib.utils.metrics.learner_infor  r   rl   )r  r  r	  r   r  learner_infooff_policy_nessr;   r;   r<   check_off_policyness  s"   r  train_resultsc                 C   s  ddl m}m}m}m} ||||tdfD ]}|| v s%J d| d|  dqt| d ts1J dddlm	} | 
d	| d d	 ij}| | }|s_t|dks_t|v s_J dt d
| d| D ])\}	}
|	dkrlqc|
 D ]\}}|ds~|drt|sJ d| dqpqc| S )Checks proper structure of a Algorithm.train() returned dict.

    Args:
        train_results: The train results dict to check.

    Raises:
        AssertionError: If `train_results` doesn't have the proper structure or
            data in it.
    r   )r   FAULT_TOLERANCE_STATSLEARNER_RESULTSTIMERSr   ' ' not found in `train_results` (rS   &`config` in results not a python dict!r$   policiesz' not found in train_results['z']!__all_modules___min_max'key' value not a scalar ()ray.rllib.utils.metricsr   r  r  r  r"   rW   rX   %ray.rllib.algorithms.algorithm_configr$   update_from_dictis_multi_agentra   r   r[   endswithrc   isscalar)r  r   r  r  r  r~   r$   r"  learner_results	module_idmodule_metricsr   r;   r;   r<   !check_train_results_new_api_stack(  sJ   	
	r(  c                 C   s  ddl m} ddlm}m} ddtdddd	d
dddtfD ]}|| v s-J d| d|  dqdD ]}|| t v sEJ d| d| t  dq0t| d tsQJ dddl	m
} | d| d d ij}| d }||v stJ d| dd|v st|v sJ d| d|| }|st|dks||v sJ d| d| d| D ]9\}	}
|	dkrq|	dkrq||
v r|
| }n|
}| D ]\}}|ds|drt|sJ d| dqq| S )r  r   r
  )r  LEARNER_STATS_KEYr   custom_metricsr   iterations_since_restorenum_healthy_workersperftime_since_restoretime_this_iter_stimerstime_total_sr  r  rS   )	episode_len_meanepisode_reward_maxepisode_reward_meanepisode_reward_min
hist_statspolicy_reward_maxpolicy_reward_meanpolicy_reward_minsampler_perfz4' not found in `train_results[ENV_RUNNER_RESULTS]` (r  r  r  z)'learner' not in train_results['infos'] (num_steps_trainedz:'num_(env_)?steps_trained' not in train_results['infos'] (z2' not found in train_results['infos']['learner'] (batch_count__all__min_max_r  )r   r   r  r  r)  r   r"   rW   rX   r   r$   r!  r"  r   ra   r[   rh   rc   r$  )r  r   r  r)  r~   r$   r"  r   r  r   policy_statslearner_statsr   r;   r;   r<   check_train_resultsn  sx   


rB  stopkeep_ray_upc              	   C   s   |   D ]P\}}|}|dD ]}| }t|tr#||v r#|| }qd} |du r+qz t|sJ||krJtd| d| d |sFt	  W  dS W q t
yT   Y qw dS )a  Checks stopping criteria on `ResultDict`

    Args:
        stop: Dictionary of stopping criteria. Each criterium is a mapping of
            a metric in the `ResultDict` of the algorithm to a certain criterium.
        results: An RLlib `ResultDict` containing all results from a training step.
        keep_ray_up: Optionally shutting down the runnin Ray instance.

    Returns: True, if any stopping criterium is fulfilled. Otherwise, False.
    r   NzStop criterion (=) fulfilled!TF)r[   splitstriprW   rX   rc   isnanr   rayshutdownr   )rC  r  rD  r~   	thresholdvalkr;   r;   r<   should_stop  s*   
rO  )rC  success_metric	trainabletune_callbackskeep_configrD  	schedulerprogress_reporterbase_configr$   argsrP  rQ  rR  rS  c          &         s`  |du rt  }
|
 }|jrd|_tj|jpd|jdd |du r6t dt	 |j
t dt |jt|ji}| }|sG||j |jdurQ|jdu rQ||j |js[|jddd |jdurg|j|jd |jdurs|j|jd |jr|jdur|jd	krtd
t dd	}|jdur|jn|jpd}|jpd	| }|jdur|jnd| }|jd	d |jdur|j|jd |j dur|j|j d |jdu r||kr|jdd n"|jd	d n||k rtd|j d|j d| d|j|jd |j!dur
|j|j!d n|jdur|j|jd |j"d	kr-|j#|j$|j"|j%|j&|j'd |j(dur:|j)|j(d |j*durG|j+|j*d |j,r|jsS|jrUJ |- }t.|t|jD ]}|/ }t|v r|t t	t0j1}t2d| d| dd t3|v rt|t3 v r|t3 t t	 }t2d| dd t2  |4 D ]K\}}|}|5dD ]}z|| }W q t6y   d}Y  nw |durt07|s||krt2d| d| d  |st8  |    S qqb|st8  |S |pg }t9|d!rK|j:dust;t<j=v rK|j:pt<j=t; }|j>p3|j?@ d" tABd#d"tC|j@  }|DtEdE||dd$|jFrFd%|jFini  |	du rr|jGd	krrtHi td&d'd(td)t dt	 d*id+d, |jID d-}	d.t<j=d/< tJJ }tKjL|p|jM|tKjN||jO|tKjP|jQ|jRd0|	d1tKjS|jT|jU|d2d3V }tJJ | }|st8  |jWrtXd4d5d6 |jWD  d}|jr|du rt3 dt dt	 t dt	 fD ]}||v r||| i} nq|du rt dt	 |j
i}tYtZ|4 \}t[fd7d8|\ ] D } | |kr'd}t2d9 d:| d; |jr|j^j_d	 }!|!j`  ad<d tb||!jcgtbtJJ  |g| g|sTtC|!dini d=}"tdt<j=d>d?d@5}#ztef|"|# W n$ tgy   d	dAlhmi}$  fdBd,|$jjD }%|%|"dC< tef|"|# Y nw W d   n	1 sw   Y  |std9 d:| dD|S )Fav  Given an algorithm config and some command line args, runs an experiment.

    There are some constraints on what properties must be defined in `args`.
    It should ideally be generated via calling
    `args = add_rllib_example_script_args()`, which can be found in this very module
    here.

    The function sets up an Algorithm object from the given config (altered by the
    contents of `args`), then runs the Algorithm via Tune (or manually, if
    `args.no_tune` is set to True) using the stopping criteria in `stop`.

    At the end of the experiment, if `args.as_test` is True, checks, whether the
    Algorithm reached the `success_metric` (if None, use `env_runners/
    episode_return_mean` with a minimum value of `args.stop_reward`).

    See https://github.com/ray-project/ray/tree/master/rllib/examples for an overview
    of all supported command line options.

    Args:
        base_config: The AlgorithmConfig object to use for this experiment. This base
            config will be automatically "extended" based on some of the provided
            `args`. For example, `args.num_env_runners` is used to set
            `config.num_env_runners`, etc..
        args: A argparse.Namespace object, ideally returned by calling
            `args = add_rllib_example_script_args()`. It must have the following
            properties defined: `stop_iters`, `stop_reward`, `stop_timesteps`,
            `no_tune`, `verbose`, `checkpoint_freq`, `as_test`. Optionally, for WandB
            logging: `wandb_key`, `wandb_project`, `wandb_run_name`.
        stop: An optional dict mapping ResultDict key strings (using "/" in case of
            nesting, e.g. "env_runners/episode_return_mean" for referring to
            `result_dict['env_runners']['episode_return_mean']` to minimum
            values, reaching of which will stop the experiment). Default is:
            {
            "env_runners/episode_return_mean": args.stop_reward,
            "training_iteration": args.stop_iters,
            "num_env_steps_sampled_lifetime": args.stop_timesteps,
            }
        success_metric: Only relevant if `args.as_test` is True.
            A dict mapping a single(!) ResultDict key string (using "/" in
            case of nesting, e.g. "env_runners/episode_return_mean" for referring
            to `result_dict['env_runners']['episode_return_mean']` to a single(!)
            minimum value to be reached in order for the experiment to count as
            successful. If `args.as_test` is True AND this `success_metric` is not
            reached with the bounds defined by `stop`, will raise an Exception.
        trainable: The Trainable sub-class to run in the tune.Tuner. If None (default),
            use the registered RLlib Algorithm class specified by args.algo.
        tune_callbacks: A list of Tune callbacks to configure with the tune.Tuner.
            In case `args.wandb_key` is provided, appends a WandB logger to this
            list.
        keep_config: Set this to True, if you don't want this utility to change the
            given `base_config` in any way and leave it as-is. This is helpful
            for those example scripts which demonstrate how to set config settings
            that are otherwise taken care of automatically in this function (e.g.
            `num_env_runners`).

    Returns:
        The last ResultDict from a --no-tune run OR the tune.Tuner.fit()
        results.
    NT)num_cpus
local_modeignore_reinit_errorr   F)r   "enable_env_runner_and_connector_v2)num_env_runners)num_envs_per_env_runnerr   z--num-gpus is not supported on the new API stack! To train on GPUs, use the command line options `--num-gpus-per-learner=1` and `--num-learners=[your number of available GPUs]`, instead.GPUrF   num_gpus)num_learners)!num_aggregator_actors_per_learnernum_gpus_per_learnerz0You are running your script with --num-learners=z and --num-gpus-per-learner=z, but your cluster only has z GPUs!)num_cpus_per_learner)evaluation_num_env_runnersevaluation_intervalevaluation_durationevaluation_duration_unitevaluation_parallel_to_training)	log_level)outputziter=z R= )endz	 R(eval)=zStop criterium (rE  rF  	wandb_key-z\W+)api_keyprojectupload_checkpointsnamer   r1  ztotal time (s)r   zcombined returnc                 S   s"   i | ]}t  d | d| qS )z/module_episode_returns_mean/zreturn )r   ).0r   r;   r;   r<   
<dictcomp>  s
    z7run_rllib_example_script_experiment.<locals>.<dictcomp>)metric_columns0RAY_AIR_NEW_OUTPUT)checkpoint_frequencycheckpoint_at_end)rC  verbose	callbackscheckpoint_configrU  )num_samplesmax_concurrent_trialsrT  )param_space
run_configtune_configz;Running the example script resulted in one or more errors! c                 S   s   g | ]
}|j d  j d qS r   rG   )rW  )ru  r   r;   r;   r<   
<listcomp>J  s    z7run_rllib_example_script_experiment.<locals>.<listcomp>c                 3   s    | ]	\}}|  V  qd S r   r;   )ru  r   r  )success_metric_keyr;   r<   	<genexpr>a  s    
z6run_rllib_example_script_experiment.<locals>.<genexpr>r   r   r   r   )
time_takentrial_stateslast_updatestatspassed
not_passedfailuresTEST_OUTPUT_JSONz/tmp/learning_test.jsonwt)r#   c                    s   i | ]}| v r| | qS r;   r;   )ru  rN  )r  r;   r<   rv    s    r  r   r;   )krM   
parse_argsas_release_testas_testrJ  initrX  rY  r   r   stop_rewardr   stop_timestepsr"   
stop_iters	frameworkr   environmentenable_new_api_stack	api_stackr\  env_runnersr]  r   r`  rr   cluster_resourcesr   ra  rd  	resourceslearnersrb  re  rg  r   rf  rh  ri  rj  rk  	debuggingrl  offline_datano_tunebuildrangetrainrc   nanr   r   r[   rG  KeyErrorrI  rK  rf   ro  r   osenvironwandb_projectalgolowerresubrK   r   r   wandb_run_name
num_agentsr!   r  timer   Tuner
algo_class	RunConfigr|  CheckpointConfigcheckpoint_freqr{  
TuneConfigr  r  fiterrorsRuntimeErrorr   r   r  r   r  _experiment_analysistrialslast_resultpoprL   statusopenjsondump	Exceptionray.rllib.algorithms.algorithmr#   _progress_metrics)&rV  rW  rC  rP  rQ  rR  rS  rD  rT  rU  r)   r   num_gpus_availablenum_actual_learnersnum_gpus_requestednum_gpus_needed_if_availabler  r   r  mean_returnRevalr~   rL  rM  rN  ro  rr  
start_timer  test_passedtry_itsuccess_metric_valuer  trialjson_summaryfr#   simplified_statsr;   )r  r  r<   #run_rllib_example_script_experiment	  s  H






		


 
$









	
r  c           
      C   s   ddl m}m} t| t|u s J dtt| tt|d
dd}t| |u r2|| | dS t| |u rs| j|jks@J t }| j	 D ]\}}|||j| | |
| qHt|j }||}	|	rqJ d|	 ddS td	tt|  )a  Check if both batches are (almost) identical.

    For MultiAgentBatches, the step count and individual policy's
    SampleBatches are checked for identity. For SampleBatches, identity is
    checked as the almost numerical key-value-pair identity between batches
    with ray.rllib.utils.test_utils.check(). unroll_id is compared only if
    both batches have an unroll_id.

    Args:
        batch1: Batch to compare against batch2
        batch2: Batch to compare against batch1
    r   )MultiAgentBatchr   z.Input batches are of different types {} and {}Nc           
      S   s   |  dd }| dd }|d ur|d ur||ksJ t }|  D ]\}}|dkr*q!t|||  || q!t| }|d ||}	|rU|	rSJ d||	d S |	r^J d|	d S )N	unroll_idz]SampleBatches for policy with ID {} don't share information on the following information: 
{}zGSampleBatches don't share information on the following information: 
{})	r   rY   r[   r\   addrZ   discardsymmetric_differencer^   )
_batch1_batch2
_policy_idunroll_id_1unroll_id_2batch1_keysrN  r:   batch2_keys_differencer;   r;   r<   check_sample_batches  s,   

z.check_same_batch.<locals>.check_sample_batchesz:MultiAgentBatches don't share the following information: 
.zUnsupported batch type r   )r   r  r   r/   r^   rK   countrY   policy_batchesr[   r  rZ   r  rr   )
batch1batch2r  r   r  
batch1_idsr   policy_batch
batch2_ids
differencer;   r;   r<   check_same_batch  s4   
#

r  rF   )training_iterationr  r#   algo_config	fw_kwargsr  c          
      C   sd  ddl m} ddlm} t|i}dD ]}|jddj|ddjtt	j
d	d
djtt	j
d	d
d}td| j d| d td t|  tj| | tj|ddd }| j}tj| | tj|ddd }	|	 j}	t|t d |	t d  |jrt|d | | |	d | |  qt|d | | d |	d | | d  qdS )a  Check if the algorithm is reproducible across different testing conditions:

        frameworks: all input frameworks
        num_gpus: int(os.environ.get("RLLIB_NUM_GPUS", "0"))
        num_workers: 0 (only local workers) or
                     4 ((1) local workers + (4) remote workers)
        num_envs_per_env_runner: 2

    Args:
        algo_class: Algorithm class to test.
        algo_config: Base config to use for the algorithm.
        fw_kwargs: Framework iterator keyword arguments.
        training_iteration: Number of training iterations to run.

    Returns:
        None

    Raises:
        It raises an AssertionError if the algorithm is not reproducible.
    r   r
  r  r  *   )seedrG   )r\  r]  RLLIB_NUM_GPUSrx  rc  r_  zTesting reproducibility of z with z workersz
/// configrF   )rC  r|  )r  r  r6  r   rA  N)r   r   r  r  r"   r  r  r  r9   r  r  r   r  r   __name__pprintto_dictr   r  r  r  get_best_resultmetricsr\   r   r   )
r  r  r  r  r   r  	stop_dictnum_workersresults1results2r;   r;   r<   check_reproducibilty  sf   




r  
batch_sizer%   c                 C   sz   ddl m} ddlm} ddlm}m} d}d|d}|| jd|d	\}}|| j| d
jdddd}	|||	}
|
S )zReturns a DatasetReader for the cartpole dataset.
    Args:
        batch_size: The batch size to use for the reader.
    Returns:
        A rllib DatasetReader for the cartpole dataset.
    r   r  )	IOContext)r%   get_dataset_and_shardsztests/data/cartpole/large.jsonr  )r^   pathsdataset)input_input_config)train_batch_sizeT)actions_in_input_normalized)r   worker_index)	ray.rllib.algorithmsr$   ray.rllib.offliner   ray.rllib.offline.dataset_readerr%   r  r  training)r  r$   r  r%   r  pathr  r  r   ioctxreaderr;   r;   r<   get_cartpole_dataset_reader7  s$   

r  c                   @   s4   e Zd ZdZdd Zddedefd	d
Zdd ZdS )ModelCheckera  Helper class to compare architecturally identical Models across frameworks.

    Holds a ModelConfig, such that individual models can be added simply via their
    framework string (by building them with config.build(framework=...).
    A call to `check()` forces all added models to be compared in terms of their
    number of trainable and non-trainable parameters, as well as, their
    computation results given a common weights structure and values and identical
    inputs to the models.
    c                 C   s,   || _ i | _i | _tjdd| _i | _d S )Ng{Gzg{Gz?)r   param_countsoutput_valuesrc   r   uniformrandom_fill_input_valuemodels)selfr   r;   r;   r<   __init__a  s
   
zModelChecker.__init__r6   TFr  r-   c           	      C   s   | j j|d }| j|< tdg|rdgng  t| j j | j}|r(tj	|i}|r5t
dd ||tj< |dkrCddlm} ||}||}|j| jfd ||}| | j|< |dkrkt
d	d || j|< |S t
d
d || j|< |S )z+Builds a new Model for the given framework.)r  rF   c                 S   s   t jdgt|  dS )NrF   )r   )rc   zerosr_   r   r;   r;   r<   r=   ~  s    z"ModelChecker.add.<locals>.<lambda>r6   r   )convert_to_torch_tensor)value_sequencec                 S   s   | d ur
|    S d S r   )ru   rq   r   r;   r;   r<   r=     s    c                 S   s   | d ur|   S d S r   )rq   r   r;   r;   r<   r=     r>   )r   r  r  rc   fullr_   
input_dimsr  r   r   r   r   STATE_INray.rllib.utils.torch_utilsr  _set_to_dummy_weightsget_num_parametersr  r  )	r  r  r   stater   inputsr  outputscomparable_outputsr;   r;   r<   r  q  s8   



zModelChecker.addc                 C   sZ   t t| j }| j D ]
}t|| j|  q| j D ]}t|| j| dd qdS )zECompares all added Models with each other and possibly raises errors.gMb@?)rP   N)r   r   r  rZ   r  valuesr\   r  )r  main_keycr:   r;   r;   r<   r\     s   zModelChecker.checkN)r6   TF)	r  
__module____qualname____doc__r  rK   r   r  r\   r;   r;   r;   r<   r  V  s
    
.r  algr   c                 C   s0   g }t dD ]}|t| | qt|S )aQ  Returns the mean action computed by the given algorithm.

    Note: This makes calls to `Algorithm.compute_single_action`

    Args:
        alg: The constructed algorithm to run inference on.
        obs: The observation to compute the action for.

    Returns:
        The mean action computed by the algorithm over 5000 samples.

    i  )r  r   rL   r   rc   mean)r*  r   outr   r;   r;   r<   _get_mean_action_from_algorithm  s   
r-  Tr   r  check_bounds
frameworksuse_gpuc                    s  ddl m} ddlm  ddlm ddlm tdt	ddd	t
jd
t	dddt
jd
tg dttdtdt	ddd	t
jd
gttdt	dddt
jd
tdttdtdgiddtg dtdt	ddd	t
jd
t	dddt
jd
t	dddt
jd
ttdt	ddd	t
jd
gttdt	ddd	t
jd
ddg dddgd }}d|d< ||d<  f	dd }	|sd!}t|	}
|
j|rd"ndd#}
 D ]}|}t|
| ||| qو D ]}|}t|
| ||| qd$S )%aD  Checks whether the given algorithm supports different action and obs spaces.

        Performs the checks by constructing an rllib algorithm from the config and
        checking to see that the model inside the policy is the correct one given
        the action and obs spaces. For example if the action space is discrete and
        the obs space is an image, then the model should be a vision network with
        a categorical action distribution.

    Args:
        alg: The name of the algorithm to test.
        config: The config to use for the algorithm.
        train: Whether to train the algorithm for a few iterations.
        check_bounds: Whether to check the bounds of the action space.
        frameworks: The frameworks to test the algorithm with.
        use_gpu: Whether to check support for training on a gpu.


    r   )	RandomEnv)ComplexInputNetwork)FullyConnectedNetwork)VisionNetworkrN   r   r   )rN   )rT      )rG   r5  )rF   rG   r5     rG   rF   a)action_choice
parametersyet_another_nested_dict)discrete
continuousint_actionsmultidiscreter`   rX   )r5  r?   r?   )rN   rN   )T   r@  rF   r?   )taskposition)multi_binaryr<  r=  vector2dimager`   rX   )rC  r<  r=  rE  r`   rX   r<  r=  rE   rk  r   c                    s  |  }|  |jr'|vrtd| d S |vr'td| d S |d }| }| }td| ||| t }|t	t	||t
dddtjdddd	 d
}	z| }
W n? tjjy } z't|jdkr{t|jd tr{d}	nt|jd jd trd}	n W Y d }~nKd }~w ty   d}	Y n=w | dvr|dv rt|
 jsJ n|dkrt|
 jsJ n|dkrt|
 j fsJ r|
  |
  td|	t |  d S )Nz1Skipping PPO test with RLModules for obs space {}z4Skipping PPO test with RLModules for action space {}r  z7=== Testing {} (fw={}) action_space={} obs_space={} ===r   r;   )r   rT   )r   r   reward_spacep_terminatedcheck_action_bounds)
env_configokrG   unsupportedr   )SACr.   )atarirE  r=  rD  zTest: {}, ran in {}s)copyvalidater   loggerwarningr^   r   r  r!  rX   r
   rc   float32r  rJ  
exceptionsRayActorErrorra   rW  rW   r    r   r   r  rC  )r*  r   a_nameo_nameconfig_copyfwr   r   t0statr  r   	TorchComplexNet
TorchFCNetTorchVisionNetaction_spaces_to_testr.  observation_spaces_to_test rlmodule_supported_action_spaces%rlmodule_supported_observation_spacesr  r;   r<   	_do_check  sz   z)check_supported_spaces.<locals>._do_check)r5   r4   r6   rF   r_  N)*ray.rllib.examples.envs.classes.random_envr1  (ray.rllib.models.torch.complex_input_netr2  ray.rllib.models.torch.fcnetr3   ray.rllib.models.torch.visionnetr4  r   r
   rc   rR  int32r   GymTupleGymDictr   rJ  remoteoptionsrZ   r   )r*  r   r  r.  r/  r0  r1  default_observation_spacedefault_action_spacerc  _do_check_remoterU  rV  r;   r[  r<   check_supported_spaces  sb   


I
rp  )Nr&   r'   r(   )rN   NNF)FF)r   )r   )Fr   )r-   Nr7  )TFNF)drH   r  loggingr  r  r   r  r  typingr   r   r   r   r   r   r   r	   	gymnasiumr   gymnasium.spacesr
   r   r   r   rj  ri  rq   rc   r   rJ  r   ray.air.integrations.wandbr   r   ray.rllib.corer   r   %ray.rllib.env.wrappers.atari_wrappersr   r   ray.rllib.utils.annotationsr   ray.rllib.utils.frameworkr   r   r   r  r   r   r   r   r   r   ray.rllib.utils.typingr   ray.rllib.utils.errorr    ray.tuner!   ray.tune.resultr"   r  r#   r$   r	  r%   jaxr   rm   r4   tfvr6   	getLoggerr  rP  rI   rL   r9   rM   r\   r   r   re   rK   r  r  r(  rB  rO  	Namespaceresult_grid
ResultGridr  r  r  r  r  ndarrayr-  rp  r;   r;   r;   r<   <module>   s8   ( 



  
 
 H.
(
3Fj

3	
   
S

\X
