o
    ciO                     @   s  d dl mZ d dlmZ d dlZd dlZd dlZd dlmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZmZ d d
lmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZN d dlOmPZP d dlQmRZRmSZSmTZT d dlUmVZVmWZW d dlXmYZY eZdZ[eYddG dd de)e4Z\dS )     )defaultdict)partialN)
CollectionDefaultDictListOptionalUnion)DictInfoToList)AlgorithmConfig)RLlibCallback)make_callback)!COMPONENT_ENV_TO_MODULE_CONNECTOR!COMPONENT_MODULE_TO_ENV_CONNECTORCOMPONENT_RL_MODULEDEFAULT_AGENT_IDDEFAULT_MODULE_ID)Columns)RLModuleRLModuleSpec)INPUT_ENV_SPACESINPUT_ENV_SINGLE_SPACES)
EnvContext)	EnvRunnerENV_STEP_FAILURE)SingleAgentEpisode)_gym_env_creator)
force_list)override)Checkpointable)
Deprecated)
get_device)ENV_TO_MODULE_CONNECTOREPISODE_DURATION_SEC_MEANEPISODE_LEN_MAXEPISODE_LEN_MEANEPISODE_LEN_MINEPISODE_RETURN_MAXEPISODE_RETURN_MEANEPISODE_RETURN_MINMODULE_TO_ENV_CONNECTORNUM_AGENT_STEPS_SAMPLED NUM_AGENT_STEPS_SAMPLED_LIFETIMENUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLED_LIFETIMENUM_EPISODESNUM_EPISODES_LIFETIMENUM_MODULE_STEPS_SAMPLED!NUM_MODULE_STEPS_SAMPLED_LIFETIMERLMODULE_INFERENCE_TIMERSAMPLE_TIMERTIME_BETWEEN_SAMPLINGWEIGHTS_SEQ_NO)unbatch)	EpisodeID
ResultDict	StateDict)ENV_CREATOR_global_registry)	PublicAPIz	ray.rllibalpha)	stabilityc                       s  e Zd ZdZeedef fddZeeddddddded	ed
e	de	de	de
e fddZddddddee d	ee d
e	de	de	de
e fddZeedd ZeedefddZee	dBdddeeeee f  deeeee f  defddZeededdfddZeedd  Zeed!d" Zeed#d$ Zeed%d& ZeedCd'd(Zeed)d* Zeed+d, Zd-d. Z dBd/d0Z!d1ed2ed3e
e fd4d5Z"d6d7 Z#d8d9 Z$e%d:d;d<d=d> Z&e%d?d;d<d@dA Z'  Z(S )DSingleAgentEnvRunnerz9The generic environment runner for the single agent case.configc                   s&  t  jdd|i| |d| _|di | _dd t| jjD | _t	| j| j
s,dn| jj| _d| _d| _| j
du sL| j
dksL| jjsL| jjdkrP|   | jj| j| j| jd| _d| _d| _|   | jj| j| jd	| _d
| _dd t| jD | _d| _g | _tt| _ d| _!d| _"dS )zInitializes a SingleAgentEnvRunner instance.

        Args:
            config: An `AlgorithmConfig` object containing all settings needed to
                build this `EnvRunner` class.
        r@   tune_trial_idspacesc                 S   s   g | ]}| qS  rC   ).0clsrC   rC   Y/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/env/single_agent_env_runner.py
<listcomp>U   s    z1SingleAgentEnvRunner.__init__.<locals>.<listcomp>r   N)envrB   device)rH   rB   Tc                 S      g | ]}d qS NrC   rD   _rC   rC   rF   rG      s    rC   )#super__init__getrA   rB   r   r@   callbacks_class
_callbacksr    worker_indexnum_gpus_per_env_runner_devicerH   num_envscreate_env_on_local_workernum_env_runnersmake_envbuild_env_to_module_connector_env_to_module_cached_to_modulemodulemake_modulebuild_module_to_env_connector_module_to_env_needs_initial_resetrange	_episodes_shared_data_done_episodes_for_metricsr   list_ongoing_episodes_for_metrics_weights_seq_no_time_after_sampling)selfr@   kwargs	__class__rC   rF   rO   G   sJ   



zSingleAgentEnvRunner.__init__NFnum_timestepsnum_episodesexplorerandom_actionsforce_resetro   rp   rq   rr   rs   returnc             
   C   sH  | j du rt|  d|dur|durJ | jdur(| jjtt | j d | jjt| j	dd | j
t] |du rA| jj}|du rY|du rY| jjdkrY| j| j| j }|durg| j||||d}n|durt| j|||d}n	| j| j||d}td	| j| jjt| | j|d
d W d   n1 sw   Y  t | _|S )a  Runs and returns a sample (n timesteps or m episodes) on the env(s).

        Args:
            num_timesteps: The number of timesteps to sample during this call.
                Note that only one of `num_timetseps` or `num_episodes` may be provided.
            num_episodes: The number of episodes to sample during this call.
                Note that only one of `num_timetseps` or `num_episodes` may be provided.
            explore: If True, will use the RLModule's `forward_exploration()`
                method to compute actions. If False, will use the RLModule's
                `forward_inference()` method. If None (default), will use the `explore`
                boolean setting from `self.config` passed into this EnvRunner's
                constructor. You can change this setting in your config via
                `config.env_runners(explore=True|False)`.
            random_actions: If True, actions will be sampled randomly (from the action
                space of the environment). If False (default), actions or action
                distribution parameters are computed by the RLModule.
            force_reset: Whether to force-reset all (vector) environments before
                sampling. Useful if you would like to collect a clean slate of new
                episodes via this call. Note that when sampling n episodes
                (`num_episodes != None`), this is fixed to True.

        Returns:
            A list of `SingleAgentEpisode` instances, carrying the sampled data.
        Nz2 doesn't have an env! Can't call `sample()` on it.)keyvalue   )ru   rv   windowtruncate_episodes)ro   rq   rr   rs   )rp   rq   rr   on_sample_end)
env_runnermetrics_loggersamplescallbacks_objectscallbacks_functionsrk   )rH   
ValueErrorri   metrics	log_valuer4   timeperf_counterr5   rh   log_timer3   r@   rq   
batch_modeget_rollout_fragment_lengthrS   rV   _sampler   rR   callbacks_on_sample_enddict)rj   ro   rp   rq   rr   rs   r}   rC   rC   rF   sample   sp   
"

3zSingleAgentEnvRunner.sample)ro   rp   rr   rs   c             
      s  g }|s|dusj r(dd tjD  }_i  }_||| d_ nj}j}|dur5d_ d}	d}
|durC|	|k rn|
|k r|rTtjjj	
 i}nhj}|dus]J d_|rjjtdd|	 jjppd }jt jj||d	}W d   n1 sw   Y  njt j|}W d   n1 sw   Y  jj||||jtfd
}|tj}|tj|}|}|tkr܈j||||ddS |\}}}}}t|t|}}t }tjD ]J  fdd| D }j |t!< j  j"s |  j#|  |  d |$  q|	d7 }	|  j%|  |  |  |  |  |  |d qjdurTj&||j|jt'fd_tjD ]K  |v rh(d | n(d | |  j)r|
d7 }
(d | jj*r|+|  ,  n|+|   |
|kr n- | qY|dur|	|k sHn|
|k sHj./| g }|durfddjD }jD ]+}
|
j0dkrאq|
1  j2|
j3 +|
 jj*r|+|
,  q|+|
 q|_4|	t5| || S )z2Helper method to sample n timesteps or m episodes.Nc                 S   rJ   rK   rC   rL   rC   rC   rF   rG     s    z0SingleAgentEnvRunner._sample.<locals>.<listcomp>FTr   defaultrw   )t)	rl_modulebatchepisodesrq   shared_datar   metrics_prefix_keyrn   c                    s   i | ]	\}}||  qS rC   rC   )rD   kv)	env_indexrC   rF   
<dictcomp>_  s    z0SingleAgentEnvRunner._sample.<locals>.<dictcomp>observationinfos)r   actionrewardr   
terminated	truncatedextra_model_outputs)r   rq   r   r   r   r   on_episode_starton_episode_stepon_episode_endc                    s   g | ]
}|j  jjd qS ))len_lookback_buffer)cutr@   episode_lookback_horizon)rD   epsrj   rC   rF   rG     s    )6ra   rb   rV   rc   rd   _reset_envsr   ACTIONSrH   action_spacer   r\   r   peekr-   r@   rX   r   r2   r]   forward_explorationforward_inferencer`   r)   popACTIONS_FOR_ENV_try_env_stepr   r   r6   setitemsrh   r5   is_resetadd_env_resetaddadd_env_stepr[   r!   _make_on_episode_callbackis_doneepisodes_to_numpyappendto_numpy_new_episodere   extendr   validaterg   id__increase_sampled_metricslen)rj   ro   rp   rq   rr   rs   done_episodes_to_returnr   r   tsr   to_env	to_moduleglobal_env_steps_lifetimeactionsactions_for_envresultsobservationsrewardsterminateds
truncatedsr   call_on_episode_startextra_model_outputongoing_episodes_to_returnongoing_episodes_continuationsrC   )r   rj   rF   r      s   



	


  



zSingleAgentEnvRunner._samplec                 C   sD   | j d u r| jS t| j j| j jft| j j| j jft| j	j| j jfiS rK   )
rH   rB   r   observation_spacer   r   single_observation_spacesingle_action_spacer   r[   r   rC   rC   rF   
get_spaces  s   
zSingleAgentEnvRunner.get_spacesc                 C   s   | j D ]@}|js
J t|}| }| }|j| jv r<| j|j D ]}|t|7 }|| 7 }|| 7 }q"| j|j= | ||| q| j   | j	
 S rK   )re   r   r   
get_returnget_duration_sr   rg   _log_episode_metricsclearr   reduce)rj   r   episode_lengthepisode_returnepisode_duration_seps2rC   rC   rF   get_metrics  s    




z SingleAgentEnvRunner.get_metrics)not_components
componentsr   c                K   s   t | jjt ddi}| t||r,| jjd| t|| t|d||t< | j|t	< | t
||r:| j |t
< | t||rH| j |t< |S )Nr   r   )r   r   rC   )r-   r   r   _check_componentr   r]   	get_state_get_subcomponentsrh   r5   r   r[   r   r`   )rj   r   r   rk   staterC   rC   rF   r     s,   	


zSingleAgentEnvRunner.get_stater   c                 C   s   t |v r| j|t   t|v r| j|t  t|v rT|td}|dks+| j|k rM|t }t	|t
jr:t
|}t	|trGt|v rG|t }| j| |dkrT|| _t|v rf| jjt|t ddd d S d S )Nr   sumT)ru   rv   r   with_throughput)r   r[   	set_stater   r`   r   rP   r5   rh   
isinstanceray	ObjectRefr   r   r]   r-   r   	set_value)rj   r   weights_seq_norl_module_staterC   rC   rF   r      s2   

zSingleAgentEnvRunner.set_statec                 C   s   dd| j ifS )NrC   r@   )r@   r   rC   rC   rF   get_ctor_args_and_kwargsG  s   z-SingleAgentEnvRunner.get_ctor_args_and_kwargsc                 C   s   t | }|i  |S rK   )r   get_metadataupdate)rj   metadatarC   rC   rF   r   N  s
   
z!SingleAgentEnvRunner.get_metadatac                 C   s   t | jft| jft| jfgS rK   )r   r]   r   r[   r   r`   r   rC   rC   rF   get_checkpointable_componentsX  s   z2SingleAgentEnvRunner.get_checkpointable_componentsc                 C   s   | j rt| ds
J dS )a  Checks that self.__init__() has been completed properly.

        Ensures that the instances has a `MultiRLModule` and an
        environment defined.

        Raises:
            AssertionError: If the EnvRunner Actor has NOT been properly initialized.
        r]   N)rH   hasattrr   rC   rC   rF   assert_healthy`  s   z#SingleAgentEnvRunner.assert_healthyc              
   C   sv  | j dur*z| j   W n ty) } ztd|jd   W Y d}~nd}~ww | jj}t|t	s?t	|| j
| j| jjd}| jj sGtdt| jj trbtt| jj rbttt| jj |}n	tt| jj |d}tjd|d | jj}ttjd| jjt|tjjjr|ntjj| d	| _ | j j| _| j| jjksJ d
| _ t!d| j"| jj#t$| | j%| j j&|dd dS )a  Creates a vectorized gymnasium env and stores it in `self.env`.

        Note that users can change the EnvRunner's config (e.g. change
        `self.config.env_config`) and then call this method to create new environments
        with the updated configuration.
        Nz7Tried closing the existing env, but failed with error: r   )rS   num_workersremotez`config.env` is not provided! You should provide a valid environment to your config through `config.environment([env descriptor e.g. 'CartPole-v1'])`.)env_descriptorenv_contextzrllib-single-agent-env-v0)entry_point)rV   vectorization_modeTon_environment_created)r{   r|   rH   r   r~   )'rH   close	Exceptionloggerwarningargsr@   
env_configr   r   rS   r   remote_worker_envsr   strr;   containsr:   r   rP   r   gymregistergym_env_vectorize_moder	   make_vecnum_envs_per_env_runnerenvsregistrationVectorizeModelowerrV   ra   r   rR    callbacks_on_environment_createdr   r   	unwrapped)rj   eenv_ctxr   vectorize_moderC   rC   rF   rY   m  sx   




zSingleAgentEnvRunner.make_envc                 C   sf   | j d ur	| j jnd }z| jj||  dd}| | _| j| j W d S  t	y2   d | _Y d S w )NT)rH   rB   inference_only)
rH   r  r@   get_rl_module_specr   buildr]   torU   NotImplementedError)rj   rH   module_specrC   rC   rF   r^     s   

z SingleAgentEnvRunner.make_modulec                 C   s   | j d ur| j   d S d S rK   )rH   r   r   rC   rC   rF   stop  s   
zSingleAgentEnvRunner.stopc                 C   s   t | jD ]}| || q| j  | j| jr| jnd d d\}}t|}t | jD ]}|| j	|| || d q+d | _
| jrP| j| j|||| jtfd| _
t | jD ]	}| d|| qUd S )N)seedoptionsr   )r   r   rq   r   r   r   r   )rb   rV   r   rg   r   _try_env_resetra   _seedr6   r   r\   r]   r[   r   r!   r   )rj   r   r   rq   r   r   r   rC   rC   rF   r     s4   


z SingleAgentEnvRunner._reset_envsc                 C   s<   |d ur|n| j }t| jj| jjd||< | d|| d S )N)r   r   on_episode_created)rc   r   rH   r   r   r   )rj   r   r   rC   rC   rF   r     s   
z!SingleAgentEnvRunner._new_episodewhichidxr   c                 C   s`   t || | | j| jj| j|d}|dkr| j|| j |d< t|| jt	| j
d| |d d S )N)episoder{   r|   rH   r   r   r   prev_episode_chunks
callbacks_r~   )r   r   rH   r  r]   rg   r   r   rR   getattrr@   )rj   r!  r"  r   rk   rC   rC   rF   r     s$   
z.SingleAgentEnvRunner._make_on_episode_callbackc                 C   s   | j jt|ddd | j jttf|ddd | j jttf|ddd | j jt|ddd | j jt|ddd | j jt	tf|dd | j jt
tf|dd | j jt|dd |S )Nr   T)r   clear_on_reduce)r   r   )r   )r   r   r,   r*   r   r0   r   r.   r-   r+   r1   r/   )rj   	num_stepsnum_episodes_completedrC   rC   rF   r   &  sV   z.SingleAgentEnvRunner._increase_sampled_metricsc                 C   s   t dtt| jj| jjpd }| jjt	||d | jjt
||d | jjt||d | jjdtf||d | jjdtf||d | jjt|d|d | jjt|d|d | jjt|d|d | jjt|d|d d S )Nrw   )rx   agent_episode_return_meanmodule_episode_return_meanmin)r   rx   max)r-  intmathceilr@   "metrics_num_episodes_for_smoothingrX   r   r   r$   r'   r"   r   r   r%   r(   r#   r&   )rj   lengthretsecwinrC   rC   rF   r   U  s.   
	

z)SingleAgentEnvRunner._log_episode_metricsz6SingleAgentEnvRunner.get_state(components='rl_module')T)newerrorc                 O      d S rK   rC   rj   r  rk   rC   rC   rF   get_weightsu  s   z SingleAgentEnvRunner.get_weightsz SingleAgentEnvRunner.set_state()c                 O   r8  rK   rC   r9  rC   rC   rF   set_weights|  s   z SingleAgentEnvRunner.set_weightsrK   )rt   N))__name__
__module____qualname____doc__r   r   r
   rO   r.  boolr   r   r   r   r   r   r8   r   r   r   r  r   r9   r   r   r   r   r   r   rY   r^   r  r   r   r   r   r   r   r:  r;  __classcell__rC   rC   rl   rF   r?   C   s    Jp
 U
!&

	

S


+
/ 

r?   )]collectionsr   	functoolsr   loggingr/  r   typingr   r   r   r   r   	gymnasiumr  r   gymnasium.wrappers.vectorr	   %ray.rllib.algorithms.algorithm_configr
   ray.rllib.callbacks.callbacksr   ray.rllib.callbacks.utilsr   ray.rllib.corer   r   r   r   r   ray.rllib.core.columnsr   "ray.rllib.core.rl_module.rl_moduler   r   ray.rllib.envr   r   ray.rllib.env.env_contextr   ray.rllib.env.env_runnerr   r   "ray.rllib.env.single_agent_episoder   ray.rllib.env.utilsr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   ray.rllib.utils.checkpointsr   ray.rllib.utils.deprecationr   ray.rllib.utils.frameworkr    ray.rllib.utils.metricsr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   "ray.rllib.utils.spaces.space_utilsr6   ray.rllib.utils.typingr7   r8   r9   ray.tune.registryr:   r;   ray.util.annotationsr<   	getLoggerr   r?   rC   rC   rC   rF   <module>   sB    \
