o
    `۷i7                     @   s  d dl mZ d dlmZmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZmZ edeejef defd	d
Zedd ZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd  d ejZ e Z!e	!	"		#d.dejd$e"d%e"d&ee" d'edejfd(d)Z#ed/d,d-Z$dS )0    )deque)OptionalUnionN)spaces)	PublicAPI)resizergb2grayenvreturnc                 C   sV   t | ts!t| jdr| jjdurt| jjdkrdS dt| v S | dp*| dS )a  Returns, whether a given env object or env descriptor (str) is an Atari env.

    Args:
        env: The gym.Env object or a string descriptor of the env (for example,
        "ale_py:ALE/Pong-v5").

    Returns:
        Whether `env` is an Atari environment.
    shapeN   FzAtariEnv<ALEzALE/zale_py:)
isinstancestrhasattrobservation_spacer   len
startswith)r	    r   [/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/rllib/env/wrappers/atari_wrappers.pyis_atari   s   

r   c                 C   s.   | }	 t ||r
|S t |tjr|j}ndS q)z8Returns the gym env wrapper of the given class, or None.TN)r   gymWrapperr	   )r	   cls
currentenvr   r   r   get_wrapper_by_cls&   s   
r   c                   @      e Zd Zdd Zdd ZdS )ClipRewardEnvc                 C   s   t j| | d S N)r   RewardWrapper__init__selfr	   r   r   r   r   5      zClipRewardEnv.__init__c                 C   s
   t |S )z&Bin reward to {+1, 0, -1} by its sign.)npsign)r!   rewardr   r   r   r%   8   s   
zClipRewardEnv.rewardN)__name__
__module____qualname__r   r%   r   r   r   r   r   3   s    r   c                   @   $   e Zd Zdd Zdd Zdd ZdS )EpisodicLifeEnvc                 C   s   t j| | d| _d| _dS )zMake end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        r   TN)r   r   r   liveswas_real_terminatedr    r   r   r   r   ?   s   
zEpisodicLifeEnv.__init__c                 C   sT   | j |\}}}}}|| _| j jj }|| jk r |dkr d}|| _|||||fS )Nr   T)r	   stepr,   	unwrappedaler+   )r!   actionobsr%   
terminated	truncatedinfor+   r   r   r   r-   G   s   zEpisodicLifeEnv.stepc                 K   sL   | j r| jjdi |\}}n| jd\}}}}}| jjj | _||fS )zReset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        r   Nr   )r,   r	   resetr-   r.   r/   r+   )r!   kwargsr1   r4   _r   r   r   r5   U   s
   zEpisodicLifeEnv.resetNr&   r'   r(   r   r-   r5   r   r   r   r   r*   =   s    r*   c                   @   r)   )FireResetEnvc                 C   s>   t j| | |j d dksJ t|j dksJ dS )zLTake action on reset.

        For environments that are fixed until firing.   FIRE   N)r   r   r   r.   get_action_meaningsr   r    r   r   r   r   e   s   zFireResetEnv.__init__c                 K   sz   | j jdi | | j d\}}}}}|s|r!| j jdi | | j d\}}}}}|s0|r9| j jdi | ||fS )Nr:   r   r   )r	   r5   r-   )r!   r6   r1   r7   r2   r3   r4   r   r   r   r5   m   s   zFireResetEnv.resetc                 C      | j |S r   r	   r-   r!   acr   r   r   r-   w      zFireResetEnv.stepNr&   r'   r(   r   r5   r-   r   r   r   r   r9   c   s    
r9   c                   @   s4   e Zd Zdd ZdddddZdd Zd	d
 ZdS )
FrameStackc                 C   s|   t j| | || _tg |d| _|jj}tj	t
j|jj|ddt
j|jj|dd|d |d |d | f|jjd| _dS )	zStack k last frames.)maxlen)repeatsaxisr   r:   r   lowhighr   dtypeN)r   r   r   kr   framesr   r   r   Boxr#   repeatrJ   rK   rL   )r!   r	   rM   shpr   r   r   r   }   s   zFrameStack.__init__Nseedoptionsc                C   s<   | j j||d\}}t| jD ]}| j| q|  |fS )NrR   )r	   r5   rangerM   rN   append_get_ob)r!   rS   rT   obinfosr7   r   r   r   r5      s   zFrameStack.resetc                 C   s4   | j |\}}}}}| j| |  ||||fS r   )r	   r-   rN   rV   rW   )r!   r0   rX   r%   r2   r3   r4   r   r   r   r-      s   zFrameStack.stepc                 C   s$   t | j| jks
J tj| jddS )Nr   rH   )r   rN   rM   r#   concatenater!   r   r   r   rW      s   zFrameStack._get_ob)r&   r'   r(   r   r5   r-   rW   r   r   r   r   rD   {   s
    rD   c                   @   r   )FrameStackTrajectoryViewc                 C   sN   t j| | |jj}|d dksJ tjdd|d |d f|jjd| _dS )z4No stacking. Trajectory View API takes care of this.r   r:   r      rI   N)r   r   r   r   r   r   rO   rL   )r!   r	   rQ   r   r   r   r      s   z!FrameStackTrajectoryView.__init__c                 C   s   t j|ddS )NrF   rZ   )r#   squeezer!   observationr   r   r   ra      s   z$FrameStackTrajectoryView.observationN)r&   r'   r(   r   ra   r   r   r   r   r]      s    	r]   c                   @   &   e Zd Zd	ddZdd Zdd ZdS )
MaxAndSkipEnv   c                 C   s4   t j| | tjd|jj |jjd| _|| _	dS )z!Return only every `skip`-th frame)r   )rL   N)
r   r   r   r#   zerosr   r   rL   _obs_buffer_skip)r!   r	   skipr   r   r   r      s
   
zMaxAndSkipEnv.__init__c           
      C   s   d}d } }}t | jD ]/}| j|\}}}}}|| jd kr&|| jd< || jd kr2|| jd< ||7 }|s:|r< nq| jjdd}	|	||||fS )z:Repeat action, sum reward, and max over last observations.g        Nr   r   r:   rZ   )rU   rg   r	   r-   rf   max)
r!   r0   total_rewardr2   r3   r4   ir1   r%   	max_framer   r   r   r-      s   

zMaxAndSkipEnv.stepc                 K   s   | j jdi |S )Nr   )r	   r5   )r!   r6   r   r   r   r5      r"   zMaxAndSkipEnv.resetN)rd   r8   r   r   r   r   rc      s    
	rc   c                   @   sF   e Zd ZdddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dS )
MonitorEnvNc                 C   s<   t j| | d| _d| _d| _g | _g | _d| _d| _	dS )z4Record episodes stats prior to EpisodicLifeEnv, etc.Nr   )
r   r   r   _current_reward
_num_steps_total_steps_episode_rewards_episode_lengths_num_episodes_num_returnedr    r   r   r   r      s   
zMonitorEnv.__init__c                 K   st   | j jdi |\}}| jd u rt| j| _| jd ur0| j| j | j| j |  j	d7  _	d| _d| _||fS )Nr:   r   r   )
r	   r5   rp   sumrr   rn   rq   rV   ro   rs   )r!   r6   r1   r4   r   r   r   r5      s   

zMonitorEnv.resetc                 C   sN   | j |\}}}}}|  j|7  _|  jd7  _|  jd7  _|||||fS )Nr:   )r	   r-   rn   ro   rp   )r!   r0   r1   rewr2   r3   r4   r   r   r   r-      s
   zMonitorEnv.stepc                 C      | j S r   )rq   r\   r   r   r   get_episode_rewards      zMonitorEnv.get_episode_rewardsc                 C   rw   r   )rr   r\   r   r   r   get_episode_lengths   ry   zMonitorEnv.get_episode_lengthsc                 C   rw   r   )rp   r\   r   r   r   get_total_steps   ry   zMonitorEnv.get_total_stepsc                 c   s@    t | jt| jD ]}| j| | j| fV  q
t| j| _d S r   )rU   rt   r   rq   rr   )r!   rk   r   r   r   next_episode_results   s   zMonitorEnv.next_episode_resultsr   )
r&   r'   r(   r   r5   r-   rx   rz   r{   r|   r   r   r   r   rm      s    
rm   c                   @   rb   )
NoopResetEnv   c                 C   s:   t j| | || _d| _d| _|j d dksJ dS )zsSample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        Nr   NOOP)r   r   r   noop_maxoverride_num_noopsnoop_actionr.   r=   )r!   r	   r   r   r   r   r      s
   zNoopResetEnv.__init__c                 K   s   | j jdi | | jdur| j}n"z| jjd| jd }W n ty3   | jjd| jd }Y nw |dks:J d}t	|D ]}| j 
| j\}}}}}|sR|r]| j jdi |\}}q@||fS )z7Do no-op action for a number of steps in [1, noop_max].Nr:   r   r   )r	   r5   r   r.   	np_randomintegersr   AttributeErrorrandintrU   r-   r   )r!   r6   noopsr1   r7   r2   r3   r4   r   r   r   r5   
  s    
zNoopResetEnv.resetc                 C   r>   r   r?   r@   r   r   r   r-     rB   zNoopResetEnv.stepN)r~   rC   r   r   r   r   r}      s    

r}   c                       s$   e Zd Z fddZdd Z  ZS )NormalizedImageEnvc                    s2   t  j|i | tjjdd| jjtjd| _d S )Ng            ?)r   rL   )	superr   r   r   rO   r   r   r#   float32)r!   argsr6   	__class__r   r   r   %  s   zNormalizedImageEnv.__init__c                 C   s   | tjd d S )Ng      `@r   )astyper#   r   r`   r   r   r   ra   0  s   zNormalizedImageEnv.observation)r&   r'   r(   r   ra   __classcell__r   r   r   r   r   #  s    r   c                   @   s$   e Zd ZddefddZdd ZdS )	GrayScaleAndResizeT	grayscalec                 C   sL   t j| | || _|| _|| _tjdd| j| j|rdndftj	d| _
dS )z.Warp frames to the specified size (dim x dim).r   r^   r:   r<   rI   N)r   ObservationWrapperr   widthheightr   r   rO   r#   uint8r   )r!   r	   dimr   r   r   r   r   6  s   zGrayScaleAndResize.__init__c                 C   sH   | j rt|}t|| j| jd}|d d d d d f S t|| j| jdS )N)r   r   )r   r   r   r   r   )r!   framer   r   r   ra   C  s
   zGrayScaleAndResize.observationN)T)r&   r'   r(   boolr   ra   r   r   r   r   r   4  s    r   @   rd   Tr   	frameskip
framestackr   c                 C   s   t jj| dd} t| ||d} t| } |dkr$| jdusJ t| |d} t| dd} t| } d	| j	
 v r9t| } |rAt| |d
} | S )ae  Wraps `env` for new-API-stack-friendly RLlib Atari experiments.

    Note that we assume reward clipping is done outside the wrapper.

    Args:
        env: The env object to wrap.
        dim: Dimension to resize observations to (dim x dim).
        frameskip: Whether to skip n frames and max over them (keep brightest pixels).
        framestack: Whether to stack the last n (grayscaled) frames. Note that this
            step happens after(!) a possible frameskip step, meaning that if
            frameskip=4 and framestack=2, we would perform the following over this
            trajectory:
            actual env timesteps: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 -> ...
            frameskip:            ( max ) ( max ) ( max   ) ( max     )
            framestack:           ( stack       ) (stack              )

    Returns:
        The wrapped gym.Env.
    i )max_episode_steps)r   r   r:   Nrh   r~   r   r;   )rM   )r   wrappers	TimeLimit	WarpFramer   specrc   r}   r*   r.   r=   r9   rD   )r	   r   r   r   r   r   r   r   wrap_atari_for_new_api_stackO  s   r   T   Fc                 C   sp   t | } t| dd} | jdur|du rt| dd} t| } d| j v r(t| } t| |} |du r6t	| d} | S )a   Configure environment for DeepMind-style Atari.

    Note that we assume reward clipping is done outside the wrapper.

    Args:
        env: The env object to wrap.
        dim: Dimension to resize observations to (dim x dim).
        framestack: Whether to framestack observations.
    r~   r   NTrd   r   r;   )
rm   r}   r   rc   r*   r.   r=   r9   r   rD   )r	   r   r   noframeskipr   r   r   wrap_deepmind  s   

r   )r   rd   NT)r   TF)%collectionsr   typingr   r   	gymnasiumr   numpyr#   r   ray.rllib.utils.annotationsr   ray.rllib.utils.imagesr   r   Envr   r   r   r   r   r   r   r*   r9   rD   r   r]   rc   rm   r}   r   r   r   intr   r   r   r   r   r   <module>   sh    
	%!2$5