o
    ci;6                     @   s  d dl mZ d dlZd dlmZ d dlZd dlmZm	Z	 d dl
mZ d dlmZmZ ede	ejef defd	d
Zedd ZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd dejZeG dd  d ejZ e	!	"	d-dejd#e!d$e!d%ee! dejf
d&d'Z"ed.d+d,Z#dS )/    )dequeN)spaces)OptionalUnion)	PublicAPI)rgb2grayresizeenvreturnc                 C   sV   t | ts!t| jdr| jjdurt| jjdkrdS dt| v S | dp*| dS )a  Returns, whether a given env object or env descriptor (str) is an Atari env.

    Args:
        env: The gym.Env object or a string descriptor of the env (for example,
        "ale_py:ALE/Pong-v5").

    Returns:
        Whether `env` is an Atari environment.
    shapeN   FzAtariEnv<ALEzALE/zale_py:)
isinstancestrhasattrobservation_spacer   len
startswith)r	    r   Y/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/env/wrappers/atari_wrappers.pyis_atari   s   

r   c                 C   s.   | }	 t ||r
|S t |tjr|j}ndS q)z8Returns the gym env wrapper of the given class, or None.TN)r   gymWrapperr	   )r	   cls
currentenvr   r   r   get_wrapper_by_cls%   s   
r   c                   @      e Zd Zdd Zdd ZdS )ClipRewardEnvc                 C   s   t j| | d S N)r   RewardWrapper__init__selfr	   r   r   r   r   4      zClipRewardEnv.__init__c                 C   s
   t |S )z&Bin reward to {+1, 0, -1} by its sign.)npsign)r!   rewardr   r   r   r%   7   s   
zClipRewardEnv.rewardN)__name__
__module____qualname__r   r%   r   r   r   r   r   2   s    r   c                   @   $   e Zd Zdd Zdd Zdd ZdS )EpisodicLifeEnvc                 C   s   t j| | d| _d| _dS )zMake end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        r   TN)r   r   r   liveswas_real_terminatedr    r   r   r   r   >   s   
zEpisodicLifeEnv.__init__c                 C   sT   | j |\}}}}}|| _| j jj }|| jk r |dkr d}|| _|||||fS )Nr   T)r	   stepr,   	unwrappedaler+   )r!   actionobsr%   
terminated	truncatedinfor+   r   r   r   r-   F   s   zEpisodicLifeEnv.stepc                 K   sL   | j r| jjdi |\}}n| jd\}}}}}| jjj | _||fS )zReset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        r   Nr   )r,   r	   resetr-   r.   r/   r+   )r!   kwargsr1   r4   _r   r   r   r5   T   s
   zEpisodicLifeEnv.resetNr&   r'   r(   r   r-   r5   r   r   r   r   r*   <   s    r*   c                   @   r)   )FireResetEnvc                 C   s>   t j| | |j d dksJ t|j dksJ dS )zLTake action on reset.

        For environments that are fixed until firing.   FIRE   N)r   r   r   r.   get_action_meaningsr   r    r   r   r   r   d   s   zFireResetEnv.__init__c                 K   sz   | j jdi | | j d\}}}}}|s|r!| j jdi | | j d\}}}}}|s0|r9| j jdi | ||fS )Nr:   r   r   )r	   r5   r-   )r!   r6   r1   r7   r2   r3   r4   r   r   r   r5   l   s   zFireResetEnv.resetc                 C      | j |S r   r	   r-   r!   acr   r   r   r-   v      zFireResetEnv.stepNr&   r'   r(   r   r5   r-   r   r   r   r   r9   b   s    
r9   c                   @   s4   e Zd Zdd ZdddddZdd Zd	d
 ZdS )
FrameStackc                 C   s|   t j| | || _tg |d| _|jj}tj	t
j|jj|ddt
j|jj|dd|d |d |d | f|jjd| _dS )	zStack k last frames.)maxlen)repeatsaxisr   r:   r   lowhighr   dtypeN)r   r   r   kr   framesr   r   r   Boxr#   repeatrJ   rK   rL   )r!   r	   rM   shpr   r   r   r   |   s   zFrameStack.__init__Nseedoptionsc                C   s<   | j j||d\}}t| jD ]}| j| q|  |fS )NrR   )r	   r5   rangerM   rN   append_get_ob)r!   rS   rT   obinfosr7   r   r   r   r5      s   zFrameStack.resetc                 C   s4   | j |\}}}}}| j| |  ||||fS r   )r	   r-   rN   rV   rW   )r!   r0   rX   r%   r2   r3   r4   r   r   r   r-      s   zFrameStack.stepc                 C   s$   t | j| jks
J tj| jddS )Nr   rH   )r   rN   rM   r#   concatenater!   r   r   r   rW      s   zFrameStack._get_ob)r&   r'   r(   r   r5   r-   rW   r   r   r   r   rD   z   s
    rD   c                   @   r   )FrameStackTrajectoryViewc                 C   sN   t j| | |jj}|d dksJ tjdd|d |d f|jjd| _dS )z4No stacking. Trajectory View API takes care of this.r   r:   r      rI   N)r   r   r   r   r   r   rO   rL   )r!   r	   rQ   r   r   r   r      s   z!FrameStackTrajectoryView.__init__c                 C   s   t j|ddS )NrF   rZ   )r#   squeezer!   observationr   r   r   ra      s   z$FrameStackTrajectoryView.observationNr&   r'   r(   r   ra   r   r   r   r   r]          	r]   c                   @   &   e Zd Zd	ddZdd Zdd ZdS )
MaxAndSkipEnv   c                 C   s4   t j| | tjd|jj |jjd| _|| _	dS )z!Return only every `skip`-th frame)r   )rL   N)
r   r   r   r#   zerosr   r   rL   _obs_buffer_skip)r!   r	   skipr   r   r   r      s
   
zMaxAndSkipEnv.__init__c           
      C   s   d}d } }}t | jD ]/}| j|\}}}}}|| jd kr&|| jd< || jd kr2|| jd< ||7 }|s:|r< nq| jjdd}	|	||||fS )z:Repeat action, sum reward, and max over last observations.g        Nr   r   r:   rZ   )rU   ri   r	   r-   rh   max)
r!   r0   total_rewardr2   r3   r4   ir1   r%   	max_framer   r   r   r-      s   

zMaxAndSkipEnv.stepc                 K   s   | j jdi |S )Nr   )r	   r5   )r!   r6   r   r   r   r5      r"   zMaxAndSkipEnv.resetN)rf   r8   r   r   r   r   re      s    
	re   c                   @   sF   e Zd ZdddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dS )
MonitorEnvNc                 C   s<   t j| | d| _d| _d| _g | _g | _d| _d| _	dS )z4Record episodes stats prior to EpisodicLifeEnv, etc.Nr   )
r   r   r   _current_reward
_num_steps_total_steps_episode_rewards_episode_lengths_num_episodes_num_returnedr    r   r   r   r      s   
zMonitorEnv.__init__c                 K   st   | j jdi |\}}| jd u rt| j| _| jd ur0| j| j | j| j |  j	d7  _	d| _d| _||fS )Nr:   r   r   )
r	   r5   rr   sumrt   rp   rs   rV   rq   ru   )r!   r6   r1   r4   r   r   r   r5      s   

zMonitorEnv.resetc                 C   sN   | j |\}}}}}|  j|7  _|  jd7  _|  jd7  _|||||fS )Nr:   )r	   r-   rp   rq   rr   )r!   r0   r1   rewr2   r3   r4   r   r   r   r-      s
   zMonitorEnv.stepc                 C      | j S r   )rs   r\   r   r   r   get_episode_rewards      zMonitorEnv.get_episode_rewardsc                 C   ry   r   )rt   r\   r   r   r   get_episode_lengths   r{   zMonitorEnv.get_episode_lengthsc                 C   ry   r   )rr   r\   r   r   r   get_total_steps   r{   zMonitorEnv.get_total_stepsc                 c   s@    t | jt| jD ]}| j| | j| fV  q
t| j| _d S r   )rU   rv   r   rs   rt   )r!   rm   r   r   r   next_episode_results   s   zMonitorEnv.next_episode_resultsr   )
r&   r'   r(   r   r5   r-   rz   r|   r}   r~   r   r   r   r   ro      s    
ro   c                   @   rd   )
NoopResetEnv   c                 C   s:   t j| | || _d| _d| _|j d dksJ dS )zsSample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        Nr   NOOP)r   r   r   noop_maxoverride_num_noopsnoop_actionr.   r=   )r!   r	   r   r   r   r   r      s
   zNoopResetEnv.__init__c                 K   s   | j jdi | | jdur| j}n"z| jjd| jd }W n ty3   | jjd| jd }Y nw |dks:J d}t	|D ]}| j 
| j\}}}}}|sR|r]| j jdi |\}}q@||fS )z7Do no-op action for a number of steps in [1, noop_max].Nr:   r   r   )r	   r5   r   r.   	np_randomintegersr   AttributeErrorrandintrU   r-   r   )r!   r6   noopsr1   r7   r2   r3   r4   r   r   r   r5   	  s    
zNoopResetEnv.resetc                 C   r>   r   r?   r@   r   r   r   r-     rB   zNoopResetEnv.stepN)r   rC   r   r   r   r   r      s    

r   c                       s$   e Zd Z fddZdd Z  ZS )NormalizedImageEnvc                    s2   t  j|i | tjjdd| jjtjd| _d S )Ng            ?)r   rL   )	superr   r   r   rO   r   r   r#   float32)r!   argsr6   	__class__r   r   r   $  s   zNormalizedImageEnv.__init__c                 C   s   | tjd d S )Ng      `@r   )astyper#   r   r`   r   r   r   ra   /  s   zNormalizedImageEnv.observation)r&   r'   r(   r   ra   __classcell__r   r   r   r   r   "  s    r   c                   @   r   )	WarpFramec                 C   s>   t j| | || _|| _tjdd| j| jdftjd| _	dS )z.Warp frames to the specified size (dim x dim).r   r^   r:   rI   N)
r   ObservationWrapperr   widthheightr   rO   r#   uint8r   )r!   r	   dimr   r   r   r   5  s   zWarpFrame.__init__c                 C   s0   t |}t|| j| jd}|d d d d d f S )N)r   r   )r   r   r   r   )r!   framer   r   r   ra   >  s   zWarpFrame.observationNrb   r   r   r   r   r   3  rc   r   @   rf   r   	frameskip
framestackc                 C   s   t jj| dd} t| |d} t| } |dkr#| jdusJ t| |d} t| dd} t| } d	| j	
 v r8t| } |r@t| |d
} | S )ae  Wraps `env` for new-API-stack-friendly RLlib Atari experiments.

    Note that we assume reward clipping is done outside the wrapper.

    Args:
        env: The env object to wrap.
        dim: Dimension to resize observations to (dim x dim).
        frameskip: Whether to skip n frames and max over them (keep brightest pixels).
        framestack: Whether to stack the last n (grayscaled) frames. Note that this
            step happens after(!) a possible frameskip step, meaning that if
            frameskip=4 and framestack=2, we would perform the following over this
            trajectory:
            actual env timesteps: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 -> ...
            frameskip:            ( max ) ( max ) ( max   ) ( max     )
            framestack:           ( stack       ) (stack              )

    Returns:
        The wrapped gym.Env.
    i )max_episode_steps)r   r:   Nrj   r   r   r;   )rM   )r   wrappers	TimeLimitr   r   specre   r   r*   r.   r=   r9   rD   )r	   r   r   r   r   r   r   wrap_atari_for_new_api_stackD  s   r   T   TFc                 C   sp   t | } t| dd} | jdur|du rt| dd} t| } d| j v r(t| } t| |} |du r6t	| d} | S )a   Configure environment for DeepMind-style Atari.

    Note that we assume reward clipping is done outside the wrapper.

    Args:
        env: The env object to wrap.
        dim: Dimension to resize observations to (dim x dim).
        framestack: Whether to framestack observations.
    r   r   NTrf   r   r;   )
ro   r   r   re   r*   r.   r=   r9   r   rD   )r	   r   r   noframeskipr   r   r   wrap_deepmindy  s   

r   )r   rf   N)r   TF)$collectionsr   	gymnasiumr   r   numpyr#   typingr   r   ray.rllib.utils.annotationsr   ray.rllib.utils.imagesr   r   Envr   boolr   r   r   r   r   r*   r9   rD   r   r]   re   ro   r   r   r   intr   r   r   r   r   r   <module>   s`    
	%!2$4