o
    ciIC                     @   s  d dl mZmZ d dlZd dlmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z& errd dl'm(Z( e \Z)Z*Z+e \Z,Z-eG dd deZ.dS )    )BoxDiscreteN)OptionalTYPE_CHECKINGUnion)BaseEnv)ActionDistribution)ModelV2)CategoricalDeterministic)TorchCategoricalTorchDeterministic)SampleBatch)OldAPIStackoverride)Exploration)get_variabletry_import_tftry_import_torch)from_config)softmaxSMALL_NUMBER)
TensorType)Policyc                       s  e Zd ZdZdddddededed	ed
ede	e f fddZ
eeddddde	e de	e de	d fddZeededeeef deeef fddZeedddddddedede	d fddZdd  Zeeddddd!d"Zee	d>ddd#ede	d fd$d%Zdd&d'd(Zd)d* Zdd+d,d-d.Zdd&d/d0Zd1d2 Zdd&d3d4Zd5d6 Zeed>d7d8Z eed>d9ed:e	d d;dfd<d=Z!  Z"S )?ParameterNoisea  An exploration that changes a Model's parameters.

    Implemented based on:
    [1] https://openai.com/research/better-exploration-with-parameter-noise
    [2] https://arxiv.org/pdf/1706.01905.pdf

    At the beginning of an episode, Gaussian noise is added to all weights
    of the model. At the end of the episode, the noise is undone and an action
    diff (pi-delta) is calculated, from which we determine the changes in the
    noise's stddev for the next episode.
          ?i'  N)initial_stddevrandom_timestepssub_exploration	frameworkpolicy_configmodelr   r   r   c             
      s  |dusJ t  j|f|||d| t|| jdd| _|| _dd | jjdd D | _	g | _
| j	D ]'}	|	jrD|	jd	d
 d nd}
| j
ttj|	jtjd| j|
d| jd q5| jdkrt s|  | _|  | _|  | _t| jg |  }W d   n1 sw   Y  t|g t | _W d   n1 sw   Y  d| _|du rt | j!t"rddd|d df|d dfgddd}nt | j!t#rd|d}nt$t%t&|f| j| j!| j'| jd|| _(d| _)dS )af  Initializes a ParameterNoise Exploration object.

        Args:
            initial_stddev: The initial stddev to use for the noise.
            random_timesteps: The number of timesteps to act completely
                randomly (see [1]).
            sub_exploration: Optional sub-exploration config.
                None for auto-detection/setup.
        N)r    r!   r   stddev)r   tf_namec                 S   s   g | ]
\}}d |vr|qS )	LayerNorm ).0kvr%   r%   _/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/utils/exploration/parameter_noise.py
<listcomp>P   s
    z+ParameterNoise.__init__.<locals>.<listcomp>T)as_dict:r   _noisy )dtype)r   r#   torch_tensordevicetfFEpsilonGreedyPiecewiseSchedule)r   r      r      g{Gz?)type	endpointsoutside_value)r7   epsilon_scheduleOrnsteinUhlenbeckNoise)r7   r   )r   action_spacer    r!   )*super__init__r   r   r"   
stddev_valr!   trainable_variablesitemsmodel_variablesnoisenamesplitappendnpzerosshapefloat32r1   r2   executing_eagerly_tf_sample_new_noise_optf_sample_new_noise_op_tf_add_stored_noise_optf_add_stored_noise_op_tf_remove_noise_optf_remove_noise_optf1control_dependenciesno_optf_sample_new_noise_and_add_opweights_are_currently_noisy
isinstancer<   r   r   NotImplementedErrorr   r   r    r   episode_started)selfr<   r   r    r!   r   r   r   kwargsvarname_add_op	__class__r%   r)   r>   *   s   







zParameterNoise.__init__)timestepexploretf_sessra   rb   rc   z
tf.Sessionc                C   sd   |d ur|n| j d }| jr| || |r!| js!| j|d d S |s.| jr0| j|d d S d S d S )Nrb   rc   )r    rY   _delayed_on_episode_startrV   _add_stored_noise_remove_noise)rZ   ra   rb   rc   r%   r%   r)   before_compute_actions   s   

z%ParameterNoise.before_compute_actionsaction_distributionc                C   s   | j j|||dS )N)ri   ra   rb   )r   get_exploration_action)rZ   ri   ra   rb   r%   r%   r)   rj      s   
z%ParameterNoise.get_exploration_action)environmentepisoderc   policyr   rk   rl   c                C   s
   d| _ d S )NT)rY   rZ   rm   rk   rl   rc   r%   r%   r)   on_episode_start   s   
zParameterNoise.on_episode_startc                 C   s*   |r
| j |dd n| j|d d| _d S )NTrc   r   rd   F)_sample_new_noise_and_add_sample_new_noiserY   )rZ   rb   rc   r%   r%   r)   re      s   
z(ParameterNoise._delayed_on_episode_startc                C   s   | j r| j|d d S d S )Nrd   )rV   rg   rn   r%   r%   r)   on_episode_end   s   zParameterNoise.on_episode_endsample_batchc              
   C   s  d  }}|j || jd\}}}t|jttfrt|tj }nt|jt	t
fr-|tj }nt| jr5|}n|}|j || j d\}}}t|jttfrSt|tj }nt|jt	t
fr`|tj }|d u rg|}n|}d  }	}
t|jttfrtt|t||t   d}
| jj|dd }td| || jj   }	n't|jt	t
frttt|| }
| jj|dd }t| jdd| }	|
|	kr|  jd9  _n|  jd  _| j|  |d |S )	N)
input_dictrb   r5   )sesscur_epsilon	cur_scaleou_sigmag?g)\(?)compute_actions_from_input_dictrV   
issubclass
dist_classr
   r   r   r   ACTION_DIST_INPUTSr   r   rX   rG   nanmeansumlogr   r   	get_stater<   nsqrtmeansquaregetattrr?   	set_state)rZ   rm   rt   rc   noisy_action_distnoise_free_action_dist_fetchesaction_distdeltadistancecurrent_epsiloncurrent_scaler%   r%   r)   postprocess_trajectory   sb   

	z%ParameterNoise.postprocess_trajectoryrd   c                C   sv   | j dkr|| j dS | j dkr|   dS tt| jD ]}tjt	| j| 
 | jd| j| j|< qdS )z0Samples new noise and stores it in `self.noise`.r2   tf2)r   stdN)r   runrM   rL   rangelenrC   torchnormalrH   sizer"   tor1   )rZ   rc   ir%   r%   r)   rr   2  s   


z ParameterNoise._sample_new_noisec                 C   s@   g }| j D ]}|t|tjj|j| jtj	d qtj
| S )N)rI   r"   r/   )rC   rF   rR   assignr2   randomr   rI   r"   rJ   group)rZ   added_noisesrC   r%   r%   r)   rL   >  s   

z&ParameterNoise._tf_sample_new_noise_opFrp   c                C   sZ   | j dkr|r| jr|| j || j n|r | jr |   |   |   d| _d S )Nr2   T)r   rV   r   rQ   rU   rg   rr   rf   )rZ   rc   r   r%   r%   r)   rq   K  s   



z(ParameterNoise._sample_new_noise_and_addc                C   sr   | j du sJ | jdkr|| j n!| jdkr|   nt| j| jD ]\}}d|_|	| d|_q$d| _ dS )a  Adds the stored `self.noise` to the model's parameters.

        Note: No new sampling of noise here.

        Args:
            tf_sess (Optional[tf.Session]): The tf-session to use to add the
                stored noise to the (currently noise-free) weights.
            override: If True, undo any currently applied noise first,
                then add the currently stored noise.
        Fr2   r   TN)
rV   r   r   rO   rN   ziprB   rC   requires_gradadd_rZ   rc   r\   rC   r%   r%   r)   rf   X  s   




z ParameterNoise._add_stored_noisec                 C   sv   t  }t| j| jD ]\}}|t|| q
tjt	| }t
|g t W  d   S 1 s4w   Y  dS )zGenerates tf-op that assigns the stored noise to weights.

        Also used by tf-eager.

        Returns:
            tf.op: The tf op to apply the already stored noise to the NN.
        Nlistr   rB   rC   rF   rR   
assign_addr2   r   tuplerS   rT   )rZ   add_noise_opsr\   rC   retr%   r%   r)   rN   t  s   $z&ParameterNoise._tf_add_stored_noise_opc                C   st   | j du sJ | jdkr|| j n"| jdkr|   nt| j| jD ]\}}d|_|	|  d|_q$d| _ dS )z
        Removes the current action noise from the model parameters.

        Args:
            tf_sess (Optional[tf.Session]): The tf-session to use to remove
                the noise from the (currently noisy) weights.
        Tr2   r   FN)
rV   r   r   rQ   rP   r   rB   rC   r   r   r   r%   r%   r)   rg     s   	



zParameterNoise._remove_noisec                 C   sx   t  }t| j| jD ]\}}|t||  q
tjt	| }t
|g t W  d   S 1 s5w   Y  dS )zGenerates a tf-op for removing noise from the model's weights.

        Also used by tf-eager.

        Returns:
            tf.op: The tf op to remve the currently stored noise from the NN.
        Nr   )rZ   remove_noise_opsr\   rC   r   r%   r%   r)   rP     s   $z"ParameterNoise._tf_remove_noise_opc                 C   s
   d| j iS )N
cur_stddev)r?   )rZ   rv   r%   r%   r)   r     s   
zParameterNoise.get_statestaterv   returnc                 C   sT   |d | _ | jdkr| jj| j |d d S t| jtr!| j | _d S | j| j  d S )Nr   r2   )session)r?   r   r"   loadrW   floatr   )rZ   r   rv   r%   r%   r)   r     s   

zParameterNoise.set_state)N)#__name__
__module____qualname____doc__strdictr	   r   intr   r>   r   r   boolrh   r   r   r   rj   r   ro   re   rs   r   r   rr   rL   rq   rf   rN   rg   rP   r   r   __classcell__r%   r%   r_   r)   r      s    	r

	S&r   )/gymnasium.spacesr   r   numpyrG   typingr   r   r   ray.rllib.env.base_envr   ray.rllib.models.action_distr   ray.rllib.models.modelv2r	   "ray.rllib.models.tf.tf_action_distr
   r   (ray.rllib.models.torch.torch_action_distr   r   ray.rllib.policy.sample_batchr   ray.rllib.utils.annotationsr   r   'ray.rllib.utils.exploration.explorationr   ray.rllib.utils.frameworkr   r   r   ray.rllib.utils.from_configr   ray.rllib.utils.numpyr   r   ray.rllib.utils.typingr   ray.rllib.policy.policyr   rR   r2   tfvr   r   r   r%   r%   r%   r)   <module>   s*    
