o
    $i!                     @   s   d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ e	 \ZZeG dd dZeG d	d
 d
ZeG dd dZeG dd dZeG dd dZdS )    )PolicyState)SampleBatch)TorchPolicy)OldAPIStack)try_import_torch)PiecewiseSchedulec                       s*   e Zd ZdZdddZ fddZ  ZS )LearningRateSchedulez9Mixin for TorchPolicy that adds a learning rate schedule.Nc                 C   s~   d | _ d | _|d u r|| _nt||d d d d| _ | j d| _|d u r*|| _d S t||d d d d| _| jd| _d S )Noutside_value	frameworkr   )_lr_schedule_lr2_schedulecur_lrr   valuecur_lr2)selflrlr_schedulelr2lr2_schedule r   Z/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/rllib/policy/torch_mixins.py__init__   s   
zLearningRateSchedule.__init__c                    s   t  | | jr#| j|d | _| jD ]}|jD ]}| j|d< qq| jrHt| jdks/J | j|d | _	| jd }|jD ]	}| j	|d< q@d S d S )Ntimestepr         )
superon_global_var_updater   r   r   _optimizersparam_groupsr   lenr   )r   global_varsoptp	__class__r   r   r   #   s   



z)LearningRateSchedule.on_global_var_update)NN__name__
__module____qualname____doc__r   r   __classcell__r   r   r%   r   r      s    
r   c                       s(   e Zd ZdZdd Z fddZ  ZS )EntropyCoeffSchedulez4Mixin for TorchPolicy that adds entropy coeff decay.c                 C   sj   d | _ |d u r|| _d S t|trt||d d d d| _ ntd|g|dggdd d| _ | j d| _d S )Nr	   r
   r           )_entropy_coeff_scheduleentropy_coeff
isinstancelistr   r   )r   r0   entropy_coeff_scheduler   r   r   r   6   s   



zEntropyCoeffSchedule.__init__c                    s4   t t| | | jd ur| j|d | _d S d S )Nr   )r   r-   r   r/   r   r0   )r   r"   r%   r   r   r   M   s   

z)EntropyCoeffSchedule.on_global_var_updater'   r   r   r%   r   r-   2   s    r-   c                       sL   e Zd ZdZdd Zdd Zdef fddZd	edd
f fddZ  Z	S )KLCoeffMixinzAssigns the `update_kl()` method to a TorchPolicy.

    This is used by Algorithms to update the KL coefficient
    after each learning step based on `config.kl_target` and
    the measured KL value (from the train_batch).
    c                 C   s   |d | _ |d | _d S )Nkl_coeff	kl_target)r5   r6   )r   configr   r   r   r   ^   s   
zKLCoeffMixin.__init__c                 C   sD   |d| j  kr|  jd9  _| jS |d| j  k r|  jd9  _| jS )Ng       @g      ?g      ?)r6   r5   )r   
sampled_klr   r   r   	update_kld   s   zKLCoeffMixin.update_klreturnc                    s   t   }| j|d< |S )Ncurrent_kl_coeff)r   	get_stater5   r   stater%   r   r   r<   m   s   

zKLCoeffMixin.get_stater>   Nc                    s$   | d| jd | _t | d S )Nr;   r5   )popr7   r5   r   	set_stater=   r%   r   r   r@   s   s   zKLCoeffMixin.set_state)
r(   r)   r*   r+   r   r9   r   r<   r@   r,   r   r   r%   r   r4   U   s    	r4   c                   @   s    e Zd ZdZdd Zdd ZdS )ValueNetworkMixina  Assigns the `_value()` method to a TorchPolicy.

    This way, Policy can call `_value()` to get the current VF estimate on a
    single(!) observation (as done in `postprocess_trajectory_fn`).
    Note: When doing this, an actual forward pass is being performed.
    This is different from only calling `model.value_function()`, where
    the result of the most recent forward pass is being used to return an
    already calculated tensor.
    c                    s4   | ds
| dr fdd}ndd }| _d S )Nuse_gaevtracec                     s2   t | }  | }  | \}} j d  S )Nr   )r   _lazy_tensor_dictmodelvalue_functionitem)
input_dict	model_out_r   r   r   r      s   
z)ValueNetworkMixin.__init__.<locals>.valuec                  _   s   dS )Nr.   r   )argskwargsr   r   r   r      s   )get_value)r   r7   r   r   rK   r   r      s   

zValueNetworkMixin.__init__c                 C   s   t j| iS )a  Defines extra fetches per action computation.

        Args:
            input_dict (Dict[str, TensorType]): The input dict used for the action
                computing forward pass.
            state_batches (List[TensorType]): List of state tensors (empty for
                non-RNNs).
            model (ModelV2): The Model object of the Policy.
            action_dist: The instantiated distribution
                object, resulting from the model's outputs and the given
                distribution class.

        Returns:
            Dict[str, TensorType]: Dict with extra tf fetches to perform per
                action computation.
        )r   VF_PREDSrF   )r   rH   state_batchesrE   action_distr   r   r   extra_action_out   s   
z"ValueNetworkMixin.extra_action_outN)r(   r)   r*   r+   r   rS   r   r   r   r   rA   z   s    
rA   c                   @   s*   e Zd ZdZdd Zd	ddZdd ZdS )
TargetNetworkMixina  Mixin class adding a method for (soft) target net(s) synchronizations.

    - Adds the `update_target` method to the policy.
      Calling `update_target` updates all target Q-networks' weights from their
      respective "main" Q-networks, based on tau (smooth, partial updating).
    c                 C   s   | j dd}| j|d d S )Ntau      ?)rU   )r7   rN   update_target)r   rU   r   r   r   r      s   zTargetNetworkMixin.__init__Nc                    sh   p| j dd| j  tt| j  } fdd| D  | j D ]}|	  q*d S )NrU   rV   c                    s*   i | ]\}}| |  d  |  qS )r   r   ).0kvmodel_state_dictrU   r   r   
<dictcomp>   s    z4TargetNetworkMixin.update_target.<locals>.<dictcomp>)
r7   rN   rE   
state_dictnextitertarget_modelsvaluesitemsload_state_dict)r   rU   target_state_dicttargetr   r[   r   rW      s   
z TargetNetworkMixin.update_targetc                 C   s   t | | |   d S N)r   set_weightsrW   )r   weightsr   r   r   rh      s   zTargetNetworkMixin.set_weightsrg   )r(   r)   r*   r+   r   rW   rh   r   r   r   r   rT      s
    
rT   N)ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.torch_policyr   ray.rllib.utils.annotationsr   ray.rllib.utils.frameworkr   ray.rllib.utils.schedulesr   torchnnr   r-   r4   rA   rT   r   r   r   r   <module>   s"    
&"$;