o
    $i;                     @   sP  d dl Z d dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZmZmZmZ e  e!Z"e \Z#Z$Z%eG dd dZ&eG dd dZ'eG dd dZ(eG dd dZ)eG dd dZ*eG dd dZ+dededefddZ,dS )    N)DictList)ModelV2)EagerTFPolicy)EagerTFPolicyV2)PolicyState)SampleBatch)TFPolicy)OldAPIStack)get_variabletry_import_tf)PiecewiseSchedule)make_tf_callable)AlgorithmConfigDictLocalOptimizerModelGradients
TensorTypec                       s0   e Zd ZdZdd Z fddZdd Z  ZS )LearningRateSchedulez6Mixin for TFPolicy that adds a learning rate schedule.c                 C   s   d | _ |d u rtjd|dd| _d S t||d d d d| _ tjd| j ddd| _| jdkrEtjtj	dd| _
| jj| j
dd	| _d S d S )
NlrF)initializer	trainableoutside_value	frameworkr   tfdtypename
read_value)_lr_scheduletf1r   cur_lrr   valuer   placeholderr   float32_lr_placeholderassign
_lr_update)selfr   lr_schedule r,   W/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/rllib/policy/tf_mixins.py__init__   s   
zLearningRateSchedule.__init__c                    sv   t  | | jd ur9| j|d }| jdkr'|  j| j| j|id d S | j	j
|dd | jj
| j	 d S d S Ntimestepr   	feed_dictFr   )superon_global_var_updater!   r$   r   get_sessionrunr)   r'   r#   r(   
_optimizerlearning_rater*   global_varsnew_val	__class__r,   r-   r4   0   s   


z)LearningRateSchedule.on_global_var_updatec                 C   s*   | j dkrtjj| jdS tjj| jS )Nr   )r8   )	r   r"   trainAdamOptimizerr#   r   keras
optimizersAdamr*   r,   r,   r-   	optimizer>   s   
zLearningRateSchedule.optimizer)__name__
__module____qualname____doc__r.   r4   rD   __classcell__r,   r,   r<   r-   r      s
    r   c                       s(   e Zd ZdZdd Z fddZ  ZS )EntropyCoeffSchedulez1Mixin for TFPolicy that adds entropy coeff decay.c                 C   s   d | _ |d u rt|dddd| _d S t|tr$t||d d d d| _ ntd|g|dggdd d| _ t| j ddddd| _| jdkrYtj	t
jdd	| _| jj| jdd
| _d S d S )Nr   entropy_coeffF)r   tf_namer   r   r   r           r   r   )_entropy_coeff_scheduler   rK   
isinstancelistr   r$   r   r"   r%   r   r&   _entropy_coeff_placeholderr(   _entropy_coeff_update)r*   rK   entropy_coeff_scheduler,   r,   r-   r.   I   s<   




zEntropyCoeffSchedule.__init__c                    sf   t  | | jd ur1| j|d }| jdkr'|  j| j| j|id d S | j	j
|dd d S d S r/   )r3   r4   rN   r$   r   r5   r6   rR   rQ   rK   r(   r9   r<   r,   r-   r4   m   s   


z)EntropyCoeffSchedule.on_global_var_update)rE   rF   rG   rH   r.   r4   rI   r,   r,   r<   r-   rJ   E   s    $rJ   c                       sZ   e Zd ZdZdefddZdd Zdd Zd	ef fd
dZ	ded	df fddZ
  ZS )KLCoeffMixinzAssigns the `update_kl()` and other KL-related methods to a TFPolicy.

    This is used in Algorithms to update the KL coefficient after each
    learning step based on `config.kl_target` and the measured KL value
    (from the train_batch).
    configc                 C   sh   |d | _ tt| j dd|d d| _|d | _| jdkr2tjtj	dd| _
| jj| j
dd| _d S d S )	Nkl_coeffFr   )rL   r   r   	kl_targetr   r   r   )kl_coeff_valr   floatrV   rW   r   r"   r%   r   r&   _kl_coeff_placeholderr(   _kl_coeff_update)r*   rU   r,   r,   r-   r.      s    


zKLCoeffMixin.__init__c                 C   sT   |d| j  kr|  jd9  _n|d| j  k r|  jd9  _n| jS | | j | jS )Ng       @g      ?g      ?)rW   rX   _set_kl_coeff)r*   
sampled_klr,   r,   r-   	update_kl   s   zKLCoeffMixin.update_klc                 C   sF   || _ | jdkr|  j| j| j| j id d S | jj| j dd d S )Nr   r1   Fr   )rX   r   r5   r6   r[   rZ   rV   r(   )r*   new_kl_coeffr,   r,   r-   r\      s   


zKLCoeffMixin._set_kl_coeffreturnc                    s   t   }| j|d< |S )Ncurrent_kl_coeff)r3   	get_staterX   r*   stater<   r,   r-   rb      s   

zKLCoeffMixin.get_staterd   Nc                    s(   |  |d| jd  t | d S )Nra   rV   )r\   poprU   r3   	set_staterc   r<   r,   r-   rf      s   zKLCoeffMixin.set_state)rE   rF   rG   rH   r   r.   r^   r\   r   rb   rf   rI   r,   r,   r<   r-   rT   z   s    rT   c                   @   s^   e Zd ZdZdd Zedd Zedd Zdd	ed
dfddZ	d
e
e fddZdd ZdS )TargetNetworkMixinzAssign the `update_target` method to the policy.

    The function is called every `target_network_update_freq` steps by the
    master learner.
    c                    sD   | j   | j t|   fdd}|| _| jdd d S )Nc                    s~   t j| t jd} g }t tksJ  ft D ]\}}||| | d|  |   td	| qt j
| S )N)r         ?zUpdate target op {})r   convert_to_tensorr&   lenzipappendr(   loggerdebugformatgroup)tauupdate_target_exprvar
var_target
model_varstarget_model_varsr,   r-   update_target_fn   s   
z5TargetNetworkMixin.__init__.<locals>.update_target_fnrh   )rq   )modeltrainable_variablestarget_modelr   r5   
_do_updateupdate_target)r*   rx   r,   ru   r-   r.      s   


zTargetNetworkMixin.__init__c                 C      t | ds| j | _| jS )N_q_func_vars)hasattrry   	variablesr   rC   r,   r,   r-   q_func_vars      
zTargetNetworkMixin.q_func_varsc                 C   r~   )N_target_q_func_vars)r   r{   r   r   rC   r,   r,   r-   target_q_func_vars   r   z%TargetNetworkMixin.target_q_func_varsNrq   r`   c                 C   s"   |  t|p| jdd d S Nrq   rh   )r|   npr&   rU   get)r*   rq   r,   r,   r-   r}      s   "z TargetNetworkMixin.update_targetc                 C   s
   | j  S N)ry   r   rC   r,   r,   r-   r      s   
zTargetNetworkMixin.variablesc                 C   s^   t | trt| | nt | trt| | nt | tr#t| | | | jdd d S r   )rO   r	   set_weightsr   r   r}   rU   r   )r*   weightsr,   r,   r-   r      s   


zTargetNetworkMixin.set_weightsr   )rE   rF   rG   rH   r.   propertyr   r   intr}   r   r   r   r   r,   r,   r,   r-   rg      s    

rg   c                       sL   e Zd ZdZdd Zdeeef f fddZdeeef fddZ	  Z
S )	ValueNetworkMixina  Assigns the `_value()` method to a TFPolicy.

    This way, Policy can call `_value()` to get the current VF estimate on a
    single(!) observation (as done in `postprocess_trajectory_fn`).
    Note: When doing this, an actual forward pass is being performed.
    This is different from only calling `model.value_function()`, where
    the result of the most recent forward pass is being used to return an
    already calculated tensor.
    c                    s`   | ds
| drt   fdd}n
t  dd }| _|d dk _d  _d S )Nuse_gaevtracec                     sR   t | } t jtjjr | \}}}|t j d S  | \}} j d S )Nr   )r   rO   ry   r   r@   ModelVF_PREDSvalue_function)
input_dict_
extra_outs	model_outrC   r,   r-   r$     s   z)ValueNetworkMixin.__init__.<locals>.valuec                  _   s
   t dS )NrM   )r   constant)argskwargsr,   r,   r-   r$   $  s   
r   r   )r   r   r5   _value_should_cache_extra_action_cached_extra_action_fetches)r*   rU   r$   r,   rC   r-   r.     s   



zValueNetworkMixin.__init__r`   c                    s8   t   }t| jtjjr|S |tj	| j
 i |S r   )r3   extra_action_out_fnrO   ry   r   r@   r   updater   r   r   )r*   extra_action_outr<   r,   r-   _extra_action_out_impl,  s   
z(ValueNetworkMixin._extra_action_out_implc                 C   s.   | j s|  S | jd ur| jS |  | _| jS r   )r   r   r   rC   r,   r,   r-   r   <  s   

z%ValueNetworkMixin.extra_action_out_fn)rE   rF   rG   rH   r.   r   strr   r   r   rI   r,   r,   r<   r-   r     s
    
r   c                   @   s2   e Zd Zdd Zdededeeef fddZ	dS )	GradStatsMixinc                 C   s   d S r   r,   rC   r,   r,   r-   r.   V  s   zGradStatsMixin.__init__train_batchgradsr`   c                 C   s6   | j drdd |D }d|iS tj|}d|iS )N%_tf_policy_handles_more_than_one_lossc                 S   s   g | ]}t j|qS r,   )r   linalgglobal_norm).0gr,   r,   r-   
<listcomp>^  s    z0GradStatsMixin.grad_stats_fn.<locals>.<listcomp>
grad_gnorm)rU   r   r   r   r   )r*   r   r   r   r,   r,   r-   grad_stats_fnY  s   zGradStatsMixin.grad_stats_fnN)
rE   rF   rG   r.   r   r   r   r   r   r   r,   r,   r,   r-   r   T  s    
r   rD   lossr`   c           	   	   C   s   | j j}t| j tr| }|||}| jdd ur\dd |D }t|| jd \}}g | _	|D ]}|d urK| j	
ttj|t|| q2| j	
d  q2tt| j	|}|S |S )N	grad_clipc                 S   s   g | ]\}}|qS r,   r,   )r   r   vr,   r,   r-   r   u  s    z%compute_gradients.<locals>.<listcomp>)ry   rz   rO   r   compute_gradientsrU   r   r   clip_by_global_normr   rl   wheremathis_nan
zeros_likerP   rk   )	policyrD   r   r   grads_and_varsr   r   r   clipped_grads_and_varsr,   r,   r-   r   i  s   &r   )-loggingtypingr   r   numpyr   ray.rllib.models.modelv2r    ray.rllib.policy.eager_tf_policyr   #ray.rllib.policy.eager_tf_policy_v2r   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.tf_policyr	   ray.rllib.utils.annotationsr
   ray.rllib.utils.frameworkr   r   ray.rllib.utils.schedulesr   ray.rllib.utils.tf_utilsr   ray.rllib.utils.typingr   r   r   r   	getLoggerrE   rm   r"   r   tfvr   rJ   rT   rg   r   r   r   r,   r,   r,   r-   <module>   sF    
)4H?P