o
    ci                     @   s  d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@ e- \ZAZBZCeDeEZFe&G dd deZGdS )zsEager mode TF policy built using build_tf_policy().

It supports both traced and non-traced eager execution modes.
    N)DictListOptionalTupleTypeUnion)convert_to_numpy)ModelCatalog)ModelV2)TFActionDistribution)_convert_to_tf_disallow_var_creation_OptimizerWrapper_traced_eager_policy)PolicyPolicyState)#pad_batch_to_sequences_of_same_size)SampleBatch)
force_list)is_overriddenOldAPIStackOverrideToImplementCustomLogic5OverrideToImplementCustomLogic_CallToSuperRecommendedoverride))ERR_MSG_TF_POLICY_CANNOT_SAVE_KERAS_MODEL)try_import_tf)'DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICYNUM_AGENT_STEPS_TRAINEDNUM_GRAD_UPDATES_LIFETIME)LEARNER_STATS_KEY)normalize_action)get_gpu_devices)	with_lock)AlgorithmConfigDictLocalOptimizerModelGradients
TensorType)log_oncec                       s  e Zd ZdZdejjdejjdefddZe	dd Z
ed	ejjdejjdefd
dZeeedeedf dee dedeeee f fddZededeeef fddZedededeeef fddZedefddZededededefddZedddedd fd!d"Zeded#ed$ede eeeee f fd%d&Z!eded#ed$ede ee"ee f fd'd(Z#ede$fd)d*Z%e&deeef fd+d,Z'e&deeef fd-d.Z(eee&	/	/d|d0ed1e)e fd2d3Z*ededed f fd4d5Z+d6d7 Z,d8d9 Z-d:d; Z.ee	/	/	/d}d<eeef d=e/d>e)e$ de eee eeef f fd?d@Z0ee	/	/	/	/	/	/	/d~dAdBZ1e2ee	/	/	/	C	CddDeee ef d#eee ef d$e)ee  dEe)eee ef  dFe)eee ef  dGe/dHe/defdIdJZ3e2eedKdL Z4eedMede eeeef f fdNdOZ5eedPedd/fdQdRZ6eeddTdUZ7eedVdW Z8eedXdY Z9eedZd[ Z:eed\d] Z;eed^d_ Z<eee&de=f fd`daZ>eee&dbe=dd/f fdcddZ?eeddee)e$ dd/fdfdgZ@dhdi ZAdjdk ZBe2	/ddldmZCddndoZDdpdq ZEe2drds ZFdtdu ZGdvdw ZHdMefdxdyZIeJdzd{ ZK  ZLS )EagerTFPolicyV2zsA TF-eager / TF2 based tensorflow policy.

    This class is intended to be used and extended by sub-classing.
    observation_spaceaction_spaceconfigc                 K   s6  | dd| _tdt rdnd t| ||| d| _t	j
ddt	jd| _t	j
| jd	 dt	jd| _|  }|dkrLt }td
t| d d| _d| _d | _|  | _| jd d | _| ||| j |  | _|  | _|   |  | _| j | _ t| j dk| _!| j"d t#$ | _%d| _&d S )N	frameworktf2z'Creating TF-eager policy running on {}.GPUCPUFr   )	trainabledtypeexplorezFound z visible cuda devices.modelmax_seq_len)'getr,   loggerinfoformatr!   r   __init___is_trainingtfVariableint64global_timestepr+   boolr2   _get_num_gpus_for_policylen_loss_initialized_lossget_batch_divisibility_reqbatch_divisibility_req_max_seq_lenvalidate_spaces_init_dist_class
dist_class
make_modelr3   _init_view_requirements_create_explorationexplorationget_initial_state_state_inputs_is_recurrentassign	threadingRLock_lock_re_trace_counter)selfr)   r*   r+   kwargsnum_gpusgpu_ids rZ   W/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/policy/eager_tf_policy_v2.pyr9   A   s>   






zEagerTFPolicyV2.__init__c                   C   s    t rt  st   d S d S d S N)tf1executing_eagerlyenable_eager_executionrZ   rZ   rZ   r[   #enable_eager_execution_if_necessary   s   z3EagerTFPolicyV2.enable_eager_execution_if_necessary	obs_spacec                 C      i S r\   rZ   )rV   ra   r*   r+   rZ   rZ   r[   rG         zEagerTFPolicyV2.validate_spacesr3   ztf.keras.ModelrI   train_batchreturnc                 C   s   t )aB  Compute loss for this policy using model, dist_class and a train_batch.

        Args:
            model: The Model to calculate the loss for.
            dist_class: The action distr. class.
            train_batch: The training data.

        Returns:
            A single loss tensor or a list of loss tensors.
        )NotImplementedError)rV   r3   rI   rd   rZ   rZ   r[   loss      zEagerTFPolicyV2.lossc                 C   rb   )zStats function. Returns a dict of statistics.

        Args:
            train_batch: The SampleBatch (already) used for training.

        Returns:
            The stats dict.
        rZ   )rV   rd   rZ   rZ   r[   stats_fn   s   
zEagerTFPolicyV2.stats_fngradsc                 C   rb   )zGradient stats function. Returns a dict of statistics.

        Args:
            train_batch: The SampleBatch (already) used for training.

        Returns:
            The stats dict.
        rZ   )rV   rd   rj   rZ   rZ   r[   grad_stats_fn   s   zEagerTFPolicyV2.grad_stats_fnc                 C   s8   t | j| jd \}}t j| j| j|| jd | jdS )zoBuild underlying model for this Policy.

        Returns:
            The Model for the Policy to use.
        r3   )r,   )r	   get_action_distr*   r+   get_model_v2r)   r,   )rV   _	logit_dimrZ   rZ   r[   rJ      s   zEagerTFPolicyV2.make_modelpolicy	optimizerrg   c                 C      dS )a  Gradients computing function (from loss tensor, using local optimizer).

        Args:
            policy: The Policy object that generated the loss tensor and
                that holds the given local optimizer.
            optimizer: The tf (local) optimizer object to
                calculate the gradients with.
            loss: The loss tensor for which gradients should be
                calculated.

        Returns:
            ModelGradients: List of the possibly clipped gradients- and variable
                tuples.
        NrZ   )rV   rp   rq   rg   rZ   rZ   r[   compute_gradients_fn   rh   z$EagerTFPolicyV2.compute_gradients_fnztf.keras.optimizers.Optimizerztf.Operationc                 C   rr   )aY  Gradients computing function (from loss tensor, using local optimizer).

        Args:
            optimizer: The tf (local) optimizer object to
                calculate the gradients with.
            grads: The gradient tensor to be applied.

        Returns:
            "tf.Operation": TF operation that applies supplied gradients.
        NrZ   )rV   rq   rj   rZ   rZ   r[   apply_gradients_fn   s   z"EagerTFPolicyV2.apply_gradients_fn	obs_batchstate_batchesc                K   rr   )ae  Custom function for sampling new actions given policy.

        Args:
            model: Underlying model.
            obs_batch: Observation tensor batch.
            state_batches: Action sampling state batch.

        Returns:
            Sampled action
            Log-likelihood
            Action distribution inputs
            Updated state
        )NNNNrZ   rV   r3   ru   rv   rW   rZ   rZ   r[   action_sampler_fn  s   z!EagerTFPolicyV2.action_sampler_fnc                K   rr   )aC  Action distribution function for this Policy.

        Args:
            model: Underlying model.
            obs_batch: Observation tensor batch.
            state_batches: Action sampling state batch.

        Returns:
            Distribution input.
            ActionDistribution class.
            State outs.
        NNNrZ   rw   rZ   rZ   r[   action_distribution_fn  s   z&EagerTFPolicyV2.action_distribution_fnc                 C   rr   )zrGet batch divisibility request.

        Returns:
            Size N. A sample batch must be of size K*N.
           rZ   rV   rZ   rZ   r[   rD   2     z*EagerTFPolicyV2.get_batch_divisibility_reqc                 C   rb   )zExtra values to fetch and return from compute_actions().

        Returns:
             Dict[str, TensorType]: An extra fetch-dict to be passed to and
                returned from the compute_actions() call.
        rZ   r|   rZ   rZ   r[   extra_action_out_fn<  r}   z#EagerTFPolicyV2.extra_action_out_fnc                 C   rb   )zExtra stats to be reported after gradient computation.

        Returns:
             Dict[str, TensorType]: An extra fetch-dict.
        rZ   r|   rZ   rZ   r[   extra_learn_fetches_fnF  rc   z&EagerTFPolicyV2.extra_learn_fetches_fnNsample_batchother_agent_batchesc                 C   s   t  sJ t| |S )a  Post process trajectory in the format of a SampleBatch.

        Args:
            sample_batch: sample_batch: batch of experiences for the policy,
                which will contain at most one episode trajectory.
            other_agent_batches: In a multi-agent env, this contains a
                mapping of agent ids to (policy, agent_batch) tuples
                containing the policy and experiences of the other agents.
            episode: An optional multi-agent episode object to provide
                access to all of the internal episode state, which may
                be useful for model-based or multi-agent algorithms.

        Returns:
            The postprocessed sample batch.
        )r;   r^   r   postprocess_trajectory)rV   r   r   episoderZ   rZ   r[   r   O  s   z&EagerTFPolicyV2.postprocess_trajectoryc                 C   s   t jj| jd S )zTF optimizer to use for policy optimization.

        Returns:
            A local optimizer or a list of local optimizers to use for this
                Policy's Model.
        lr)r;   keras
optimizersAdamr+   r|   rZ   rZ   r[   rq   i  s   
zEagerTFPolicyV2.optimizerc                 C   sF   t | js
t | jrt | jstdd S t| j| jd \}}|S )NzT`make_model` is required if `action_sampler_fn` OR `action_distribution_fn` is givenr3   )	r   rx   rz   rJ   
ValueErrorr	   rl   r*   r+   )rV   rI   rn   rZ   rZ   r[   rH   u  s   
z EagerTFPolicyV2._init_dist_classc                 C   s:   |    | j| jj tj| jv rd| jtj _d S d S )NF)/_update_model_view_requirements_from_init_stateview_requirementsupdater3   r   INFOSused_for_trainingr|   rZ   rZ   r[   rK     s
   z'EagerTFPolicyV2._init_view_requirementsc                 C   sL   t |  }| jr| j|}|| _|r|d nd | _| jdd d| _d S )Nr   T)auto_remove_unneeded_view_reqs)r   rq   rM   get_exploration_optimizer_optimizers
_optimizer!_initialize_loss_from_dummy_batchrB   )rV   r   rZ   rZ   r[   #maybe_initialize_optimizer_and_loss  s   
z3EagerTFPolicyV2.maybe_initialize_optimizer_and_loss
input_dictr2   timestepc                    s   d| _ |d ur	|n| j}|d ur|n| j}t|tjr!t| }|    	d  fdd 
 D }|| _tt| jdk| _| jrR| jj|||  d |  || jd r]d n|||}| jt|d d j d  t|S )NFc                    s$   g | ]}d |dd v r | qS )state_inN   rZ   ).0kr   rZ   r[   
<listcomp>  s    zCEagerTFPolicyV2.compute_actions_from_input_dict.<locals>.<listcomp>r   )r   r2   tf_sesseager_tracing)r:   r2   r>   
isinstancer;   Tensorintnumpy_lazy_tensor_dictset_trainingkeys	_state_inrA   treeflattenrP   rM   before_compute_actionsget_session_compute_actions_helperr+   
assign_addshapeas_listr   )rV   r   r2   r   episodesrW   rv   retrZ   r   r[   compute_actions_from_input_dict  s2   	



$	z/EagerTFPolicyV2.compute_actions_from_input_dictc	                 K   s   t t j|itdd}
|d urt|D ]}||
d< q|d ur$||
t j< |d ur-||
t j< |d ur6||
t j< | jd|
|||d|	S )NFr:   zstate_in_{i})r   r2   r   r   rZ   )	r   CUR_OBSr;   constant	enumeratePREV_ACTIONSPREV_REWARDSr   r   )rV   ru   rv   prev_action_batchprev_reward_batch
info_batchr   r2   r   rW   r   srZ   rZ   r[   compute_actions  s,   



zEagerTFPolicyV2.compute_actionsTactionsr   r   actions_normalizedin_trainingc                 C   s  t | jrt | jstdtjt|tjd}ttj	t
|tj|idd}	|d ur3t
||	tj< |d ur?t
||	tj< | jrI| jjdd t | jre| j| | j|	ddd\}
| _}| |
| j}n| |	||\}
}| |
| j}|s| jd rt|| j}||}|S )NzfCannot compute log-prob/likelihood w/o an `action_distribution_fn` and a provided `action_sampler_fn`!r1   Fr   )r2   )r2   is_trainingnormalize_actions)r   rx   rz   r   r;   onesrA   int32r   r   convert_to_tensorACTIONSr   r   rM   r   r3   rI   r+   r    action_space_structlogp)rV   r   ru   rv   r   r   r   r   seq_lensinput_batchdist_inputsrn   action_distlog_likelihoodsrZ   rZ   r[   compute_log_likelihoods  sD   



z'EagerTFPolicyV2.compute_log_likelihoodsc                 C   s   i }| j j| ||d t|| jd| j| jd d| _| |}|d | 	|}|  j
d7  _
|d|t|jt| j
t| j
d |j
pCd i t|S )N)rp   rd   resultF)r4   shufflerE   r   Tr{   custom_metricsr   )	callbackson_learn_on_batchr   rF   rE   r   r:   r   r   _learn_on_batch_helpernum_grad_updatesr   r   countr   r   r   )rV   postprocessed_batchlearn_statsstatsrZ   rZ   r[   learn_on_batch5  s:   


zEagerTFPolicyV2.learn_on_batchr   c                 C   sN   t |d| j| j| jd d| _| | |d | |\}}}t||fS )NF)r   r4   rE   r   T)	r   rF   rE   r   r:   r   r   _compute_gradients_helperr   )rV   r   grads_and_varsrj   r   rZ   rZ   r[   compute_gradients\  s   


z!EagerTFPolicyV2.compute_gradients	gradientsc                 C   s(   |  ttdd |D | j  d S )Nc                 S   s"   g | ]}|d urt |nd qS r\   )r;   r   )r   grZ   rZ   r[   r   v  s    z3EagerTFPolicyV2.apply_gradients.<locals>.<listcomp>)_apply_gradients_helperlistzipr3   trainable_variables)rV   r   rZ   rZ   r[   apply_gradientsq  s   zEagerTFPolicyV2.apply_gradientsFc                 C   s(   |   }|rdd |D S dd |D S )Nc                 S   s   i | ]}|j | qS rZ   )namer   r   vrZ   rZ   r[   
<dictcomp>  s    z/EagerTFPolicyV2.get_weights.<locals>.<dictcomp>c                 S   s   g | ]}|  qS rZ   )r   r   rZ   rZ   r[   r         z/EagerTFPolicyV2.get_weights.<locals>.<listcomp>)	variables)rV   as_dictr   rZ   rZ   r[   get_weights  s   zEagerTFPolicyV2.get_weightsc                 C   sN   |   }t|t|ksJ t|t|ft||D ]	\}}|| qd S r\   )r   rA   r   rQ   )rV   weightsr   r   wrZ   rZ   r[   set_weights  s
   $zEagerTFPolicyV2.set_weightsc                 C   s   t | j S r\   )r   rM   	get_stater|   rZ   rZ   r[   get_exploration_state  s   z%EagerTFPolicyV2.get_exploration_statec                 C      | j S r\   )rP   r|   rZ   rZ   r[   is_recurrent  s   zEagerTFPolicyV2.is_recurrentc                 C   s
   t | jS r\   )rA   rO   r|   rZ   rZ   r[   num_state_tensors  s   
z!EagerTFPolicyV2.num_state_tensorsc                 C   s   t | dr
| j S g S )Nr3   )hasattrr3   rN   r|   rZ   rZ   r[   rN     s   

z!EagerTFPolicyV2.get_initial_statec                    s`   t   }|d  |d< g |d< | jr$t| j dkr$| j |d< | jr.| j |d< |S )Nr>   _optimizer_variablesr   _exploration_state)superr   r   r   rA   r   rM   )rV   state	__class__rZ   r[   r     s   
zEagerTFPolicyV2.get_stater   c                    s   | dd }|r0| j r0t| jdstdrtd t	| j |D ]	\}}|
| q&t| drBd|v rB| jj|d d | j
|d  t | d S )	Nr   _traced+set_state_optimizer_vars_tf_eager_policy_v2zCannot restore an optimizer's state for tf eager! Keras is not able to save the v1.x optimizers (from tf.compat.v1.train) since they aren't compatible with checkpoints.rM   r   )r   r>   )r5   r   r   type__name__endswithr'   r6   warningr   rQ   r   rM   	set_stater>   r   )rV   r   optimizer_varsopt_varvaluer   rZ   r[   r    s   zEagerTFPolicyV2.set_stateonnxc              
   C   s   |r-zdd l }W n ty } ztd|d }~ww |jj| jjtj	|dd\}}d S t
| dr]t
| jdr]t| jjtjjr]z| jjj|dd W d S  ty\   tt Y d S w tt d S )	Nr   zmConverting a TensorFlow model to ONNX requires `tf2onnx` to be installed. Install with `pip install tf2onnx`.z
model.onnx)output_pathr3   
base_modelr;   )save_format)tf2onnxImportErrorRuntimeErrorconvert
from_kerasr3   r  ospathjoinr   r   r;   r   Modelsave	Exceptionr6   r   r   )rV   
export_dirr  r	  emodel_protoexternal_tensor_storagerZ   rZ   r[   export_model  s6   
zEagerTFPolicyV2.export_modelc                 C   s"   t | jtjjr| jjS | j S )z9Return the list of all savable variables for this policy.)r   r3   r;   r   r  r   r|   rZ   rZ   r[   r     s   
zEagerTFPolicyV2.variablesc                 C   r   r\   )rB   r|   rZ   rZ   r[   loss_initialized  s   z EagerTFPolicyV2.loss_initializedc              
   C   s  |  j d7  _ tj|v r|tj }nt|tj d jd }|r)tj|tj	dnd }i }	t
t t| jrM| j| j|tj |||||d\}
}}}nat| jrg| j| j|tj ||||dd\}| _}n5t| jtjjr|rd|vrt|D ]\}}||d| < qy| | | |\}}}	n	| |||\}}| || j}| jj|||d	\}
}W d    n1 sw   Y  |d urt||	tj< ||	tj< |d ur||	tj< |	|   |
||	fS )
Nr{   r   r   )ru   rv   r   r2   r   r   F)ru   rv   r   r2   r   r   
state_in_0	state_in_)action_distributionr   r2   )rU   r   SEQ_LENSr   r   OBSr   r;   r   r   variable_creator_scoper   r   rx   r3   rz   rI   r   r   r  r   r   rM   get_exploration_actionexpACTION_PROBACTION_LOGPACTION_DIST_INPUTSr   r~   )rV   r   rv   r   r2   r   _ray_trace_ctxr   
batch_sizeextra_fetchesr   r   r   	state_outir   r   rZ   rZ   r[   r     sn   





1


z'EagerTFPolicyV2._compute_actions_helperc                 C   sV   |  j d7  _ tt | |\}}}W d    n1 sw   Y  | | |S )Nr{   )rU   r;   r  r   r   r   )rV   samplesr%  r   rn   r   rZ   rZ   r[   r   P  s   
z&EagerTFPolicyV2._learn_on_batch_helperc                 C   s   t | jS r\   )r;   r   r:   r|   rZ   rZ   r[   _get_is_training_placeholder\  s   z,EagerTFPolicyV2._get_is_training_placeholderc           
         sZ  |  j d7  _ t| jtjjr| jjn| j tjt| j	d | 
| j| j|}W d   n1 s6w   Y  t|}t| j	rct }| jd rY| 	|gt| |}n| 	||d g}n
 fdd|D }tdr|D ]}|D ]\}}|durtd	|j  qwqs| jd rd
d |D }n|d }dd |D }| ||}	|||	fS )z,Computes and returns grads as eager tensors.r{   )
persistentN%_tf_policy_handles_more_than_one_lossr   c                    s"   g | ]}t t |qS rZ   )r   r   gradient)r   rg   taper   rZ   r[   r     s    z=EagerTFPolicyV2._compute_gradients_helper.<locals>.<listcomp>	grad_varszOptimizing variable c                 S   s   g | ]	}d d |D qS )c                 S      g | ]\}}|qS rZ   rZ   r   r   rn   rZ   rZ   r[   r     r   zHEagerTFPolicyV2._compute_gradients_helper.<locals>.<listcomp>.<listcomp>rZ   )r   g_and_vrZ   rZ   r[   r     s    c                 S   r2  rZ   rZ   r3  rZ   rZ   r[   r     r   )rU   r   r3   r;   r   r  r   GradientTaper   rs   rg   rI   r   r   r+   rA   r'   r6   r7   r   _stats)
rV   r*  lossesrq   r   r4  r   r   rj   r   rZ   r/  r[   r   _  sD   





z)EagerTFPolicyV2._compute_gradients_helperc                 C   s   |  j d7  _ t| jr#| jd r| | j| d S | | j| d S | jd r@t| jD ]\}}|dd || D  q-d S | jdd |D  d S )Nr{   r-  c                 S       g | ]\}}|d ur||fqS r\   rZ   r   r   r   rZ   rZ   r[   r          z;EagerTFPolicyV2._apply_gradients_helper.<locals>.<listcomp>c                 S   r8  r\   rZ   r9  rZ   rZ   r[   r     r:  )rU   r   rt   r+   r   r   r   r   )rV   r   r)  orZ   rZ   r[   r     s   


z'EagerTFPolicyV2._apply_gradients_helperc                 C   sV   i }t | jrt| ||t< ni |t< |t|   |t| || |S r\   )r   ri   dictr   r   r   rk   )rV   r*  rj   fetchesrZ   rZ   r[   r6    s   
zEagerTFPolicyV2._statsc                 C   s    t |ts	t|}|t |S r\   )r   r   set_get_interceptorr   )rV   r   rZ   rZ   r[   r     s   

z!EagerTFPolicyV2._lazy_tensor_dictc                 C   s   t | S r\   )r   )clsrZ   rZ   r[   with_tracing  s   zEagerTFPolicyV2.with_tracing)NNry   )NNNNNNN)NNNTT)Fr\   )Mr   
__module____qualname____doc__gymspacesSpacer#   r9   staticmethodr`   r   rG   r   r   r   r
   r   r   r   r&   r   rg   r   strri   r%   rk   rJ   r$   rs   rt   r   rx   r   rz   r   rD   r   r~   r   r   r   rq   rH   rK   r   r?   r   r   r"   r   r   r   r   r   r   r   r   r   rN   r   r   r  r  r   r  r   r   r+  r   r   r6  r   classmethodr@  __classcell__rZ   rZ   r   r[   r(   :   s   
K


		

/&
	<%





Y
=r(   )HrC  loggingr  rR   typingr   r   r   r   r   r   	gymnasiumrD  r   ray.rllib.utils.numpyr   ray.rllib.models.catalogr	   ray.rllib.models.modelv2r
   "ray.rllib.models.tf.tf_action_distr    ray.rllib.policy.eager_tf_policyr   r   r   r   ray.rllib.policy.policyr   r   ray.rllib.policy.rnn_sequencingr   ray.rllib.policy.sample_batchr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   r   r   r   r   ray.rllib.utils.errorr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r   r   $ray.rllib.utils.metrics.learner_infor   "ray.rllib.utils.spaces.space_utilsr    ray.rllib.utils.tf_utilsr!   ray.rllib.utils.threadingr"   ray.rllib.utils.typingr#   r$   r%   r&   ray.util.debugr'   r]   r;   tfv	getLoggerr   r6   r(   rZ   rZ   rZ   r[   <module>   s<     
