o
    ci                     @   sr  d dl mZ d dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9 e) \Z:Z;Z<e=e>Z?e!G dd deZ@dS )    )OrderedDictN)DictListOptionalTupleTypeUnion)ModelCatalog)ModelV2)TFActionDistributionTFMultiGPUTowerStack)Policy)SampleBatch)TFPolicy)ViewRequirement)
force_list)OldAPIStackOverrideToImplementCustomLogic5OverrideToImplementCustomLogic_CallToSuperRecommendedis_overriddenoverride)	summarize)try_import_tf)'DIFF_NUM_GRAD_UPDATES_VS_SAMPLER_POLICYNUM_GRAD_UPDATES_LIFETIME)LEARNER_STATS_KEY)get_dummy_batch_for_space)get_placeholder)AlgorithmConfigDictLocalOptimizerModelGradients
TensorType)log_oncec                       s  e Zd ZdZddddejjdejjdedee	e
df  d	ee f
 fd
dZedd ZedejjdejjdefddZeeedeedf dee dedeeee f fddZedede	e
ef fddZededede	e
ef fddZedefddZedededefd d!Zedd"dedd#fd$d%Z eded&ed'ede!eeeee f fd(d)Z"eded&ed'ede!ee#ee f fd*d+Z$ede%fd,d-Z&ee'e(de	e
ef f fd.d/Z)e(de	e
ef fd0d1Z*ee'd2d3 Z+eee(		dbd4ed5ee fd6d7Z,ee'eded"ed" f f fd8d9Z-d:d; Z.d<d= Z/de	e
df fd>d?Z0de	e
df de!ee%ef ee1ef f fd@dAZ2dBdC Z3dDee%ef dEee1ef de!eeee#e	e
ef f fdFdGZ4dHdI Z5dJdK Z6ee	LdcdMe1ddfdNdOZ7defdPdQZ8ee'dee!e
df  de'fdRdSZ9eedee fdTdUZ:ee	VdddWedXe%de%fdYdZZ;eedddXe%de%fd[d\Z<eeded]e%dXe%fd^d_Z=ee' fd`daZ>  Z?S )fDynamicTFPolicyV2zA TFPolicy that auto-defines placeholders dynamically at runtime.

    This class is intended to be used and extended by sub-classing.
    Nexisting_inputsexisting_model	obs_spaceaction_spaceconfigr&   ztf1.placeholderr'   c                   s  || _ || _|| _d| _d | _|d u| _| ||| |  | _|rEt	|t
rE|d | _tdt|D ]}t| || d || d  q3n|  | _|   | | |   | |\}}| ||\}	}
}| _t p{tjtjdi | jd d}|  }tj| jjv r| jtj nd }tj| jjv r| jtj nd }t  j!di d|d|d|d	|d
| jtj" d| jtj# d|	d|
d|d| jdd dg d| jd| j$d| j%d|d|d| jd|d &ddd|d|d| d S )Ntfr      tf_session_args)r*   observation_spacer)   r*   sess	obs_inputaction_inputsampled_actionsampled_action_logpdist_inputs
dist_classlossloss_inputsmodelstate_inputsstate_outputsprev_action_inputprev_reward_inputseq_lensmax_seq_len   batch_divisibility_reqexploretimestep )'r.   r)   r*   	framework	_seq_lens	_is_towervalidate_spaces_init_dist_classr5   
isinstancelistr8   rangelensetattr
make_model/_update_model_view_requirements_from_init_state_init_state_inputs_init_view_requirements _init_input_dict_and_dummy_batch_init_action_fetches_policy_extra_action_fetchestf1get_default_sessionSessionConfigProtoget_batch_divisibility_reqr   PREV_ACTIONS_input_dictaccessed_keysPREV_REWARDSsuper__init__OBSACTIONS_state_inputs
_state_outget)selfr(   r)   r*   r&   r'   irB   rA   r2   r3   r4   r/   r@   r;   r<   	__class__rC   Y/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/policy/dynamic_tf_policy_v2.pyr_   5   s   	





	
zDynamicTFPolicyV2.__init__c                   C   s   d S NrC   rC   rC   rC   ri   #enable_eager_execution_if_necessary   s   z5DynamicTFPolicyV2.enable_eager_execution_if_necessaryc                 C      i S rj   rC   )re   r(   r)   r*   rC   rC   ri   rG         z!DynamicTFPolicyV2.validate_spacesr8   ztf.keras.Modelr5   train_batchreturnc                 C   s   t )a1  Constructs loss computation graph for this TF1 policy.

        Args:
            model: The Model to calculate the loss for.
            dist_class: The action distr. class.
            train_batch: The training data.

        Returns:
            A single loss tensor or a list of loss tensors.
        )NotImplementedError)re   r8   r5   rn   rC   rC   ri   r6         zDynamicTFPolicyV2.lossc                 C   rl   )zStats function. Returns a dict of statistics.

        Args:
            train_batch: The SampleBatch (already) used for training.

        Returns:
            The stats dict.
        rC   )re   rn   rC   rC   ri   stats_fn   s   
zDynamicTFPolicyV2.stats_fngradsc                 C   rl   )zGradient stats function. Returns a dict of statistics.

        Args:
            train_batch: The SampleBatch (already) used for training.

        Returns:
            The stats dict.
        rC   )re   rn   rs   rC   rC   ri   grad_stats_fn   s   zDynamicTFPolicyV2.grad_stats_fnc                 C   s6   t | j| jd \}}t j| j| j|| jd ddS )zoBuild underlying model for this Policy.

        Returns:
            The Model for the Policy to use.
        r8   r+   )r(   r)   num_outputsmodel_configrD   )r	   get_action_distr)   r*   get_model_v2r.   )re   _	logit_dimrC   rC   ri   rN      s   zDynamicTFPolicyV2.make_model	optimizerr6   c                 C      dS )a  Gradients computing function (from loss tensor, using local optimizer).

        Args:
            policy: The Policy object that generated the loss tensor and
                that holds the given local optimizer.
            optimizer: The tf (local) optimizer object to
                calculate the gradients with.
            loss: The loss tensor for which gradients should be
                calculated.

        Returns:
            ModelGradients: List of the possibly clipped gradients- and variable
                tuples.
        NrC   )re   r{   r6   rC   rC   ri   compute_gradients_fn   rq   z&DynamicTFPolicyV2.compute_gradients_fnztf.keras.optimizers.Optimizerztf.Operationc                 C   r|   )aY  Gradients computing function (from loss tensor, using local optimizer).

        Args:
            optimizer: The tf (local) optimizer object to
                calculate the gradients with.
            grads: The gradient tensor to be applied.

        Returns:
            "tf.Operation": TF operation that applies supplied gradients.
        NrC   )re   r{   rs   rC   rC   ri   apply_gradients_fn   s   z$DynamicTFPolicyV2.apply_gradients_fn	obs_batchstate_batchesc                K   r|   )ae  Custom function for sampling new actions given policy.

        Args:
            model: Underlying model.
            obs_batch: Observation tensor batch.
            state_batches: Action sampling state batch.

        Returns:
            Sampled action
            Log-likelihood
            Action distribution inputs
            Updated state
        )NNNNrC   re   r8   r   r   kwargsrC   rC   ri   action_sampler_fn   s   z#DynamicTFPolicyV2.action_sampler_fnc                K   r|   )aC  Action distribution function for this Policy.

        Args:
            model: Underlying model.
            obs_batch: Observation tensor batch.
            state_batches: Action sampling state batch.

        Returns:
            Distribution input.
            ActionDistribution class.
            State outs.
        )NNNrC   r   rC   rC   ri   action_distribution_fn  s   z(DynamicTFPolicyV2.action_distribution_fnc                 C   r|   )zrGet batch divisibility request.

        Returns:
            Size N. A sample batch must be of size K*N.
        r,   rC   re   rC   rC   ri   rY   -  s   z,DynamicTFPolicyV2.get_batch_divisibility_reqc                    s   t   }|| j |S )zExtra values to fetch and return from compute_actions().

        Returns:
             Dict[str, TensorType]: An extra fetch-dict to be passed to and
                returned from the compute_actions() call.
        )r^   extra_action_out_fnupdaterT   )re   extra_action_fetchesrg   rC   ri   r   7  s   
	z%DynamicTFPolicyV2.extra_action_out_fnc                 C   rl   )zExtra stats to be reported after gradient computation.

        Returns:
             Dict[str, TensorType]: An extra fetch-dict.
        rC   r   rC   rC   ri   extra_learn_fetches_fnD  rm   z(DynamicTFPolicyV2.extra_learn_fetches_fnc                 C   s   t ti ifi |  S rj   )dictr   r   r   rC   rC   ri   extra_compute_grad_fetchesM  s   z,DynamicTFPolicyV2.extra_compute_grad_fetchessample_batchother_agent_batchesc                 C   s   t | |S )a  Post process trajectory in the format of a SampleBatch.

        Args:
            sample_batch: sample_batch: batch of experiences for the policy,
                which will contain at most one episode trajectory.
            other_agent_batches: In a multi-agent env, this contains a
                mapping of agent ids to (policy, agent_batch) tuples
                containing the policy and experiences of the other agents.
            episode: An optional multi-agent episode object to provide
                access to all of the internal episode state, which may
                be useful for model-based or multi-agent algorithms.

        Returns:
            The postprocessed sample batch.
        )r   postprocess_trajectory)re   r   r   episoderC   rC   ri   r   Q  s   z(DynamicTFPolicyV2.postprocess_trajectoryc                    s
   t   S )zTF optimizer to use for policy optimization.

        Returns:
            A local optimizer or a list of local optimizers to use for this
                Policy's Model.
        )r^   r{   r   rg   rC   ri   r{   j  s   
zDynamicTFPolicyV2.optimizerc                 C   sF   t | js
t | jrt | jstdd S t| j| jd \}}|S )NzT`make_model` is required if `action_sampler_fn` OR `action_distribution_fn` is givenr8   )	r   r   r   rN   
ValueErrorr	   rw   r)   r*   )re   r5   ry   rC   rC   ri   rH   w  s   
z"DynamicTFPolicyV2._init_dist_classc                 C   sL   t | dd rd S |  | _| j| jj tj| jv r$d| jtj _d S d S )Nview_requirementsF)getattr_get_default_view_requirementsr   r   r8   r   INFOSused_for_trainingr   rC   rC   ri   rQ     s   
z)DynamicTFPolicyV2._init_view_requirementsc                 C   sn   |rdd |  D | _| jr|tj | _dS dS dd | jj  D | _| jr5tjt	j
dgdd| _dS dS )zjInitialize input placeholders.

        Args:
            existing_inputs: existing placeholders.
        c                 S   s   g | ]\}}| d r|qS )	state_in_)
startswith.0kvrC   rC   ri   
<listcomp>  s
    z8DynamicTFPolicyV2._init_state_inputs.<locals>.<listcomp>c                 S   s4   g | ]\}}| d rt|jt|jt |dqS )r   )space	time_axisname)r   r   r   rI   shiftint)r   r   vrrC   rC   ri   r     s    Nr=   )dtypeshaper   )itemsrb   r   SEQ_LENSrE   r8   r   rU   placeholderr+   int32)re   r&   rC   rC   ri   rP     s   


z$DynamicTFPolicyV2._init_state_inputsc                 C   s   | j r|dus	J |d }d}| | j|\| _| _n!tjtjdtj	dddd}tjdddd}| | ji \| _| _| j
|   ||fS )	aQ  Initialized input_dict and dummy_batch data.

        Args:
            existing_inputs: When copying a policy, this specifies an existing
                dict of placeholders to use instead of defining new ones.

        Returns:
            timestep: training timestep.
            explore: whether this policy should explore.
        NrB   FrC   )r   )r   Tis_exploring)rF   "_create_input_dict_and_dummy_batchr   r[   _dummy_batchrU   placeholder_with_defaultr+   zerosint64set_training_get_is_training_placeholder)re   r&   rB   rA   rC   rC   ri   rR     s(   z2DynamicTFPolicyV2._init_input_dict_and_dummy_batchc           
      C   s   i }|  D ]j\}}td|}|dur!| jt|d ||< q|dr'q|tjkr-q||v r8|| ||< qt	|j
t }|jrp| jdrS|tjtjfv rSd}n|tjtjfv rc| jd rcd}nd}t|j|||d	||< q| jd
d}	t|| jd|	fS )a4  Creates input_dict and dummy_batch for loss initialization.

        Used for managing the Policy's input placeholders and for loss
        initialization.
        Input_dict: Str -> tf.placeholders, dummy_batch: str -> np.arrays.

        Args:
            view_requirements: The view requirements dict.
            existing_inputs (Dict[str, tf.placeholder]): A dict of already
                existing placeholders.

        Returns:
            Tuple[Dict[str, tf.placeholder], Dict[str, np.ndarray]]: The
                input_dict/dummy_batch tuple.
        zstate_in_(\d+)Nr,   
state_out__disable_action_flatteningF_disable_preprocessor_apiT)r   r   r   flatten    
batch_size)r=   )r   rematchrb   r   groupr   r   ACTION_DIST_INPUTSrI   r   r   r*   rd   ra   rZ   r`   NEXT_OBSr   r   '_get_dummy_batch_from_view_requirementsrE   )
re   r   r&   
input_dictview_colview_reqmor   r   dummy_batchrC   rC   ri   r     s@   


z4DynamicTFPolicyV2._create_input_dict_and_dummy_batchrB   rA   c           	   
   C   sd  d}d}d}i }d| _ | js|  | _t| jr>| j| j| jtj	 | j
| j| jtj| jtj|| jjd\}}}| _ nOt| jr_| j}| j| j|tj	 | j
| j|||jd\}| _| _ nt| jtjjrr| | j\}| _ }n	| | j\}| _ | || j}| jj|||d\}}|dur||tj< |dur||tj< tt|tj|tj< ||||fS )zECreate action related fields for base Policy and loss initialization.N)r   r   r=   prev_action_batchprev_reward_batchrA   is_training)r   r   r=   rA   rB   r   )action_distributionrB   rA   )rc   rF   _create_explorationexplorationr   r   r8   r[   r   r`   rb   rE   rd   rZ   r]   r   r   r5   rI   r+   kerasModelget_exploration_actionr   ACTION_LOGPexpcastfloat32ACTION_PROB)	re   rB   rA   r2   r3   r4   r   in_dictaction_distrC   rC   ri   rS     s|   






z&DynamicTFPolicyV2._init_action_fetchesc                 C   s:   t |  }| jr| j|}|sd S || _|d | _d S )Nr   )r   r{   r   get_exploration_optimizer_optimizers
_optimizer)re   
optimizersrC   rC   ri   _init_optimizerss  s   z"DynamicTFPolicyV2._init_optimizersc                    s    j r  t  d S     jdd t jdks)t	dd  jD rRtj
dtjd  fdd	t jd
dD  _W d    n1 sMw   Y    t  d S )NT)auto_remove_unneeded_view_reqsr,   c                 s   s    | ]}d |v V  qdS )gpuNrC   )r   drC   rC   ri   	<genexpr>  s    zHDynamicTFPolicyV2.maybe_initialize_optimizer_and_loss.<locals>.<genexpr> )reusec                    s   g | ]}t  d qS ))policyr   )r   ry   r   rC   ri   r     s    zIDynamicTFPolicyV2.maybe_initialize_optimizer_and_loss.<locals>.<listcomp>num_multi_gpu_tower_stacks)rF   get_sessionrunrU   global_variables_initializerr   !_initialize_loss_from_dummy_batchrL   devicesanyvariable_scope
AUTO_REUSErK   r*   rd   multi_gpu_tower_stacksr   rC   r   ri   #maybe_initialize_optimizer_and_loss  s   "

z5DynamicTFPolicyV2.maybe_initialize_optimizer_and_lossTr   c                    sF    t  j D ]\}}|ds |jjvr d|_	q
  D ]Q\}}ttjjdd|j dd  |jjdtjdj|< t||dj|< |jvrxtd	| ttjjdd|j dd  |jjddd
j|< q'j}td j|   |}|jD ]2}|jvrt|| |dj|< |jvrttjjdd|| jdd  || jddd
j|< qttjfi jdd}j rj!|tj"< j#tj"|tj" i j#t| t$drt%dt&| '|}|j|jB |jB t(j)j* B  t+,| fdd| D tj"|v r2tj"|tj" fgng   djv r@jd= j-#.|j/ |r|j|jB  |jD ]=}||jvr|j)jvr|tj0tj1tj2tj3tj4tj5tj6tj7tj8f	vr|jv rdj| _9|jv rj|= qWt:j* D ]I}| vr|tj0tj1tj2tj3tj4tj5tj6tj7fvr|j)jvr||j;v rt<d| nj=d d u rڈj|= |jv rj|= qt:j* D ]&}j| }	|	j>d ur|	j>jvr|	j>|jv }
t|	j?|
dj|	j>< qfddj D _@d S )Nr   Fg      g      ?r,   )r   r   r   )valuer   z,Adding extra-action-fetch `{}` to view-reqs.)r   used_for_compute_actionsz0Testing `postprocess_trajectory` w/ dummy batch.T)_is_training	loss_initz1Initializing loss function with dummy input:

{}
c                    s    g | ]\}}| v r||fqS rC   rC   r   )all_accessed_keysrC   ri   r     s     zGDynamicTFPolicyV2._initialize_loss_from_dummy_batch.<locals>.<listcomp>r   zSampleBatch key '{}' was deleted manually in postprocessing function! RLlib will automatically remove non-used items from the data stream. Remove the `del` from your postprocessing function.output)r   r   c                    s*   i | ]\}}| j vr| jkr||qS rC   )rb   rE   r   r   rC   ri   
<dictcomp>Q  s
    zGDynamicTFPolicyV2._initialize_loss_from_dummy_batch.<locals>.<dictcomp>)Ar   r   rU   r   r   r   r   r[   r\   r   r   r   gymspacesBoxr   as_listr   r   rL   r   r   loggerinfoformatr   r   r   
added_keysr   r   _loss_input_dictrb   rE   r   r   r#   debugr   _do_loss_initsetr8   keysr   _initialize_loss_stats_fetchesrt   _gradsEPS_IDAGENT_INDEX	UNROLL_IDTERMINATEDS
TRUNCATEDSREWARDSr   T
OBS_EMBEDSr   rJ   deleted_keyswarningr*   data_colr   _loss_input_dict_no_rnn)re   r   keyr   r   r   ry   rn   lossesr   r   rC   )r   re   ri   r     s  
	








	

z3DynamicTFPolicyV2._initialize_loss_from_dummy_batchc                 C   sR   |  | j| j|}t|}| j| | g | _t| jt	j
js'| j | _|S rj   )r6   r8   r5   r   r   r   rr   _update_opsrI   r+   r   r   
update_ops)re   rn   r  rC   rC   ri   r   W  s   zDynamicTFPolicyV2._do_loss_initc                    s  t | j}t | j}t|t|krtd| j| j|t|D ]\}}|j	 || j	 kr<td||j|| jq!g }t
t| jD ]}|d||t||  f qF|re|tj|d f t | j|dt|  td| jfd| jfg fdd	t| j D  | }| j| j| j| j|| jd
t| d
dfdt| ddfgd}||_|t|}	 fdd	t| j D }
t||	|
 |j|||j  |S )z9Creates a copy of self using existing input placeholders.zTensor list mismatchzTensor shape mismatchzstate_in_{}Nr   rB   c                       g | ]
\}}| | fqS rC   rC   r   rf   r   existing_inputs_unflattenedrC   ri   r         
z*DynamicTFPolicyV2.copy.<locals>.<listcomp>target_q_modeltarget_modelr%   c                    r  rC   rC   r  r  rC   ri   r     r  )!treer   r   r  rL   r   rb   	enumerater   r   rK   appendr   r   r   unflatten_asr   _is_exploring	_timestepr   rh   r.   r)   r*   r8   r   r   r   r   r   r   rt   r   )re   r&   flat_loss_inputsflat_loss_inputs_no_rnnrf   r   
rnn_inputsr   instancer  r7   rC   r  ri   copya  sr   
	
zDynamicTFPolicyV2.copyc                 C   s   | j r| j  S g S rj   )r8   get_initial_stater   rC   rC   ri   r$    s   
z#DynamicTFPolicyV2.get_initial_stater   batchbuffer_indexc                    s   | d t| jdkr | jd dkr |dksJ || _t|S | j|dd t| j}| jr8| j| j	g }ng } fdd|D } fd	d|D }| j
| j|  |||jd
S )NTr,   r   /cpu:0F)shufflec                       g | ]} | qS rC   rC   r   r   r   rC   ri   r         z<DynamicTFPolicyV2.load_batch_into_buffer.<locals>.<listcomp>c                    r)  rC   rC   r*  r+  rC   ri   r     r,  )r/   inputsr9   num_grad_updates)r   rL   r   _loaded_single_cpu_batch_get_loss_inputs_dictr  r   r  rb   rE   r   	load_datar   r.  )re   r%  r&  	data_keys
state_keysr-  r9   rC   r+  ri   load_batch_into_buffer  s$   

z(DynamicTFPolicyV2.load_batch_into_bufferc                 C   sL   t | jdkr | jd dkr |dksJ | jd urt | jS dS | j| jS )Nr,   r   r'  )rL   r   r/  r   num_tuples_loaded)re   r&  rC   rC   ri   "get_num_samples_loaded_into_buffer  s   
z4DynamicTFPolicyV2.get_num_samples_loaded_into_bufferoffsetc                 C   s   t | jdkrK| jd dkrK|dksJ | jd u rtd| jd}|d u r1| jd| jd }|t | jkr<| j}n
| jj||| d}| |S | j| }|	| 
 |}|  jd7  _|t| jt| jd |jpmd i |S )	Nr,   r   r'  zPMust call Policy.load_batch_into_buffer() before Policy.learn_on_loaded_batch()!minibatch_sizesgd_minibatch_sizetrain_batch_size)startend)rL   r   r/  r   r*   rd   slicelearn_on_batchr   optimizer   r.  r   r   r   )re   r7  r&  r   sliced_batchtower_stackresultsrC   rC   ri   learn_on_loaded_batch  s6   




z'DynamicTFPolicyV2.learn_on_loaded_batchc                    sR   t |}t |}t| jr"| jd r| ||S | |d |d S t ||S )N%_tf_policy_handles_more_than_one_lossr   )r   r   r}   r*   r^   	gradients)re   r{   r6   r   r  rg   rC   ri   rE  	  s   

zDynamicTFPolicyV2.gradients)NN)T)r   )r   r   )@__name__
__module____qualname____doc__r   r   Spacer   r   r   strr
   r_   staticmethodrk   r   rG   r   r   r   r   r   r   r"   r   r6   rr   r!   rt   rN   r    r}   r~   r   r   typer   r   rY   r   r   r   r   r   r   r{   rH   rQ   rP   boolrR   r   rS   r   r   r   r   r#  r$  r4  r6  rC  rE  __classcell__rC   rC   rg   ri   r$   .   s2   S


	


)>


W :
 G*r$   )Acollectionsr   	gymnasiumr   loggingr   r  typingr   r   r   r   r   r   ray.rllib.models.catalogr	   ray.rllib.models.modelv2r
   "ray.rllib.models.tf.tf_action_distr   "ray.rllib.policy.dynamic_tf_policyr   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.policy.tf_policyr   !ray.rllib.policy.view_requirementr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   r   r   r   r   ray.rllib.utils.debugr   ray.rllib.utils.frameworkr   ray.rllib.utils.metricsr   r   $ray.rllib.utils.metrics.learner_infor   "ray.rllib.utils.spaces.space_utilsr   ray.rllib.utils.tf_utilsr   ray.rllib.utils.typingr   r    r!   r"   ray.util.debugr#   rU   r+   tfv	getLoggerrF  r   r$   rC   rC   rC   ri   <module>   s8     
