o
    ci                      @   s   d dl Z d dlmZmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlm Z  d dl!m"Z"m#Z# G dd de
Z$e$Z%dS )    N)AnyDictOptional)
APPOConfig)CircularBuffer)IMPALALearner)Learner)update_target_networkTargetNetworkAPIValueFunctionAPI)MultiRLModuleSpec)RLModuleSpec)override)LambdaDefaultDict)LAST_TARGET_UPDATE_TSNUM_ENV_STEPS_TRAINED_LIFETIMENUM_MODULE_STEPS_TRAINEDNUM_TARGET_UPDATES)	Scheduler)ModuleIDShouldModuleBeUpdatedFnc                       s   e Zd ZdZee fddZeeddddede	de
e d	e
e d
ef
 fddZeeded
ef fddZeedeeef d
df fddZeeed
ee fddZejdeded
dfddZ  ZS )APPOLearnerzAdds KL coeff updates via `after_gradient_based_update()` to IMPALA logic.

    Framework-specific subclasses must override `_update_module_kl_coeff()`.
    c                    sF   t  jj jjd _t    jdd  t	 fdd _
d S )N)num_batchesiterations_per_batchc                 S   s   t |tr	| S d S N)
isinstancer   make_target_networks)midmod r    Z/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/algorithms/appo/appo_learner.py<lambda>)   s   
z#APPOLearner.build.<locals>.<lambda>c                    s      j| jS r   )_get_tensor_variableconfigget_config_for_modulekl_coeff)	module_idselfr    r!   r"   5   s    )r   r$   circular_buffer_num_batches$circular_buffer_iterations_per_batch_learner_thread_in_queuesuperbuildmoduleforeach_moduler   curr_kl_coeffs_per_moduler(   	__class__r(   r!   r.      s   

zAPPOLearner.buildN)config_overridesnew_should_module_be_updatedr'   module_specr4   r5   returnc                   s>   t  j||||d}t| j|  tr| j|    |S )N)r'   r6   r4   r5   )r-   
add_moduler   r/   	unwrappedr   r   )r)   r'   r6   r4   r5   	marl_specr2   r    r!   r8   :   s   	zAPPOLearner.add_modulec                    s   t  |}| j| |S r   )r-   remove_moduler1   pop)r)   r'   r:   r2   r    r!   r;   N   s   zAPPOLearner.remove_module	timestepsc          	         s   t  j|d |td}| jj D ]h\}}| j|}|t	f}t
| tre|| jj|dd |j|j |j |j kre|  D ]\}}t|||jd qD| jj|tfddd | jj||dd |jr{| jj|tfdddkr{| j||d	 qd
S )zUpdates the target Q Networks.)r=   r   )default)main_net
target_nettau   sum)reduce)window)r'   r$   N)r-   after_gradient_based_updategetr   r/   _rl_modulesitemsr$   r%   r   r   r9   r   metricspeektarget_network_update_freqr*   r+   train_batch_size_per_learnerget_target_network_pairsr	   rA   	log_valuer   use_kl_lossr   _update_module_kl_coeff)	r)   r=   curr_timestepr'   r/   r$   last_update_ts_keyr?   r@   r2   r    r!   rF   T   sD   
z'APPOLearner.after_gradient_based_updatec                 C   s   t tgS r   r
   )clsr    r    r!   rl_module_required_apis~   s   z#APPOLearner.rl_module_required_apisr$   c                 C   s   dS )a  Dynamically update the KL loss coefficients of each module.

        The update is completed using the mean KL divergence between the action
        distributions current policy and old policy of each module. That action
        distribution is computed during the most recent update/call to `compute_loss`.

        Args:
            module_id: The module whose KL loss coefficient to update.
            config: The AlgorithmConfig specific to the given `module_id`.
        Nr    )r)   r'   r$   r    r    r!   rQ      s    z#APPOLearner._update_module_kl_coeff)__name__
__module____qualname____doc__r   r   r.   r   r   r   r   r   r   r   r8   strr;   r   rF   classmethodlisttyperU   abcabstractmethodr   rQ   __classcell__r    r    r2   r!   r      s6     ) r   )&r^   typingr   r   r   ray.rllib.algorithms.appo.appor   ray.rllib.algorithms.appo.utilsr   *ray.rllib.algorithms.impala.impala_learnerr   ray.rllib.core.learner.learnerr   ray.rllib.core.learner.utilsr	   ray.rllib.core.rl_module.apisr   r   (ray.rllib.core.rl_module.multi_rl_moduler   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.utils.annotationsr   "ray.rllib.utils.lambda_defaultdictr   ray.rllib.utils.metricsr   r   r   r   #ray.rllib.utils.schedules.schedulerr   ray.rllib.utils.typingr   r   r   AppoLearnerr    r    r    r!   <module>   s"    {