o
    `۷i^8                     @   st  d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9 e' \Z:Z;Z<e( Z=e >e?Z@G dd deZAG dd deZBdS )    N)OptionalTypeUnion)Self)DEPRECATED_VALUEdeprecation_warning)AlgorithmConfigNotProvided)CQLTFPolicy)CQLTorchPolicy)SAC	SACConfig)"AddObservationsFromEpisodesToBatch)+AddNextObservationsFromEpisodesToTrainBatch)Learner)RLModuleSpec)synchronous_parallel_sample)multi_gpu_train_one_steptrain_one_step)Policy)OldAPIStackoverride)try_import_tftry_import_tfp)LAST_TARGET_UPDATE_TSLEARNER_RESULTSLEARNER_UPDATE_TIMERNUM_AGENT_STEPS_SAMPLEDNUM_AGENT_STEPS_TRAINEDNUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_TRAINEDNUM_TARGET_UPDATESOFFLINE_SAMPLING_TIMERSAMPLE_TIMERSYNCH_WORKER_WEIGHTS_TIMERTARGET_NET_UPDATE_TIMERTIMERS)
ResultDictRLModuleSpecTypec                       s  e Zd ZdZd fdd	Zeeeeeeeeeddee	 dee
 dee	 d	ee d
ee
 dee
 dee def fddZeedef fddZeedeed ef fddZee	d fdd	Zeed fddZeedefddZe fddZ  ZS )	CQLConfiga  Defines a configuration class from which a CQL can be built.

    .. testcode::
        :skipif: True

        from ray.rllib.algorithms.cql import CQLConfig
        config = CQLConfig().training(gamma=0.9, lr=0.01)
        config = config.resources(num_gpus=0)
        config = config.env_runners(num_env_runners=4)
        print(config.to_dict())
        # Build a Algorithm object from the config and run 1 training iteration.
        algo = config.build(env="CartPole-v1")
        algo.train()
    Nc              	      s   t  j|ptd d| _d| _d| _d| _d| _d| _d| _	d| _
d	| _d
| _d
| _ddtddddddd| _d| _d| _t| _d S )N)
algo_classi N  g      ?
   Fg      @Tga2U0*3?g-C6?gMbP?!MultiAgentPrioritizedReplayBufferg    .Ag333333?g?gư>)_enable_replay_buffer_apitypecapacityprioritized_replayprioritized_replay_alphaprioritized_replay_betaprioritized_replay_epsworker_side_prioritizationr   d   )super__init__CQLbc_iterstemperaturenum_actions
lagrangianlagrangian_threshmin_q_weightdeterministic_backuplractor_lr	critic_lralpha_lrintreplay_buffer_config"min_sample_timesteps_per_iteration!min_train_timesteps_per_iterationr   timesteps_per_iteration)selfr*   	__class__ R/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/rllib/algorithms/cql/cql.pyr7   I   s0   
zCQLConfig.__init__)r9   r:   r;   r<   r=   r>   r?   r9   r:   r;   r<   r=   r>   r?   returnc          	         sx   t  jdi | |tur|| _|tur|| _|tur|| _|tur%|| _|tur,|| _|tur3|| _|tur:|| _	| S )a  Sets the training-related configuration.

        Args:
            bc_iters: Number of iterations with Behavior Cloning pretraining.
            temperature: CQL loss temperature.
            num_actions: Number of actions to sample for CQL loss
            lagrangian: Whether to use the Lagrangian for Alpha Prime (in CQL loss).
            lagrangian_thresh: Lagrangian threshold.
            min_q_weight: in Q weight multiplier.
            deterministic_backup: If the target in the Bellman update should have an
                entropy backup. Defaults to `True`.

        Returns:
            This updated AlgorithmConfig object.
        NrL   )
r6   trainingr	   r9   r:   r;   r<   r=   r>   r?   )	rI   r9   r:   r;   r<   r=   r>   r?   kwargsrJ   rL   rM   rO   u   s    zCQLConfig.trainingc                    sP   t  jdi | d|v r&ddlm} t|d|s&td|d d| S )Nprelearner_classr   )OfflinePreLearnerz`prelearner_class` z is not a subclass of `OfflinePreLearner`. Any class passed to `prelearner_class` needs to implement the interface given by `OfflinePreLearner`.rL   )r6   offline_dataray.rllib.offline.offline_datarR   
issubclassget
ValueError)rI   rP   rR   rJ   rL   rM   rS      s   zCQLConfig.offline_datar   c                 C   s,   | j dkrddlm} |S td| j  d)Ntorchr   )CQLTorchLearnerThe framework z) is not supported. Use `'torch'` instead.)framework_str0ray.rllib.algorithms.cql.torch.cql_torch_learnerrY   rW   )rI   rY   rL   rL   rM   get_default_learner_class   s   
z#CQLConfig.get_default_learner_classc                    s$   t  j|||d}|tt  |S )N)input_observation_spaceinput_action_spacedevice)r6   build_learner_connectorinsert_afterr   r   )rI   r^   r_   r`   pipelinerJ   rL   rM   ra      s   z!CQLConfig.build_learner_connectorc                    s   | j tkrtdddd t   | jdur| jdkrd| _| jdv r:td u r:t	dt
r/t
jnd  d tdd	 | jd
krL| jsN| jrP| d d S d S d S d S )NrH   rG   T)oldnewerrorrX   )tftf2zYou need `tensorflow_probability` in order to run CQL! Install it via `pip install tensorflow_probability`. Your tf.__version__=z5.Trying to import tfp results in the following error:)rf   r   zWhen using a single local learner the number of iterations per learner, `dataset_num_iters_per_learner` has to be defined. Set this hyperparameter in the `AlgorithmConfig.offline_data`.)rH   r   r   r6   validatesimple_optimizerr[   tfploggerwarningrg   __version__r   num_learnersdataset_num_iters_per_learnerenable_rl_module_and_learner_value_errorrI   rJ   rL   rM   ri      s2   



zCQLConfig.validatec                 C   s2   | j dkrddlm} t|dS td| j  d)NrX   r   )DefaultCQLTorchRLModule)module_classrZ   z is not supported. Use `torch`.)r[   :ray.rllib.algorithms.cql.torch.default_cql_torch_rl_modulert   r   rW   )rI   rt   rL   rL   rM   get_default_rl_module_spec  s   

z$CQLConfig.get_default_rl_module_specc                    s   t  jd| jiB S )Nr;   )r6   _model_config_auto_includesr;   rs   rJ   rL   rM   rx     s   z%CQLConfig._model_config_auto_includesNrN   N)__name__
__module____qualname____doc__r7   r   r   r	   r   rD   floatboolr   rO   r   rS   r   r   strr]   ra   ri   r(   rw   propertyrx   __classcell__rL   rL   rJ   rM   r)   9   sR    ,	/)r)   c                   @   st   e Zd ZdZeeedefddZeeede	de
ee  fddZeedd	d
ZedefddZdS )r8   zCQL (derived from SAC).rN   c                 C   s   t  S ry   )r)   )clsrL   rL   rM   get_default_config  s   zCQL.get_default_configconfigc                 C   s   |d dkrt S tS )N	frameworkrX   )r   r
   )r   r   rL   rL   rM   get_default_policy_class!  s   zCQL.get_default_policy_classNc                 C   s   | j js|  S | jttf# | j jdkp| j jdk }}| j	j
| j j| j j|d}W d    n1 s6w   Y  | jttf | jj|| j j| j jd}| jj|td W d    d S 1 sdw   Y  d S )Nr      )num_samples
num_shardsreturn_iterator)data_iteratorsminibatch_size	num_iters)key)r   "enable_env_runner_and_connector_v2_training_step_old_api_stackmetricslog_timer&   r"   ro   rp   rS   sampletrain_batch_size_per_learnerr   learner_groupupdate	aggregater   )rI   r   batch_or_iteratorlearner_resultsrL   rL   rM   training_step+  s(   
"zCQL.training_stepc                    s  | j t  t| jd}W d    n1 sw   Y  | }| jt  | 7  < | jt  |	 7  < | j
dp>dd }||| j| j
}| j
ddu rUt| |}nt| |}| j| j
jdkrdtnt }| jt }|| | j
jkr| j t  | j  | j fdd W d    n1 sw   Y  | jt  d	7  < || jt< | j d
kr| j t  | jjt| d W d    |S 1 sw   Y  |S )N)
worker_setbefore_learn_on_batchc                 W   s   | S ry   rL   )barL   rL   rM   <lambda>X  s    z2CQL._training_step_old_api_stack.<locals>.<lambda>rj   Tagent_stepsc                    s   | v o|   S ry   )update_target)ppid	to_updaterL   rM   r   n  s    r   r   )policies)_timersr#   r   env_runner_groupas_multi_agent	_countersr   r   r   	env_stepsr   rV   r   r   count_steps_byr   r    r   target_network_update_freqr%   
env_runnerget_policies_to_trainforeach_policy_to_trainr!   num_remote_workersr$   sync_weightslistkeys)rI   train_batchpost_fntrain_resultscur_tslast_updaterL   r   rM   r   N  sB   





z CQL._training_step_old_api_stackrz   )r{   r|   r}   r~   classmethodr   r   r)   r   r   r   r   r   r   r   r   r'   r   rL   rL   rL   rM   r8     s     
"r8   )Cloggingtypingr   r   r   typing_extensionsr   ray._common.deprecationr   r   %ray.rllib.algorithms.algorithm_configr   r	   &ray.rllib.algorithms.cql.cql_tf_policyr
   )ray.rllib.algorithms.cql.cql_torch_policyr   ray.rllib.algorithms.sac.sacr   r   Cray.rllib.connectors.common.add_observations_from_episodes_to_batchr   Oray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batchr   ray.rllib.core.learner.learnerr   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.frameworkr   r   ray.rllib.utils.metricsr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   ray.rllib.utils.typingr'   r(   tf1rg   tfvrk   	getLoggerr{   rl   r)   r8   rL   rL   rL   rM   <module>   s2    <
 a