o
    ci8                     @   sh  d dl Z d dlmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ d dlmZ d dlmZm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7 e% \Z8Z9Z:e& Z;e <e=Z>G dd deZ?G dd deZ@dS )    N)OptionalTypeUnion)AlgorithmConfigNotProvided)CQLTFPolicy)CQLTorchPolicy)SAC	SACConfig)"AddObservationsFromEpisodesToBatch)+AddNextObservationsFromEpisodesToTrainBatch)Learner)RLModuleSpec)synchronous_parallel_sample)multi_gpu_train_one_steptrain_one_step)Policy)OldAPIStackoverride)DEPRECATED_VALUEdeprecation_warning)try_import_tftry_import_tfp)LEARNER_RESULTSLEARNER_UPDATE_TIMERLAST_TARGET_UPDATE_TSNUM_AGENT_STEPS_SAMPLEDNUM_AGENT_STEPS_TRAINEDNUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_TRAINEDNUM_TARGET_UPDATESOFFLINE_SAMPLING_TIMERTARGET_NET_UPDATE_TIMERSYNCH_WORKER_WEIGHTS_TIMERSAMPLE_TIMERTIMERS)
ResultDictRLModuleSpecTypec                       s  e Zd ZdZd fdd	Zeeeeeeeeeddee	 dee
 dee	 d	ee d
ee
 dee
 dee dd f fddZeed fddZeedeed ef fddZee	d fdd	Zeed fddZeedefddZe fddZ  ZS ) 	CQLConfiga  Defines a configuration class from which a CQL can be built.

    .. testcode::
        :skipif: True

        from ray.rllib.algorithms.cql import CQLConfig
        config = CQLConfig().training(gamma=0.9, lr=0.01)
        config = config.resources(num_gpus=0)
        config = config.env_runners(num_env_runners=4)
        print(config.to_dict())
        # Build a Algorithm object from the config and run 1 training iteration.
        algo = config.build(env="CartPole-v1")
        algo.train()
    Nc              	      s   t  j|ptd d| _d| _d| _d| _d| _d| _d| _	d| _
d	| _d
| _d
| _ddtddddddd| _d| _d| _t| _d S )N)
algo_classi N  g      ?
   Fg      @Tga2U0*3?g-C6?gMbP?!MultiAgentPrioritizedReplayBufferg    .Ag333333?g?gư>)_enable_replay_buffer_apitypecapacityprioritized_replayprioritized_replay_alphaprioritized_replay_betaprioritized_replay_epsworker_side_prioritizationr   d   )super__init__CQLbc_iterstemperaturenum_actions
lagrangianlagrangian_threshmin_q_weightdeterministic_backuplractor_lr	critic_lralpha_lrintreplay_buffer_config"min_sample_timesteps_per_iteration!min_train_timesteps_per_iterationr   timesteps_per_iteration)selfr)   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/algorithms/cql/cql.pyr6   G   s0   
zCQLConfig.__init__)r8   r9   r:   r;   r<   r=   r>   r8   r9   r:   r;   r<   r=   r>   returnc          	         sx   t  jdi | |tur|| _|tur|| _|tur|| _|tur%|| _|tur,|| _|tur3|| _|tur:|| _	| S )a  Sets the training-related configuration.

        Args:
            bc_iters: Number of iterations with Behavior Cloning pretraining.
            temperature: CQL loss temperature.
            num_actions: Number of actions to sample for CQL loss
            lagrangian: Whether to use the Lagrangian for Alpha Prime (in CQL loss).
            lagrangian_thresh: Lagrangian threshold.
            min_q_weight: in Q weight multiplier.
            deterministic_backup: If the target in the Bellman update should have an
                entropy backup. Defaults to `True`.

        Returns:
            This updated AlgorithmConfig object.
        NrK   )
r5   trainingr   r8   r9   r:   r;   r<   r=   r>   )	rH   r8   r9   r:   r;   r<   r=   r>   kwargsrI   rK   rL   rN   s   s    zCQLConfig.trainingc                    sP   t  jdi | d|v r&ddlm} t|d|s&td|d d| S )Nprelearner_classr   )OfflinePreLearnerz`prelearner_class` z is not a subclass of `OfflinePreLearner`. Any class passed to `prelearner_class` needs to implement the interface given by `OfflinePreLearner`.rK   )r5   offline_dataray.rllib.offline.offline_datarQ   
issubclassget
ValueError)rH   rO   rQ   rI   rK   rL   rR      s   zCQLConfig.offline_datar   c                 C   s,   | j dkrddlm} |S td| j  d)Ntorchr   )CQLTorchLearnerThe framework z) is not supported. Use `'torch'` instead.)framework_str0ray.rllib.algorithms.cql.torch.cql_torch_learnerrX   rV   )rH   rX   rK   rK   rL   get_default_learner_class   s   
z#CQLConfig.get_default_learner_classc                    sH   t  j|||d}|tt  | jdks| jr"| jdkr"|d |S )N)input_observation_spaceinput_action_spacedevicer      NumpyToTensor)r5   build_learner_connectorinsert_afterr   r   num_gpus_per_learnerdataset_num_iters_per_learnerremove)rH   r]   r^   r_   pipelinerI   rK   rL   rb      s   


z!CQLConfig.build_learner_connectorc                    s   | j tkrtdddd t   | jdur| jdkrd| _| jdv r:td u r:t	dt
r/t
jnd  d tdd	 | jd
krL| jsN| jrP| d d S d S d S d S )NrG   rF   T)oldnewerrorrW   )tftf2zYou need `tensorflow_probability` in order to run CQL! Install it via `pip install tensorflow_probability`. Your tf.__version__=z5.Trying to import tfp results in the following error:)rj   r   zWhen using a single local learner the number of iterations per learner, `dataset_num_iters_per_learner` has to be defined. Set this hyperparameter in the `AlgorithmConfig.offline_data`.)rG   r   r   r5   validatesimple_optimizerrZ   tfploggerwarningrk   __version__r   num_learnersre   enable_rl_module_and_learner_value_errorrH   rI   rK   rL   rm      s2   



zCQLConfig.validatec                 C   s2   | j dkrddlm} t|dS td| j  d)NrW   r   )DefaultCQLTorchRLModule)module_classrY   z is not supported. Use `torch`.)rZ   :ray.rllib.algorithms.cql.torch.default_cql_torch_rl_modulerw   r   rV   )rH   rw   rK   rK   rL   get_default_rl_module_spec  s   

z$CQLConfig.get_default_rl_module_specc                    s   t  jd| jiB S )Nr:   )r5   _model_config_auto_includesr:   rv   rI   rK   rL   r{     s   z%CQLConfig._model_config_auto_includesN)rM   r(   rM   N)__name__
__module____qualname____doc__r6   r   r
   r   r   rC   floatboolrN   r   rR   r   r   strr\   rb   rm   r'   rz   propertyr{   __classcell__rK   rK   rI   rL   r(   7   sR    ,	/)r(   c                   @   st   e Zd ZdZeeedefddZeeedede	e
e  fddZeedd	d
ZedefddZdS )r7   zCQL (derived from SAC).rM   c                 C   s   t  S r|   )r(   )clsrK   rK   rL   get_default_config"  s   zCQL.get_default_configconfigc                 C   s   |d dkrt S tS )N	frameworkrW   )r   r   )r   r   rK   rK   rL   get_default_policy_class'  s   zCQL.get_default_policy_classNc                 C   s   | j js|  S | jttf | jj| j j	| j j
| j jr$| j jdkndd}W d    n1 s2w   Y  | jttf | jj|| j j	| j jd}| jj|td W d    d S 1 s`w   Y  d S )Nr`   T)num_samples
num_shardsreturn_iterator)data_iteratorsminibatch_size	num_iters)key)r   "enable_env_runner_and_connector_v2_training_step_old_api_stackmetricslog_timer%   r!   rR   sampletrain_batch_size_per_learnerrs   re   r   learner_groupupdate	aggregater   )rH   batch_or_iteratorlearner_resultsrK   rK   rL   training_step1  s&   "zCQL.training_stepc                    s  | j t  t| jd}W d    n1 sw   Y  | }| jt  | 7  < | jt  |	 7  < | j
dp>dd }||| j| j
}| j
ddu rUt| |}nt| |}| j| j
jdkrdtnt }| jt }|| | j
jkr| j t  | j  | j fdd W d    n1 sw   Y  | jt  d	7  < || jt< | j d
kr| j t  | jjt| d W d    |S 1 sw   Y  |S )N)
worker_setbefore_learn_on_batchc                 W   s   | S r|   rK   )barK   rK   rL   <lambda>Y  s    z2CQL._training_step_old_api_stack.<locals>.<lambda>rn   Tagent_stepsc                    s   | v o|   S r|   )update_target)ppid	to_updaterK   rL   r   o  s    r`   r   )policies)_timersr$   r   env_runner_groupas_multi_agent	_countersr   r   r   	env_stepsr   rU   r   r   count_steps_byr   r   r   target_network_update_freqr"   
env_runnerget_policies_to_trainforeach_policy_to_trainr    num_remote_workersr#   sync_weightslistkeys)rH   train_batchpost_fntrain_resultscur_tslast_updaterK   r   rL   r   O  sB   





z CQL._training_step_old_api_stackr}   )r~   r   r   r   classmethodr   r	   r   r   r   r   r   r   r   r   r&   r   rK   rK   rK   rL   r7     s     
r7   )Aloggingtypingr   r   r   %ray.rllib.algorithms.algorithm_configr   r   &ray.rllib.algorithms.cql.cql_tf_policyr   )ray.rllib.algorithms.cql.cql_torch_policyr   ray.rllib.algorithms.sac.sacr	   r
   Cray.rllib.connectors.common.add_observations_from_episodes_to_batchr   Oray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batchr   ray.rllib.core.learner.learnerr   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.utils.annotationsr   r   ray.rllib.utils.deprecationr   r   ray.rllib.utils.frameworkr   r   ray.rllib.utils.metricsr   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   ray.rllib.utils.typingr&   r'   tf1rk   tfvro   	getLoggerr~   rp   r(   r7   rK   rK   rK   rL   <module>   s0    <
 i