o
    `۷i                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZG ddlHmIZImJZJmKZKmLZL eMeNZOG dd deZPdedeeQ fddZRG dd deZSdS )ap  
Deep Q-Networks (DQN, Rainbow, Parametric DQN)
==============================================

This file defines the distributed Algorithm class for the Deep Q-Networks
algorithm. See `dqn_[tf|torch]_policy.py` for the definition of the policies.

Detailed documentation:
https://docs.ray.io/en/master/rllib-algorithms.html#deep-q-networks-dqn-rainbow-parametric-dqn
    N)defaultdict)AnyCallableDictListOptionalTupleTypeUnion)Self)DEPRECATED_VALUE)	Algorithm)AlgorithmConfigNotProvided)DQNTFPolicy)DQNTorchPolicy)Learner)RLModuleSpec)synchronous_parallel_sample)multi_gpu_train_one_steptrain_one_step)Policy)MultiAgentBatch)deep_update)override)ALL_MODULESENV_RUNNER_RESULTSENV_RUNNER_SAMPLING_TIMERLAST_TARGET_UPDATE_TSLEARNER_RESULTSLEARNER_UPDATE_TIMERNUM_AGENT_STEPS_SAMPLED NUM_AGENT_STEPS_SAMPLED_LIFETIMENUM_ENV_STEPS_SAMPLEDNUM_ENV_STEPS_SAMPLED_LIFETIMENUM_TARGET_UPDATESREPLAY_BUFFER_ADD_DATA_TIMERREPLAY_BUFFER_RESULTSREPLAY_BUFFER_SAMPLE_TIMER REPLAY_BUFFER_UPDATE_PRIOS_TIMERSAMPLE_TIMERSYNCH_WORKER_WEIGHTS_TIMERTD_ERROR_KEYTIMERS)convert_to_numpy)sample_min_n_steps_from_buffer*update_priorities_in_episode_replay_buffer"update_priorities_in_replay_buffervalidate_buffer_config)LearningRateOrSchedule
ResultDictRLModuleSpecTypeSampleBatchTypec                2       s  e Zd ZdZd- fdd	Zeeeeeeeeeeeeeeeeeeeeeeeeeddee	 dee
 dee d	eeeee	ef    d
ee dee dee	 dee	 dee dee	 dee dee dee dee dee dee	 dee deee	ee	e	f f  deee eee  ee	 gee f dee dee dee dee	 def0 fddZeed. fd d!Zeed/d#e	de	fd$d%Zeedefd&d'Zeeedeeef f fd(d)Zeedeed* ef fd+d,Z   Z!S )0	DQNConfiga/  Defines a configuration class from which a DQN Algorithm can be built.

    .. testcode::

        from ray.rllib.algorithms.dqn.dqn import DQNConfig

        config = (
            DQNConfig()
            .environment("CartPole-v1")
            .training(replay_buffer_config={
                "type": "PrioritizedEpisodeReplayBuffer",
                "capacity": 60000,
                "alpha": 0.5,
                "beta": 0.5,
            })
            .env_runners(num_env_runners=1)
        )
        algo = config.build()
        algo.train()
        algo.stop()

    .. testcode::

        from ray.rllib.algorithms.dqn.dqn import DQNConfig
        from ray import tune

        config = (
            DQNConfig()
            .environment("CartPole-v1")
            .training(
                num_atoms=tune.grid_search([1,])
            )
        )
        tune.Tuner(
            "DQN",
            run_config=tune.RunConfig(stop={"training_iteration":1}),
            param_space=config,
        ).fit()

    .. testoutput::
        :hide:

        ...


    Nc                    s(  ddddd| _ t j|ptd d| _dd	g| _d
| _d| _d| _d| _	| j
tjddd d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _dg| _d| _d| _d| _d| _d| _d| _ d| _!dddd d!| _"d| _#t$| _%t$| _&t$| _'t$| _(d| _)t$| _*t$| _+t$| _,dS )"z!Initializes a DQNConfig instance.EpsilonGreedy      ?g{Gz?'  )typeinitial_epsilonfinal_epsilonepsilon_timesteps)
algo_classauto)r   r9   )r:   g?g      D@global_normgMb@?    F)explore)evaluation_configNi  i  g:0yE>   g      $g      $@g      ?T   huberr   PrioritizedEpisodeReplayBufferiP  g333333?g?)r;   capacityalphabeta)-exploration_configsuper__init__DQNrollout_fragment_lengthepsilon	grad_clipgrad_clip_bylrtrain_batch_size
evaluationr   	overridesmin_time_s_per_iteration"min_sample_timesteps_per_iterationtarget_network_update_freq(num_steps_sampled_before_learning_startsstore_buffer_in_checkpointsadam_epsilontau	num_atomsv_minv_maxnoisysigma0duelinghiddensdouble_qn_stepbefore_learn_on_batchtraining_intensitytd_error_loss_fn$categorical_distribution_temperatureburn_in_lenreplay_buffer_configlr_scheduler   buffer_sizeprioritized_replaylearning_startsreplay_batch_sizereplay_sequence_lengthprioritized_replay_alphaprioritized_replay_betaprioritized_replay_eps)selfr?   	__class__ R/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/rllib/algorithms/dqn/dqn.pyrN   {   s`   

zDQNConfig.__init__)rZ   rm   r\   rn   rQ   r]   rR   r[   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rZ   rm   r\   rn   rQ   r]   rR   r[   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   returnc                   sz  t  jdi | |tur|| _|tur(td| jid|iddgdg}|d | _|tur/|| _|tur6|| _|tur=|| _|turD|| _	|turK|| _
|turR|| _|	turY|	| _|
tur`|
| _|turg|| _|turn|| _|turu|| _|tur||| _|tur|| _|tur|| _|tur|| _|tur|| _|tur|| _|tur|| _|tur|| _|tur|| _|tur|| _| S )uW  Sets the training related configuration.

        Args:
            target_network_update_freq: Update the target network every
                `target_network_update_freq` sample steps.
            replay_buffer_config: Replay buffer config.
                Examples:
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentReplayBuffer",
                "capacity": 50000,
                "replay_sequence_length": 1,
                }
                - OR -
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentPrioritizedReplayBuffer",
                "capacity": 50000,
                "prioritized_replay_alpha": 0.6,
                "prioritized_replay_beta": 0.4,
                "prioritized_replay_eps": 1e-6,
                "replay_sequence_length": 1,
                }
                - Where -
                prioritized_replay_alpha: Alpha parameter controls the degree of
                prioritization in the buffer. In other words, when a buffer sample has
                a higher temporal-difference error, with how much more probability
                should it drawn to use to update the parametrized Q-network. 0.0
                corresponds to uniform probability. Setting much above 1.0 may quickly
                result as the sampling distribution could become heavily “pointy” with
                low entropy.
                prioritized_replay_beta: Beta parameter controls the degree of
                importance sampling which suppresses the influence of gradient updates
                from samples that have higher probability of being sampled via alpha
                parameter and the temporal-difference error.
                prioritized_replay_eps: Epsilon parameter sets the baseline probability
                for sampling so that when the temporal-difference error of a sample is
                zero, there is still a chance of drawing the sample.
            store_buffer_in_checkpoints: Set this to True, if you want the contents of
                your buffer(s) to be stored in any saved checkpoints as well.
                Warnings will be created if:
                - This is True AND restoring from a checkpoint that contains no buffer
                data.
                - This is False AND restoring from a checkpoint that does contain
                buffer data.
            epsilon: Epsilon exploration schedule. In the format of [[timestep, value],
                [timestep, value], ...]. A schedule must start from
                timestep 0.
            adam_epsilon: Adam optimizer's epsilon hyper parameter.
            grad_clip: If not None, clip gradients during optimization at this value.
            num_steps_sampled_before_learning_starts: Number of timesteps to collect
                from rollout workers before we start sampling from replay buffers for
                learning. Whether we count this in agent steps or environment steps
                depends on config.multi_agent(count_steps_by=..).
            tau: Update the target by 	au * policy + (1-	au) * target_policy.
            num_atoms: Number of atoms for representing the distribution of return.
                When this is greater than 1, distributional Q-learning is used.
            v_min: Minimum value estimation
            v_max: Maximum value estimation
            noisy: Whether to use noisy network to aid exploration. This adds parametric
                noise to the model weights.
            sigma0: Control the initial parameter noise for noisy nets.
            dueling: Whether to use dueling DQN.
            hiddens: Dense-layer setup for each the advantage branch and the value
                branch
            double_q: Whether to use double DQN.
            n_step: N-step target updates. If >1, sars' tuples in trajectories will be
                postprocessed to become sa[discounted sum of R][s t+n] tuples. An
                integer will be interpreted as a fixed n-step value. If a tuple of 2
                ints is provided here, the n-step value will be drawn for each sample(!)
                in the train batch from a uniform distribution over the closed interval
                defined by `[n_step[0], n_step[1]]`.
            before_learn_on_batch: Callback to run before learning on a multi-agent
                batch of experiences.
            training_intensity: The intensity with which to update the model (vs
                collecting samples from the env).
                If None, uses "natural" values of:
                `train_batch_size` / (`rollout_fragment_length` x `num_env_runners` x
                `num_envs_per_env_runner`).
                If not None, will make sure that the ratio between timesteps inserted
                into and sampled from the buffer matches the given values.
                Example:
                training_intensity=1000.0
                train_batch_size=250
                rollout_fragment_length=1
                num_env_runners=1 (or 0)
                num_envs_per_env_runner=1
                -> natural value = 250 / 1 = 250.0
                -> will make sure that replay+train op will be executed 4x asoften as
                rollout+insert op (4 * 250 = 1000).
                See: rllib/algorithms/dqn/dqn.py::calculate_rr_weights for further
                details.
            td_error_loss_fn: "huber" or "mse". loss function for calculating TD error
                when num_atoms is 1. Note that if num_atoms is > 1, this parameter
                is simply ignored, and softmax cross entropy loss will be used.
            categorical_distribution_temperature: Set the temperature parameter used
                by Categorical action distribution. A valid temperature is in the range
                of [0, 1]. Note that this mostly affects evaluation since TD error uses
                argmax for return calculation.
            burn_in_len: The burn-in period for a stateful RLModule. It allows the
                Learner to utilize the initial `burn_in_len` steps in a replay sequence
                solely for unrolling the network and establishing a typical starting
                state. The network is then updated on the remaining steps of the
                sequence. This process helps mitigate issues stemming from a poor
                initial state - zero or an outdated recorded state. Consider setting
                this parameter to a positive integer if your stateful RLModule faces
                convergence challenges or exhibits signs of catastrophic forgetting.

        Returns:
            This updated AlgorithmConfig object.
        rm   FNrz   )rM   trainingr   rZ   r   rm   r\   rn   rQ   r]   rR   r[   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   )rw   rZ   rm   r\   rn   rQ   r]   rR   r[   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   kwargsnew_replay_buffer_configrx   rz   r{   r}      st    
zDQNConfig.trainingc              	      s  t    | jr| jd ur| d n | jst|  | jd dkr3| jdkr+| d | j	r3| d | j
dvr=| d | js]| jd	kr]| j| jk r]| d
| j d| j d| j d d| jv rd| jd   k rp| jkrn ntd| j d| jd  dddlm} | jrt| jd tst| jd |s| d d S | jst| jd trd| jd v st| jd |r| d d S d S d S )NzW`lr_schedule` is deprecated and must be None! Use the `lr` setting to setup a schedule.r;   ParameterNoisecomplete_episodeszParameterNoise Exploration requires `batch_mode` to be 'complete_episodes'. Try setting `config.env_runners(batch_mode='complete_episodes')`.zOParameterNoise Exploration and `noisy` network cannot be used at the same time!)rG   msez,`td_error_loss_fn` must be 'huber' or 'mse'!r@   z Your `rollout_fragment_length` (z) is smaller than `n_step` (z:)! Try setting config.env_runners(rollout_fragment_length=z).max_seq_lenr   zYour defined `burn_in_len`=z" is larger or equal `max_seq_len`=zC! Either decrease the `burn_in_len` or increase your `max_seq_len`.)EpisodeReplayBufferz[When using the new `EnvRunner API` the replay buffer must be of type `EpisodeReplayBuffer`.Episodead  When using the old API stack the replay buffer must not be of type `EpisodeReplayBuffer`! We suggest you use the following config to run DQN on the old API stack: `config.training(replay_buffer_config={'type': 'MultiAgentPrioritizedReplayBuffer', 'prioritized_replay_alpha': [alpha], 'prioritized_replay_beta': [beta], 'prioritized_replay_eps': [eps], })`.)rM   validateenable_rl_module_and_learnerrn   _value_errorin_evaluationr2   rL   
batch_moderb   rj   rP   rg   model_configrl   
ValueError4ray.rllib.utils.replay_buffers.episode_replay_bufferr   "enable_env_runner_and_connector_v2
isinstancerm   str
issubclass)rw   r   rx   rz   r{   r     sv   







	 
	zDQNConfig.validater   worker_indexc                 C   s0   | j dkrt| jttfr| jd S | jS | j S )Nr@   rE   )rP   r   rg   tuplelist)rw   r   rz   rz   r{   get_rollout_fragment_length   s   

z%DQNConfig.get_rollout_fragment_lengthc                 C   s6   | j dkrddlm} t|| jdS td| j  d)Ntorchr   )DefaultDQNTorchRLModule)module_classr   The framework ; is not supported! Use `config.framework('torch')` instead.)framework_str:ray.rllib.algorithms.dqn.torch.default_dqn_torch_rl_moduler   r   r   r   )rw   r   rz   rz   r{   get_default_rl_module_spec  s   
z$DQNConfig.get_default_rl_module_specc              	      s*   t  j| j| j| j| j| j| j| jdB S )N)rf   rd   rQ   r_   std_initra   r`   )	rM   _model_config_auto_includesrf   rd   rQ   r_   rc   ra   r`   rw   rx   rz   r{   r     s   z%DQNConfig._model_config_auto_includesr   c                 C   s,   | j dkrddlm} |S td| j  d)Nr   r   )DQNTorchLearnerr   r   )r   0ray.rllib.algorithms.dqn.torch.dqn_torch_learnerr   r   )rw   r   rz   rz   r{   get_default_learner_class)  s   
z#DQNConfig.get_default_learner_classNr|   N)r   )"__name__
__module____qualname____doc__rN   r   r   r   r   intdictboolr   r
   floatr3   r   r   r	   r   r   r   r   r}   r   r   r5   r   propertyr   r   r   r   __classcell__rz   rz   rx   r{   r7   K   s    /Y	
 P[
$r7   configr|   c                 C   sl   | j sddgS | j|  | j t| jd d  }| j | }|dk r-ttd| dgS dtt|gS )zACalculate the round robin weights for the rollout and train stepsrE   )	ri   total_train_batch_sizer   num_envs_per_env_runnermaxnum_env_runnersr   npround)r   native_ratiosample_and_train_weightrz   rz   r{   calculate_rr_weights8  s   

r   c                       s   e Zd ZeeedefddZeeedede	e
e  fddZeededdf fdd	Zeedd
dZdd ZdefddZ  ZS )rO   r|   c                 C   s   t  S r   )r7   )clsrz   rz   r{   get_default_configT  s   zDQN.get_default_configr   c                 C   s   |d dkrt S tS )N	frameworkr   )r   r   )r   r   rz   rz   r{   get_default_policy_classY  s   zDQN.get_default_policy_classNc                    s`   t  | | jjr,| jr.| jd u r#| jjdd dgddd | _d S | jj	 | _d S d S d S )Nc                 S   s
   | j  S r   )moduleis_stateful)errz   rz   r{   <lambda>j  s   
 zDQN.setup.<locals>.<lambda>rE   F)remote_worker_idslocal_env_runnerr   )
rM   setupr   r   env_runner_group
env_runnerforeach_env_runner_module_is_statefulr   r   )rw   r   rx   rz   r{   r   c  s   

z	DQN.setupc                 C   s   | j js|  S |  S )a-  DQN training iteration function.

        Each training iteration, we:
        - Sample (MultiAgentBatch) from workers.
        - Store new samples in replay buffer.
        - Sample training batch (MultiAgentBatch) from replay buffer.
        - Learn on training batch.
        - Update remote workers' new policy weights.
        - Update target network every `target_network_update_freq` sample steps.
        - Return all collected metrics for the iteration.

        Returns:
            The results dict from executing the training iteration.
        )r   r   _training_step_old_api_stack_training_step_new_api_stackr   rz   rz   r{   training_stepq  s   zDQN.training_stepc                 C   s2  t | j\}}t|D ]N}| jttf t| jd| jj	ddd\}}W d    n1 s.w   Y  | jj
|td | jttf | j| W d    n1 sTw   Y  q| jjdkrot| jjttfi d }n
| jjttfdd}|| jjkrt|D ]}| jttfE | jj| jj| jj| j| jjdd t| jt| jdr| jjnd| jj | jj!d	dd
}| j" }| jj
|gt#d W d    n1 sw   Y  | jtt$fX | j%j&|t| jttft| jttfid}t't(}	|D ] }
|
) D ]\}}t*|v r|	| +t,|-t*  qqdd |	) D }	| jj
|t.d W d    n	1 s?w   Y  | jtt/f t0| j|	d W d    n	1 s_w   Y  q| jtt1f t2|d 3 t4h }| jj5| j%|d dd W d    d S 1 sw   Y  d S d S )NT)
worker_setconcatsample_timeout_s_uses_new_env_runners_return_metrics)keyagent_steps)defaultr   r   rl   rK   )	num_itemsrg   batch_length_Tlookbackmin_batch_length_TgammarK   sample_episodes)episodes	timestepsc                 S   s$   i | ]\}}|t tj|d diqS )r   )axis)r,   r   concatenate).0	module_idsrz   rz   r{   
<dictcomp>  s    z4DQN._training_step_new_api_stack.<locals>.<dictcomp>)replay_buffer	td_errors)from_worker_or_learner_grouppoliciesglobal_varsinference_only)6r   r   rangemetricslog_timer-   r   r   r   r   	aggregater   r&   local_replay_bufferaddcount_steps_bysumpeekr"   valuesr$   r[   r(   sampler   rg   r   r   getr   hasattrrl   r   rm   get_metricsr'   r    learner_groupupdater   r   itemsr,   extendr.   popr   r)   r0   r+   setkeysr   sync_weights)rw   store_weightr   _r   env_runner_results
current_tsreplay_buffer_resultslearner_resultsr   resr   module_resultsmodules_to_updaterz   rz   r{   r     s   




%$z DQN._training_step_new_api_stackc              	   C   s  i }t | j\}}t|D ]C}| jt  t| jd| jjd}W d   n1 s)w   Y  |s4i   S | jt	  |
 7  < | jt  | 7  < | j| qd| jt i}| j| jjdkrbt	nt }|| jjkrt|D ]}t| j| jj| jjdkd}| jdpdd	 }	|	|| j| j}| jd
du rt| |}nt| |}t| j| j|| | jt }
||
 | jjkr| j }| j|fdd	 | jt  d7  < || jt< | jt  | jj|d W d   n1 sw   Y  qo|S )zvTraining step for the old API stack.

        More specifically this training step relies on `RolloutWorker`.
        T)r   r   r   Ntimestepr   )count_by_agent_stepsrh   c                 W   s   | S r   rz   )barz   rz   r{   r   7  s    z2DQN._training_step_old_api_stack.<locals>.<lambda>simple_optimizerc                 S   s   ||v o|   S r   )update_target)ppid	to_updaterz   rz   r{   r   N  s   rE   )r   )r   r   r   _timersr*   r   r   r   	_countersr!   r   r#   	env_stepsr   r   r   r[   r/   r   r   r   r   r1   r   rZ   r   get_policies_to_trainforeach_policy_to_trainr%   r+   r   )rw   train_resultsr   r   r   new_sample_batchr   cur_tstrain_batchpost_fnlast_updater  rz   rz   r{   r     sj   






z DQN._training_step_old_api_stackr   )r   r   r   classmethodr   r   r7   r   r   r   r	   r   r   r   r   r   r4   r   r   rz   rz   rx   r{   rO   S  s"    
zrO   )Tr   loggingcollectionsr   typingr   r   r   r   r   r   r	   r
   numpyr   typing_extensionsr   ray._common.deprecationr   ray.rllib.algorithms.algorithmr   %ray.rllib.algorithms.algorithm_configr   r   &ray.rllib.algorithms.dqn.dqn_tf_policyr   )ray.rllib.algorithms.dqn.dqn_torch_policyr   ray.rllib.core.learnerr   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.execution.rollout_opsr   ray.rllib.execution.train_opsr   r   ray.rllib.policy.policyr   ray.rllib.policy.sample_batchr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   ray.rllib.utils.metricsr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   ray.rllib.utils.numpyr.   $ray.rllib.utils.replay_buffers.utilsr/   r0   r1   r2   ray.rllib.utils.typingr3   r4   r5   r6   	getLoggerr   loggerr7   r   r   rO   rz   rz   rz   r{   <module>   s<    (T
   p