o
    cil                     @   s   d dl Z d dlmZmZmZmZmZmZ d dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dl m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' e! \Z(Z)Z*e" Z+e ,e-Z.G dd de	Z/G dd deZ0dS )    N)AnyDictOptionalTupleTypeUnion)AlgorithmConfigNotProvided)DQN)SACTFPolicy)"AddObservationsFromEpisodesToBatch)+AddNextObservationsFromEpisodesToTrainBatch)Learner)RLModuleSpec)Policy)deep_update)override)DEPRECATED_VALUEdeprecation_warning)try_import_tftry_import_tfp)EpisodeReplayBuffer)LearningRateOrScheduleRLModuleSpecTypec                ,       s  e Zd ZdZd, fdd	Zeeeeeeeeeeeeeeeeeeeeeeddee	 dee
eef  dee
eef  d	ee d
ee deeeef  deeeeeef f  dee	 dee
eef  dee dee	 dee dee
eef  dee dee dee dee dee	 dee	 dee dd f* fddZeed- fddZeed.d edefd!d"Zeedefd#d$Zeedeed% ef fd&d'Zee	d, fd(d)	Ze fd*d+Z  ZS )/	SACConfiga   Defines a configuration class from which an SAC Algorithm can be built.

    .. testcode::

        config = (
            SACConfig()
            .environment("Pendulum-v1")
            .env_runners(num_env_runners=1)
            .training(
                gamma=0.9,
                actor_lr=0.001,
                critic_lr=0.002,
                train_batch_size_per_learner=32,
            )
        )
        # Build the SAC algo object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()
    Nc                    s  ddi| _ t j|ptd d| _ddgdg d d i d| _ddgdg d d i d| _d| _d	| _d
| _	d| _
d| _dtdddd| _d| _d | _dddd| _d| _d| _d| _d | _d | _d| _d| _d| _d| _d| _d| _d| _d| _d| _t| _ t| _!d S )NtypeStochasticSampling)
algo_classT   relu)fcnet_hiddensfcnet_activationpost_fcnet_hiddenspost_fcnet_activationcustom_modelcustom_model_configFg{Gzt?g      ?auto   PrioritizedEpisodeReplayBufferg    .Ag333333?g?)r   capacityalphabetaga2U0*3?)actor_learning_ratecritic_learning_rateentropy_learning_rategiUMu>r   i  d   )"exploration_configsuper__init__SACtwin_qq_model_configpolicy_model_configclip_actionstauinitial_alphatarget_entropyn_stepintreplay_buffer_configstore_buffer_in_checkpointstraining_intensityoptimizationactor_lr	critic_lralpha_lrlr	grad_cliptarget_network_update_freqrollout_fragment_lengthtrain_batch_size_per_learnertrain_batch_size(num_steps_sampled_before_learning_startsmin_time_s_per_iteration"min_sample_timesteps_per_iteration_deterministic_loss_use_beta_distributionr   use_state_preprocessorworker_side_prioritization)selfr   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/ray/rllib/algorithms/sac/sac.pyr2   2   sd   
	

zSACConfig.__init__)r4   r5   r6   r8   r9   r:   r;   r>   r=   r?   r7   rE   optimization_configrA   rB   rC   rF   rM   rN   rJ   r4   r5   r6   r8   r9   r:   r;   r>   r=   r?   r7   rE   rV   rA   rB   rC   rF   rM   rN   rJ   returnc                   s\  t  jdi | |tur|| _|tur| j| |tur$| j| |tur+|| _|tur2|| _|tur9|| _	|tur@|| _
|turG|| _|	tur_td| jid|	iddgdg}|d | _|
turf|
| _|turm|| _|turt|| _|tur{|| _|tur|| _|tur|| _|tur|| _|tur|| _|tur|| _|tur|| _|tur|| _| S )u(  Sets the training related configuration.

        Args:
            twin_q: Use two Q-networks (instead of one) for action-value estimation.
                Note: Each Q-network will have its own target network.
            q_model_config: Model configs for the Q network(s). These will override
                MODEL_DEFAULTS. This is treated just as the top-level `model` dict in
                setting up the Q-network(s) (2 if twin_q=True).
                That means, you can do for different observation spaces:
                `obs=Box(1D)` -> `Tuple(Box(1D) + Action)` -> `concat` -> `post_fcnet`
                obs=Box(3D) -> Tuple(Box(3D) + Action) -> vision-net -> concat w/ action
                -> post_fcnet
                obs=Tuple(Box(1D), Box(3D)) -> Tuple(Box(1D), Box(3D), Action)
                -> vision-net -> concat w/ Box(1D) and action -> post_fcnet
                You can also have SAC use your custom_model as Q-model(s), by simply
                specifying the `custom_model` sub-key in below dict (just like you would
                do in the top-level `model` dict.
            policy_model_config: Model options for the policy function (see
                `q_model_config` above for details). The difference to `q_model_config`
                above is that no action concat'ing is performed before the post_fcnet
                stack.
            tau: Update the target by 	au * policy + (1-	au) * target_policy.
            initial_alpha: Initial value to use for the entropy weight alpha.
            target_entropy: Target entropy lower bound. If "auto", will be set
                to `-|A|` (e.g. -2.0 for Discrete(2), -3.0 for Box(shape=(3,))).
                This is the inverse of reward scale, and will be optimized
                automatically.
            n_step: N-step target updates. If >1, sars' tuples in trajectories will be
                postprocessed to become sa[discounted sum of R][s t+n] tuples. An
                integer will be interpreted as a fixed n-step value. If a tuple of 2
                ints is provided here, the n-step value will be drawn for each sample(!)
                in the train batch from a uniform distribution over the closed interval
                defined by `[n_step[0], n_step[1]]`.
            store_buffer_in_checkpoints: Set this to True, if you want the contents of
                your buffer(s) to be stored in any saved checkpoints as well.
                Warnings will be created if:
                - This is True AND restoring from a checkpoint that contains no buffer
                data.
                - This is False AND restoring from a checkpoint that does contain
                buffer data.
            replay_buffer_config: Replay buffer config.
                Examples:
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentReplayBuffer",
                "capacity": 50000,
                "replay_batch_size": 32,
                "replay_sequence_length": 1,
                }
                - OR -
                {
                "_enable_replay_buffer_api": True,
                "type": "MultiAgentPrioritizedReplayBuffer",
                "capacity": 50000,
                "prioritized_replay_alpha": 0.6,
                "prioritized_replay_beta": 0.4,
                "prioritized_replay_eps": 1e-6,
                "replay_sequence_length": 1,
                }
                - Where -
                prioritized_replay_alpha: Alpha parameter controls the degree of
                prioritization in the buffer. In other words, when a buffer sample has
                a higher temporal-difference error, with how much more probability
                should it drawn to use to update the parametrized Q-network. 0.0
                corresponds to uniform probability. Setting much above 1.0 may quickly
                result as the sampling distribution could become heavily “pointy” with
                low entropy.
                prioritized_replay_beta: Beta parameter controls the degree of
                importance sampling which suppresses the influence of gradient updates
                from samples that have higher probability of being sampled via alpha
                parameter and the temporal-difference error.
                prioritized_replay_eps: Epsilon parameter sets the baseline probability
                for sampling so that when the temporal-difference error of a sample is
                zero, there is still a chance of drawing the sample.
            training_intensity: The intensity with which to update the model (vs
                collecting samples from the env).
                If None, uses "natural" values of:
                `train_batch_size` / (`rollout_fragment_length` x `num_env_runners` x
                `num_envs_per_env_runner`).
                If not None, will make sure that the ratio between timesteps inserted
                into and sampled from th buffer matches the given values.
                Example:
                training_intensity=1000.0
                train_batch_size=250
                rollout_fragment_length=1
                num_env_runners=1 (or 0)
                num_envs_per_env_runner=1
                -> natural value = 250 / 1 = 250.0
                -> will make sure that replay+train op will be executed 4x asoften as
                rollout+insert op (4 * 250 = 1000).
                See: rllib/algorithms/dqn/dqn.py::calculate_rr_weights for further
                details.
            clip_actions: Whether to clip actions. If actions are already normalized,
                this should be set to False.
            grad_clip: If not None, clip gradients during optimization at this value.
            optimization_config: Config dict for optimization. Set the supported keys
                `actor_learning_rate`, `critic_learning_rate`, and
                `entropy_learning_rate` in here.
            actor_lr: The learning rate (float) or learning rate schedule for the
                policy in the format of
                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
                schedule, intermediary timesteps will be assigned to linearly
                interpolated learning rate values. A schedule config's first entry
                must start with timestep 0, i.e.: [[0, initial_value], [...]].
                Note: It is common practice (two-timescale approach) to use a smaller
                learning rate for the policy than for the critic to ensure that the
                critic gives adequate values for improving the policy.
                Note: If you require a) more than one optimizer (per RLModule),
                b) optimizer types that are not Adam, c) a learning rate schedule that
                is not a linearly interpolated, piecewise schedule as described above,
                or d) specifying c'tor arguments of the optimizer that are not the
                learning rate (e.g. Adam's epsilon), then you must override your
                Learner's `configure_optimizer_for_module()` method and handle
                lr-scheduling yourself.
                The default value is 3e-5, one decimal less than the respective
                learning rate of the critic (see `critic_lr`).
            critic_lr: The learning rate (float) or learning rate schedule for the
                critic in the format of
                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
                schedule, intermediary timesteps will be assigned to linearly
                interpolated learning rate values. A schedule config's first entry
                must start with timestep 0, i.e.: [[0, initial_value], [...]].
                Note: It is common practice (two-timescale approach) to use a smaller
                learning rate for the policy than for the critic to ensure that the
                critic gives adequate values for improving the policy.
                Note: If you require a) more than one optimizer (per RLModule),
                b) optimizer types that are not Adam, c) a learning rate schedule that
                is not a linearly interpolated, piecewise schedule as described above,
                or d) specifying c'tor arguments of the optimizer that are not the
                learning rate (e.g. Adam's epsilon), then you must override your
                Learner's `configure_optimizer_for_module()` method and handle
                lr-scheduling yourself.
                The default value is 3e-4, one decimal higher than the respective
                learning rate of the actor (policy) (see `actor_lr`).
            alpha_lr: The learning rate (float) or learning rate schedule for the
                hyperparameter alpha in the format of
                [[timestep, lr-value], [timestep, lr-value], ...] In case of a
                schedule, intermediary timesteps will be assigned to linearly
                interpolated learning rate values. A schedule config's first entry
                must start with timestep 0, i.e.: [[0, initial_value], [...]].
                Note: If you require a) more than one optimizer (per RLModule),
                b) optimizer types that are not Adam, c) a learning rate schedule that
                is not a linearly interpolated, piecewise schedule as described above,
                or d) specifying c'tor arguments of the optimizer that are not the
                learning rate (e.g. Adam's epsilon), then you must override your
                Learner's `configure_optimizer_for_module()` method and handle
                lr-scheduling yourself.
                The default value is 3e-4, identical to the critic learning rate (`lr`).
            target_network_update_freq: Update the target network every
                `target_network_update_freq` steps.
            _deterministic_loss: Whether the loss should be calculated deterministically
                (w/o the stochastic action sampling step). True only useful for
                continuous actions and for debugging.
            _use_beta_distribution: Use a Beta-distribution instead of a
                `SquashedGaussian` for bounded, continuous action spaces (not
                recommended; for debugging only).

        Returns:
            This updated AlgorithmConfig object.
        r=   FNrT   )r1   trainingr	   r4   r5   updater6   r8   r9   r:   r;   r>   r   r=   r?   r7   rE   r@   rA   rB   rC   rF   rM   rN   rJ   )rQ   r4   r5   r6   r8   r9   r:   r;   r>   r=   r?   r7   rE   rV   rA   rB   rC   rF   rM   rN   rJ   kwargsnew_replay_buffer_configrR   rT   rU   rX      sf    <
zSACConfig.trainingc              
      s  t    t| jtr| jd }n| j}| js8| jdkr8| j|k r8td| j d| j d| j d| jd  d	| jt	krFt
dd	d
 t	| _| jd urT| jdkrTtd| jdv rptd u rptdtretjnd  d tdd | jr| jd dvr| jrt| jtst| jtrt| jd tr| jdkr| jstd| jst| jd trd| jd v st| jd trt| jd trtd| jr| jd urtdtd d S d S )Nr'   r&   z Your `rollout_fragment_length` (z') is smaller than needed for `n_step` (zB)! If `n_step` is an integer try setting `rollout_fragment_length=z@`. If `n_step` is a tuple, try setting `rollout_fragment_length=z`.z config['use_state_preprocessor']F)olderrorg        z `grad_clip` value must be > 0.0!)tftf2zYou need `tensorflow_probability` in order to run SAC! Install it via `pip install tensorflow_probability`. Your tf.__version__=z5.Trying to import tfp results in the following error:T)r]   r   )r   r(   MultiAgentEpisodeReplayBuffer(MultiAgentPrioritizedEpisodeReplayBufferr   samplerz[When using the new `EnvRunner API` the replay buffer must be of type `EpisodeReplayBuffer`.Episodead  When using the old API stack the replay buffer must not be of type `EpisodeReplayBuffer`! We suggest you use the following config to run SAC on the old API stack: `config.training(replay_buffer_config={'type': 'MultiAgentPrioritizedReplayBuffer', 'prioritized_replay_alpha': [alpha], 'prioritized_replay_beta': [beta], 'prioritized_replay_eps': [eps], })`.zBasic learning rate parameter `lr` is not `None`. For SAC use the specific learning rate parameters `actor_lr`, `critic_lr` and `alpha_lr`, for the actor, critic, and the hyperparameter `alpha`, respectively and set `config.lr` to None.aa  You are running SAC on the new API stack! This is the new default behavior for this algorithm. If you don't want to use the new API stack, set `config.api_stack(enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False)`. For a detailed migration guide, see here: https://docs.ray.io/en/master/rllib/new-api-stack-migration-guide.html)r1   validate
isinstancer;   tuplein_evaluationrG   
ValueErrorrO   r   r   rE   	frameworktfploggerwarningr^   __version__r   "enable_env_runner_and_connector_v2r=   input_strlistenable_rl_module_and_learnerr   
issubclassr   rD   )rQ   min_rollout_fragment_lengthrR   rT   rU   rd     s   









zSACConfig.validater   worker_indexc                 C   s0   | j dkrt| jttfr| jd S | jS | j S )Nr&   r'   )rG   re   r;   rf   rq   )rQ   ru   rT   rT   rU   get_rollout_fragment_length  s   

z%SACConfig.get_rollout_fragment_lengthc                 C   s2   | j dkrddlm} t|dS td| j  d)Ntorchr   )DefaultSACTorchRLModule)module_classThe framework  is not supported. Use `torch`.)framework_str:ray.rllib.algorithms.sac.torch.default_sac_torch_rl_modulerx   r   rh   )rQ   rx   rT   rT   rU   get_default_rl_module_spec  s   

z$SACConfig.get_default_rl_module_specr   c                 C   s,   | j dkrddlm} |S td| j  d)Nrw   r   )SACTorchLearnerrz   r{   )r|   0ray.rllib.algorithms.sac.torch.sac_torch_learnerr   rh   )rQ   r   rT   rT   rU   get_default_learner_class  s   
z#SACConfig.get_default_learner_classc                    s$   t  j|||d}|tt  |S )N)input_observation_spaceinput_action_spacedevice)r1   build_learner_connectorinsert_afterr   r   )rQ   r   r   r   pipelinerR   rT   rU   r     s   z!SACConfig.build_learner_connectorc                    s   t  jd| jiB S )Nr4   )r1   _model_config_auto_includesr4   )rQ   rR   rT   rU   r   (  s   z%SACConfig._model_config_auto_includesN)rW   N)r   )__name__
__module____qualname____doc__r2   r   r   r	   r   boolr   rp   r   floatr   r<   r   r   rX   rd   rv   r   r~   r   r   r   propertyr   __classcell__rT   rT   rR   rU   r      s    Z	
 so

r   c                       s`   e Zd ZdZ fddZeeedefddZ	eeedede
ee  fdd	Z  ZS )
r3   a0  Soft Actor Critic (SAC) Algorithm class.

    This file defines the distributed Algorithm class for the soft actor critic
    algorithm.
    See `sac_[tf|torch]_policy.py` for the definition of the policy loss.

    Detailed documentation:
    https://docs.ray.io/en/master/rllib-algorithms.html#sac
    c                    s(   |  j ddg7  _ t j|i | d S )Nr6   r5   )_allow_unknown_subkeysr1   r2   )rQ   argsrZ   rR   rT   rU   r2   8  s   zSAC.__init__rW   c                 C   s   t  S r   )r   )clsrT   rT   rU   get_default_config<  s   zSAC.get_default_configconfigc                 C   s    |d dkrddl m} |S tS )Nri   rw   r   )SACTorchPolicy))ray.rllib.algorithms.sac.sac_torch_policyr   r   )r   r   r   rT   rT   rU   get_default_policy_classA  s   zSAC.get_default_policy_class)r   r   r   r   r2   classmethodr   r
   r   r   r   r   r   r   r   rT   rT   rR   rU   r3   -  s    

r3   )1loggingtypingr   r   r   r   r   r   %ray.rllib.algorithms.algorithm_configr   r	   ray.rllib.algorithms.dqn.dqnr
   &ray.rllib.algorithms.sac.sac_tf_policyr   Cray.rllib.connectors.common.add_observations_from_episodes_to_batchr   Oray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batchr   ray.rllib.core.learnerr   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.policy.policyr   ray.rllib.utilsr   ray.rllib.utils.annotationsr   ray.rllib.utils.deprecationr   r   ray.rllib.utils.frameworkr   r   4ray.rllib.utils.replay_buffers.episode_replay_bufferr   ray.rllib.utils.typingr   r   tf1r^   tfvrj   	getLoggerr   rk   r   r3   rT   rT   rT   rU   <module>   s2     
    