o
    `۷i
!                     @   s   d dl mZmZmZ d dlmZmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ G d
d de	ZG dd deZdS )    )OptionalTypeUnion)AlgorithmConfigNotProvided)MARWILMARWILConfig)"AddObservationsFromEpisodesToBatch)+AddNextObservationsFromEpisodesToTrainBatch)Learner)RLModuleSpec)override)LearningRateOrScheduleRLModuleSpecTypec                       s   e Zd ZdZd fdd	Zeeeeeeeeeddee	 dee
 dee d	ee d
ee dee dee
 dd f fddZeedeed ef fddZeedefddZee	d fdd	Zeed fddZe fddZ  ZS )	IQLConfiga  Defines a configuration class from which a new IQL Algorithm can be built

    .. testcode::
        :skipif: True

        from ray.rllib.algorithms.iql import IQLConfig
        # Run this from the ray directory root.
        config = IQLConfig().training(actor_lr=0.00001, gamma=0.99)
        config = config.offline_data(
            input_="./rllib/offline/tests/data/pendulum/pendulum-v1_enormous")

        # Build an Algorithm object from the config and run 1 training iteration.
        algo = config.build()
        algo.train()

    .. testcode::
        :skipif: True

        from ray.rllib.algorithms.iql import IQLConfig
        from ray import tune
        config = IQLConfig()
        # Print out some default values.
        print(config.beta)
        # Update the config object.
        config.training(
            lr=tune.grid_search([0.001, 0.0001]), beta=0.75
        )
        # Set the config object's data path.
        # Run this from the ray directory root.
        config.offline_data(
            input_="./rllib/offline/tests/data/pendulum/pendulum-v1_enormous"
        )
        # Set the config object's env, used for evaluation.
        config.environment(env="Pendulum-v1")
        # Use to_dict() to get the old-style python config dict
        # when running with tune.
        tune.Tuner(
            "IQL",
            param_space=config.to_dict(),
        ).fit()
    Nc                    sL   t  j|ptd d| _d| _d| _d| _d| _d | _d| _	d| _
d| _d S )N)
algo_classg?g?ga2U0*3?Tr         ?)super__init__IQLbeta	expectileactor_lr	critic_lrvalue_lrlrtwin_qtarget_network_update_freqtau)selfr   	__class__ R/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/rllib/algorithms/iql/iql.pyr   <   s   
zIQLConfig.__init__)r   r   r   r   r   r   r   r   r   r   r   r   r   r   returnc          	         sx   t  jdi | |tur|| _|tur|| _|tur|| _|tur%|| _|tur,|| _|tur3|| _|tur:|| _	| S )a  Sets the training related configuration.

        Args:
            beta: The temperature to scaling advantages in exponential terms.
                Must be >> 0.0. The higher this parameter the less greedy
                (exploitative) the policy becomes. It also means that the policy
                is fitting less to the best actions in the dataset.
            twin_q: If a twin-Q architecture should be used (advisable).
            expectile: The expectile to use in expectile regression for the value
                function. For high expectiles the value function tries to match
                the upper tail of the Q-value distribution.
            actor_lr: The learning rate for the actor network. Actor learning rates
                greater than critic learning rates work well in experiments.
            critic_lr: The learning rate for the Q-network. Critic learning rates
                greater than value function learning rates work well in experiments.
            value_lr: The learning rate for the value function network.
            target_network_update_freq: The number of timesteps in between the target
                Q-network is fixed. Note, too high values here could harm convergence.
                The target network is updated via Polyak-averaging.
            tau: The update parameter for Polyak-averaging of the target Q-network.
                The higher this value the faster the weights move towards the actual
                Q-network.

        Return:
            This updated `AlgorithmConfig` object.
        Nr"   )
r   trainingr   r   r   r   r   r   r   r   )	r   r   r   r   r   r   r   r   kwargsr    r"   r#   r%   Y   s    'zIQLConfig.trainingr   c                 C   s,   | j dkrddlm} |S td| j  d)Ntorchr   )IQLTorchLearnerThe framework z) is not supported. Use `'torch'` instead.)framework_str0ray.rllib.algorithms.iql.torch.iql_torch_learnerr(   
ValueError)r   r(   r"   r"   r#   get_default_learner_class   s   
z#IQLConfig.get_default_learner_classc                 C   s2   | j dkrddlm} t|dS td| j  d)Nr'   r   )DefaultIQLTorchRLModule)module_classr)   z' is not supported. Use `torch` instead.)r*   :ray.rllib.algorithms.iql.torch.default_iql_torch_rl_moduler.   r   r,   )r   r.   r"   r"   r#   get_default_rl_module_spec   s   

z$IQLConfig.get_default_rl_module_specc                    s8   t  j|||d}|d |d |tt  |S )N)input_observation_spaceinput_action_spacedeviceAddOneTsToEpisodesAndTruncateGeneralAdvantageEstimation)r   build_learner_connectorremoveinsert_afterr	   r
   )r   r2   r3   r4   pipeliner    r"   r#   r7      s   

z!IQLConfig.build_learner_connectorc                    sH   t    | jdkr| d d| j  k rdk s"n | d d S d S )Ng        zFFor meaningful results, `beta` (temperature) parameter must be >> 0.0!r   z@For meaningful results, `expectile` parameter must be in (0, 1).)r   validater   _value_errorr   r   r    r"   r#   r;      s   

zIQLConfig.validatec                    s   t  jd| jiB S )Nr   )r   _model_config_auto_includesr   r=   r    r"   r#   r>      s   z%IQLConfig._model_config_auto_includesN)r$   N)__name__
__module____qualname____doc__r   r   r   r   r   boolfloatr   intr%   r   r   strr-   r   r1   r7   r;   propertyr>   __classcell__r"   r"   r    r#   r      sN    *	9r   c                   @   s*   e Zd ZdZeeedefddZdS )r   zOImplicit Q-learning (derived from MARWIL).

    Uses MARWIL training step.
    r$   c                 C   s   t  S r?   )r   )clsr"   r"   r#   get_default_config   s   zIQL.get_default_configN)	r@   rA   rB   rC   classmethodr   r   r   rK   r"   r"   r"   r#   r      s
    r   N)typingr   r   r   %ray.rllib.algorithms.algorithm_configr   r   "ray.rllib.algorithms.marwil.marwilr   r   Cray.rllib.connectors.common.add_observations_from_episodes_to_batchr	   Oray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batchr
   ray.rllib.core.learner.learnerr   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.utils.annotationsr   ray.rllib.utils.typingr   r   r   r   r"   r"   r"   r#   <module>   s     K