o
    `۷i\                     @   s   d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ eeZG dd deZG dd deZdS )a4  
TQC (Truncated Quantile Critics) Algorithm.

Paper: https://arxiv.org/abs/2005.04269
"Controlling Overestimation Bias with Truncated Mixture of Continuous
Distributional Quantile Critics"

TQC extends SAC by using distributional RL with quantile regression to
control overestimation bias in the Q-function.
    N)OptionalTypeUnion)	Algorithm)AlgorithmConfigNotProvided)SAC	SACConfig)Learner)RLModuleSpec)override)RLModuleSpecTypec                	       s   e Zd ZdZd fdd	Zeeeeeddee	 dee	 dee	 f fd	d
Z
eed fddZeedefddZeedeed ef fddZeee fddZ  ZS )	TQCConfiga  Configuration for the TQC algorithm.

    TQC extends SAC with distributional critics using quantile regression.

    Example:
        >>> from ray.rllib.algorithms.tqc import TQCConfig
        >>> config = (
        ...     TQCConfig()
        ...     .environment("Pendulum-v1")
        ...     .training(
        ...         n_quantiles=25,
        ...         n_critics=2,
        ...         top_quantiles_to_drop_per_net=2,
        ...     )
        ... )
        >>> algo = config.build()
    Nc                    s(   t  j|ptd d| _d| _d| _dS )z!Initializes a TQCConfig instance.)
algo_class      N)super__init__TQCn_quantiles	n_criticstop_quantiles_to_drop_per_net)selfr   	__class__ R/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/rllib/algorithms/tqc/tqc.pyr   -   s   
zTQCConfig.__init__r   r   r   r   r   r   c                   s@   t  jdi | |tur|| _|tur|| _|tur|| _| S )a8  Sets the training-related configuration.

        Args:
            n_quantiles: Number of quantiles for each critic network.
                Default is 25.
            n_critics: Number of critic networks. Default is 2.
            top_quantiles_to_drop_per_net: Number of quantiles to drop per
                network when computing the target Q-value. This controls
                the overestimation bias. Default is 2.
            **kwargs: Additional arguments passed to SACConfig.training().

        Returns:
            This updated TQCConfig object.
        Nr   )r   trainingr   r   r   r   )r   r   r   r   kwargsr   r   r   r   6   s   zTQCConfig.trainingreturnc                    s   t    | jdk rtd| j | jdk rtd| j | jdk r,td| j | j| j }| j| j }||krGtd| d| dd	S )
z Validates the TQC configuration.   z `n_quantiles` must be >= 1, got z`n_critics` must be >= 1, got r   z2`top_quantiles_to_drop_per_net` must be >= 0, got zCannot drop z quantiles when only zp total quantiles are available. Reduce `top_quantiles_to_drop_per_net` or increase `n_quantiles` or `n_critics`.N)r   validater   
ValueErrorr   r   )r   total_quantilesquantiles_to_dropr   r   r   r"   X   s(   



zTQCConfig.validatec                 C   s2   | j dkrddlm} t|dS td| j  d)Ntorchr   )DefaultTQCTorchRLModule)module_classThe framework  is not supported. Use `torch`.)framework_str:ray.rllib.algorithms.tqc.torch.default_tqc_torch_rl_moduler'   r   r#   )r   r'   r   r   r   get_default_rl_module_specu   s   

z$TQCConfig.get_default_rl_module_specr
   c                 C   s,   | j dkrddlm} |S td| j  d)Nr&   r   )TQCTorchLearnerr)   r*   )r+   0ray.rllib.algorithms.tqc.torch.tqc_torch_learnerr.   r#   )r   r.   r   r   r   get_default_learner_class   s   
z#TQCConfig.get_default_learner_classc                    s   t  j| j| j| jdB S )Nr   )r   _model_config_auto_includesr   r   r   )r   r   r   r   r1      s
   z%TQCConfig._model_config_auto_includesN)r    N)__name__
__module____qualname____doc__r   r   r	   r   r   intr   r   r"   r   r-   r   r   strr0   propertyr1   __classcell__r   r   r   r   r      s.    	!r   c                   @   s*   e Zd ZdZeeedefddZdS )r   a  TQC (Truncated Quantile Critics) Algorithm.

    TQC extends SAC by using distributional critics with quantile regression
    and truncating the top quantiles to control overestimation bias.

    Key differences from SAC:
    - Uses multiple critic networks, each outputting multiple quantiles
    - Computes target Q-values by sorting and truncating top quantiles
    - Uses quantile Huber loss for critic training

    See the paper for more details:
    https://arxiv.org/abs/2005.04269
    r    c                 C   s   t  S r2   )r   )clsr   r   r   get_default_config   s   zTQC.get_default_configN)	r3   r4   r5   r6   classmethodr   r   r   r<   r   r   r   r   r      s
    r   )r6   loggingtypingr   r   r   ray.rllib.algorithms.algorithmr   %ray.rllib.algorithms.algorithm_configr   r   ray.rllib.algorithms.sac.sacr   r	   ray.rllib.core.learnerr
   "ray.rllib.core.rl_module.rl_moduler   ray.rllib.utils.annotationsr   ray.rllib.utils.typingr   	getLoggerr3   loggerr   r   r   r   r   r   <module>   s    
