o
    	TÃiâ  ã                   @   s<   d dl Z d dlmZmZ ddlmZ eG dd„ deƒƒZdS )é    N)Ú	dataclassÚfieldé   )ÚOnPolicyConfigc                   @   sB  e Zd ZU dZeej e¡dd… ddidZ	e
ed< eddd	idZe
ed
< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed < eddd!idZeed"< ed#dd$idZeed%< dS )&Ú
RLOOConfiga¼  
    Configuration class for the [`RLOOTrainer`].

    This class includes only the parameters that are specific to RLOO training. For a full list of training arguments,
    please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
    values in this class may differ from those in [`~transformers.TrainingArguments`].

    Using [`~transformers.HfArgumentParser`] we can turn this class into
    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
    command line.

    Parameters:
        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`):
            Name of this experiment.
        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
            Path to the reward model.
        num_ppo_epochs (`int`, *optional*, defaults to `4`):
            Number of epochs to train.
        whiten_rewards (`bool`, *optional*, defaults to `False`):
            Whether to whiten the rewards.
        kl_coef (`float`, *optional*, defaults to `0.05`):
            KL coefficient.
        cliprange (`float`, *optional*, defaults to `0.2`):
            Clip range.
        rloo_k (`int`, *optional*, defaults to `2`):
            REINFORCE Leave-One-Out (RLOO) number of online samples per prompt.
        normalize_reward (`bool`, *optional*, defaults to `False`):
            Whether to normalize rewards.
        reward_clip_range (`float`, *optional*, defaults to `10.0`):
            Clip range for rewards.
        normalize_advantage (`bool`, *optional*, defaults to `False`):
            Whether to normalize advantages.
        token_level_kl (`bool`, *optional*, defaults to `True`):
            Whether to use token-level KL penalty or sequence-level KL penalty.
        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
            improving generation speed. However, disabling this option allows training models that exceed the VRAM
            capacity of a single GPU, albeit at the cost of slower generation.
    NéýÿÿÿÚhelpzName of this experiment.)ÚdefaultÚmetadataÚexp_namezEleutherAI/pythia-160mzPath to the reward model.Úreward_model_pathé   zNumber of epochs to train.Únum_ppo_epochsFzWhether to whiten the rewards.Úwhiten_rewardsgš™™™™™©?zKL coefficient.Úkl_coefgš™™™™™É?zClip range.Ú	clipranger   zCREINFORCE Leave-One-Out (RLOO) number of online samples per prompt.Úrloo_kzWhether to normalize rewardsÚnormalize_rewardg      $@zClip range for rewardsÚreward_clip_rangezWhether to normalize advantagesÚnormalize_advantagezBWhether to use token-level KL penalty or sequence-level KL penaltyÚtoken_level_klTa  This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation.Úds3_gather_for_generation)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚosÚpathÚbasenameÚ__file__r   ÚstrÚ__annotations__r   r   Úintr   Úboolr   Úfloatr   r   r   r   r   r   r   © r%   r%   úK/home/ubuntu/.local/lib/python3.10/site-packages/trl/trainer/rloo_config.pyr      sf   
 (þþþþþþþþþþþÿþr   )r   Údataclassesr   r   Útrainer.utilsr   r   r%   r%   r%   r&   Ú<module>   s
   