o
    	Ti                     @   s<   d dl Z d dlmZmZ ddlmZ eG dd deZdS )    N)	dataclassfield   )OnPolicyConfigc                   @   sB  e Zd ZU dZeejedd ddidZ	e
ed< eddd	idZe
ed
< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed< edddidZeed < eddd!idZeed"< ed#dd$idZeed%< dS )&
RLOOConfiga  
    Configuration class for the [`RLOOTrainer`].

    This class includes only the parameters that are specific to RLOO training. For a full list of training arguments,
    please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
    values in this class may differ from those in [`~transformers.TrainingArguments`].

    Using [`~transformers.HfArgumentParser`] we can turn this class into
    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
    command line.

    Parameters:
        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`):
            Name of this experiment.
        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
            Path to the reward model.
        num_ppo_epochs (`int`, *optional*, defaults to `4`):
            Number of epochs to train.
        whiten_rewards (`bool`, *optional*, defaults to `False`):
            Whether to whiten the rewards.
        kl_coef (`float`, *optional*, defaults to `0.05`):
            KL coefficient.
        cliprange (`float`, *optional*, defaults to `0.2`):
            Clip range.
        rloo_k (`int`, *optional*, defaults to `2`):
            REINFORCE Leave-One-Out (RLOO) number of online samples per prompt.
        normalize_reward (`bool`, *optional*, defaults to `False`):
            Whether to normalize rewards.
        reward_clip_range (`float`, *optional*, defaults to `10.0`):
            Clip range for rewards.
        normalize_advantage (`bool`, *optional*, defaults to `False`):
            Whether to normalize advantages.
        token_level_kl (`bool`, *optional*, defaults to `True`):
            Whether to use token-level KL penalty or sequence-level KL penalty.
        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
            improving generation speed. However, disabling this option allows training models that exceed the VRAM
            capacity of a single GPU, albeit at the cost of slower generation.
    NhelpzName of this experiment.)defaultmetadataexp_namezEleutherAI/pythia-160mzPath to the reward model.reward_model_path   zNumber of epochs to train.num_ppo_epochsFzWhether to whiten the rewards.whiten_rewardsg?zKL coefficient.kl_coefg?zClip range.	clipranger   zCREINFORCE Leave-One-Out (RLOO) number of online samples per prompt.rloo_kzWhether to normalize rewardsnormalize_rewardg      $@zClip range for rewardsreward_clip_rangezWhether to normalize advantagesnormalize_advantagezBWhether to use token-level KL penalty or sequence-level KL penaltytoken_level_klTa  This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation.ds3_gather_for_generation)__name__
__module____qualname____doc__r   ospathbasename__file__r   str__annotations__r   r   intr   boolr   floatr   r   r   r   r   r   r    r%   r%   K/home/ubuntu/.local/lib/python3.10/site-packages/trl/trainer/rloo_config.pyr      sf   
 (r   )r   dataclassesr   r   trainer.utilsr   r   r%   r%   r%   r&   <module>   s
   