o
    	Ti                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZmZmZmZm Z  d d	lm!Z!m"Z" d d
lm#Z# d dl$m%Z& d dlm'Z'm(Z( d dlm)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8m9Z9m:Z: ddl;m<Z<m=Z=m>Z> ddl?m@Z@mAZA ddlBmCZC ddlDmEZEmFZF ddlGmHZHmIZImJZJ ddlKmLZL ddlMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZVmWZWmXZX e9 rd dlYmZZZm[Z[ eE rd dl\m]Z] eF r"d dl^m_Z_m`Z` d dlambZb e3 r*d dlcZceede.eeeeegeeef f f ZgG dd  d e(Zhd!ejid"ejifd#d$Zjd%ekedeeji f d&eld"eeekedeeji f  fd'd(Zmd)ekedee
 f d"ekedee
 f fd*d+Znd!ejid"ejifd,d-Zod!ejid"ejifd.d/Zpd0d1 Zqd2ekedejif d"ekedeejieeeji f f fd3d4Zrd2ekedeejieeeji f f d"ekedejif fd5d6Zsd7ejid8ejid9eld:eeel d"etejiejif f
d;d<ZuG d=d> d>e1ZvdS )?    N)defaultdictdeque)SequenceSized)nullcontext)partial)Path)AnyCallableOptionalUnion)broadcast_object_listgathergather_objectis_peft_modelset_seed)DatasetIterableDataset)nn)FullyShardedDataParallel)
DataLoaderSampler)
AutoConfig"AutoModelForSequenceClassificationAutoProcessorAutoTokenizerGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainerCallbackis_wandb_available)seed_worker)is_datasets_availableis_flash_attn_2_availableis_peft_availableis_rich_available   )apply_chat_templateis_conversationalmaybe_apply_chat_template)profiling_contextprofiling_decorator)
VLLMClient)is_liger_kernel_availableis_vllm_available)prepare_deepspeedprepare_fsdpunwrap_model_for_generation)_ForwardRedirection   )SyncRefModelCallback)
GRPOConfig)disable_dropout_in_modelentropy_from_logitsgenerate_model_cardget_comet_experiment_urlpadprint_prompt_completions_sampleselective_log_softmax)
PeftConfigget_peft_model)LigerFusedLinearGRPOLoss)LLMSamplingParams)GuidedDecodingParamsc                   @   sV   e Zd ZdZ				ddedededed	ed
ee fddZdd Z	defddZ
dS )RepeatSamplera7  
    Sampler that repeats the indices of a dataset in a structured manner.

    Args:
        data_source (`Sized`):
            Dataset to sample from.
        mini_repeat_count (`int`):
            Number of times to repeat each index per batch.
        batch_size (`int`, *optional*, defaults to `1`):
            Number of unique indices per batch.
        repeat_count (`int`, *optional*, defaults to `1`):
            Number of times to repeat the full sampling process.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the dataset.
        seed (`int` or `None`, *optional*, defaults to `None`):
            Random seed for reproducibility (only affects this sampler).

    Example:
    ```python
    >>> sampler = RepeatSampler(
    ...     ["a", "b", "c", "d", "e", "f", "g"], mini_repeat_count=2, batch_size=3, repeat_count=4
    ... )
    >>> list(sampler)
    [4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6]
    ```

    ```txt
    mini_repeat_count = 3
          -   -   -
         [0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11,      |
                                                                repeat_count = 2
          0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11, ...] |
          ---------   ---------   ---------   ---------
           ---------   ---------   ---------   ---------
            ---------   ---------   ---------   ---------
                         batch_size = 12
    ```
    r5   TNdata_sourcemini_repeat_count
batch_sizerepeat_countshuffleseedc                 C   s\   || _ || _|| _|| _t|| _|| _|| _|r*t	 | _
|d ur,| j
| d S d S d S N)rF   rG   rH   rI   lennum_samplesrJ   rK   torch	Generator	generatormanual_seed)selfrF   rG   rH   rI   rJ   rK    rT   L/home/ubuntu/.local/lib/python3.10/site-packages/trl/trainer/grpo_trainer.py__init__   s   	

zRepeatSampler.__init__c                 #   s    j rtjjjd  nttj  fddtdt j	D  fdd D   D ]}tj
D ]}|D ]}tjD ]}|V  qEq>q:q3d S )N)rQ   c                    s   g | ]} ||j   qS rT   )rH   .0iindexesrS   rT   rU   
<listcomp>       z*RepeatSampler.__iter__.<locals>.<listcomp>r   c                    s   g | ]}t | jkr|qS rT   )rM   rH   )rX   chunkrS   rT   rU   r\      r]   )rJ   rO   randpermrN   rQ   tolistlistrangerM   rH   rI   rG   )rS   r^   _indexrT   rZ   rU   __iter__   s   "zRepeatSampler.__iter__returnc                 C   s   | j | j | j | j | j S rL   )rN   rH   rG   rI   r_   rT   rT   rU   __len__   s   zRepeatSampler.__len__)r5   r5   TN)__name__
__module____qualname____doc__r   intboolr   rV   rf   rh   rT   rT   rT   rU   rE   Z   s*    6
rE   tensorrg   c                 C   sH   t | t j| dd d }t t |  }|||d  9 }t |S )a%  
    Compute the standard deviation of a tensor, ignoring NaNs. This function only supports 1D tensors.

    Args:
        tensor (`torch.Tensor`):
            Input tensor of shape `(N,)`.

    Returns:
        `torch.Tensor`:
            Standard deviation of the tensor, ignoring NaNs.
    T)keepdimr(   r5   )rO   nanmeansumisnansqrt)ro   variancecountrT   rT   rU   nanstd   s   
rw   tensor_dict
num_chunksc                    s<   t dd  D }|jd |   fddt|D S )a   
    Splits a dictionary of tensors along the first dimension into `num_chunks` equal parts.

    Example:
    ```python
    >>> x = torch.arange(12).reshape(6, 2)
    >>> y = torch.arange(6).reshape(6, 1)
    >>> tensor_dict = {"x": x, "y": y}
    >>> split_tensor_dict(tensor_dict, 3)
    [
        {"x": tensor([[0, 1], [2, 3]]), "y": tensor([[0], [1]])},
        {"x": tensor([[4, 5], [6, 7]]), "y": tensor([[2], [3]])},
        {"x": tensor([[ 8,  9], [10, 11]]), "y": tensor([[4], [5]])}
    ]
    ```
    c                 s       | ]	}|d ur|V  qd S rL   rT   )rX   ro   rT   rT   rU   	<genexpr>       z$split_tensor_dict.<locals>.<genexpr>r   c                    s$   g | ]  fd d  D qS )c                    s6   i | ]\}}||d ur|  d    nd qS Nr5   rT   )rX   keyro   )
chunk_sizerY   rT   rU   
<dictcomp>   s    $z0split_tensor_dict.<locals>.<listcomp>.<dictcomp>)itemsrX   r   rx   )rY   rU   r\      s    z%split_tensor_dict.<locals>.<listcomp>)nextvaluesshaperc   )rx   ry   first_tensorrT   r   rU   split_tensor_dict   s
   r   seq_dictc                    sX   t tdd |  D }t| dtt dtt f fddfdd|  D S )	a  
    Shuffles all sequence-like values in a dictionary along the first dimension in unison.

    Example:
    ```python
    >>> x = torch.arange(6).reshape(3, 2)
    >>> y = ["a", "b", "c"]
    >>> seq_dict = {"x": x, "y": y}
    >>> shuffle_sequence_dict(seq_dict)
    {'x': tensor([[2, 3],
                  [0, 1],
                  [4, 5]]),
     'y': ['b', 'a', 'c']}
    ```
    c                 s   rz   rL   rT   )rX   vrT   rT   rU   r{      r|   z(shuffle_sequence_dict.<locals>.<genexpr>r   rg   c                    s2    d u rd S t  tjr  S  fddD S )Nc                    s   g | ]} | qS rT   rT   rW   r   rT   rU   r\         z:shuffle_sequence_dict.<locals>.permute.<locals>.<listcomp>)
isinstancerO   Tensorr   )permutationr   rU   permute  s
   z&shuffle_sequence_dict.<locals>.permutec                    s   i | ]	\}}| |qS rT   rT   rX   r~   val)r   rT   rU   r         z)shuffle_sequence_dict.<locals>.<dictcomp>)rM   r   r   rO   r`   r   r   r   )r   rH   rT   )r   r   rU   shuffle_sequence_dict   s   
r   c                 C   <   t |  rt jtd| j| jdS t | t |   S )a&  
    Compute the minimum value of a tensor, ignoring NaNs. This function only supports 1D tensors.

    Args:
        tensor (`torch.Tensor`): Input tensor of shape `(N,)`.

    Returns:
        `torch.Tensor`: Minimum value of the tensor, ignoring NaNs. Returns NaN if all values are NaN.
    nandtypedevice)rO   rs   allro   floatr   r   minro   rT   rT   rU   nanmin     
r   c                 C   r   )a&  
    Compute the maximum value of a tensor, ignoring NaNs. This function only supports 1D tensors.

    Args:
        tensor (`torch.Tensor`): Input tensor of shape `(N,)`.

    Returns:
        `torch.Tensor`: Maximum value of the tensor, ignoring NaNs. Returns NaN if all values are NaN.
    r   r   )rO   rs   r   ro   r   r   r   maxr   rT   rT   rU   nanmax  r   r   c                 C   s   | S )z Do we really need docs for this?rT   xrT   rT   rU   identity)  s   r   batchc                 C   s   d| vsd| vr
| S | d j dd }| d }t||dkr0tdt| d|d ttj| d |dd}i | d|iS )z
    Splits `batch["pixel_values"]` into a list of tensors based on the product of each row in
    `batch["image_grid_thw"]`, while keeping other entries unchanged.
    image_grid_thwpixel_valuesr5   dimr   zMismatch: sum(lengths) = z != pixel_values.size(0) = )prodra   rr   size
ValueErrorrb   rO   split)r   lengthsr   split_valuesrT   rT   rU   split_pixel_values_by_grid.  s   r   c                 C   s6   |  d}t|trtj|dd}i | d|iS | S )z
    Opposite of `split_pixel_values_by_grid`. Merges a list of tensors in `batch["pixel_values"]`
    back into a single tensor along the first dimension.
    r   r   r   )getr   rb   rO   cat)r   r   mergedrT   rT   rU   unsplit_pixel_values_by_grid@  s
   

r   idsmasktarget_lengthprotected_tokensc           
         sp   t |  fdd}g }g }t| jd D ]}|| | || \}}	|| ||	 qt|t|fS )a  
    Truncate tensors to target length while preserving protected tokens.

    Args:
        ids (`torch.Tensor`):
            Input tensor of token IDs, shape (batch_size, sequence_length).
        mask (`torch.Tensor`):
            Input tensor of attention masks, shape (batch_size, sequence_length).
        target_length (`int`):
            Desired length of the output sequences.
        protected_tokens (`list[int]`):
            List of token IDs that should be preserved in the output.
    c           
         s   t  fdd| D }| }|  }| }|dk r+td d| d| dt |d }t |}|dkrF|| d  }d||< ||B }	| |	 ||	 fS )	Nc                    s   g | ]}|   v qS rT   itemrX   r   )protected_setrT   rU   r\   b      zLtruncate_with_protected_tokens.<locals>.process_sequence.<locals>.<listcomp>r   ztarget_length (z)) is too small for the protected tokens (z4 tokens). Please increase target length to at least z or disable truncation.T)rO   ro   rr   r   r   where
zeros_like)
r   r   is_protectedis_non_protectednum_protectednum_non_protected_needednon_protected_indiceskeep_non_protectedkeep_indices	keep_maskr   r   rT   rU   process_sequence`  s"   
z8truncate_with_protected_tokens.<locals>.process_sequencer   )setrc   r   appendrO   stack)
r   r   r   r   r   truncated_seqtruncated_maskrY   new_idsnew_maskrT   r   rU   truncate_with_protected_tokensN  s   
r   c                       s  e Zd ZdZddgZ								dQdeeef deee	e f de
e d	e
eeef  d
e
eeeeeeeef f f  de
eeef  de
eee	e f  de
e	e  dee
ejj e
ejjj f de
d f fddZdd Zdd ZdRde
e defddZdefddZdededefddZe				dSdd Z 	dRd!ej!d"ej!d#e"dej!fd$d%Z#e		&				dTdeee
ej! f fd'd(Z$dRd)e
e	e  fd*d+Z%dUd-e&j'd.efd/d0Z(d-e&j'fd1d2Z)ed3d4 Z*ed5eeeej!e+f f deeeej!e+f f fd6d7Z,e fd8d9Z-d:e	eeeej!e+f f  deeeej!e+f f f fd;d<Z.d=d> Z/edVd?d@Z0dAdB Z1dRdCe
e	e  fdDdEZ2dRdFeee"f dGe
e" ddf fdHdIZ3 fdJdKZ4			dWdLe
e dMe
e dNeee	e df fdOdPZ5  Z6S )XGRPOTrainera  
    Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
    paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
    Models](https://huggingface.co/papers/2402.03300).

    Example:

    ```python
    from datasets import load_dataset
    from trl import GRPOTrainer

    dataset = load_dataset("trl-lib/tldr", split="train")


    def reward_func(completions, **kwargs):
        # Dummy reward function that rewards completions with more unique letters.
        return [float(len(set(completion))) for completion in completions]


    trainer = GRPOTrainer(
        model="Qwen/Qwen2-0.5B-Instruct",
        reward_funcs=reward_func,
        train_dataset=dataset,
    )

    trainer.train()
    ```

    Args:
        model (`Union[str, PreTrainedModel]`):
            Model to be trained. Can be either:

            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
              path to a *directory* containing model weights saved using
              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
              `args.model_init_kwargs`.
            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
            functions with the prompts and completions and sum the rewards. Can be either:

            - A single reward function, such as:
                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
                path to a *directory* containing model weights saved using
                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
                keyword arguments in `args.model_init_kwargs`.
                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
                - A custom reward function: The function is provided with the prompts and the generated completions,
                  plus any additional columns in the dataset. It should return a list of rewards. Custom reward
                  functions can also return `None` when the reward is not applicable to those samples. This is useful
                  for multi-task training where different reward functions apply to different types of samples. When a
                  reward function returns `None` for a sample, that reward function is excluded from the reward
                  calculation for that sample. For more details, see [Using a custom reward
                  function](#using-a-custom-reward-function).

                  The trainer's state is also passed to the reward function. The trainer's state is an instance of
                  [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the
                  reward function's signature.
            - A list of reward functions, where each item can independently be any of the above types. Mixing different
            types within the list (e.g., a string model ID and a custom reward function) is allowed.
        args ([`GRPOConfig`], *optional*, defaults to `None`):
            Configuration for this trainer. If `None`, a default configuration is used.
        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
            ignored. The format of the samples can be either:

            - [Standard](dataset_formats#standard): Each sample contains plain text.
            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
              and content).
        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
        processing_class ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
            Processing class used to process the data. The padding side must be set to "left". If `None`, the
            processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A
            padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token,
            `tokenizer.eos_token` will be used as the default.
        reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:

            - A single processing class: Used when `reward_funcs` contains only one reward function.
            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
            `None`, the tokenizer for the model is automatically loaded using
            [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward
            functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes`
            are ignored.
        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
            List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
            in [here](https://huggingface.co/docs/transformers/main_classes/callback).

            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
            method.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
    trlgrpoNNNmodelreward_funcsargstrain_dataseteval_datasetprocessing_classreward_processing_classes	callbacks
optimizerspeft_configr?   c                    s	   d u rt |tr|n|jj}|dd }t| d  jp!i }t |trk|}|d}t |tj	s<|dks<|d u r=nt |trLt
t|}||d< ntd| dt|}t
t|jd }|j|fi |}n|jj} jd urxtd	t|d
st|jj n
t| jj _|
d urt stdt||
} jr| }|d u rt|jj}t |tr|j}nt |t r|}nt!d|j"d u r|j#|_"|j"_"|j$_$|j%_%t
|dd _&t
|dd _'t
|jdd _(t
|jdd _)t |t*s|g}g _+t,|D ];\}}t |tr%t-j|fddi|||< t || t.j/r?j+0|| jjdd  qj+0|| j1 q|_2 j3d urxt4 j3t4|krmtdt4 j3 dt4| dtj5 j3tj6d_3ntj7t4|tj6d_3|d u rd gt4| }nt |t*s|g}nt4|t4|krtdt,t8||D ],\}\}}t |t9r|d u rt:|jj}|j$d u r|j#|_"|j$|j_$|||< q|_; j<_< j=_= j>_> j?_? j@_@ jA_A jB_B jC_C jD_D jE_E jF_F jG_G jH_H jI_I jJ_J jK_K jL_L jM_M jN_NjIr8jNdk r8tOdjIrFjLdksFtOd jP_Pt |tQsht |tQsht |tRrltSdd |T D rltOd jU_U jV_W jXd ur} jXn jV_Xd_Yd _Zd|j[d < t\ j]| t^|||||	d!  j___j_d"krd _`n ta|rd _`nt|}t
t|jd }|j|fi |_` jbrtc| j`d urtcj` jIr td std#te _ftgj_jWjXj?j_d"kjJj=d$_htit*tit*d%_jd_k jl_l jm_m jn_nto jpd&to jpd&to jpd&ti fd'd(to jpd&d)_qtr jsdd* jEr*tt sKtd+jFd,kr}jujvr} jwd ur` jw}n
d- jx d. jy }tz| j{d/_|j|j}tj~ d0 njFd1krjujjH dkstd2jH d3juj d4jHdkrtjfd5d6tjujjH D \_}tjujtjd7< tjujtjd8< tjujtjd9< tjd:d;tjd:< tjd<d=tjd<< j<d urj=d urj<j= }nd }t|j jHjGjjjH jj |d>jujjH d?jjd@	_ j_d_ju  n<j=d|j$|j|j%j?j@jAjBjC jdA} jDrRdB|dC< dD|dE< dF|dG<  jd ur^| j tdLi |_dH_jj j`d urjrtj`ju_`njrtj`ju_`n
jujj`ddI_` jrtj`judJ t,j2D ]&\}}t |t9rӈjrt|juj2|< qjuj|dddKj2|< qd S )MN/z-GRPOtorch_dtypeautozInvalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .r   zYou passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. This argument can only be used when the `model` argument is a string.get_base_modelz>PEFT is required to use `peft_config`. Run `pip install peft`.zWThe `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`image_tokenimage_token_idvision_start_token_idvision_end_token_id
num_labelsr5   zNumber of reward weights (z)) must match number of reward functions ()r   zRThe number of reward processing classes must match the number of reward functions.      ?zOLiger Kernels don't currently support masking token positions based on entropy.tokenzwLiger Kernels currently only support token-level importance sampling. Please set`importance_sampling_level` to 'token'.c                 s   s    | ]}t |tV  qd S rL   )r   r   )rX   dsrT   rT   rU   r{     s    z'GRPOTrainer.__init__.<locals>.<genexpr>z^Iterable datasets are not yet supported in GRPOTrainer. Please use a standard dataset instead.Testimate_tokens)r   r   data_collatorr   r   r   r   r           zWLiger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`.)betaepsilon_lowepsilon_hightemperatureuse_ref_model	loss_typemax_completion_length)trainevalmaxlenc                      s   t  jdS )Nr   )r   generation_batch_sizerT   )r   rT   rU   <lambda>  s    z&GRPOTrainer.__init__.<locals>.<lambda>)imageprompt
completionrewards
advantages)device_specificzkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.serverzhttp://:)base_urlconnection_timeoutr   colocatezvllm_tensor_parallel_size (z) must divide world size (z	) evenly.c                    s*   g | ]}t t| j |d   j qS )r5   )rb   rc   vllm_tensor_parallel_sizerW   r_   rT   rU   r\     s    z(GRPOTrainer.__init__.<locals>.<listcomp>RANK
LOCAL_RANK
WORLD_SIZEMASTER_ADDR	localhostMASTER_PORT12345external_launcheri   )	r   tensor_parallel_sizegpu_memory_utilizationmax_num_seqsmax_model_lendistributed_executor_backendrK   max_num_batched_tokens
model_impl)max_new_tokens	do_samplepad_token_idbos_token_ideos_token_idr   top_ptop_kmin_prepetition_penaltycache_implementationi   max_batch_tokensi   
num_blocks   
block_sizeF)evaluation_mode)	ref_modelaccelerator)r*  device_placementrT   )r   strconfig_name_or_pathr   r7   model_init_kwargsr   rO   r   getattrr   r   from_pretrainedtransformersarchitectureshasattrinspect	signatureforward
parameterskeysr   model_kwarg_keysr&   ImportErrorr@   gradient_checkpointing_enable_gradient_checkpointingr   r   	tokenizerr   	TypeError	pad_token	eos_tokenr  r   r   r   r   r   rb   reward_func_names	enumerater   r   Moduler   ri   r   reward_weightsrM   ro   float32oneszipr   r   r   max_prompt_lengthr   num_generationsr   r!  r"  r#  r$  use_transformers_pageduse_vllm	vllm_modevllm_gpu_memory_utilizationr  use_liger_lossr   scale_rewardsimportance_sampling_levelmask_truncated_completionstop_entropy_quantileNotImplementedErrorshuffle_datasetr   dictanyr   num_iterationsepsilonr   r   _step_buffered_inputswarnings_issuedsuperrV   r   r   r+  r   disable_dropoutr8   r/   r4   _forward_redirectionrA   liger_grpo_lossr   _metrics_total_train_tokenslog_completionswandb_log_unique_promptsnum_completions_to_printr   r   _logsr   rK   r0   r,  is_main_processvllm_server_base_urlvllm_server_hostvllm_server_portr.   vllm_server_timeoutvllm_clientinit_communicatorcudacurrent_devicenum_processesdistributednew_subgroups_by_enumerationrc   tp_groupprocess_indexosenvironlocal_process_indexrB   name_or_pathr   per_device_train_batch_sizesteps_per_generationvllm_model_implllmvllm_guided_decoding_regexguided_decoding_regex_last_loaded_stepwait_for_everyoner  r%  generation_kwargsupdater   generation_configmodel_accepts_loss_kwargsr   add_model_tags
_tag_namesis_deepspeed_enabledr1   is_fsdp_enabledr2   prepare_modelsync_ref_modeladd_callbackr6   )rS   r   r   r   r   r   r   r   r   r   r   
model_namer1  model_idr   r/  architecturer@  rY   reward_funcreward_processing_classr  rd   r  r  	__class__)r   rS   rU   rV     s&  







	




"













zGRPOTrainer.__init__c                 C   s   | j d u rddg| _ d S d S )Nr  r   )_signature_columnsr_   rT   rT   rU    _set_signature_columns_if_neededt  s   
z,GRPOTrainer._set_signature_columns_if_neededc                 C   s   | j d u r	td| j }| j}t r t|tjr | j|dd}n| j|dd}| j	| j
j || j
j| j
j| j
jd}t|tjjjsa|  |d< | j
j|d< tt| j
j| j
jd|d< | j
j|d	< | jt|fi |S )
Nz+Trainer: training requires a train_dataset.training)description)rH   
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_last)r  rankworker_init_fnprefetch_factor)r   r   r   r$   r   datasetsr   _remove_unused_columns"_get_collator_with_removed_columns_train_batch_sizer   r|  dataloader_num_workersdataloader_pin_memorydataloader_persistent_workersrO   utilsdatar   _get_train_samplerdataloader_drop_lastr   r#   rv  dataloader_prefetch_factorr,  preparer   )rS   r   r   dataloader_paramsrT   rT   rU   get_train_dataloader  s*   

z GRPOTrainer.get_train_dataloaderdatasetrg   c                 C   s>   |d u r| j }t|| j| jj| j | j| jj | j| jjdS )N)rF   rG   rH   rI   rJ   rK   )	r   rE   rL  r   r   rZ  r|  rW  rK   )rS   r  rT   rT   rU   r    s   zGRPOTrainer._get_train_samplerc                 C   s   t || j| jjdS )N)rF   rG   rK   )rE   rL  r   rK   )rS   r   rT   rT   rU   _get_eval_sampler  s
   zGRPOTrainer._get_eval_samplerc                 C   sN   d|j _t|r|j  n|  |jpi }d|vp|d }|r%|  |S )z-Enables gradient checkpointing for the model.Fuse_reentrant)r/  	use_cacher   
base_modelgradient_checkpointing_enablegradient_checkpointing_kwargsenable_input_require_grads)rS   r   r   r  r  rT   rT   rU   r?    s   
z*GRPOTrainer._enable_gradient_checkpointingc	                 C   s   t |r|jj}||d}	|d ur|d ur||	d< |d ur!||	d< |d ur)||	d< |d ur1||	d< d| jv r<|d |	d< |jd	i |	j}
|
d d d dd d f }
|
d d | d d d f }
|
S )
N	input_idsattention_maskr   r   pixel_attention_maskimage_sizeslogits_to_keepr5   r   rT   )r   r  r   r<  last_hidden_state)rS   unwrapped_modelr  r  r  r   r   r  r  model_inputsr  rT   rT   rU   _get_last_hidden_state  s"   

z"GRPOTrainer._get_last_hidden_state	entropiesr   	thresholdc           
      C   sd   ||    }| dkrtj|tj dS | j|}t||}||  }||k}	|	|  @ S )a  
        Returns a binary mask identifying tokens whose entropy exceeds a given quantile threshold.

        Args:
            entropies (`torch.Tensor`):
                Tensor of shape (batch_size, seq_len) with per-token entropy values.
            mask (`torch.Tensor`):
                Binary mask of the same shape as `entropies`, where `1` indicates valid tokens and `0` padding.
            threshold (`float`):
                Quantile threshold between `0.0` and `1.0` to select high-entropy tokens.

        Returns:
            `torch.Tensor`:
                Boolean mask of shape (batch_size, seq_len), where `True` indicates tokens with entropy >= threshold and
                `False` otherwise.
        r   r   )rn   r   numelrO   r   r,  r   quantile)
rS   r  r   r  r,  non_pad_entropiesall_non_pad_entropiesentropy_thresholdmasked_entropiesentropy_maskrT   rT   rU   get_high_entropy_mask  s   z!GRPOTrainer.get_high_entropy_maskFc              	   C   s  |p| d}g }g }td| d|D ]}||||  }||||  }||d}|durb|durb||||  |d< |d| d  }|d||  d  }||| |d< n|durp||||  |d< |	dur~|	|||  |d< |
dur|
|||  |d< d	| jv r|d
 |d	< |di |j}|ddddddf }|dd| dddf }|| j }|dd| df }t||}|	| |rt
  t|}W d   n1 sw   Y  |	| qt
j|dd}|rt
j|ddnd}||fS )z<Compute log-probs and (optionally) entropies for each token.r   r  Nr   r   r   r  r  r  r5   r   rT   )r   rc   r   rr   r   r<  logitsr   r>   r   rO   no_gradr9   r   )rS   r   r  r  r  rH   compute_entropyr   r   r  r  	all_logpsall_entropiesstartinput_ids_batchattention_mask_batchr  start_pixel_idxend_pixel_idxr  completion_idslogpsr  rT   rT   rU   "_get_per_token_logps_and_entropies.  sH   







z.GRPOTrainer._get_per_token_logps_and_entropiesextra_prefixesc                 C   s,   |pg }dg| }|D ]}| |d}q|S )Nz_checkpoint_wrapped_module. )replace)rS   namer  prefixesprefixrT   rT   rU   _fix_param_name_to_vllmn  s
   
z#GRPOTrainer._fix_param_name_to_vllmr  moduler  c                 C   s&  |du rt  }| D ]\}}|r| d| n|}| j|||d qt|trtj|dddW | D ]I\}}|rB| d| n|}	| j|	dgd}	|	|v rQq5||	 | j	dkrh| j
jrh| j|	|j q5| j	d	kr~| jjjjjj}
|
|	|jfg q5W d   dS 1 sw   Y  dS dS )
zdMemory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.Nr   )r  visitedF)recurse	writebackz_fsdp_wrapped_module.r  r  r  )r   named_children_sync_fsdp1_params_to_vllmr   FSDPsummon_full_paramsnamed_parametersr  addrO  r,  ri  rn  update_named_paramr  r~  
llm_enginemodel_executordriver_workermodel_runnerr   load_weights)rS   r  r  r  
child_namechild_modulechild_prefix
param_nameparam	full_name	llm_modelrT   rT   rU   r  u  s0   


"z&GRPOTrainer._sync_fsdp1_params_to_vllmc                 C   s   |   D ]9\}}|jr|td}| }| jdkr*| jj	r*| j
|| q| jdkr?| jjjjjj}|||fg qd S )Nrp  r  r  )
state_dictr   is_cputorO   r   full_tensorrO  r,  ri  rn  r  r~  r  r  r  r  r   r  )rS   r  r  r  r  rT   rT   rU   _sync_fsdp2_params_to_vllm  s   
z&GRPOTrainer._sync_fsdp2_params_to_vllmc           
   	   C   s  | j jj}|d uo|jdk}|rdd l}|jj}nt}t| j	r|t
| j	  | j	  | jrZt| j jdd }|rBt|ddnd}|dkrO| | j	 nZ|dkrY| | j	 nO| j	 D ]I\}}|ddd	}| j	j|v rsq_d
|v rxq_| j|dgd}| jdkr| j jr| j||j q_| jdkr| jjjjjj	}	|	 ||jfg q_| j	!  W d    n1 sw   Y  n{| jrt| j jdd }|rt|ddnd}|dkr| | j	 n[|dkr| | j	 nP| j	 D ]J\}}| |}||g3 | jdkr| j jr| j||j n| jdkr(| jjjjjj	}	|	 ||jfg W d    n	1 s3w   Y  q| jdkrK| j jrK| j"  d S | jdkrX| j"  d S d S )N   r   fsdp_pluginfsdp_versionr5   r(   zbase_model.model.z.base_layerr  original_modulezmodules_to_save.default.r  r  r  )#r,  statedeepspeed_plugin
zero_stage	deepspeedzeroGatheredParametersr   r   r   rb   r:  merge_adapterr  r2  r  r  r  removeprefixr  r  r  rO  ri  rn  r  r  r~  r  r  r  r  r  unmerge_adapterreset_prefix_cache)
rS   r  zero_stage_3r  gather_if_zero3r  r   r  r  r  rT   rT   rU   _move_model_to_vllm  sv   




%
zGRPOTrainer._move_model_to_vllmgeneration_batchc                 C   s   | j jrdnd}|dkrN| jj| j }| j| dks| jd u r;| |}t|}t	|}t
|| jj}dd |D | _| j| j| jj  }|  jd7  _|S | |}|S )Nr   r   r   c                 S   s   g | ]}t |qS rT   )r   )rX   r   rT   rT   rU   r\     r   z/GRPOTrainer._prepare_inputs.<locals>.<listcomp>r5   )r   r  r   r|  rZ  r\  r]  _generate_and_score_completionsr   r   r   )rS   r  modegenerate_everygeneration_batchesinputsrT   rT   rU   _prepare_inputs  s   

zGRPOTrainer._prepare_inputsc              
      s,  | j j}tjt|t| j|d}dd  d D } fdd|D }| j|d< tt| j| j	| j
D ]\}	\}
}t| | t|
tjrt d r`dd t||D }fd	d|D }n
d
d t||D }|ddddd}t |}t  |
di |jd d df |d d |	f< W d    n1 sw   Y  n!|
d|||d|}dd |D }tj|tj|d|d d |	f< W d    n1 sw   Y  q3t|jdd rt|jddjddd d fdd| D }| |d< | |d< td| d t|}|S )Nr
  c                 S   s   g | ]}|d vr|qS ))r  r  r  rT   )rX   r~   rT   rT   rU   r\     r   z2GRPOTrainer._calculate_rewards.<locals>.<listcomp>r   c                    s    i | ]   fd dD qS )c                    s   g | ]}|  qS rT   rT   rX   exampler~   rT   rU   r\     r   z=GRPOTrainer._calculate_rewards.<locals>.<dictcomp>.<listcomp>rT   r   )r  r  rU   r     s     z2GRPOTrainer._calculate_rewards.<locals>.<dictcomp>trainer_statec                 S   s   g | ]
\}}d || iqS )messagesrT   rX   pcrT   rT   rU   r\         c                    s   g | ]	}t | d  qS )text)r)   r   )r  rT   rU   r\     r   c                 S   s   g | ]\}}|| qS rT   rT   r  rT   rT   rU   r\   !  r   ptTrightFr  return_tensorspaddingpadding_sideadd_special_tokens)promptscompletionsr  c                 S   s   g | ]}|d ur
|nt jqS rL   )rO   r   )rX   rewardrT   rT   rU   r\   -  r]   r   r5   r   )as_tuplec                    s   i | ]	\}}||  qS rT   rT   )rX   r~   value)nan_row_idxrT   rU   r   4  r   r  r  z=All reward functions returned None for the following kwargs: zI. Please ensure that at least one reward function returns a valid reward.rT   )r,  r   rO   zerosrM   r   r  rE  rJ  r   rD  r,   r   r   rF  r*   r_  r  inference_moder  ro   rH  rs   r   rY  nonzeror   warningswarnr   )rS   r  r'  r(  completion_ids_listr   rewards_per_funcr;  reward_kwargsrY   r  reward_func_namer  textsreward_inputsoutput_reward_funcrow_reward_kwargsr  )r  r,  r  rU   _calculate_rewards  sT   


*"
zGRPOTrainer._calculate_rewardsr  c           J         st  j j jjr
dnd}dd |D }t|}i }d|d v }|rrdd |D }dd	d |D i}|D ]=}t|trq|D ]3}	t|	tsEq=|		d
}
|		d}t|
t
rp|dkrdddid|
dg|	d
< q=|dkrpd|
dg|	d
< q=q4fdd|D }jdb|ddddd|}t |}|d |d }}jd urֈjjjg}dd |D }t||j|\}}jj|ddd}fdd|D }jd urևfdd|D }jrjjjkr  jj_jdkr}t|}|rt|}j jrW|d d j }|r|d d j }nd }td3 jj ||jj!j"j#j$d u r0dnj$j%d u r:d nj%j&j'j(j)d!}W d    n	1 sQw   Y  nd gt*| }t+|dd"}t,j j-t*| j j-d# t*| }|| }njd$krj'rt.j'd%}nd }d#j!j"j#j$d u rdnj$j%d u rd nj%j&|d&}j(j)d ur|/j(j) t0dbi |}j1d#krt*|}d'd t2j1D }t3j4j5||j6d( d)d |D }|r
d*d t2j1D }t3j4j5||j6d( d+d |D }nd }n	|}|r|nd }|r@|r@g }t7||D ]\}}|d ur8|8|d|id, q#|8| q#n|}td j9j ||dd-}W d    n	1 s\w   Y  d.d |D }j1d#krt3j4j:j6d(} t,| | | d# | }!||! } fd/d|D }t;|j<d0}t3j=||gd#d1}"nj>rjdbd|i|}#j?j@jA}$tB rd2j?j@_And3j?j@_Atd4 tCj?j j(jDd5s}%t3E ^ jFrtGjHj?dd6ntI > j(jJr|%Kt3jL nj(jMr |%Kt3jN t3O  |%jP|#jQjRdd7}W d    n	1 sw   Y  W d    n	1 s*w   Y  W d    n	1 s:w   Y  W d    n	1 sJw   Y  W d    n	1 sZw   Y  d8d |S D } fd9d|D }t;|j<d:d;} fd<d|#jQD }t;|j<dd;}t3j=||gd#d1}"|$j?j@_Antd=p tCj?j j(jDd5T}%t3E ? jFrtGjHj?dd6ntI  |||d< |d< |%j dbi |jRdd>}"W d    n	1 sw   Y  W d    n	1 sw   Y  W d    n	1 sw   Y  W d    n	1 sw   Y  |Td#}&|"d d d |&f }|"d d |&d f }|jUk}'t3jV|'Tdf|'Td#t3jW d?}(|'X jYd#d1|'jZd#d1 |(|'jZd#d1< t3j[|'Td# d@\|'Tdd})|)|(]d#kX }*dAd t7||*D }+|*^d#},j_r|'jZd#d1 }-|*|- ]d#X  }*t3j=||*gd#d1}.|Td#}/|dkrj(j`nj(ja}0t3E  j(jbjc }1j(jd|1 dkrjej|"|.|/|0|	dB|	dC|	dD|	dEdF	\}2}3nd }2jfd krKjgd urjejg|"|.|/|0|	dB|	dC|	dD|	dEdG	\}4}3n:j hji % jej|"|.|/|0|	dB|	dC|	dD|	dEdG	\}4}3W d    n	1 sEw   Y  nd }4W d    n	1 sXw   Y  jj|ddH}5tj|d rg }6t7||5D ]!\}}7|d d dIkr|k d
 ndJ}8|68dI|8|7 dKg qsn|5}6l|||6|+}9|9jmK ]d jnd#d1}:|:odjjpd#d1};|:odjjqd#d1}<t3r|<t3s|<}=|;jtjdd1};|<jtjdd1}<|:|; }>jur|>|<dL  }>t,j j-t*| j j-d# t*| }|>v }?|>| }>|dkrj jwj x|.^ ^ y 7  _wjjwgjz| dM< j x|,}@jz| dN 8|@{ p y  jz| dO 8|@{ | y  jz| dP 8|@{ } y  j x|'jZd#d1}A|@|A }Bd#t*|Bt*|@  }Cjz| dQ 8|C t*|Bdkrt3j~d# d@}Bjz| dR 8|B{ p y  jz| dS 8|B{ | y  jz| dT 8|B{ } y  tjD ]:\}D}Et3|9d d |Df y }Fjz| dU|E dV 8|F t|9d d |Df y }Gjz| dU|E dW 8|G qÈjz| dX 8|;p y  jz| dY 8|<p y  jz| dZ 8|={ p y  jd[ t| jd\ t|5 tjD ]\}D}Hjd] |H |9d d |Df   qCjd^ |?  |rrjd t| ||||*|>d_}I|2d ur|2|Id`< |4d ur|4|Ida< dB|v r|dB |IdB< dC|v r|dC |IdC< dD|v r|dD |IdD< dE|v r|dE |IdE< |IS )cNr   r   c                 S   s   g | ]}|d  qS r  rT   r   rT   rT   rU   r\   G  r   z?GRPOTrainer._generate_and_score_completions.<locals>.<listcomp>r   r   c                 S   s   g | ]}| d qS )r   )r   r  rT   rT   rU   r\   T  s    imagesc                 S   s   g | ]}|gqS rT   rT   )rX   imgrT   rT   rU   r\   U      contentroleusertyper  )rB  r  systemc                    s   g | ]
}t | jd  qS r;  )r+   r   r  r_   rT   rU   r\   c  r  r   TleftFr"  r  r  c                 S   s   g | ]}|d ur|qS rL   rT   )rX   r   rT   rT   rU   r\   u  r   )skip_special_tokensclean_up_tokenization_spacesc                    s*   g | ]}t d t  j dd|qS )z^()+r  )resubescaperB  rX   r  r_   rT   rU   r\   }  s   * c                    s,   g | ]}t d t  j d j|qS )(rG  )rH  rI  rJ  r   rK  r_   rT   rU   r\     s     r  zvLLM.generater   r   )r'  r<  nr$  r   r!  r"  r#  
max_tokensr  r  )from_processr5   r  )regex)rM  r$  r   r!  r"  r#  rN  guided_decodingc                 S      g | ]}d qS rL   rT   rX   rd   rT   rT   rU   r\         )groupc                 S      g | ]	}|D ]}|qqS rT   rT   )rX   sublistr  rT   rT   rU   r\     r   c                 S   rR  rL   rT   rS  rT   rT   rU   r\     rT  c                 S   rV  rT   rT   )rX   rW  r=  rT   rT   rU   r\     r   )r  multi_modal_data)sampling_paramsuse_tqdmc                 S   s   g | ]}|j D ]}|jqqS rT   )outputs	token_ids)rX   r[  outputrT   rT   rU   r\     r]   c                       g | ]	}t j| d qS r
  rO   ro   rX   r   r
  rT   rU   r\     r   )padding_valuer   paged_attention
sdpa_pagedztransformers.generate_batch)gather_deepspeed3_params)r  )r  progress_barc                 S   s   g | ]}|j qS rT   )generated_tokens)rX   r]  rT   rT   rU   r\     r>  c                    r^  r_  r`  ra  r
  rT   rU   r\     r   r!  )rb  r%  c                    r^  r_  r`  ra  r
  rT   rU   r\     r   ztransformers.generate)r  disable_compiler   r
  c                 S   s$   g | ]\}}d d t ||D qS )c                 S   s   g | ]
\}}|r|  qS rT   r   )rX   idmrT   rT   rU   r\   ;  r  zJGRPOTrainer._generate_and_score_completions.<locals>.<listcomp>.<listcomp>)rJ  )rX   rowmask_rowrT   rT   rU   r\   :  s    r   r   r  r  )r   r   r  r  )rH   r   r   r  r  )rE  	assistantr  )r@  r?  g-C6?
num_tokenszcompletions/mean_lengthzcompletions/min_lengthzcompletions/max_lengthzcompletions/clipped_ratioz"completions/mean_terminated_lengthz!completions/min_terminated_lengthz!completions/max_terminated_lengthzrewards/z/meanz/stdr)  
reward_stdfrac_reward_zero_stdr  r  r  r  )
prompt_idsprompt_maskr  completion_maskr  old_per_token_logpsref_per_token_logpsrT   )r,  r   r   r  copydeepcopyr   rb   rX  r   r.  r   r_  r  rK  r   r   r   r   batch_decoder   rN  r  global_stepr  r  rO  r   ri  rL  r,   rn  generater$  r   r!  r"  r#  r   r  r   r  rM   r   slicerv  rD   r  rC   r  rc   rO   rs  all_gather_objectru  rJ  r   r~  get_rankr<   r  r   rM  model_wrappedr/  _attn_implementationr%   r3   ds3_gather_for_generationr  r  r  r  r   bf16r  bfloat16fp16float16r.  generate_batchr  r  r   r   r   fulllongrm   argmaxrY  arangeexpand	unsqueezerr   rT  r{  per_device_eval_batch_sizer|  rZ  gradient_accumulation_stepsr  r   r+  unwrap_modeldisable_adapterr*   popr:  rG  nansumviewmeanstdiscloser   repeat_interleaverR  clonenum_input_tokens_seenr   r   rc  r   r   r   r-  rE  rD  rq   rw   rh  extendra   )JrS   r  r  r'  original_promptskwargs
has_imagesr<  r  messager?  r@  prompts_textprompt_inputsrq  rr  	protectedall_prompts_text
all_imagesordered_set_of_promptsordered_set_of_imagesr  process_slicerQ  r  rY  	orig_sizegathered_promptsgathered_imagesvllm_inputsr   all_outputslocal_rank_in_grouptp_sliceprompt_completion_idspaged_prompt_inputsprevious_attnr  prompt_lengthis_eoseos_idxsequence_indicesrs  r2  completion_lengthstruncated_completionsr  r  rH   r  rt  rd   ru  completions_textr(  r  	bootstrapr3  r  mean_grouped_rewardsstd_grouped_rewardsis_std_zeror  all_process_advantagesagg_completion_lengthsagg_terminated_with_eosterm_completion_lengthsclipped_completions_ratiorY   r5  mean_rewardsstd_rewardsr  r]  r  )r   rS   rU   r  A  s  
















	


      

"("


5"
$        (





z+GRPOTrainer._generate_and_score_completionsc                 C   s<  |d |d }}|d |d }}t j||gdd}t j||gdd}|d}	| ||||	|d|d|d	|d
}
| j|
|jj|||d |jj|d|dd\}}| j	dkre|d nd }|d }| j
jrqdnd}| j	dkr| j| d | j|   | j| d | j|   |S )Nrq  rr  r  rs  r5   r   r   r   r  r  r  rt  ru  )_input
lin_weightselected_token_idsr  r  biasrt  ru  r   r   r   r   r   kl
clip_ratio)rO   r   r   r  r   rb  lm_headweightr  r   r   r  rc  r   r,  r   r  r   )rS   r  r  rq  rr  r  rs  r  r  r  r  lossmetricsmean_klr  r  rT   rT   rU   compute_liger_loss  s@   


$$zGRPOTrainer.compute_liger_lossc                 C   s>   |rt d| jr| j|}| ||| j||S | ||S )Nz2The GRPOTrainer does not support returning outputs)r   rQ  r,  r  ra  r  _compute_loss)rS   r   r  return_outputsnum_items_in_batchr  rT   rT   rU   compute_loss  s   zGRPOTrainer.compute_lossc           %         s6  |d |d }}|d |d } t j||gdd}t j| gdd}|d}| j||||d|d|d	|d
|dd	\}	}
| jdk rU| |
 d| j }nd }| jdkrm|d }t ||	 ||	  d }|d }|d}|d u r~|		 n|}|	| }| j
dkr|}n$| j
dkr|  d djdd }|d}n	td| j
 dt |}t |d| j d| j }| jjd urt j|| jjd}||d }||d }t || }|d ur|| }| jdkr|| j|  }| jdkr|  d djdd  }n2| jdkr(|     jdd }n| jdkr=|   |d| j  }ntd| j | jjrLdnd}  jdd fd d!}| jdkry||}| j| d" | j|   ||
}| j| d# | j|   |d| j k |ddk @ }|d| j k|ddk@ }||B }|| }|| } || }!| j|}"| j| d$ |"   | j| d% t |"  | j| }#| j| d& |#   | j| d' t!|#  | j|!}$| j| d( |$   |S ))Nrq  rr  r  rs  r5   r   Tr   r   r  r  )r  r   r   r  r  r   r   ru  r  rt  r   sequencer   )r   z#Unknown importance sampling level: z-. Possible values are 'token' and 'sequence'.)r   r   bnpodr_grpor   zUnknown loss type: r   r   c                    s&   | j d dkr|  S |     S r}   )r   r  rr   r   rs  completion_token_countrT   rU   masked_batch_meann  s   z4GRPOTrainer._compute_loss.<locals>.masked_batch_meanr  entropyzclip_ratio/low_meanzclip_ratio/low_minzclip_ratio/high_meanzclip_ratio/high_maxzclip_ratio/region_mean)"rO   r   r   r  r   rU  r  r   expdetachrS  rr   clampr  r   r   r   r   deltar   r   r  r   r   r  rc  r   r,  r   rq   r   r   r   r   )%rS   r   r  rq  rr  r  r  r  r  per_token_logpsr  r  ru  per_token_klr  rt  	log_ratiolog_importance_weightscoef_1coef_2per_token_loss1per_token_loss2per_token_lossr  r  r  r  mean_entropyis_low_clippedis_high_clippedis_region_clippedlow_clip	high_clipr  gathered_low_clipgathered_high_clipgathered_clip_ratiorT   r  rU   r    s   






 

&$$zGRPOTrainer._compute_lossignore_keysc              	   C   s|   |  |}t ( |   | ||}W d    n1 sw   Y  |  }W d    n1 s4w   Y  |d d fS rL   )r  rO   r  compute_loss_context_managerr  r  r  )rS   r   r  prediction_loss_onlyr  r  rT   rT   rU   prediction_step  s   



zGRPOTrainer.prediction_steplogs
start_timec           	         s  | j jrdnd}dd | j|  D }|dkr!dd | D }i ||}t || | j|   | jjr| j	rt
 rWt| jd | jd | jd | jd	 | jj| j | jjrd
| jjv rtjd urdd l}t| jjgt| jd  | jd | jd d| jd d| jd	 i}| jd rg |d< | jd D ]}|d ur|d t| q|d d  q||}| jr|jdgd}tdtj|di d S d S d S d S d S d S )Nr   r   c                 S   s"   i | ]\}}|t |t| qS rT   )rr   rM   r   rT   rT   rU   r     s   " z#GRPOTrainer.log.<locals>.<dictcomp>c                 S   s   i | ]
\}}d | |qS )eval_rT   r   rT   rT   rU   r     r  r  r  r  r  wandbr   )stepr  r  	advantager   )subsetr(  )	dataframe)r   r  rc  r   r_  logclearr,  ri  re  r'   r=   rh  r  ry  rg  r   	report_tor  runpandasr.  rM   r   Image	DataFramerf  drop_duplicatesTable)	rS   r  r  r  r  pdtabler=  dfr  rT   rU   r    sN   	


zGRPOTrainer.logc                    sL   | j jd u rt| j jj}n	| j jdd }| j|d t || d S )Nr   r   )r  )	r   hub_model_idr   
output_dirr  r   create_model_cardr_  _save_checkpoint)rS   r   trialr  r  rT   rU   r
    s
   zGRPOTrainer._save_checkpointr  dataset_nametagsc                 C   s   |   sdS t| jjdrtj| jjjs| jjj}nd}|du r&t }nt	|t
r/|h}nt|}t| jjdr?|d || j td}t||| j||t r]tjdur]tjjndt d|ddd	}|tj| jjd
 dS )a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        Nr0  unsloth_versionunslotha              @article{zhihong2024deepseekmath,
                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
                year         = 2024,
                eprint       = {arXiv:2402.03300},
            }
            GRPOzRDeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Modelsz
2402.03300)r  r  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror6  r   r/  rw  pathisdirr0  r   r   r.  r  r  r  textwrapdedentr:   r  r"   r  r  urlr;   savejoinr   r  )rS   r  r  r  r  citation
model_cardrT   rT   rU   r	    s<    

zGRPOTrainer.create_model_card)NNNNNNr   NrL   )NNNN)NFNNNN)r  N)FN)NNN)7ri   rj   rk   rl   r  r   r.  r   
RewardFuncrb   r   r7   r   r   rX  r   r   r!   tuplerO   optim	Optimizerlr_schedulerLambdaLRrV   r  r  r   r  r  r?  r-   r  r   r   r  r  r  r   rF  r  r  r  r	   r  r:  r  r  r  r  r  r  r
  r	  __classcell__rT   rT   r  rU   r     s    e
 
	
   $+
?
K"4   '*
t(2
r   )wrv  r7  rw  rH  r  r0  collectionsr   r   collections.abcr   r   
contextlibr   	functoolsr   pathlibr   typingr	   r
   r   r   r  rO   torch.utils.datar4  accelerate.utilsr   r   r   r   r   r   r   r   torch.distributed.fsdpr   r  r   r   r   r   r   r   r   r   r   r   r    r!   r"   transformers.trainer_utilsr#   transformers.utilsr$   r%   r&   r'   
data_utilsr)   r*   r+   extras.profilingr,   r-   extras.vllm_clientr.   import_utilsr/   r0   modelsr1   r2   r3   models.utilsr4   r   r6   grpo_configr7   r  r8   r9   r:   r;   r<   r=   r>   peftr?   r@   liger_kernel.chunked_lossrA   vllmrB   rC   vllm.sampling_paramsrD   r  r.  rb   r   r!  rE   r   rw   rX  rm   r   r   r   r   r   r   r   r"  r   r   rT   rT   rT   rU   <module>   s   4$c
*44
9