o
    	Ti                     @   s*  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlZd dlZd dlmZ d dlm  mZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d d	l'm(Z(m)Z) d d
l*m+Z+ d dl,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZG e- rd dlHmIZImJZJ e% rd dlKmLZL e. rd dlMmNZO ePeOePdkZQndZQe5 rd dlRmSZSmTZT e& rd dlUZUe/VeWZXG dd de#ZYdS )    N)wraps)Path)AnyCallableOptionalUnion)Dataset)version)
DataLoaderIterableDataset)AutoModelForCausalLMBaseImageProcessorDataCollatorFeatureExtractionMixinGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinTrainerTrainerCallbackis_apex_availableis_wandb_available)EvalPredictionseed_worker)OptimizerNames)is_peft_availableis_sagemaker_mp_enabledlogging   )apply_chat_templateis_conversationalmaybe_apply_chat_template)is_vllm_available)create_reference_model)unwrap_model_for_generation   )BasePairwiseJudge)OnlineDPOConfig)	SIMPLE_CHAT_TEMPLATEDPODataCollatorWithPaddingdisable_dropout_in_modelempty_cachegenerate_model_cardget_comet_experiment_url
get_rewardprepare_deepspeedtruncate_right)	PeftModelget_peft_model)amp)__version__z1.10F)LLMSamplingParamsc                "       s>  e Zd ZdZddgZ														d6deeeje	f deeejdf deeejdf d	e
e d
e
e de
e de
eeedf  de
eeee	ef df  de
eeeeef  de
e de
e de
eegef  de
ee  deejjejjjf de
eejejgejf  ddf  fddZ e!dd Z"e#de$dedee	e%f fddZ&e'e(j)de*fdd Z)e'e(j+d7de
ee	ef  de*fd!d"Z+d#d$ Z,d%d& Z-d'd( Z.	d7dejd)ee	eeje%f f d*e
e/ dejfd+d,Z0	d7d-d.Z1 fd/d0Z2			d8d1e
e	 d2e
e	 d3ee	ee	 df fd4d5Z3  Z4S )9OnlineDPOTrainera  
    Initialize OnlineDPOTrainer.

    Args:
        model (`Union[str, nn.Module, PreTrainedModel]`):
            Model to be trained. Can be either:

            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a
              path to a *directory* containing model weights saved using
              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
              using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in
              `args.model_init_kwargs`.
            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
        ref_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
            The reference model to use for training. If None is specified, the reference model will be created from the
            model.
        reward_model (`transformers.PreTrainedModel` or `torch.nn.Module` or `None`):
            The reward model to score completions with, preferably an `AutoModelForSequenceClassification`.
        judge (`BasePairwiseJudge`):
            The judge to use for pairwise comparison of model completions.
        args (`OnlineDPOConfig`):
            The online DPO config arguments to use for training.
        data_collator (`transformers.DataCollator`):
            The data collator to use for training. If None is specified, the default data collator
            (`DPODataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the
            sequences in the batch, given a dataset of paired sequences.
        train_dataset (`datasets.Dataset`):
            The dataset to use for training.
        eval_dataset (`datasets.Dataset`):
            The dataset to use for evaluation.
        processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
            Processing class used to process the data. If provided, will be used to automatically process the inputs
            for the model, and it will be saved along the model to make it easier to rerun an interrupted training or
            reuse the fine-tuned model.
        peft_config (`dict`):
            The peft config to use for training.
        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
            The function to use to compute the metrics. Must take a `EvalPrediction` and return a dictionary string to
            metric values.
        callbacks (`list[transformers.TrainerCallback]`):
            The callbacks to use for training.
        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
            The optimizer and scheduler to use for training.
        preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
            The function to use to preprocess the logits before computing the metrics.
    trlz
online-dpoNNNmodel	ref_modelreward_modeljudgeargsdata_collatortrain_datasetzdatasets.Dataseteval_datasetprocessing_classreward_processing_classpeft_configcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricsreturnc                    s  ||u rt d|| _|d ur|d urtdt d }n|d u r(|d u r(t d|| _|
| _|| _|jd ur>|d ur>t d|d u rFt d|	d u rNt d|j	pRi }t
|tr|}|d}t
|tjsm|dksm|d u rnnt
|tr}tt|}||d< nt d	| d
tj|fi |}n	|j	d urt d|jj| _|d urt stdt
|tr| }t||}|jrt| | jd urt| j |d u r|d u rt|| _nd | _n|| _| j  | jd ur| j  |d u rt|	jd}|j| _g g g g g g g g g g g d| _ | jd urg | j d< g | j d< g | j d< |j!rDt" s)tdt#d|j$|j%dddd| _&t'|j(|j)tj*|j+d| _,nt-|j$|j%ddd|j.rRdndd| _&d|j/d< t0 j1||||||	||||d
 t2| j3drz| j34| j5 |j6| _7| j8r| jd urt9| j|j:|j;|j<| _| jd urt9| j|j:|j;|j<| _d S d S | jd ur| j=| j>j?| _| jd ur| j=| j>j?| _d S d S )Nz`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the same as `model`, either omit the `ref_model` argument or pass `None`.zBoth `reward_model` and `judge` are provided. Please choose provide only one of them. Ignoring `judge` and using `reward_model`.z2Either `reward_model` or `judge` must be provided.z@`missing_eos_penalty` is not supported when `judge` is provided.z`args` must be provided.z$`processing_class` must be provided.torch_dtypeautozInvalid `torch_dtype` passed to `OnlineDPOConfig`. Expected either 'auto' or a string representing a `torch.dtype` (e.g., 'float32'), but got .zYou passed `model_init_kwargs` to the `OnlineDPOConfig`, but your model is already instantiated. This argument can only be used when the `model` argument is a string.zfPEFT is not available and passed `peft_config`. Please install PEFT with `pip install peft` to use it.)pad_token_id)objective/klobjective/entropyobjective/non_score_rewardrewards/chosenrewards/rejectedrewards/accuraciesrewards/marginslogps/chosenlogps/rejectedval/contain_eos_tokenbetaobjective/rlhf_rewardobjective/scores_marginobjective/scoreszkvLLM is not available and `use_vllm` is set to True. Please install vLLM with `pip install vllm` to use it.r   2   g      ?F)n
max_tokenstemperaturetop_ktop_p
detokenize)r:   gpu_memory_utilizationdtype
model_implT)max_new_tokensr_   r`   ra   	do_sample	use_cacheestimate_tokens)
r:   r>   r?   r@   rA   rB   rE   rF   rG   rH   add_model_tags)@
ValueErrorr;   warningswarnUserWarningr<   rC   r=   missing_eos_penaltymodel_init_kwargs
isinstancestrgettorchrd   getattrr   from_pretrainedconfigis_encoder_decoderr   ImportErrorr1   merge_and_unloadr2   disable_dropoutr*   r#   evalr)   rM   
max_lengthstatsuse_vllmr"   r6   rf   r_   generation_configr5   name_or_pathrc   float32vllm_model_implllmr   gradient_checkpointingwarnings_issuedsuper__init__hasattrr:   rj   
_tag_namesrX   _betais_deepspeed_enabledr/   per_device_train_batch_sizefp16bf16toacceleratordevice)selfr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rp   model_idrJ   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/trl/trainer/online_dpo_trainer.pyr      s  




















zOnlineDPOTrainer.__init__c                 C   s<   t | jtr| jj}|t| jk r| j| S | jd S | jS )N)rq   r   liststateepochlen)r   r   r   r   r   rX   o  s   "zOnlineDPOTrainer.betarx   	tokenizerc                 C   s   |s6|| d dd}|j dur5t|d }|dks"|j |d d kr5|j g|d  |d< dg|d  |d< n|| d d	d}d
d | D }|S )z2Tokenize a single row from a DPO specific dataset.promptF)add_special_tokensN	input_idsr   r%   attention_maskTc                 S   s   i | ]
\}}d | |qS )prompt_r   ).0keyvaluer   r   r   
<dictcomp>  s    z1OnlineDPOTrainer.tokenize_row.<locals>.<dictcomp>)bos_token_idr   items)featurerx   r   batchprompt_len_input_idsr   r   r   tokenize_roww  s   
zOnlineDPOTrainer.tokenize_rowc                 C   s   | j d u r	td| j }| j}| j|| jj| jj| jjd}t|t	j
jjs<|  |d< | jj|d< t|d< | jj|d< | jt|fi |S )Nz+Trainer: training requires a train_dataset.
batch_size
collate_fnnum_workers
pin_memorypersistent_workerssampler	drop_lastworker_init_fnprefetch_factor)r@   rk   r?   _train_batch_sizer>   dataloader_num_workersdataloader_pin_memorydataloader_persistent_workersrq   rt   utilsdatar   _get_train_samplerdataloader_drop_lastr   dataloader_prefetch_factorr   preparer
   )r   r@   r?   dataloader_paramsr   r   r   get_train_dataloader  s    
z%OnlineDPOTrainer.get_train_dataloaderc                 C   s   |d u r| j d u rtdt|tr|nd}t| dr-|| jv r-| jjr-| j	| j| S t|tr7| j | n|d ur=|n| j }| j
}| jj|| jj| jj| jjd}t|tjjjsn| ||d< | jj|d< | jj|d< t|fi |}| jjrt| dr|| j|< n||i| _| j	|S )Nz-Trainer: evaluation requires an eval_dataset.r|   _eval_dataloadersr   r   r   r   )rA   rk   rq   rr   r   r   r>   r   r   r   r?   eval_batch_sizer   r   rt   r   r   r   _get_eval_samplerr   r   r
   )r   rA   dataloader_keyr?   r   eval_dataloaderr   r   r   get_eval_dataloader  s@   


z$OnlineDPOTrainer.get_eval_dataloaderc                    sf  | j j | j j| jjjjjj}|	|
   td|d ir,| jj|| jddn
| jj|| jddfddtdD }fddtdD }td	d
 |D fdd|D }fdd|D }| jjfdd|D } fdd|D }fdd|D }tj|| jjd}tj|| jjd}tj|| jjd}tj|| jjd}||||fS )Nr   r   F)use_tqdmc                    s&   g | ]} D ]
}t |j| jqqS r   )r   outputs	token_ids)r   ioutputr   r   r   
<listcomp>  s   & z3OnlineDPOTrainer._generate_vllm.<locals>.<listcomp>r   c                    s    g | ]} D ]}t |jqqS r   )r   prompt_token_ids)r   _r   r   r   r   r     s     c                 s   s    | ]}t |V  qd S Nr   r   idsr   r   r   	<genexpr>  s    z2OnlineDPOTrainer._generate_vllm.<locals>.<genexpr>c                    s,   g | ]}d g t |  dgt |  qS )r   r%   r   r   )max_prompt_lengthr   r   r        , c                    s"   g | ]}g t |  | qS r   r   r   )r   rM   r   r   r        " c                    s,   g | ]}d gt | dg t |   qS )r%   r   r   r   )r^   r   r   r     r   c                    s2   g | ]}|d   krt |k r| g n|qS )r   r   r   )eos_token_idr^   r   r   r     s    $c                    s"   g | ]}|g t |   qS r   r   r   )r^   rM   r   r   r     r   r   )rB   r   rM   r   
llm_enginemodel_executordriver_workermodel_runnerr:   load_weights
state_dictr   r    chatr   generaterangemaxr^   rt   tensorr   r   )r   r:   prompts	llm_modelcompletion_ids
prompt_idsprompt_maskcompletion_maskr   )r   r   r^   r   rM   r   _generate_vllm  s.   zOnlineDPOTrainer._generate_vllmc                    s    j j} j j}dd |D } fdd|D } fdd|D } |} |}|d dd}|d dd}t| j jj	d	}|j
|| jd
}	W d    n1 sYw   Y  |	d d |dd f }
t|
||\}
}|||
|fS )Nc                 S   s   g | ]}d |iqS r   r   r   r   r   r   r   r         z.OnlineDPOTrainer._generate.<locals>.<listcomp>c                       g | ]}t | jqS r   )r!   rB   r   xr   r   r   r         c                    s   g | ]}  | j jqS r   )r   rx   rB   r   r   r   r   r     s    prompt_input_idsr   r%   prompt_attention_mask)gather_deepspeed3_params)r   r   r   )rB   r   rM   r?   _prepare_inputsrepeatr$   r   r>   ds3_gather_for_generationr   r   sizer0   )r   r:   r   r   rM   inputsr   r   unwrapped_modelr   r   r   r   r   r   	_generate  s,   

	zOnlineDPOTrainer._generatec                 C   s   t |d|d | j d}|d d |d f }|d d |d f }tj||fdd}tj||fdd}|||d}	|d}
|
dkrI|
d nd}|	jd d |df }tj|jdd|ddd	d}|S )Nr%   r   dim)r   r   r   )
r   r   r}   rt   catlogitstake_along_dimlog_softmax	unsqueezesqueeze)r   r:   r   r   r   r   num_tokens_to_truncateprompt_completion_idsprompt_completion_maskr   
prompt_len	start_idxr  logprobsr   r   r   _forward  s    
$zOnlineDPOTrainer._forwardr   num_items_in_batchc           =   	      s  |   |d }t|} jjr ||\}}}}	n
 ||\}}}}	tj| jj	kdd}
 
|||||	}t 7  jd urN 
 j||||	}n j   
 j||||	}W d    n1 shw   Y  W d    n1 sww   Y  |j} jj|dd}td|d irdd |D } jd urtd|d irt }|tfd	d|D }fd
d|D } j|tt|d | ||d  }tjdd |D |d}nd| }td|d irdd t||D } fdd|D }dd |D }dd |D } j|ddddd |}|jd } j|ddddd |}tj||fdd}t ' t j | jj!|\}}} jj"d ur[||
    jj"8  < W d    n	1 sfw   Y  |#|\}}||k}tj$||d}|| |  }|||  }tj||fdd}|| }|| } |	%  }!|!| }"||"  &d}#| |"  &d}$t#|#|\}%}&t#|$|\}'}(|%|& })|'|( }*|)|* }+ jj'dkrt() j*|+  },n jj'dkr|+dd j*   d },nt+d j' |,, }- j d ur2|| ||  }. j-d . j/0|., , 1   j-d . j/0|, , 1   j-d .|
2 , 1   j-d . j/0|%, 1   j-d . j/0|&, 1  || }/|/&d, }0 j-d  . j/0|0, 1   j* |/ &d}1|1, }2 j-d! . j/0|2, 1   j d ur||1 }3 j-d" . j/0|3, 1  |&d,  }4 j-d# . j/0|4, 1   j*|%|'  }5 j/0|5}6 j-d$ .|6, 1   j*|&|(  }7 j/0|7}8 j-d% .|8, 1  |6|8 }9 j-d& .|9, 1  |9dk}: j-d' .|:2 , 1   j-d( . j*  jj3d ur< j4j5 jj3 dkr<t6  i }; jj7t8j9t8j:fv rO ; |;d)<  jj<dkrZ|-, }- j=r{t>?|- j@}<|<A  W d    n	1 suw   Y  n
 j/jA|-fi |; |-B  jjC S )*Nr   r   r  T)skip_special_tokensr   c                 S   s   g | ]}d |dgqS )	assistant)rolecontentr   r   
completionr   r   r   r   D  r   z2OnlineDPOTrainer.training_step.<locals>.<listcomp>c                       g | ]} j |d qS )messagesrenderr   templater   r   r   O  r   c                    r  r  r  r  r  r   r   r   P  r   c                 S   s   g | ]}|d kqS )r   r   )r   rankr   r   r   r   Y  r   r   r   c                 S   s   g | ]	\}}||d qS ))r   r  r   )r   pcr   r   r   r   _  s    c                    r   r   )r   rC   r   exampler   r   r   r   `  r   c                 S      g | ]}|d  qS r   r   r!  r   r   r   r   a  r   c                 S   r#  )r  r   r!  r   r   r   r   b  r   ptleft)paddingreturn_tensorspadding_sider   r%   rightsigmoidipozinvalid loss type rZ   r[   rW   rU   rV   rN   rP   rY   rO   rQ   rR   rT   rS   rX   learning_rate)Dtrainr   r>   r   r   r   rt   anyrB   r   r  no_gradr;   r:   disable_adapterr   batch_decoder    r=   jinja2Environmentfrom_stringr(   r   zipr   rC   r   shaper  inference_moder.   r<   rM   ro   splitarangeboolsum	loss_typeF
logsigmoidrX   NotImplementedErrormeanr~   appendr   gather_for_metricsitemfloattorch_empty_cache_stepsr   global_stepr+   optimr   LOMOADALOMO_get_learning_raten_gpuuse_apexr3   
scale_loss	optimizerbackwarddetachgradient_accumulation_steps)=r   r:   r   r  r   r   r   r   r   r   contain_eos_tokenr  ref_logprobsr   completionsenvironmentranks_of_first_completionmaskexamplesprompts_idscontext_lengthcompletions_idsr
  r   scores
first_halfsecond_halfbatch_rangechosen_indicesrejected_indices
cr_indicescr_logprobscr_ref_logprobspadding_maskcr_padding_maskcr_logprobs_sumcr_ref_logprobs_sumchosen_logprobs_sumrejected_logprobs_sumchosen_ref_logprobs_sumrejected_ref_logprobs_sumpi_logratiosref_logratiosr  losseslossscores_marginklmean_klnon_score_rewardmean_non_score_rewardrlhf_rewardmean_entropychosen_rewardsgathered_chosen_rewardsrejected_rewardsgathered_rejected_rewardsmarginaccuracykwargsscaled_lossr   )r   r  r   training_step)  s  








$   
  
zOnlineDPOTrainer.training_stepc	                 C   sj  | j jr~| jj| jkr~i }	| |  }
||8 }t|
| jj| j  d|	d< |d ur<t	|t
jr8|  n||	d< |d urE||	d< n|  |	d< | j D ]\}}t|t| |	|< qPdd | jD | _|  j|
7  _| jj| _|   | |	| d }| j jr| ||}| j||d}| jjdkr|| j _| j jr| || | j| j| j| j | _ d S d S )	N   rp  	grad_normr,  c                 S   s   i | ]}|g qS r   r   )r   r   r   r   r   r     s    z=OnlineDPOTrainer._maybe_log_save_evaluate.<locals>.<dictcomp>)metricstrialbest)control
should_logr   rF  _globalstep_last_logged_nested_gatherr@  rC  roundrq   rt   TensorrP  rJ  r~   r   r;  r   _total_loss_scalar
store_floslogshould_evaluate	_evaluate_determine_best_metricr>   save_strategyshould_save_save_checkpointcallback_handleron_save)r   tr_lossr  r:   r  r   ignore_keys_for_eval
start_timer,  logstr_loss_scalarr   valr  is_new_best_metricr   r   r   _maybe_log_save_evaluate  s6    

z)OnlineDPOTrainer._maybe_log_save_evaluatec                    sL   | j jd u rt| j jj}n	| j jdd }| j|d t || d S )N/r   )
model_name)	r>   hub_model_idr   
output_dirnamer8  create_model_cardr   r  )r   r:   r  r  r   r   r   r  	  s
   z!OnlineDPOTrainer._save_checkpointr  dataset_nametagsc                 C   s   |   sdS t| jjdrtj| jjjs| jjj}nd}|du r&t }nt	|t
r/|h}nt|}t| jjdr?|d || j td}t||| j||t r]tjdur]tjjndt d|ddd	}|tj| jjd
 dS )a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslotha          @article{guo2024direct,
            title        = {{Direct Language Model Alignment from Online AI Feedback}},
            author       = {Shangmin Guo and Biao Zhang and Tianlin Liu and Tianqi Liu and Misha Khalman and Felipe Llinares and Alexandre Ram{'{e}} and Thomas Mesnard and Yao Zhao and Bilal Piot and Johan Ferret and Mathieu Blondel},
            year         = 2024,
            eprint       = {arXiv:2402.04792}
        }z
Online DPOz7Direct Language Model Alignment from Online AI Feedbackz
2402.04792)
base_modelr  r  r  r  	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zeror   r:   rw   ospathisdirr  setrq   rr   addupdater   textwrapdedentr,   r  r   wandbrunurlr-   savejoinr>   r  )r   r  r  r  r  citation
model_cardr   r   r   r    s8    


z"OnlineDPOTrainer.create_model_card)NNNNNNNNNNNNr9   Nr   )NNN)5__name__
__module____qualname____doc__r   r   r   nnModulerr   r   r&   r'   r   r   r   dictr   r   r   r   r   r   r   r   tuplert   rG  	Optimizerlr_schedulerLambdaLRr  r   propertyrX   staticmethodr:  r   r   r   r   r   r
   r   r   r   r  intr  r  r  r  __classcell__r   r   r   r   r7   ]   s    /	

 a
 "/$
 7
+
r7   )Zr  r  rl   	functoolsr   pathlibr   typingr   r   r   r   datasetsr2  rt   torch.nnr  torch.nn.functional
functionalr=  torch.utils.datar   	packagingr	   r
   r   transformersr   r   r   r   r   r   r   r   r   r   r   r   transformers.trainer_utilsr   r   transformers.training_argsr   transformers.utilsr   r   r   
data_utilsr   r    r!   import_utilsr"   modelsr#   models.utilsr$   judgesr&   online_dpo_configr'   r   r(   r)   r*   r+   r,   r-   r.   r/   r0   peftr1   r2   apexr3   smdistributed.modelparallelr4   SMP_VERSIONparseIS_SAGEMAKER_MP_POST_1_10vllmr5   r6   r  
get_loggerr  loggerr7   r   r   r   r   <module>   sP   8,
