o
    	Ti;                     @   s2  d dl Z d dlZd dlZd dlmZmZmZmZ d dlZd dl	m
Z
 d dlm
  mZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ dd	l m!Z! dd
l"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- e rd dl.m/Z/ e rd dl0Z0G dd de'Z1dS )    N)AnyCallableOptionalUnion)Dataset)	AutoModelForCausalLMBaseImageProcessorDataCollatorFeatureExtractionMixinGenerationConfigPreTrainedModelPreTrainedTokenizerBaseProcessorMixinis_wandb_available)TrainerCallback)EvalPrediction)is_peft_available   )prepare_deepspeed)unwrap_model_for_generation   )	GKDConfig)
SFTTrainer)DataCollatorForChatMLdisable_dropout_in_modelempty_cachegenerate_model_cardget_comet_experiment_url)
PeftConfigc                       s  e Zd ZddgZ													d)deeeeje	f  deeeje	f dee
 dee d	ee d
eeeee	ef f  deeeeeef  deeegef  deee  deejjejjjf deeejejgejf  ded dee f fddZe	d*ddZd+ddZ ed,ddZ!	d,dejdee	eeje"f f d ee# d!ejf fd"d#Z$			d-d$ee	 d%ee	 d&ee	ee	 df fd'd(Z%  Z&S ).
GKDTrainertrlgkdNNNmodelteacher_modelargsdata_collatortrain_dataseteval_datasetprocessing_classcompute_metrics	callbacks
optimizerspreprocess_logits_for_metricspeft_configr   formatting_funcc                    sX  d|_ t||jd}t j||||||||	|
|||d |jd u r$i }nt|ts-td|j}|d dv r:|d nt	t
|d |d< t|trQtj|fi |}|jrYt| j | jrdt|| j| _n	| jj|dd| _|j| _|j| _|j| _|j| _t|j|jdd	|jrdnd| jjd
| _t| jjdr| jjj d ur| jjj | j_ d S d S d S )NF)	tokenizer
max_length)r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   zfYou passed teacher_model_init_kwargs to the GKDConfig, but your teacher_model is already instantiated.torch_dtype)autoNT)evaluation_moder   )max_new_tokenstemperature	do_sampletop_k	use_cachepad_token_ideos_token_id)!remove_unused_columnsr   r1   super__init__teacher_model_init_kwargs
isinstancestr
ValueErrorgetattrtorchr   from_pretraineddisable_dropoutr   r#   is_deepspeed_enabledr   acceleratorr$   prepare_modellmbdabetar6   seq_kdr   r5   gradient_checkpointingr)   r:   generation_confighasattrr;   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r?   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/trl/trainer/gkd_trainer.pyr>   >   sh   



zGKDTrainer.__init__      ?      ?	batchmeanc                 C   sT  | | } || }t j| dd}t j|dd}|dkr$t j||ddd}nJ|dkr2t j||ddd}n<tj||jd}tjt|td|  |t| gdd}	t j|	|ddd}
t j|	|ddd}||
 d| |  }|d	urz|d
k}|| }|dkr|d	ur|	 |	  S |	 |
d|
d  S |dkr|	 S |dkr| S |S )a  
        Compute the generalized Jensen-Shannon Divergence loss for knowledge distillation using F.kl_div. See Eq. (1)
        of https://huggingface.co/papers/2306.13649 for the definition.

        Args:
            student_logits:
                Tensor of shape (batch_size, sequence_length, vocab_size)
            teacher_logits:
                Tensor of shape (batch_size, sequence_length, vocab_size)
            labels:
                Tensor of shape (batch_size, sequence_length) with -100 for padding tokens to ignore when computing
                loss
            beta:
                Interpolation coefficient between 0 and 1 (default: 0.5)
            temperature:
                Softmax temperature (default: 1.0)
            reduction:
                Specifies the reduction to apply to the output (default: 'batchmean')

        Returns:
            loss: Scalar tensor with the generalized JSD loss
        )dimr   noneT)	reduction
log_targetr   )dtypeNrW   summean)Flog_softmaxkl_divrD   tensorr]   	logsumexpstacklogr_   sizer`   )student_logitsteacher_logitslabelsrK   r6   r[   student_log_probsteacher_log_probsjsdmixture_log_probs
kl_teacher
kl_studentmaskrS   rS   rT   generalized_jsd_loss   s4   $4zGKDTrainer.generalized_jsd_lossFc                 C   s   ||d |d d}| j   t  | j |d |d d}W d    n1 s)w   Y  |d jd }|jd d |d dd d f }|jd d |d dd d f }	|d d d |d f }
| j||	|
| jd}t  |rt||fS |S )	N	input_idsattention_mask)rt   ru   promptsr   rX   rk   )ri   rj   rk   rK   )	r$   evalrD   no_gradshapelogitsrs   rK   r   )rP   r#   inputsreturn_outputsnum_items_in_batchoutputs_studentoutputs_teacherprompt_lengthsshifted_student_logitsshifted_teacher_logitsshifted_labelslossrS   rS   rT   compute_loss   s.   

  zGKDTrainer.compute_lossc                 C   s`   | j |d |dd |dd}|j}t|}| }|d ur+d|||k< d|||k< |||fS )Nrv   prompt_attention_maskT)rt   ru   rN   return_dict_in_generater^   r   )generateget	sequencesrD   	ones_likeclone)r#   r{   rN   r:   generated_outputsgenerated_tokensnew_attention_mask
new_labelsrS   rS   rT   generate_on_policy_outputs   s   


z%GKDTrainer.generate_on_policy_outputsr{   r}   returnc           	         s   | j r4t| j| j}| ||| j| jj\}}}W d   n1 s#w   Y  ||d< ||d< ||d< t | j	krkt|| j}| ||| j| jj\}}}W d   n1 sZw   Y  ||d< ||d< ||d< t
 |||}|S )aa  
        Perform a training step for the Generalized Knowledge Distillation (GKD) model.

        This method implements the on-policy learning approach described in the GKD paper. With probability
        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
        the original inputs.
        Nrt   ru   rk   )rL   r   r$   rH   r   rN   r)   r:   randomrJ   r=   training_step)	rP   r#   r{   r}   unwrapped_modelnew_input_idsr   r   r   rQ   rS   rT   r     s(   
zGKDTrainer.training_step
model_namedataset_nametagsc                 C   s   |   sdS t| jjdrtj| jjjs| jjj}nd}|du r&t }nt	|t
r/|h}nt|}t| jjdr?|d || j td}t||| j||t r]tjdur]tjjndt d|ddd	}|tj| jjd
 dS )a  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        N_name_or_pathunsloth_versionunslothan          @inproceedings{agarwal2024on-policy,
            title        = {{On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes}},
            author       = {Rishabh Agarwal and Nino Vieillard and Yongchao Zhou and Piotr Stanczyk and Sabela Ramos Garea and Matthieu Geist and Olivier Bachem},
            year         = 2024,
            booktitle    = {The Twelfth International Conference on Learning Representations, {ICLR} 2024, Vienna, Austria, May 7-11, 2024},
            publisher    = {OpenReview.net},
            url          = {https://openreview.net/forum?id=3zKtaqxLhW},
        }GKDzPOn-Policy Distillation of Language Models: Learning from Self-Generated Mistakesz
2306.13649)
base_modelr   hub_model_idr   r   	wandb_url	comet_urltrainer_nametrainer_citationpaper_titlepaper_idz	README.md)is_world_process_zerorO   r#   configospathisdirr   setr@   rA   addupdate
_tag_namestextwrapdedentr   r   r   wandbrunurlr   savejoinr%   
output_dir)rP   r   r   r   r   citation
model_cardrS   rS   rT   create_model_card3  s8    



zGKDTrainer.create_model_card)NNNNNNNNNr"   NNN)NrU   rV   rW   )FN)N)NNN)'__name__
__module____qualname__r   r   r   r   nnModulerA   r   r	   r   dictr   r   r
   r   r   r   listr   tuplerD   optim	Optimizerlr_schedulerLambdaLRTensorr>   staticmethodrs   r   r   r   intr   r   __classcell__rS   rS   rQ   rT   r   ;   s    
V
F# r   )2r   r   r   typingr   r   r   r   rD   torch.nnr   torch.nn.functional
functionalra   datasetsr   transformersr   r   r	   r
   r   r   r   r   r   transformers.trainer_callbackr   transformers.trainer_utilsr   transformers.utilsr   modelsr   models.utilsr   
gkd_configr   sft_trainerr   utilsr   r   r   r   r   peftr   r   r   rS   rS   rS   rT   <module>   s,   ,	