o
    	TÃi%‡  ã                   @   s¢  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' d d
l(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZG e'ƒ rÅd dlHZHdZIG dd„ de$ƒZJdS )é    N)Údefaultdict)ÚPath)ÚCallableÚOptionalÚUnion)ÚAccelerator)Ú	broadcastÚgather_object)ÚDataset)Ú
DataLoader)
ÚBaseImageProcessorÚDataCollatorWithPaddingÚFeatureExtractionMixinÚGenerationConfigÚPreTrainedTokenizerBaseÚProcessorMixinÚTrainerÚTrainerCallbackÚTrainerControlÚis_wandb_available)Ú#get_reporting_integration_callbacks)ÚDEFAULT_CALLBACKSÚDEFAULT_PROGRESS_CALLBACK)ÚCallbackHandlerÚExportableStateÚPrinterCallback)Úis_rich_availableé   )Úunwrap_model_for_generation)ÚOnlineTrainerStateÚbatch_generationÚdisable_dropout_in_modelÚ	exact_divÚfirst_true_indicesÚforwardÚ
get_rewardÚprepare_deepspeedÚprint_rich_tableÚselective_log_softmaxÚtruncate_responseé   )Ú
RLOOConfig)Úempty_cacheÚgenerate_model_cardÚget_comet_experiment_urlÚlog_table_to_comet_experimentç      ð?c                       s.  e Zd ZddgZ				d#dedeeeee	e
f  dejdejd	eejeee gee f f d
edee deeeeeef f  deejjejjjf deee  ddfdd„Zdefdd„Zdefdd„Zdd„ Zd$de fdd„Z!‡ fdd„Z"			d%dee dee d eeee df fd!d"„Z#‡  Z$S )&ÚRLOOTrainerÚtrlÚrlooN©NNÚconfigÚprocessing_classÚpolicyÚ
ref_policyÚreward_modelÚtrain_datasetÚdata_collatorÚeval_datasetÚ
optimizersÚ	callbacksÚreturnc                 C   s(  ||u rt dƒ‚|| _|}|| _|| _|d u rt| jƒ}d | jj_d | jj_|| _|| _	|| _
t|ƒ| _|| _|| _|	\| _| _d | _|jd u rQt|j| j ƒ|_t|jd}|| _|j|_|j|j |j |_t|j|j ƒ|_t|j|j ƒ|_t|j|jdƒ|_ t|j|jdƒ|_!t" #|j|j ¡|_$t%j&tt' '¡ ƒ|j(d}t)|dƒ *¡ }|j+› d|j,› d|› |_-|j,|j.d  | _/|j0dkrÎt1d	|j$|j0 ƒ| _2t|j|j3d
ƒ| _4|||fD ]}t5|t6j7ƒrèt8|ƒ qÜ|j9rö|j9dkrö| jj|_:|| _;| j<|j$d t=t>| jj?ƒ }|
d u r|n||
 | _@tA| j@| j;| j| j| jƒ| _B|  C| jjDr+tEntF¡ tGƒ | _HtI|  J¡ |  K¡ dd„ | jBj@| jHg D ƒd| _Ld| _Md | _NtO| jjLdd ƒd u| _PtO| jjLdd ƒd u| _Qd | _R| jjSrq|  T¡  | jjUrtVjW| jjXdd d | _YtZ| j;dƒr| j; [| j\¡ t]| j
| j4d| jdd| _^t% _|j,¡ | `| j;| j| j^¡\| _;| _| _^t% _| j/¡ t]| j|ja| jdd| _b| `| jb¡| _b| jPröt5| j	t6j7ƒrätc| j	|j|jd|jeƒ| _	tc| j|j|jd|jeƒ| _| j;| _fd S | j g| jj(¡| _t5| j	t6j7ƒr| j	 g| jj(¡| _	d S d S )Nz `policy` and `ref_policy` cannot be the same object. If you want `ref_policy` to be the same as `policy`, you must mass a copy of it, or `None` if you use peft.)Úgradient_accumulation_stepsz5`batch_size` must be a multiple of `num_mini_batches`z;`local_batch_size` must be a multiple of `num_mini_batches`©Údevicer   Ú__i£† r*   z/`local_batch_size` must be a multiple of rloo_kÚeos)Únum_training_stepsc                 S   s   g | ]	}t |tƒr|‘qS © )Ú
isinstancer   )Ú.0ÚcbrF   rF   úL/home/ubuntu/.local/lib/python3.10/site-packages/trl/trainer/rloo_trainer.pyÚ
<listcomp>³   s
    
ÿÿz(RLOOTrainer.__init__.<locals>.<listcomp>)Úis_local_process_zeroÚis_world_process_zeroÚstateful_callbacksÚdeepspeed_pluginÚfsdp_pluginT)Úexist_okÚadd_model_tags)Ú
batch_sizeÚshuffleÚ
collate_fnÚ	drop_last)rS   rU   rV   )hÚ
ValueErrorÚargsr6   r7   r   Úgeneration_configÚeos_token_idÚpad_token_idr8   r9   r:   ÚlenÚtrain_dataset_lenr;   r<   Ú	optimizerÚlr_schedulerÚoptimizer_cls_and_kwargsÚtotal_episodesÚintÚnum_train_epochsr   r@   ÚacceleratorÚnum_processesÚ
world_sizeÚper_device_train_batch_sizeÚnum_mini_batchesÚlocal_batch_sizeÚmicro_batch_sizerS   r"   Úmini_batch_sizeÚlocal_mini_batch_sizeÚmathÚceilÚnum_total_batchesÚtorchÚtensorÚtimerB   r   ÚitemÚexp_nameÚseedÚrun_nameÚprocess_indexÚ
local_seedÚnum_sample_generationsÚmaxÚsample_generations_freqÚrloo_kÚlocal_dataloader_batch_sizerG   ÚnnÚModuler!   Ú
stop_tokenÚstop_token_idÚmodelÚcreate_optimizer_and_schedulerr   r   Ú	report_tor>   r   Úcallback_handlerÚadd_callbackÚdisable_tqdmr   r   r   Úcontrolr   rL   rM   ÚstateÚcurrent_flosÚhp_search_backendÚgetattrÚis_deepspeed_enabledÚis_fsdp_enabledÚhub_model_idÚpush_to_hubÚinit_hf_repoÚshould_saveÚosÚmakedirsÚ
output_dirÚbackup_modelÚhasattrrR   Ú
_tag_namesr   Ú
dataloaderÚmanual_seedÚprepareÚper_device_eval_batch_sizeÚeval_dataloaderr&   Úfp16Úbf16Ú	deepspeedÚto)Úselfr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   rX   rd   Útime_tensorÚtime_intÚmoduleÚdefault_callbacksrF   rF   rJ   Ú__init__L   sä   ÿ
ÿ


ÿ
ÿ
ÿ
ÿ

ÿ€
ÿÿÿý

û	 üÿÿÿzRLOOTrainer.__init__c                 C   ó   | j S ©N©r™   ©r¢   rF   rF   rJ   Úget_train_dataloaderî   ó   z RLOOTrainer.get_train_dataloaderc                 C   r¨   r©   )r   r«   rF   rF   rJ   Úget_eval_dataloaderñ   r­   zRLOOTrainer.get_eval_dataloaderc           ^         s
  | j }| j}| j}| j}| j| _| j}| j}| j}| j‰ |j	}‡ fdd„}	t
|	ƒ ƒ}
t|j|jd dddd}| d¡ t ¡ }|j|j|jf}tj||d	}tj||d	}tj||d	}tj||d	}tj||d	}tj||d	}| ¡  d
| j_d
| j_|j|j d | j_|j| j | j_|jd ur¬|jdk r§t  | jj|j ¡| j_n|j| j_|j!d urÈ|j!dk rÃt  | jj|j! ¡| j_!n|j!| j_!|j"d urä|j"dk rßt  | jj|j" ¡| j_"n|j"| j_"| j# $|| j| j%¡| _%t&d|jd ƒD ]ä}| j jd|j' 7  _t(|
ƒ}t )¡ ! |d  *|¡}| +|j,d¡}|j-d }g }g }g }g }g }g }t.| j| j| j j/d}t0|||j1|j2|ƒ\}} W d   ƒ n	1 sQw   Y  t&d
|j-d
 |j1ƒD ]º}!||!|!|j1 … }"||!|!|j1 … }#|#d d …|d …f }$| |!|!|j1 … }%t3|%|$ƒ}&~%t4ƒ  t5||#|j2ƒ}'|'j6d d …|d d…f }(|(|jd  }(t3|(|$ƒ})~'~(t4ƒ  |$}*|j7d urÅt8|j7|j2|$ƒ}*t 9|"|*fd¡}+t:|*|j2kƒd },t;|t<j=ƒrét>||+|j2|ƒ\}-}.}-ntj?||j@|+ddƒtjAd *|¡}.| B|$¡ | B|*¡ | B|&¡ | B|)¡ | B|,¡ | B|.¡ q`t 9|d
¡}t 9|d
¡}t 9|d
¡}t 9|d
¡}t 9|d
¡}t 9|d
¡}~&~)~.t4ƒ  tC D¡  tjE||jFkdd}/|jGd urd||/   | j jG8  < tjH|j-d |j	d	 +|j-d
 d¡}0|0| Id¡k}1t J||1tK¡}t J||1tK¡}|| }2|jLr©|| M¡  | N¡ d  }t O||jP |jP¡}|jQrë|jR |2 }3|1 Sd¡d |1 T¡  U¡ jVddd }4t W|2¡}5| Xdd¡ *|2jY¡}6|5jZd|4|6d |3 [d¡}7|5|3 }8|8 [d¡}9n|2 [d¡}:|jR |: }7|7| }9|9 X|j,d¡}9|9 [d
¡|9 |j,d  };|9|; }<|< \¡ }<|j]r%|<|< M¡  |< N¡ d  }<t4ƒ  W d   ƒ n	1 s3w   Y  t&|jƒD ]U}=t^j_ `|ja¡}>d
}?t&d
|ja|jbƒD ]?}@|@|jb }A|>|@|A… }Bd
}Ct&d
|jb|jcƒD ]}D| d|¡ø |D|jc }E|B|D|E… }F|<|F }G||F }H||F }I||F }Jt5||I|j2ƒ}K|Kj6d d …|d d…f }%|%|jd  }%t3|%|Hƒ}Lt J|L|1|F tK¡}L|L|J  e¡ }M|L [d¡}L|J [d¡}J|L|J }Nt e|N¡}O|G |O }P|G t O|Od|jf d|jf ¡ }Qt g|P|Q¡}R|R M¡ }S|S}T| h|T¡ | i¡  | j¡  t )¡ X |Q|Pk A¡  M¡ }Utj<jkjl|%dd}Vtjm|%ddtj[|V|% dd }Wd|Nd  M¡  }X|X||=|?|Cf< |U||=|?|Cf< |S||=|?|Cf< |W M¡ ||=|?|Cf< |M M¡ ||=|?|Cf< W d   ƒ n	1 s]w   Y  W d   ƒ n	1 smw   Y  |Cd7 }Cqi|?d7 }?~K~%~L~N~O~P~Q~S~T~U~V~W~X~G~H~I~Jt4ƒ  qQq=t )¡ ê |2 [d¡ M¡ }Y|  [d¡ M¡ }Z|7 M¡ }[tn| jjt ¡ |  ƒ}\i }]|\|]d< | j o|Y¡ M¡  p¡ |]d< | j o|Z¡ M¡  p¡ |]d< | j o|[¡ M¡  p¡ |]d< | j o|9¡ M¡  p¡ |]d< | j o| M¡ ¡ M¡  p¡ |]d< | j o|¡ M¡  p¡ |]d< | j o|¡ M¡  p¡ |]d< | j o|¡ M¡  p¡ |]d< | j o|¡ M¡  p¡ |]d < | j o|¡ M¡  p¡ |]d!< | j o|¡ M¡  p¡ |]d"< | j o|¡ q¡  p¡ |]d#< ||jFk [¡  p¡ |]d$< | jr s¡ d
 |]d%< | jj|]d&< | jj|j,| j  | j_t|  u|]¡ W d   ƒ n	1 s†w   Y  ~2~Y~Z~| jr i¡  | j jd7  _| j# v|| j| j%¡| _%| j%jwr¿| jx|d d' | j# y| j | j| j%¡| _%t4ƒ  tC D¡  |jzd
krÜ|d | j{ d
krÜ| j|dd( q÷| j# }|| j| j%¡| _%| j%jwr| jx|d d d) | j# y| j | j| j%¡| _%d S d S )*Nc                   3   s    	 ˆ E d H  qr©   rF   rF   rª   rF   rJ   Úrepeat_generator   s   €
ÿz+RLOOTrainer.train.<locals>.repeat_generatorgH¯¼šò×z>ç        r0   T©Úmax_new_tokensÚtemperatureÚtop_kÚtop_pÚ	do_samplez===training policy===rA   r   r   r*   Ú	input_ids©Úgather_deepspeed3_paramséÿÿÿÿ©Úskip_special_tokens©Údtype)Údimg:Œ0âŽyE>)r¿   Úkeepdim)r¿   ÚindexÚsrcg      à?Úepszobjective/klzobjective/entropyzobjective/non_score_rewardzobjective/rlhf_rewardzobjective/scoreszpolicy/approxkl_avgzpolicy/clipfrac_avgzloss/policy_avgzval/clipfrac_avgzpolicy/entropy_avgz	val/ratiozval/ratio_varzval/num_eos_tokensÚlrÚepisode)Útrial)Úsampling)rÆ   Úmetrics)~rX   rd   r^   r‚   Úmodel_wrappedr8   r9   r6   r™   rB   Úiterr   Úresponse_lengthr³   Úprintrr   Únum_ppo_epochsrh   r@   rp   ÚzerosÚtrainr‰   Úglobal_steprÅ   ro   Ú	max_stepsra   r]   rc   Úlogging_stepsrm   rn   Ú
eval_stepsÚ
save_stepsr…   Úon_train_beginrˆ   ÚrangerS   ÚnextÚno_gradr¡   Úrepeatr|   Úshaper   Úds3_gather_for_generationr    Ú local_rollout_forward_batch_sizer[   r(   r,   r$   Úlogitsr   r)   Úcatr#   rG   r~   r   r%   rq   Úbatch_decodeÚfloatÚappendÚgcÚcollectÚanyrZ   Úmissing_eos_penaltyÚarangeÚ	unsqueezeÚmasked_fillÚINVALID_LOGPROBÚnormalize_rewardÚmeanÚstdÚclampÚreward_clip_rangeÚtoken_level_klÚkl_coefÚsizeÚlongÚfliplrÚargmaxÚ
zeros_likeÚreshaper¾   Úscatter_ÚsumÚflattenÚnormalize_advantageÚnpÚrandomÚpermutationri   rl   rg   Ú
accumulateÚexpÚ	cliprangerz   ÚbackwardÚstepÚ	zero_gradÚ
functionalÚsoftmaxÚ	logsumexprb   Úgather_for_metricsrs   Úvarr_   Úget_last_lrÚepochÚlogÚon_step_endr’   Ú_save_checkpointÚon_savery   r{   Úgenerate_completionsÚon_train_end)^r¢   rX   rd   r^   r‚   r8   r9   r6   rB   r¯   Úiter_dataloaderrY   Ú
start_timeÚstats_shapeÚapproxkl_statsÚpg_clipfrac_statsÚpg_loss_statsÚvf_clipfrac_statsÚentropy_statsÚratio_statsÚupdateÚdataÚqueriesÚcontext_lengthÚ	responsesÚpostprocessed_responsesÚlogprobsÚref_logprobsÚscoresÚsequence_lengthsÚunwrapped_modelÚquery_responsesÚlogitssÚiÚqueryÚquery_responseÚresponserÝ   ÚlogprobÚ
ref_outputÚ
ref_logitsÚref_logprobÚpostprocessed_responseÚpostprocessed_query_responseÚsequence_lengthÚ_ÚscoreÚcontain_eos_tokenÚresponse_idxsÚpadding_maskÚklÚ	kl_rewardÚeos_indicesÚlast_rewardÚscores_shapedÚnon_score_rewardÚrewardÚrlhf_rewardÚsequence_klÚbaselineÚ
advantagesÚppo_epoch_idxÚb_indsÚminibatch_idxÚmini_batch_startÚmini_batch_endÚmini_batch_indsÚgradient_accumulation_idxÚmicro_batch_startÚmicro_batch_endÚmicro_batch_indsÚmb_advantageÚmb_responsesÚmb_query_responsesÚmb_logprobsÚoutputÚnew_logprobsÚ	new_ratioÚlogprobs_diffÚratioÚ	pg_lossesÚ
pg_losses2Úpg_loss_maxÚpg_lossÚlossÚpg_clipfracÚ	prob_distÚentropyÚapproxklÚmean_klÚmean_entropyÚmean_non_score_rewardrÃ   rÈ   rF   rª   rJ   rÏ   ô   sö  
û










ÿ
ûý


ÿ
ÿÿüû




$$


 ö 



ÿ



 

 ÿõ€Ö6
º
Iÿç

 €
þzRLOOTrainer.trainFrÇ   c              
   C   s8  | j }| j}t| j jddddd}ttƒ}t| j| j| j j	d±}| j
D ]¥}|d }t ¡  |jd }	t|||jd	 |j|ƒ\}
}|
d d …|	d …f }|}|jd ur[t|j|j|ƒ}|d
  t|j|ddƒ¡ |d  t| |¡ƒ¡ t ||fd¡}t| jtjƒr‘t| j||j|	ƒ\}}}ntj|  |j|dd¡tjd |j¡}|d  | j |¡ ¡   ¡  !¡ ¡ W d   ƒ n1 sÀw   Y  |rÉ nq$W d   ƒ n1 sÔw   Y  t" #|¡}| jj$rt%ƒ rït&|j'd	d… ƒ d|j(v r
d	d l)}|j*d ur
| +d|j,|di¡ d|j(v rt-d|d d S d S d S )NgÙa§³îz„?r°   r0   Tr±   r¸   r·   r*   r   r(  r»   zmodel responser½   r3  é   ÚwandbÚcompletions)Ú	dataframeÚcomet_mlzcompletions.csv)ÚnameÚtable).rX   r6   r   rË   r   Úlistr   r‚   rd   rÛ   r   rp   rØ   rÚ   r    r[   r   r)   Úextendr	   rß   rÞ   rG   r9   r~   r   r%   rq   rà   r¡   rB   r  ÚcpuÚnumpyÚpdÚ	DataFrameÚis_main_processr   r'   Úilocr„   rb  Úrunr  ÚTabler/   )r¢   rÇ   rX   r6   rY   rg  r$  Úbatchr(  r  r)  r2  r*  r/  r0  r3  Údfrb  rF   rF   rJ   r  8  s–   ûÿ


û

ÿÿÿüÿüû$Ú(ÿ€Ó
/

þö	z RLOOTrainer.generate_completionsc                    sL   | j jd u rt| j jƒj}n	| j j d¡d }| j|d tƒ  ||¡ d S )Nú/rº   )Ú
model_name)	rX   r   r   r•   rf  ÚsplitÚcreate_model_cardÚsuperr  )r¢   r‚   rÆ   ru  ©Ú	__class__rF   rJ   r  …  s
   zRLOOTrainer._save_checkpointru  Údataset_nameÚtagsc                 C   sê   |   ¡ sdS t| jjdƒrtj | jjj¡s| jjj}nd}|du r&tƒ }nt	|t
ƒr/|h}nt|ƒ}t| jjdƒr?| d¡ | | j¡ t d¡}t||| j||tƒ r]tjdur]tjjndtƒ d|ddd	}| tj | jjd
¡¡ dS )aî  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        NÚ_name_or_pathÚunsloth_versionÚunslotha          @inproceedings{ahmadian2024back,
            title        = {{Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs}},
            author       = {Arash Ahmadian and Chris Cremer and Matthias Gall{'{e}} and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet {"{U}}st{"{u}}n and Sara Hooker},
            year         = 2024,
            booktitle    = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), {ACL} 2024, Bangkok, Thailand, August 11-16, 2024},
            publisher    = {Association for Computational Linguistics},
            pages        = {12248--12267},
            editor       = {Lun{-}Wei Ku and Andre Martins and Vivek Srikumar},
        }ÚRLOOz`Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMsz
2402.14740)Ú
base_modelru  r   r{  r|  Ú	wandb_urlÚ	comet_urlÚtrainer_nameÚtrainer_citationÚpaper_titleÚpaper_idz	README.md)rM   r—   r‚   r5   r“   ÚpathÚisdirr}  ÚsetrG   ÚstrÚaddr  r˜   ÚtextwrapÚdedentr-   r   r   rb  rp  Úurlr.   ÚsaveÚjoinrX   r•   )r¢   ru  r{  r|  r  ÚcitationÚ
model_cardrF   rF   rJ   rw    s8    


õzRLOOTrainer.create_model_card)NNr4   N)F)NNN)%Ú__name__Ú
__module__Ú__qualname__r˜   r+   r   r   r   r   r   r   r~   r   r   rh  r‹  rà   r
   r   ÚdictÚtuplerp   ÚoptimÚ	Optimizerr_   ÚLambdaLRr   r§   r   r¬   r®   rÏ   Úboolr  r  rw  Ú__classcell__rF   rF   ry  rJ   r1   I   sb    òþÿýúùø	÷
öõó
ò
ñ #  FM
üþýür1   )Krâ   rm   r“   r  rr   Úcollectionsr   Úpathlibr   Útypingr   r   r   rk  rû   Úpandasrl  rp   Útorch.nnr~   Ú
accelerater   Úaccelerate.utilsr   r	   Údatasetsr
   Útorch.utils.datar   Útransformersr   r   r   r   r   r   r   r   r   r   Útransformers.integrationsr   Útransformers.trainerr   r   Útransformers.trainer_callbackr   r   r   Útransformers.utilsr   Úmodels.utilsr   Útrainer.utilsr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   Úrloo_configr+   Úutilsr,   r-   r.   r/   rb  ré   r1   rF   rF   rF   rJ   Ú<module>   s:   04