o
    	TÃi°œ  ã                   @   sö  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	 d dl
mZ d dlmZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2 d dl3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: ddl;m<Z< ddl=m>Z> ddl?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZO e4ƒ rßd dlPmQZQmRZRmSZS e)ƒ ræd dlTZTdZUG dd„ dejVƒZWG dd„ de&ƒZXdS )é    N)Údefaultdict)ÚcontextmanagerÚnullcontext)ÚPath)ÚOptionalÚUnion)ÚAccelerator)Ú	broadcastÚgather_object)ÚDataset)Ú
DataLoader)
ÚBaseImageProcessorÚDataCollatorWithPaddingÚFeatureExtractionMixinÚGenerationConfigÚPreTrainedTokenizerBaseÚProcessorMixinÚTrainerÚTrainerCallbackÚTrainerControlÚis_wandb_available)Ú#get_reporting_integration_callbacks)ÚDEFAULT_CALLBACKSÚDEFAULT_PROGRESS_CALLBACK)ÚCallbackHandlerÚExportableStateÚPrinterCallback)Úis_peft_availableÚis_rich_availableé   )Úmasked_meanÚmasked_whiten)Úcreate_reference_model)Úunwrap_model_for_generationé   )Ú	PPOConfig)ÚOnlineTrainerStateÚbatch_generationÚdisable_dropout_in_modelÚempty_cacheÚ	exact_divÚfirst_true_indicesÚforwardÚgenerate_model_cardÚget_comet_experiment_urlÚ
get_rewardÚlog_table_to_comet_experimentÚpeft_module_casting_to_bf16Úprepare_deepspeedÚprint_rich_tableÚselective_log_softmaxÚtruncate_response)Ú
PeftConfigÚ	PeftModelÚget_peft_modelç      ð?c                       s&   e Zd Zd‡ fdd„Zdd„ Z‡  ZS )ÚPolicyAndValueWrapperÚreturnNc                    s(   t ƒ  ¡  || _|| _t||jƒ| _d S ©N)ÚsuperÚ__init__ÚpolicyÚvalue_modelÚgetattrÚbase_model_prefixÚcritic_backbone)Úselfr?   r@   ©Ú	__class__© úK/home/ubuntu/.local/lib/python3.10/site-packages/trl/trainer/ppo_trainer.pyr>   W   s   
zPolicyAndValueWrapper.__init__c                 K   s6   | j di |¤Ž}| j |jd ¡}| jdi |¤Ž|fS )NéÿÿÿÿrG   )rC   r@   ÚscoreÚhidden_statesr?   )rD   ÚkwargsÚoutputÚlogitsrG   rG   rH   r,   ]   s   zPolicyAndValueWrapper.forward)r;   N)Ú__name__Ú
__module__Ú__qualname__r>   r,   Ú__classcell__rG   rG   rE   rH   r:   V   s    r:   c                       sP  e Zd ZddgZ					d,dedeeeee	e
f  dejdeej d	ejd
edejdee deeeeeef f  deejjejjjf deee  ded ddfdd„Zdefdd„Zdefdd„Zedd„ ƒZd-dee def‡ fdd„Z d d!„ Z!d.d"efd#d$„Z"‡ fd%d&„Z#			d/d'ee d(ee d)eeee df fd*d+„Z$‡  Z%S )0Ú
PPOTrainerÚtrlÚppoN©NNÚargsÚprocessing_classÚmodelÚ	ref_modelÚreward_modelÚtrain_datasetr@   Údata_collatorÚeval_datasetÚ
optimizersÚ	callbacksÚpeft_configr6   r;   c                 C   s\  ||u rt dƒ‚|| _|| _|| _|d u rt| jƒ}|jr$|jr$t dƒ‚|jr?|jdkr6|j | jj_| _nt d|j› dƒ‚|j | jj_| _| jj	dvrRt dƒ‚t
ƒ s]|d ur]tdƒ‚t
ƒ r†|d ur†t| jtƒrp| j ¡ | _t| j|ƒ| _|jr†t| jd	d
ƒr†t| jƒ t
ƒ oŽt| jtƒ| _|j| _|j| _|rž|| _n| jr¥d | _nt| jƒ| _|| _|| _t|ƒ| _|| _|| _|	| _|
\| _| _ d | _!|j"d u rÖt#|j$| j ƒ|_"t%|j&d}|| _'|j(|_)|j*|j& |_+t#|j*|j) ƒ|_,t#|j+|j) ƒ|_-t.|j-|j/dƒ|_0t.|j+|j/dƒ|_1|j2r!|j1dks!J d|j1› dƒ‚t3 4|j"|j- ¡|_5t6j7t#t8 8¡ ƒ|j9d}t:|dƒ ;¡ }|j<› d|j=› d|› |_>|j=|j?d  | _@|jAdkrdtBd|j5|jA ƒ| _C|j+| _D| j| j| j| jfD ]}|d ur}tE|ƒ qrtF| j| jƒ| _G| jjH| jG_H| jI|j5d tJtK| jjLƒ }|d u r£|n|| | _MtN| jM| jG| j| j| j ƒ| _O|  P| jjQr¿tRntS¡ tTƒ | _UtV|  W¡ |  X¡ dd„ | jOjM| jUg D ƒd| _Yd| _Zd | _[t| j'jYdd ƒd u| _\t| j'jYdd ƒd u| _]d | _^| jj_r|  `¡  | jjartbjc| jjddd te| jGdƒr!| jG f| jg¡ th| j| jDd| jdd| _it6 j|j=¡ | k| jG| j| ji¡\| _G| _| _it6 j| j@¡ th| j|jl| jdd | _m| k| jm¡| _m| j\r‹tn| j|j*|jo|jƒ| _| jd u r}| js{t d!ƒ‚d S tn| j|j*|jo|jƒ| _d S | jd u rš| js™t d!ƒ‚n	| j p| j'j9¡| _| j p| j'j9¡| _d S )"Nzœ`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the same as `model`, you must make a copy of it, or `None` if you use peft.z5You cannot set both `stop_token` and `stop_token_id`.ÚeoszUnknown `stop_token` z9. Allowed values are: `'eos'` and `None` (no stop token).>   Úk1Úk3zákl_estimator must be either 'k1' (straightforward, unbiased) or 'k3' (lower variance, unbiased, appears to be a strictly better estimator). See [Approximating KL Divergence](http://joschu.net/blog/kl-approx.html) for details.zvPEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT modelsÚis_loaded_in_4bitF)Úgradient_accumulation_stepsz5`batch_size` must be a multiple of `num_mini_batches`z;`local_batch_size` must be a multiple of `num_mini_batches`é   zPer-rank minibatch size z is insufficient for whitening©Údevicer   Ú__i£† r$   )Únum_training_stepsc                 S   s   g | ]	}t |tƒr|‘qS rG   )Ú
isinstancer   )Ú.0ÚcbrG   rG   rH   Ú
<listcomp>ù   s
    
ÿÿz'PPOTrainer.__init__.<locals>.<listcomp>)Úis_local_process_zeroÚis_world_process_zeroÚstateful_callbacksÚdeepspeed_pluginÚfsdp_pluginT)Úexist_okÚadd_model_tags)Ú
batch_sizeÚshuffleÚ
collate_fnÚ	drop_last)rw   ry   rz   z1No reference model and model is not a Peft model.)qÚ
ValueErrorrW   rX   Úpolicy_modelr   Ú
stop_tokenÚstop_token_idÚeos_token_idÚgeneration_configÚkl_estimatorr   ÚImportErrorrl   r7   Úmerge_and_unloadr8   Úbf16rA   r1   Úis_peft_modelÚmodel_adapter_nameÚref_adapter_namerZ   r"   r[   r\   ÚlenÚtrain_dataset_lenr@   r]   r^   Ú	optimizerÚlr_schedulerÚoptimizer_cls_and_kwargsÚtotal_episodesÚintÚnum_train_epochsr   rf   ÚacceleratorÚnum_processesÚ
world_sizeÚper_device_train_batch_sizeÚlocal_batch_sizeÚmicro_batch_sizerw   r*   Únum_mini_batchesÚmini_batch_sizeÚlocal_mini_batch_sizeÚwhiten_rewardsÚmathÚceilÚnum_total_batchesÚtorchÚtensorÚtimeri   r	   ÚitemÚexp_nameÚseedÚrun_nameÚprocess_indexÚ
local_seedÚnum_sample_generationsÚmaxÚsample_generations_freqÚlocal_dataloader_batch_sizer(   r:   rY   ÚconfigÚcreate_optimizer_and_schedulerr   r   Ú	report_tor`   r   Úcallback_handlerÚadd_callbackÚdisable_tqdmr   r   r   Úcontrolr&   rp   rq   ÚstateÚcurrent_flosÚhp_search_backendÚis_deepspeed_enabledÚis_fsdp_enabledÚhub_model_idÚpush_to_hubÚinit_hf_repoÚshould_saveÚosÚmakedirsÚ
output_dirÚhasattrrv   Ú
_tag_namesr   Ú
dataloaderÚmanual_seedÚprepareÚper_device_eval_batch_sizeÚeval_dataloaderr2   Úfp16Úto)rD   rW   rX   rY   rZ   r[   r\   r@   r]   r^   r_   r`   ra   r   Útime_tensorÚtime_intÚmoduleÚdefault_callbacksrG   rG   rH   r>   f   s   ÿ

ÿÿÿ



ÿ
ÿÿ
ÿ
€ÿÿÿý

û	 üÿÿ
ÿÿzPPOTrainer.__init__c                 C   ó   | j S r<   ©r¿   ©rD   rG   rG   rH   Úget_train_dataloader8  ó   zPPOTrainer.get_train_dataloaderc                 C   rÊ   r<   )rÃ   rÌ   rG   rG   rH   Úget_eval_dataloader;  rÎ   zPPOTrainer.get_eval_dataloaderc                 c   s”    | j r| js| j | jj¡ ¡ ntƒ , | jr | jj | j¡ dV  | jr8| jj | j	p.d¡ W d  ƒ dS W d  ƒ dS 1 sCw   Y  dS )zWContext manager for handling null reference model (that is, peft adapter manipulation).NÚdefault)
r…   r‡   r   Úunwrap_modelrY   r?   Údisable_adapterr   Úset_adapterr†   rÌ   rG   rG   rH   Únull_ref_context>  s   €ÿÿý÷"øzPPOTrainer.null_ref_contextFr¼   Ú_internal_callc                    sL   | j }| j j| _ | jr| j}| j | _tƒ  ||¡ || _ | jr$|| _d S d S r<   )rY   r?   r´   Ú	deepspeedr=   Ú
save_model)rD   r¼   rÕ   Úbackup_modelÚbackup_deepspeedrE   rG   rH   r×   L  s   

ÿzPPOTrainer.save_modelc           r         s$  | j }| j}| j}| j}| j}| j}| j}| j‰ |j}‡ fdd„}	t	|	ƒ ƒ}
t
|j|jd dddd}| d¡ t ¡ }|j|j|jf}tj||d	}tj||d	}tj||d	}tj||d	}tj||d	}tj||d	}tj||d	}| ¡  d
| j_d
| j_|j| j_|j| j | j_|jd urª|jdk r¥t | jj|j ¡| j_n|j| j_|j d urÆ|j dk rÁt | jj|j  ¡| j_ n|j | j_ |j!d urâ|j!dk rÝt | jj|j! ¡| j_!n|j!| j_!| j" #|| j| j$¡| _$| j%rø| j| _&| j| _'t(d|jd ƒD ]è}| j jd|j) 7  _t*|
ƒ}t +¡ ˆ |d  ,|¡}|j-d }g }g }g }g }g }g }g }t.| j| j| j j/d} t0| j1||j2|j3|ƒ\}!}"W d   ƒ n	1 sVw   Y  t(d
|j-d
 |j2ƒD ]è}#||#|#|j2 … }$|!|#|#|j2 … }%|%d d …|d …f }&|"|#|#|j2 … }'t4|'|&ƒ}(~'t5ƒ  |d u r¸|  6¡  t7|j1|%|j3ƒ})W d   ƒ n	1 s²w   Y  nt7||%|j3ƒ})|)j8d d …|d d…f }*|*|jd  }*t4|*|&ƒ}+~)~*t5ƒ  |&},| j9d urít:| j9|j3|&ƒ},t ;|$|,fd¡}-t<|,|j3kƒd }.| =|¡j>}/t?|/|%|j3|ƒ\}0}1}1|0d d …|d d…f  @d¡}2t?||-|j3|ƒ\}1}3}1| A|&¡ | A|,¡ | A|(¡ | A|+¡ | A|.¡ | A|3¡ | A|2¡ qet ;|d
¡}t ;|d
¡}t ;|d
¡}t ;|d
¡}t ;|d
¡}t ;|d
¡}t ;|d
¡}~(~+~0~2~3~ t5ƒ  tB C¡  tjD|| jjEkdd}4| j jFd ur¢||4   | j jF8  < tjG|j-d |jd	 H|j-d
 d¡}5|5| Id¡k}6t J||6tK¡}t J||6tK¡}|d }7|5|7 Id¡k}8t J||8d
¡}|| }9|jLdkrè|9 n|9 M¡ d |9 }:|jN |: };|; O¡ }<tjG|< Pd
¡|<jd	}=t Q|7|< Pd¡k |7|¡}>|<|=|>g  |7  < |jRr.tS|<|8 dd}<t J|<|8d
¡}<d
}?g }@|j-d }AtTt(|AƒƒD ]:}B|B|Ad k rP|d d …|Bd f nd}C|<d d …|Bf |jU|C  |d d …|Bf  }D|D|jU|jV |?  }?|@ A|?¡ q=tjW|@d d d… dd}E|E| }FtS|E|6 ƒ}Et J|E|6d
¡}Et5ƒ  W d   ƒ n	1 s£w   Y  t(|jƒD ]Ä}GtXjY Z|j[¡}Hd
}It(d
|j[|j\ƒD ]®}J|J|j\ }K|H|J|K… }Ld
}Mt(d
|j\|j]ƒD ]r}N| ^|¡\ |N|j] }O|L|N|O… }P|E|P }Q||P }R|!|P }S||P }T|F|P }U||P }Vt7||S|j3ƒ\}W}X|Wj8d d …|d d…f }'|'|jd  }'t4|'|Rƒ}Yt J|Y|6|P tK¡}Y|Xd d …|d d…f  @d¡}Zt J|Z|8|P d
¡}Zt _|Z|V|j` |V|j` ¡}[t a|Z|U ¡}\t a|[|U ¡}]t b|\|]¡}^dtc|^|8|P  ƒ }_tc|]|\k d¡ |8|P  ƒ}`|Y|T }at M|a¡}b|Q |b }c|Q t _|bd|je d|je ¡ }dt b|c|d¡}etc|e|6|P  ƒ}f|f|jf|_  }g| g|g¡ | h¡  | i¡  t +¡ j tc|d|ck d¡ |6|P  ƒ}htjjjkjl|'dd}itjm|'ddtjn|i|' dd }jd|ad  o¡  }k|k||G|I|Mf< |h||G|I|Mf< |f||G|I|Mf< |_||G|I|Mf< |`||G|I|Mf< |j o¡ ||G|I|Mf< |b o¡ ||G|I|Mf< W d   ƒ n	1 s2w   Y  W d   ƒ n	1 sBw   Y  |Md7 }MqÙ|Id7 }I~W~X~'~Y~Z~[~\~]~_~`~a~b~c~d~e~f~g~h~i~j~k~U~Q~V~R~S~Tt5ƒ  qÁq­t +¡  |: nd¡ o¡ }l|  nd¡ o¡ }m|; nd¡ o¡ }n|n| o¡  }otp| jjt ¡ |  ƒ}pi }q|p|qd< | j q|l¡ o¡  r¡ |qd< | j q|m¡ o¡  r¡ |qd< | j q|n¡ o¡  r¡ |qd< | j q|o¡ o¡  r¡ |qd< | j q| o¡ ¡ o¡  r¡ |qd< | j q|¡ o¡  r¡ |qd< | j q|¡ o¡  r¡ |qd< | j q|¡ o¡  r¡ |qd< | j q|¡ o¡  r¡ |qd< | j q|¡ o¡  r¡ |qd < | j q|¡ o¡  r¡ |qd!< | j q|¡ o¡  r¡ |qd"< | j q|¡ s¡  r¡ |qd#< ||jEk n¡  r¡ |qd$< | jt u¡ d
 |qd%< | jj|qd&< | jj| j | j_v| j jd7  _|  w|q¡ W d   ƒ n	1 s€w   Y  | jt h¡  | j" x|| j| j$¡| _$| j$jyr­| jz|d d' | j" {| j | j| j$¡| _$~:~l~m~n~~q~;t5ƒ  tB C¡  |j|d
krÔ|d | j} d
krÔ| j~dd( t5ƒ  ~!~~~~~~~4~7~5~6~8~<~=~>~E~Ft5ƒ  q | j" || j| j$¡| _$| j$jyr| jz|d d d) | j" {| j | j| j$¡| _$d S d S )*Nc                   3   s    	 ˆ E d H  qr<   rG   rG   rË   rG   rH   Úrepeat_generatorf  s   €
ÿz*PPOTrainer.train.<locals>.repeat_generatorgH¯¼šò×z>ç        r9   T©Úmax_new_tokensÚtemperatureÚtop_kÚtop_pÚ	do_samplez===training policy===rh   r   r$   Ú	input_ids©Úgather_deepspeed3_paramsrI   )Údimrc   F)ÚmaskÚ
shift_mean)Úaxisg      à?r   Úepszobjective/klzobjective/entropyzobjective/non_score_rewardzobjective/rlhf_rewardzobjective/scoreszpolicy/approxkl_avgzpolicy/clipfrac_avgzloss/policy_avgzloss/value_avgzval/clipfrac_avgzpolicy/entropy_avgz	val/ratiozval/ratio_varzval/num_eos_tokensÚlrÚepisode)Útrial)Úsampling)rì   Úmetrics)€rW   r   rŠ   rY   rZ   r[   rX   r¿   ri   Úiterr   Úresponse_lengthrÞ   ÚprintrŸ   Únum_ppo_epochsr–   rf   r   ÚzerosÚtrainr±   Úglobal_steprë   rœ   Ú	max_stepsr   r‰   r   Úlogging_stepsrš   r›   Ú
eval_stepsÚ
save_stepsr­   Úon_train_beginr°   r´   rÖ   Úmodel_wrappedÚrangerw   ÚnextÚno_gradrÅ   Úshaper#   Úds3_gather_for_generationr'   r?   Ú local_rollout_forward_batch_sizeÚpad_token_idr4   r)   rÔ   r,   rN   r~   r5   Úcatr+   rÑ   r@   r/   ÚsqueezeÚappendÚgcÚcollectÚanyr   Úmissing_eos_penaltyÚarangeÚrepeatÚ	unsqueezeÚmasked_fillÚINVALID_LOGPROBr   ÚexpÚkl_coefÚcloneÚsizeÚwherer™   r!   ÚreversedÚgammaÚlamÚstackÚnpÚrandomÚpermutationr”   r˜   r“   Ú
accumulateÚclampÚcliprange_valueÚsquarer§   r    ÚfloatÚ	cliprangeÚvf_coefÚbackwardÚstepÚ	zero_gradÚnnÚ
functionalÚsoftmaxÚ	logsumexpÚsumÚmeanrŽ   Úgather_for_metricsr    Úvarr‹   Úget_last_lrÚepochÚlogÚon_step_endr¹   Ú_save_checkpointÚon_saver¦   r¨   Úgenerate_completionsÚon_train_end)rrD   rW   r   rŠ   rY   Ú
ref_policyr[   rX   ri   rÚ   Úiter_dataloaderr€   Ú
start_timeÚstats_shapeÚapproxkl_statsÚpg_clipfrac_statsÚpg_loss_statsÚvf_loss_statsÚvf_clipfrac_statsÚentropy_statsÚratio_statsÚupdateÚdataÚqueriesÚcontext_lengthÚ	responsesÚpostprocessed_responsesÚlogprobsÚref_logprobsÚscoresÚsequence_lengthsÚvaluesÚunwrapped_modelÚquery_responsesÚlogitssÚiÚqueryÚquery_responseÚresponserN   ÚlogprobÚ
ref_outputÚ
ref_logitsÚref_logprobÚpostprocessed_responseÚpostprocessed_query_responseÚsequence_lengthÚunwrapped_value_modelÚ
full_valueÚ_ÚvaluerJ   Úcontain_eos_tokenÚresponse_idxsÚpadding_maskÚsequence_lengths_p1Úpadding_mask_p1ÚlogrÚklÚnon_score_rewardÚrewardsÚactual_startÚ
actual_endÚ
lastgaelamÚadvantages_reversedÚ
gen_lengthÚtÚ
nextvaluesÚdeltaÚ
advantagesÚreturnsÚppo_epoch_idxÚb_indsÚminibatch_idxÚmini_batch_startÚmini_batch_endÚmini_batch_indsÚgradient_accumulation_idxÚmicro_batch_startÚmicro_batch_endÚmicro_batch_indsÚmb_advantageÚmb_responsesÚmb_query_responsesÚmb_logprobsÚ	mb_returnÚ	mb_valuesrM   Ú
vpred_tempÚnew_logprobsÚvpredÚvpredclippedÚ
vf_losses1Ú
vf_losses2Úvf_loss_maxÚvf_lossÚvf_clipfracÚlogprobs_diffÚratioÚ	pg_lossesÚ
pg_losses2Úpg_loss_maxÚpg_lossÚlossÚpg_clipfracÚ	prob_distÚentropyÚapproxklÚmean_klÚmean_entropyÚmean_non_score_rewardÚrlhf_rewardré   rî   rG   rË   rH   rô   [  sR  
û











ÿ
ûý


ÿ€

ÿ

ÿ

ÿ





$"
&*†}



ÿýÿ

 

ÿ ÿÿï€×;

µLÿä

 

þzPPOTrainer.trainrí   c              
   C   sú  | j }| j}t| j jddddd}ttƒ}t| j| j| j j	d–}| j
D ]Š}|d }t ¡ t |jd }	t|j||jd	 |j|ƒ\}
}|
d d …|	d …f }|}| jd ur\t| j|j|ƒ}|d
  t|j|ddƒ¡ |d  t| |¡ƒ¡ t ||fd¡}t| j||j|	ƒ\}}}|d  | j |¡ ¡  ¡  ¡ ¡ W d   ƒ n1 s¥w   Y  |r® nq$W d   ƒ n1 s¹w   Y  t |¡}| jjrùt ƒ rÓt!|j"d	d… ƒ d|j#v rìd	d l$}|j%d urì| &d|j'|di¡ d|j#v rût(d|d d S d S d S )NgÙa§³îz„?rÛ   r9   TrÜ   rã   râ   r$   r   rO  )Úskip_special_tokenszmodel responserJ   é   ÚwandbÚcompletions)Ú	dataframeÚcomet_mlzcompletions.csv)ÚnameÚtable))rW   rX   r   rð   r   Úlistr#   rY   r   r   rÃ   r   rþ   rÿ   r'   r?   r  r~   r5   Úextendr
   Úbatch_decoder  r/   r[   r+  r  ÚcpuÚnumpyÚpdÚ	DataFrameÚis_main_processr   r3   Úilocr¬   rš  Úrunr/  ÚTabler0   )rD   rí   rW   rX   r€   rŸ  rK  ÚbatchrO  rC  rP  r[  rQ  rV  rW  rJ   Údfrš  rG   rG   rH   r3  ¯  s~   ûÿ


û

ÿÿÿ
ÿ$æÿ€ß
#



þö	zPPOTrainer.generate_completionsc                    sL   | j jd u rt| j jƒj}n	| j j d¡d }| j|d tƒ  ||¡ d S )Nú/rI   )Ú
model_name)	rW   r¶   r   r¼   rž  ÚsplitÚcreate_model_cardr=   r1  )rD   rY   rì   r®  rE   rG   rH   r1  ð  s
   zPPOTrainer._save_checkpointr®  Údataset_nameÚtagsc                 C   sê   |   ¡ sdS t| jjdƒrtj | jjj¡s| jjj}nd}|du r&tƒ }nt	|t
ƒr/|h}nt|ƒ}t| jjdƒr?| d¡ | | j¡ t d¡}t||| j||tƒ r]tjdur]tjjndtƒ d|ddd	}| tj | jjd
¡¡ dS )aî  
        Creates a draft of a model card using the information available to the `Trainer`.

        Args:
            model_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the model.
            dataset_name (`str` or `None`, *optional*, defaults to `None`):
                Name of the dataset used for training.
            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
                Tags to be associated with the model card.
        NÚ_name_or_pathÚunsloth_versionÚunslotha…          @article{mziegler2019fine-tuning,
            title        = {{Fine-Tuning Language Models from Human Preferences}},
            author       = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving},
            year         = 2019,
            eprint       = {arXiv:1909.08593}
        }ÚPPOz2Fine-Tuning Language Models from Human Preferencesz
1909.08593)Ú
base_modelr®  r¶   r±  r²  Ú	wandb_urlÚ	comet_urlÚtrainer_nameÚtrainer_citationÚpaper_titleÚpaper_idz	README.md)rq   r½   rY   rª   rº   ÚpathÚisdirr³  Úsetrl   ÚstrÚaddr@  r¾   ÚtextwrapÚdedentr-   r¶   r   rš  r©  Úurlr.   ÚsaveÚjoinrW   r¼   )rD   r®  r±  r²  r·  ÚcitationÚ
model_cardrG   rG   rH   r°  ø  s8    


õzPPOTrainer.create_model_card)NNrV   NN)NF)F)NNN)&rO   rP   rQ   r¾   r%   r   r   r   r   r   r   r%  ÚModuler   r   ÚdictrÁ  Útupler   ÚoptimÚ	Optimizerr‹   ÚLambdaLRr   r   r>   r   rÍ   rÏ   r   rÔ   Úboolr×   rô   r3  r1  r°  rR   rG   rG   rE   rH   rS   c   sr    ðþÿýúùø	÷
öõôò
ñð
ï S
  VA
üþýürS   )Yr  rš   rº   rÃ  rŸ   Úcollectionsr   Ú
contextlibr   r   Úpathlibr   Útypingr   r   r¤  r  Úpandasr¥  r   Útorch.nnr%  Ú
accelerater   Úaccelerate.utilsr	   r
   Údatasetsr   Útorch.utils.datar   Útransformersr   r   r   r   r   r   r   r   r   r   Útransformers.integrationsr   Útransformers.trainerr   r   Útransformers.trainer_callbackr   r   r   Útransformers.utilsr   r   Úcorer    r!   Úmodelsr"   Úmodels.utilsr#   Ú
ppo_configr%   Úutilsr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   Úpeftr6   r7   r8   rš  r  rÊ  r:   rS   rG   rG   rG   rH   Ú<module>   sD   0H