o
    	Ti^                     @   sv  d Z ddlZddlmZ ddlmZmZmZmZ ddl	m
Z
mZmZmZmZmZmZmZmZmZmZmZ ddlmZ eee
dZedkr7eeeefZe \ZZZd	d
ie_ej dv r`ej ne!eej Z eeZ"e#ej$ej%e ej&rudnd
e"dur}e nde"dZ'ej(ej)fdej*ie'Z+ej,durej(ej,fdej*de'Z-ej(ej,ej*d
ddZ.ndZ-dZ.ej/dureej/ Z0e0 Z/ndZ/ej(ej)fdej*de'Z1e1j2du ree1_2e1j3du re1j4e1_5eej6ej7dZ8ee+e-e/ee8ej9 ej:dkre8ej; nde1e.eed	Z<ej:dkr eej=d
ej>dZ?ee<e?ddZ@e<Ae@ e<B  e<CejD ejEr9e<jEej6d dS dS dS )a&  
Usage:

python examples/scripts/dpo_online.py     --model_name_or_path trl-lib/pythia-1b-deduped-tldr-sft      --reward_model_path trl-lib/pythia-1b-deduped-tldr-rm     --dataset_name trl-lib/tldr     --learning_rate 5.0e-7     --output_dir pythia-1b-tldr-online-dpo     --per_device_train_batch_size 8     --gradient_accumulation_steps 16     --warmup_ratio 0.1     --missing_eos_penalty 1.0

With LoRA:
python examples/scripts/dpo_online.py     --model_name_or_path trl-lib/pythia-1b-deduped-tldr-sft      --reward_model_path trl-lib/pythia-1b-deduped-tldr-rm     --dataset_name trl-lib/tldr     --learning_rate 5.0e-6     --output_dir pythia-1b-tldr-online-dpo     --per_device_train_batch_size 16     --gradient_accumulation_steps 8     --warmup_ratio 0.1     --missing_eos_penalty 1.0     --use_peft
    N)load_dataset)AutoModelForCausalLM"AutoModelForSequenceClassificationAutoTokenizerGenerationConfig)HfPairwiseJudgeLogCompletionsCallbackModelConfigOnlineDPOConfigOnlineDPOTrainerOpenAIPairwiseJudgePairRMJudgeScriptArguments	TrlParserget_kbit_device_mapget_peft_configget_quantization_config)SIMPLE_CHAT_TEMPLATE)pair_rmopenaihf__main__use_reentrantT)autoNF)revisionattn_implementationtorch_dtype	use_cache
device_mapquantization_configtrust_remote_code   )
num_labelsr    left)r    
truncationtruncation_side)padding_sider    )nameno)	modelreward_modeljudgeargstrain_dataseteval_datasetprocessing_classreward_processing_classpeft_config)max_new_tokens	do_sampletemperature   )num_prompts)dataset_name)F__doc__torchdatasetsr   transformersr   r   r   r   trlr   r   r	   r
   r   r   r   r   r   r   r   r   trl.trainer.utilsr   JUDGES__name__parserparse_args_and_configscript_argstraining_args
model_argsgradient_checkpointing_kwargsr   getattrr   dictmodel_revisionr   gradient_checkpointingmodel_kwargsfrom_pretrainedmodel_name_or_pathr    r)   reward_model_pathr*   reward_tokenizerr+   	judge_cls	tokenizerchat_templatepad_token_id	eos_token	pad_tokenr7   dataset_configdatasetdataset_train_spliteval_strategydataset_test_splittrainerr2   r4   generation_configcompletions_callbackadd_callbacktrain
save_model
output_dirpush_to_hub rb   rb   O/home/ubuntu/.local/lib/python3.10/site-packages/examples/scripts/dpo_online.py<module>   s   8

	






Q