o
    	Tiw                  	   @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	m
Z
mZmZmZmZmZmZmZ edkreeee
fZe \ZZZedde_de_d	d
ie_ejdv rTejneeejZeeZeejej eedurke ndedZ!ej"ej#ej$dZ%ej"ej#fdej$ie!Z&dd Z'eej(ej)dZ*ee&ee'e*ej+ ej,dkre*ej- nde%eedZ.e./  e.0ej1 ej2re.j2ej(d e.j3j4re%2ej5 dS dS dS dS )a$  
pip install pillow

# Tested on 8x H100 GPUs
accelerate launch
    --config_file=examples/accelerate_configs/deepspeed_zero3.yaml     sft_vlm_smol_vlm.py     --dataset_name HuggingFaceH4/llava-instruct-mix-vsft     --model_name_or_path HuggingFaceTB/SmolVLM-Instruct     --per_device_train_batch_size 1     --gradient_accumulation_steps 1     --output_dir sft-smol-vlm-hf     --bf16     --torch_dtype bfloat16     --gradient_checkpointing     --use_peft     --lora_target_modules down_proj, o_proj, k_proj, q_proj, gate_proj, up_proj, v_proj

For LLaVA-NeXT, use: (requires transformers>=4.45)
    --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf

For meta-llama/Llama-3.2-11B-Vision-Instruct, use: (requires transformers>=4.45.1)
    --model_name_or_path meta-llama/Llama-3.2-11B-Vision-Instruct
    N)load_dataset)AutoModelForVision2SeqAutoProcessor Idefics3ForConditionalGenerationLlavaForConditionalGeneration)ModelConfigScriptArguments	SFTConfig
SFTTrainer	TrlParserget_kbit_device_mapget_peft_configget_quantization_config__main__F)use_reentrantskip_prepare_datasetT)autoN)revisionattn_implementationtorch_dtype
device_mapquantization_config)trust_remote_coder   c                 C   s   dd | D }dd | D }t ttrdd |D }t||ddd}|d  }d	||tjjk< t ttrAtjjtjj	
d
 }ntjtj}d	|||k< ||d< |S )Nc                 S   s   g | ]}t j|d  ddqS )messagesF)tokenize)	processorapply_chat_template.0example r    U/home/ubuntu/.local/lib/python3.10/site-packages/examples/scripts/sft_vlm_smol_vlm.py
<listcomp>f   s    zcollate_fn.<locals>.<listcomp>c                 S      g | ]}|d  qS )imagesr    r   r    r    r!   r"   g       c                 S   r#   )r   r    )r   imager    r    r!   r"   j   r%   ptT)r$   textreturn_tensorspadding	input_idsiz<image>labels)
isinstancemodelr   r   clone	tokenizerpad_token_idr   additional_special_tokens_idsadditional_special_tokensindexconvert_tokens_to_idsimage_token)examplestextsr$   batchr,   image_token_idr    r    r!   
collate_fnd   s   

r;   )nameno)r.   argsdata_collatortrain_dataseteval_datasetprocessing_classpeft_config)dataset_name)6__doc__torchdatasetsr   transformersr   r   r   r   trlr   r   r	   r
   r   r   r   r   __name__parserparse_args_and_configscript_argstraining_args
model_argsdictgradient_checkpointing_kwargsremove_unused_columnsdataset_kwargsr   getattrr   model_revisionr   model_kwargsfrom_pretrainedmodel_name_or_pathr   r   r.   r;   rD   dataset_configdatasetdataset_train_spliteval_strategydataset_test_splittrainertrain
save_model
output_dirpush_to_hubacceleratoris_main_processhub_model_idr    r    r    r!   <module>   sf   (

P