o
    	Ti                      @   s  d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	Z	ddl
Z
ddlZddlmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZmZmZmZmZmZ d
ededefddZde eef dede ee!e eef  f fddZ"de!e eef  de ee
j#f fddZ$eG dd deZ%e&dkr^ee%eefZ'e'( \Z)Z*Z+e dde*_,de*_-ddie*_.ee)j/e)j0ddZ1e+j2dv re+j2ne3e
e+j2Z2eddde
j4d Z5e e+j6e+j7e2e e5d!Z8ej9e+j:fi e8Z;ed"d#d#d$d%g d&d'Z<e*j=re;>  de;j?_@e;A  ej9e+j:e+j7d(ZBd)d* e1D ZCe*jDd+kr'ejEd,d- ee;e*eCe$e<eBd.ZFeFG  eFHe*jI e*jJrQeFjJe)j/d/ eFjKjLrQeBJe*jM [;[Fe
jNO  eP  dS dS )0a  
Example usage:
accelerate launch     --config_file=deepspeed_zero2.yaml     sft_video_llm.py     --dataset_name=mfarre/simplevideoshorts     --video_cache_dir="/optional/path/to/cache/"     --model_name_or_path=Qwen/Qwen2-VL-7B-Instruct     --per_device_train_batch_size=1     --output_dir=video-llm-output     --bf16=True     --tf32=True     --gradient_accumulation_steps=4     --num_train_epochs=4     --optim="adamw_torch_fused"     --log_level="debug"     --log_level_replica="debug"     --save_strategy="steps"     --save_steps=300     --learning_rate=8e-5     --max_grad_norm=0.3     --warmup_ratio=0.1     --lr_scheduler_type="cosine"     --report_to="wandb"     --push_to_hub=False     --torch_dtype=bfloat16     --gradient_checkpointing=True
    N)	dataclassfield)Any)load_dataset)
LoraConfig)process_vision_info)AutoModelForVision2SeqAutoProcessorBitsAndBytesConfigQwen2VLProcessor)ModelConfigScriptArguments	SFTConfig
SFTTrainer	TrlParserget_kbit_device_mapurl	cache_dirreturnc              
   C   s  t j|dd | dd }t j||}t j|r|S zNtj| dd<}|  t	|d}|j
ddD ]	}|r?|| q6W d	   n1 sJw   Y  W d	   |W S W d	   |W S 1 sdw   Y  |W S  tjy } ztd
| |d	}~ww )z.Download video if not already present locally.T)exist_ok/)streamwbi    )
chunk_sizeNzFailed to download video: )osmakedirssplitpathjoinexistsrequestsgetraise_for_statusopeniter_contentwriteRequestException	Exception)r   r   filename
local_pathrfchunke r/   R/home/ubuntu/.local/lib/python3.10/site-packages/examples/scripts/sft_video_llm.pydownload_videoF   s4   


r1   examplec           	      C   s   | d }| d }t | d }d}d| d}t|dd }d	d
|dgdddt||dddd
| d|d  dgddd
|d dgdg}d|iS )z%Prepare dataset example for training.	video_urltimecoded_ccqaz.You are an expert in movie narrative analysis.zCAnalyze the video and consider the following timecoded subtitles:

zC

Based on this information, please answer the following questions:   r   systemtext)typer8   )rolecontentuservideoiN g      ?)r9   r=   
max_pixelsfpsz

Question: question	assistantanswermessages)jsonloadsrandomsampler1   )	r2   r   r3   r4   qa_pairssystem_messagebase_promptselected_qarC   r/   r/   r0   prepare_dataset[   s"   rL   examplesc                 C   s  g }g }t | D ]N\}}z3tdd |d D }tdtj|  |tj|d dd t	|d d d }|| W q t
yV } ztd	| d
| |d}~ww t||ddd}|d  }	d|	|	tjjk< tttrvg dntjtjg}
|
D ]}d|	|	|k< q|	|d< |S )z'Collate batch of examples for training.c                 s   s4    | ]}|d  D ]}| ddkr|d V  qqdS )r;   r9   r=   N)r"   ).0messager;   r/   r/   r0   	<genexpr>   s    zcollate_fn.<locals>.<genexpr>rC   zProcessing video: F)tokenizer6   r   zFailed to process example z: NptT)r8   videosreturn_tensorspadding	input_idsi)idP ieP ihP labels)	enumeratenextprintr   r   basenameappend	processorapply_chat_templater   r(   
ValueErrorclone	tokenizerpad_token_id
isinstancer   convert_tokens_to_idsimage_token)rM   textsvideo_inputsir2   
video_pathvideo_inputr.   inputsrW   visual_tokensvisual_token_idr/   r/   r0   
collate_fny   s4   
rn   c                   @   s*   e Zd ZU dZedddidZeed< dS )CustomScriptArgumentsz
    Arguments for the script.

    Args:
        video_cache_dir (`str`, *optional*, defaults to `"/tmp/videos/"`):
            Video cache directory.
    z/tmp/videos/helpzVideo cache directory.)defaultmetadatavideo_cache_dirN)__name__
__module____qualname____doc__r   rs   str__annotations__r/   r/   r/   r0   ro      s   
 ro   __main__F)use_reentrantskip_prepare_datasetTtrain)namer   )autoNnf4)load_in_4bitbnb_4bit_use_double_quantbnb_4bit_quant_typebnb_4bit_compute_dtype)revisiontrust_remote_codetorch_dtype
device_mapquantization_config	CAUSAL_LM   g?none)q_projk_projv_projo_proj)	task_typer+   
lora_alphalora_dropoutbiastarget_modules)r   c                 C   s   g | ]}t |tjqS r/   )rL   script_argsrs   )rN   r2   r/   r/   r0   
<listcomp>   s    r   wandbzvideo-llm-training)project)modelargstrain_datasetdata_collatorpeft_configprocessing_class)dataset_name)Qrw   rD   r   rF   dataclassesr   r   typingr   r!   torchr   datasetsr   peftr   qwen_vl_utilsr   transformersr   r	   r
   r   trlr   r   r   r   r   r   rx   r1   dictlistrL   Tensorrn   ro   rt   parserparse_args_and_configr   training_args
model_argsgradient_checkpointing_kwargsremove_unused_columnsdataset_kwargsr   dataset_configdatasetr   getattrbfloat16
bnb_configmodel_revisionr   model_kwargsfrom_pretrainedmodel_name_or_pathr   r   gradient_checkpointinggradient_checkpointing_enableconfigr{   enable_input_require_gradsr]   prepared_dataset	report_toinittrainerr}   
save_model
output_dirpush_to_hubacceleratoris_main_processhub_model_idcudaempty_cachefinishr/   r/   r/   r0   <module>   s    2((





