o
    }oi$>                     @   s   d dl Z de jd< d dlZd dlZd dlmZ d dlZd dlm	Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d
ZefddZdddZdejfddZdd ZdddZG dd de
jZ dd Z!e"dkrye!  dS dS )    Nz/tmp/hf_homeHF_HOME)Path)AutoModelForCausalLM)	lightning)llm)	JitConfigJitTransform)to_cpuz#/home/TestData/lite/hf_cache/squad/c                    s8    fdd}t j|d jd}|j|ddg dd |S )	Nc                    s   d| d  d| d  dd| d d d	    g}tt j|\}}t|d	kr8|d	  jkr8|d	 j t|d	krK|d
  jkrK| j t	|| dd  || d d
 d	gt|d  dgt|  dS )Nz	Context: contextz Question: questionz Answer: answerstextr      )labels	input_ids	loss_mask)
striplistmaptext_to_idslenbos_idinserteos_idappenddict)exampleformatted_textcontext_ids
answer_ids	tokenizer P/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/hf/sft.pyformatting_prompts_func$   s   z6make_squad_hf_dataset.<locals>.formatting_prompts_funcz
train[:10])splitpad_token_idF   )idtitler
   r   r   )batched
batch_sizeremove_columns)r   HFDatasetDataModuler   r   )r#   	data_pathr&   
datamoduler$   r"   r%   make_squad_hf_dataset#   s   r2   Fc                 C   sj   | dkrt jjd|j|ddS | dkrt jj|j|ddS | dkr1tj|| d|j|dd	S td
)Nautozcuda:0)adapter_only)devicecheckpoint_ioddp)r6   fsdp2r   )data_parallel_sizetensor_parallel_sizer6   zEncountered unknown strategy)pl
strategiesSingleDeviceStrategymake_checkpoint_ioDDPStrategynlFSDP2StrategyNotImplementedError)strategymodeldevices	num_nodesr4   r$   r$   r%   make_strategy?   s    


rG   returnc                 C   s*   t jddddddd}t jd| d|d dS )NTr   reduced_train_loss)	save_lastevery_n_train_stepsmonitor
save_top_ksave_on_train_epoch_endsave_optim_on_train_end	nemo2_sftF)namelog_diruse_datetime_versionckptwandb)r@   ModelCheckpoint
NeMoLogger)ckpt_folderrT   r$   r$   r%   loggerS   s   	rY   c                 C   s   d }d}d}t d}t| D ];\}}}|D ]3}||}	|	rJtt|	 \}
}}t|t|}}||ks?||krJ||krJ|}|}tj	
||}qqt|S )Nr   zD[a-z0-9_]+--reduced_train_loss=([\d\.]+)-epoch=(\d+)-step=(\d+)-last)recompileoswalkmatchr   floatgroupsintpathjoinr   )base_dirlatest_checkpoint	max_epochmax_steppatternrootdirs_dir_namer^   lossepochstepr$   r$   r%   get_latest_checkpointf   s"   

rp   c                    s  h d}d}d}d}t | }|d }| sJ d| t }g }	|dD ]}
|
j|r<|
jdr<|	|
j q'||
j q'||v }t	|	d	k}|rU|rUJ d
|r||v sbJ d| t
dd |	D   fddtd d d D }t|	t|ksJ d| d|	 n|sJ d||sJ d||  |d  sJ dddg}|D ]}
|d |
  sJ d|
 qtd d S )N>   config.jsontokenizer.modeltokenizer_config.jsongeneration_config.jsonspecial_tokens_map.jsonzmodel.safetensorsmodel-zmodel.safetensors.index.json
hf_weightszMissing hf_weights directory: *.safetensorsr   zGBoth model.safetensors and sharded model files exist, which is invalid.zMissing index file: c                 S   s   g | ]}t |d d qS )-r   )ra   r'   ).0fr$   r$   r%   
<listcomp>   s    z3verify_sft_checkpoint_structure.<locals>.<listcomp>c                    s&   g | ]}d |dd d ddqS )rv   05dz-of-r   ry   r$   )r{   ishard_numbersr$   r%   r}      s    r   r   z(Missing or extra shard files. Expected: z	, Found: zMissing model file(s)zMissing files: z
trainer.ptzMissing trainer.pt filez
model.yamlzio.jsonr
   zMissing context file: z)Checkpoint structure verification passed.)r   existssetglobrQ   
startswithendswithr   addr   sortedrangeissubsetprint)rb   has_io_bytesexpected_files
model_filemodel_shard_prefix
index_fileckpt_dirrw   found_filesfound_shardsfilehas_model_filehas_sharded_filesexpected_shardscontext_filesr$   r   r%   verify_sft_checkpoint_structure~   sN   
 r   c                       s`   e Zd ZdZddededef fddZd	d
dddeddfddZe	dd Z
dddZ  ZS )!ValidateCheckpointRestoreCallbackzThis callback checks that the model weights and optimizer states are exactly restored
    from the checkpoint on the first training batch.
    T-C6?check_weightscheck_optimizer	tolerancec                    s,   t    || _|| _d | _d | _|| _d S N)super__init__r   r   loaded_model_stateloaded_optimizer_statesr   )selfr   r   r   	__class__r$   r%   r      s   

z*ValidateCheckpointRestoreCallback.__init__trainer
pl.Trainer	pl_modulezpl.LightningModule
checkpointrH   Nc                 C   s   |d | _ |d | _dS )z
        Save the loaded model and optimizer states so we can compare them
        to the actual states after resuming.
        
state_dictoptimizer_statesNr   r   )r   r   r   r   r$   r$   r%   on_load_checkpoint   s   
z4ValidateCheckpointRestoreCallback.on_load_checkpointc                 C   s   | j d uo	| jd uS r   r   )r   r$   r$   r%   has_loaded_checkpoint   s   z7ValidateCheckpointRestoreCallback.has_loaded_checkpointc              	   C   s0  | j r^| jdur^|j D ]K\}}|| jvrtd| d| j| }t|t|}}tj||| jdsXt	j
dd}|| d    }	t| d| d	|	 d
qtd ntd | jr| jdur|j}
t|
t| jkrtdt|
 dt| j dt|
| jD ]k\}}| }| | krtd|d  D ]P\}}|d | }| D ]A\}}||vrtd| d|| }t|tjrtj| | ddstd| d| dq||krtd| d| dqqqtd ntd | js|j|  dS td t d dS )z`
        Verify that the loaded model weights and optimizer state matches checkpoints'.
        NzParameter 'z%' not found in checkpoint state dict.)atol
LOCAL_RANK0r   z: Model parameter 'z(' does not match the checkpointed value .zmodel weights matchzDid not test model weightszNumber of optimizers (z!) does not match the checkpoint (z).zFMismatch in optimizer state keys between current state and checkpoint.statezKey 'z(' missing in the loaded optimizer state.h㈵>zOptimizer state for param_id=z, key='z ' does not match the checkpoint.z' differs from checkpoint.zoptim weights matchzdid not test optim weightszAll weights matchr   )!r   r   rD   named_parameters
ValueErrorr	   torchallcloser   r\   environgetviewr_   abscpumaxr   r   r   
optimizersr   zipr   keysitems
isinstanceTensorr   	callbacksremovesysexit)r   r   lightning_modulebatch	batch_idxrQ   current_paramcheckpoint_paramrankdiffr   	optimizerloaded_opt_statecurrent_opt_stateparam_idparam_stateloaded_param_state	state_keycurrent_tensor_or_valloaded_tensor_or_valr$   r$   r%   on_train_batch_start   sl   



z6ValidateCheckpointRestoreCallback.on_train_batch_start)TTr   )r   r   rH   N)__name__
__module____qualname____doc__boolr_   r   r   r   propertyr   r   __classcell__r$   r$   r   r%   r      s    
r   c                  C   s  ddl } |  }|jdtdd |jdtdg dd	 |jd
tdd |jdtdd |jdtddgd	 |jdtdd |jdtddgd	 |jdtdd |jddd |jdtdd |jdtdd |jddd |jddd | }d}|jdurddlm	} d
|jdd d }||j| d!|j d"|j d#}d}|jdkrdd$lm} ||jd%}g }|jrtd&d'd(id(d)}	t|	g}|jr|t  G d*d+ d+tj}
|jr|
ntj}||j|d,}t|j||j|jd(}|jrtjd&d(d-nd}| jt|du7  _tj |j|j|j|j!|dd.dd|j"d(||d/d0}tj#j$|t%tj&|j|t'(tj)j*d1d2t+|j,|d3 ~~t-|j,}t.||du t/j0|d4 d&d5}t1|d d6 dksjJ d7|ft1|d d8 dks{J d9|fdS ):zMExample script to run SFT with a HF transformers-instantiated model on squad.r   Nz--modelzmeta-llama/Llama-3.2-1B)typedefaultz
--strategyr3   )r3   r7   r8   )r   r   choicesz	--devicesr   z--num-nodesz--acceleratorgpuz--grad-clipg      ?z--model-acceleratortez--max-stepsd   z--fp8-autocast
store_true)actionz--wandb-projectz--ckpt-folderz/tmp/nemo_automodel_sft/z--use-torch-jitz--auto-resume)WandbLoggerrk   /_dev_strat_)projectrQ   )TEConfig)fp8_autocastTdynamicF)	use_torchtorch_kwargsuse_thunderc                       s   e Zd Z fddZ  ZS )z,main.<locals>.ZeroInitHFAutoModelForCausalLMc                    s.   t  j|i |}|  D ]}|d q|S )Nr   )r   configure_model
parametersfill_)r   argskwargsansparamr   r$   r%   configure_moduleN  s   z=main.<locals>.ZeroInitHFAutoModelForCausalLM.configure_module)r   r   r   r   r   r$   r$   r   r%   ZeroInitHFAutoModelForCausalLMM  s    r   )
model_namemodel_accelerator)resume_if_existsresume_ignore_no_checkpointg        bf16)rE   rF   	max_stepsacceleratorrC   log_every_n_stepslimit_val_batchesnum_sanity_val_stepsaccumulate_grad_batchesgradient_clip_valuse_distributed_samplerrY   r   	precisionr   )lr)rD   datar   optimlogresumerw   )output_loading_infomissing_keyszNOT LOADABLE #1mismatched_keyszNOT LOADABLE #2)2argparseArgumentParseradd_argumentstrra   r_   
parse_argswandb_projectlightning.pytorch.loggersr   rc   rD   r'   rE   rC   r   4nemo.lightning.pytorch.accelerate.transformer_enginer   r   use_torch_jitr   r   auto_resumer   r   r   HFAutoModelForCausalLMrG   rF   r@   
AutoResumer  Trainerr  	grad_clipapifinetuner2   configure_tokenizerfdlbuildadampytorch_adam_with_flat_lrrY   rX   rp   r   r   from_pretrainedr   )r  parserr   rU   r   rD   r   r   r   
jit_configr   	model_clsrC   r  r   rb   r   r$   r$   r%   main!  s   


	
"&r.  __main__)F)#r\   r   rZ   r   pathlibr   fiddler&  lightning.pytorchpytorchr;   r   transformersr   nemor   r@   nemo.collectionsr    nemo.lightning.pytorch.callbacksr   r   'nemo.lightning.pytorch.strategies.utilsr	   	DATA_PATHr2   rG   rW   rY   rp   r   Callbackr   r.  r   r$   r$   r$   r%   <module>   s0   


?df
