o
    }oi$4                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlm	Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	Zd
d ZdddZdejfddZG dd de
jZdd Z dd Z!dd Z"e#dkr{e"  dS dS )    N)Path)AutoModelForCausalLM)	lightning)llm)
NeMoLogger)	JitConfigJitTransform)to_cpuz#/home/TestData/lite/hf_cache/squad/c                    sL   j   fdd}tdtj| djd}|j|ddg dd	 |S )
Nc                    s\   d}| d }| d }| d d }t |tr|d }||||  }|}|d |d< |S )	NzBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}contextquestionanswerstextr   	input_idslabels)
isinstancelistformat)examplesalpaca_promptinstructioninputoutputr   ans	EOS_TOKEN	tokenizer T/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/hf/peft_hf.pyformatting_prompts_func&   s   

z6make_squad_hf_dataset.<locals>.formatting_prompts_funcr   ztrain[:100])splitpad_token_idF   )idtitler
   r   r   )batched
batch_sizeremove_columns)	eos_tokengetattrr   HFDatasetDataModuleeos_token_idmap)	data_pathr   r   
datamoduler   r   r   make_squad_hf_dataset#   s   r.   Fc                 C   sj   | dkrt jjd|j|ddS | dkrt jj|j|ddS | dkr1tj|| d|j|dd	S td
)Nautozcuda:0)adapter_only)devicecheckpoint_ioddp)r2   fsdp2   )data_parallel_sizetensor_parallel_sizer2   zEncountered unknown strategy)pl
strategiesSingleDeviceStrategymake_checkpoint_ioDDPStrategynlFSDP2StrategyNotImplementedError)strategymodeldevices	num_nodesr0   r   r   r   make_strategyG   s    


rD   returnc                 C   s*   t jddddddd}t jd| d|d dS )NTr5   reduced_train_loss)	save_lastevery_n_train_stepsmonitor
save_top_ksave_on_train_epoch_endsave_optim_on_train_end
nemo2_peftF)namelog_diruse_datetime_versionckptwandb)r=   ModelCheckpointr   )ckpt_folderrQ   r   r   r   logger[   s   	rU   c                       sP   e Zd ZdZddedef fddZddd	d
deddfddZdddZ  Z	S )!ValidateCheckpointRestoreCallbackzThis callback checks that the model weights and optimizer states are exactly restored
    from the checkpoint on the first training batch.
    Tcheck_weightscheck_optimizerc                    s&   t    || _|| _d | _d | _d S )N)super__init__rW   rX   loaded_model_stateloaded_optimizer_states)selfrW   rX   	__class__r   r   rZ   s   s
   

z*ValidateCheckpointRestoreCallback.__init__trainer
pl.Trainer	pl_modulezpl.LightningModule
checkpointrE   Nc                 C   s.   |d | _ tdt| j    |d | _dS )z
        Save the loaded model and optimizer states so we can compare them
        to the actual states after resuming.
        
state_dictzself.loaded_model_state.keys= optimizer_statesN)r[   printstrkeysr\   )r]   r`   rb   rc   r   r   r   on_load_checkpointz   s   
z4ValidateCheckpointRestoreCallback.on_load_checkpointc              	   C   s
  | j rU| jdurU| D ]D\}}t||j |jdkr#|| jvs"J q|| jvr0td| d| j| }t|t|}}tj||ddsPtd| dt	 qtd	 | j
r| jdur|j}t|t| jkrytd
t| dt| j dt|| jD ]p\}	}
|	 }| |
 krtd|d  D ]U\}}|
d | }| D ]F\}}||vrtd| d|| }t|t|}}t|tjrtj||ddstd| d| dq||krtd| d| dqqqtd |j|  td td dS )z`
        Verify that the loaded model weights and optimizer state matches checkpoints'.
        NFzParameter 'z%' not found in checkpoint state dict.g-C6?)atolzModel parameter 'z)' does not match the checkpointed value. zmodel weights matchzNumber of optimizers (z!) does not match the checkpoint (z).zFMismatch in optimizer state keys between current state and checkpoint.statezKey 'z(' missing in the loaded optimizer state.gHz>zOptimizer state for param_id=z, key='z ' does not match the checkpoint.z' differs from checkpoint.zoptim weights matchzAll weights matchr   )rW   r[   named_parametersrf   requires_grad
ValueErrorr	   torchallclosediffrX   r\   
optimizerslenziprd   rh   itemsr   Tensor	callbacksremovesysexit)r]   r`   lightning_modulebatch	batch_idxrN   current_paramcheckpoint_paramrr   	optimizerloaded_opt_statecurrent_opt_stateparam_idparam_stateloaded_param_state	state_keycurrent_tensor_or_valloaded_tensor_or_valr   r   r   on_train_batch_start   sn   


z6ValidateCheckpointRestoreCallback.on_train_batch_start)TT)r`   ra   rE   N)
__name__
__module____qualname____doc__boolrZ   dictri   r   __classcell__r   r   r^   r   rV   n   s
    	rV   c                 C   s   d }d}d}t d}t| D ];\}}}|D ]3}||}	|	rJtt|	 \}
}}t|t|}}||ks?||krJ||krJ|}|}tj	
||}qqt|S )NzD[a-z0-9_]+--reduced_train_loss=([\d\.]+)-epoch=(\d+)-step=(\d+)-last)recompileoswalkmatchr+   floatgroupsintpathjoinr   )base_dirlatest_checkpoint	max_epochmax_steppatternrootdirs_dir_namer   lossepochstepr   r   r   get_latest_checkpoint   s"   

r   c                 C   s   t ddg}t| }|d }| sJ t||dD ]}|j|v s(J |||j qt|dks7J |d  s?J ddg}|D ]}|d	 |  sQJ qEd S )
Nzadapter_model.safetensorszadapter_config.json
hf_adapter*r   z
trainer.ptz
model.yamlzio.jsonr
   )setr   existsrg   globrN   rx   rs   )r   expected_filesckpt_dir
hf_weightsfilecontext_filesr   r   r    verify_peft_checkpoint_structure   s"   r   c                  C   sR  ddl } |  }|jdtdd |jdtdg dd	 |jd
tdd |jdtdd |jdtddgd	 |jdtdd |jdtdd |jdtdd |jddd |jddd |jdtt jd |	 }d}|j
durddlm} d|jddd }||j
| d|j d|j d}g }|jrtd d!d id"d#}t|g}tj|jd$}t|j||j|jd }|jr|t  |jrtjd d"d%nd}	tj|j|j|j|j |dd&dd|j!d"||d'd(}
tj"j#|t$t%tj&|j|
t'(tj)j*d)d*t+|j,tj-j.d+gd,d-|	d. ~~
t/|j,}t0| t12|j}|3| d/ dS )0zNExample script to run PEFT with a HF transformers-instantiated model on squad.r   Nz--modelzmeta-llama/Llama-3.2-1B)typedefaultz
--strategyr/   )r/   r3   r4   )r   r   choicesz	--devicesr5   z--num-nodesz--acceleratorgpuz--grad-clipg      ?z--max-stepsd   z--wandb-projectz--use-torch-jit
store_true)actionz--auto-resumez--ckpt-folder)WandbLoggerr   /_dev_strat_)projectrN   TdynamicF)	use_torchtorch_kwargsuse_thunder)
model_name)resume_if_existsresume_ignore_no_checkpointg        bf16)rB   rC   	max_stepsacceleratorr@   log_every_n_stepslimit_val_batchesnum_sanity_val_stepsaccumulate_grad_batchesgradient_clip_valuse_distributed_samplerrU   rw   	precisiongh㈵>)lrz*_proj   )target_modulesdim)rA   datar`   optimlogpeftresumez/hf_adapter)4argparseArgumentParseradd_argumentrg   r   r   tempfileTemporaryDirectoryrN   
parse_argswandb_projectlightning.pytorch.loggersr   r   rA   r   rB   r@   use_torch_jitr   r   r   HFAutoModelForCausalLMrD   rC   auto_resumeappendrV   r=   
AutoResumeTrainerr   r   	grad_clipapifinetuner.   	DATA_PATHconfigure_tokenizerfdlbuildadampytorch_adam_with_flat_lrrU   rT   r   LoRAr   r   r   from_pretrainedload_adapter)r   parserargsrR   r   rA   rw   
jit_configr@   r   r`   r   r   r   r   main   s   


r   __main__)F)$r   r   ry   r   pathlibr   fiddler   lightning.pytorchpytorchr8   ro   transformersr   nemor   r=   nemo.collectionsr   nemo.lightningr    nemo.lightning.pytorch.callbacksr   r   'nemo.lightning.pytorch.strategies.utilsr	   r   r.   rD   rU   CallbackrV   r   r   r   r   r   r   r   r   <module>   s2   
$bW
