o
    }oi                     @   s   d dl Z dd Zd dlmZ d dlmZ d dlZd dlZd dl	m
Z d dlmZ d dlmZ d dlmZmZ dd	 ZddejdedefddZdddZG dd dZdS )    Nc                   C   s   dt jd< d S )N1NVTE_APPLY_QK_LAYER_SCALING)osenviron r   r   R/home/ubuntu/.local/lib/python3.10/site-packages/tests/lightning/test_dist_ckpt.pyset_env   s   r   )Path)llm)MegatronCheckpointIO)AsyncFinalizableCheckpointIOAsyncFinalizerCallbackc                  C   s   t jddd} | S )NF)enable_nemo_ckpt_iockpt_async_save)nlMegatronStrategy)strategyr   r   r   _get_strategy"   s
   r    modelsuffixreturnc                 C   s$   d| j jd  d| j jd  | S )Nzepoch=   z-step=)trainercurrent_epoch	max_steps)r   r   r   r   r   _get_last_checkpoint_dir*   s   $r      c                 C   s>   d}t j|| |d}t jdddd|dd}t j||jd	|fS )
N   )
seq_lengthmicro_batch_sizeglobal_batch_sizer   @         r   )
num_layershidden_sizeffn_hidden_sizenum_attention_headsr   apply_query_key_layer_scaling)	tokenizer)r
   MockDataModule	GPTConfigGPTModelr*   )mbsgbsr   dataconfigr   r   r   get_model_and_data.   s   r2   c                   @   s<   e Zd Zejddd Zejddd Zdd ZdS )	TestDistCkptIOGPUc              
   C   s   t   tjd dksJ d\}}t||\}}ddlm} |dd ||dd t }tjdd|d	d
t	|dd}|
|| W d    n1 sHw   Y  t|jjtsVJ tt|d }	t|	dksgJ |	d }
t	|
t|ksuJ |  d S )Nr   r   r   r   r   /reconfigure_num_microbatches_calculator_managerr   data_parallel_sizegpuTr   F)devicesacceleratorr   enable_checkpointingr   default_root_dirloggercheckpoints)r   r   r   r2   &tests.lightning.mcore_microbatch_utilsr7   r   r   Trainerstrfit
isinstancer   checkpoint_ior   listdirr	   lenr   	_teardown)selftmp_pathr/   r.   r   r0   r7   r   r   ckptsckptr   r   r   )test_dist_ckpt_io_called_for_mcore_models?   s0   
z8TestDistCkptIO.test_dist_ckpt_io_called_for_mcore_modelsc                 C   s  t   tjd dksJ d\}}t||\}}ddlm} |dd ||ddk |d }|d	 }td
}	ttd
dd}
tj	dddt
 d}||| t
 }t||\}}tj	ddddt
 |	gt|d}||| t||\}}tj	ddddt
 |
gt t|d}||| W d    n1 sw   Y  d|j i}|	jt| dt| |d}|
jt| dt| |d}|d  D ]}t|d | tjrt|d | |d | ksJ q|  d S )Nr   r   r5   r   r6   r   r8   sync_checkpointsasync_checkpoints
torch_distT)
async_saveFr   )r;   r?   r   r   )r;   r=   r?   r   r   pluginsr>   )r;   r=   r?   r   r   rS   	callbacksr>   sharded_state_dictz/checkpoints/)rU   )r   r   r   r2   rA   r7   r   r   plrB   r   rD   rC   r   modulerU   load_checkpointr	   r   keysrE   torchTensorallrI   )rJ   rK   r/   r.   r   r0   r7   sync_ckpt_dirasync_ckpt_dirsync_checkpoint_ioasync_checkpoint_iodummy_trainerr   sync_test_trainerasync_test_trainer
checkpointsync_state_dictasync_state_dictkr   r   r   1test_async_save_produces_same_checkpoints_as_synca   sl   	
0"z@TestDistCkptIO.test_async_save_produces_same_checkpoints_as_syncc                 C   s   t   tjd dksJ t }tjdddddd}tj|g|d}t|jt	s*J t|jj
ts3J |jj
}|jdks>J |jsCJ |jdksJJ |  d S )Nr   r   FrQ   T)r   save_ckpt_formatckpt_parallel_saveckpt_load_directly_on_devicer   )rT   r   )r   r   r   r   ModelCheckpointr   rB   rE   rF   r   _checkpoint_ior   ri   parallel_saveload_directly_on_devicerI   )rJ   model_checkpointr   r   base_checkpoint_ior   r   r   test_sharded_strategies   s*   
z&TestDistCkptIO.test_sharded_strategiesN)	__name__
__module____qualname__pytestmarkrun_only_onrN   rh   rr   r   r   r   r   r3   =   s    


!
Gr3   )r   r5   )r   r   pathlibr	   lightning.pytorchpytorchrV   rv   rZ   nemo.lightning	lightningr   nemo.collectionsr
   nemo.lightning.io.plr   !nemo.utils.callbacks.dist_ckpt_ior   r   r   LightningModulerC   r   r2   r3   r   r   r   r   <module>   s   
