o
    ίi3                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlm  mZ d dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZm Z m!Z! d d
l"m#Z# e$dZ%dZ&dZ'dZ(dZ)dZ*dZ+dZ,e-dZ.eG dd dZ/eG dd dZ0de1fddZ2de1fddZ3			d0de1dej4d eej5j6 d!e1d"e1f
d#d$Z7G d%d& d&Z8d1ded(e9d)e9fd*d+Z:	d2dej4d,e1d-ee1 fd.d/Z;dS )3    N)	dataclassfield)Path)ListOptionalTuple)	OmegaConf)
DeviceMesh)FileSystemReader)dcp_to_torch_save)get_model_state_dictget_state_dictset_state_dict)get_is_master
CHECKPOINTz{:010d}z\d{10}z__\d_\d\.distcpconsolidatedconsolidated.pthzparams.jsonztrain_state_{:05d}.jsonz\d+c                   @   s&   e Zd ZU dZeed< dZeed< dS )	SaveEveryi  everyr   keepN)__name__
__module____qualname__r   int__annotations__r    r   r   C/home/ubuntu/.local/lib/python3.10/site-packages/core/checkpoint.pyr   )   s   
 r   c                   @   sz   e Zd ZU eedZeed< eedZeed< dZe	e
 ed< dZe	e
 ed< dZe	e
 ed< dZeed	< dZeed
< dS )CheckpointArgs)default_factorydumpevalNpathinit_ckpt_pathvision_model_pathFis_consolidated_modelcontinue_training_from_init)r   r   r   r   r   r   r   r    r!   r   strr"   r#   r$   boolr%   r   r   r   r   r   /   s   
 r   namec                 C   s   t tt| d S )N)r   refindall	RE_DIGITS)r(   r   r   r   _get_key_step:   s   r-   ckpt_dirc                 C   sr   t | t }|t  s7|jdd tdt|  t| t|t  |t	 
t | t	   td |S )a  
    Consolidates all FSDP checkpoints in a directory to a single file
    Consolidate checkpoint is saved in a subdirectory of ckpt_dir

    Parameters:
        ckpt_dir: str - path to the directory containing the checkpoints

    Returns the path to the consolidated checkpoint
    Texist_okzConsolidating to: zConsolidated !)r   CONSOLIDATE_FOLDERCONSOLIDATE_NAMEexistsmkdirloggerinfor&   r   CONFIG_NAME
write_text	read_text)r.   consolidate_pathr   r   r   consolidate_checkpoints>   s   

r;   modeloptim	optimizer	model_key	optim_keyc                 C   sl   t | d  stdi }|d urt||\||< ||< nt|||< |dkr-||}tj|| d d S )Nz	.metadatazPlease convert the checkpoint distcp format using `torch.distributed.checkpoint.format_utils.torch_save_to_dcp` before loading it checkpoint_id)r   r3   
ValueErrorr   r   popdcpload)r.   r<   r>   r?   r@   
state_dictr   r   r   load_from_checkpointT   s   
rI   c                	   @   s   e Zd ZdefddZdee fddZdd Zdd
e	de
e fddZdededefddZ	d de
e dee	e	f fddZe dd Z	d de
e defddZe 	d dejdede
e fddZedefddZdS )!CheckpointManagerargsc                 C   sV   |j | _ |j| _|j| _|j| _|j| _tj | j s$J d| j  d| 	 | _
d S )NzPath ze does not exist and needs to be created before using CheckpointManager (use instantiate_and_make_dir))r!   r   
dump_everyr    
eval_everyr"   r%   osr3   get_existing_savesexisting_saves)selfrK   r   r   r   __init__l   s   zCheckpointManager.__init__returnc                 C   s,   dd t | j D }|jdd d |S )Nc                 S   s&   g | ]}|  rtt|jr|qS r   )is_dirr*   match	RE_FOLDERr(   ).0pr   r   r   
<listcomp>z   s    z8CheckpointManager.get_existing_saves.<locals>.<listcomp>c                 S   
   t | jS Nr-   r(   rX   r   r   r   <lambda>      
 z6CheckpointManager.get_existing_saves.<locals>.<lambda>key)r   r!   iterdirsort)rQ   foldersr   r   r   rO   y   s
   z$CheckpointManager.get_existing_savesc                 C   s  t d g }g }g }| jD ]/}t|j| jj dk}t|j| jj dk}|r-|| |r4|| |s=|s=|| qt d|  t d|  t d|  | jj	dkre|| jj	 d  }| jj	dkrt|| jj	 d  }t
|| | }t
| j| }t d|  t dkr|D ]1}	|	 D ]&}
|
 r|
  q|
 r|
jtfv sJ |
 D ]}|  q|
  q|	  qt  t|| _| jjdd d	 d S )
NzCleaning up checkpoints...r   zDump folders: zEval folders: zOther folders: zRemoving folders: c                 S   rZ   r[   r\   r]   r   r   r   r^      r_   z,CheckpointManager.clean_up.<locals>.<lambda>r`   )r5   r6   rP   r-   r(   rL   r   rM   appendr   setdistget_rankrb   is_fileunlinkrT   r1   rmdirbarrierlistrc   )rQ   dump_folderseval_foldersother_foldersrX   is_dumpis_evalfolder_to_keepfolder_to_removefolderfilefr   r   r   clean_up   sN   








zCheckpointManager.clean_upr   dp_rankc                 C   s4   d }t | jD ]}|t|  r|} |S q|S r[   )reversedrP   TRAIN_STATE_NAMEformatri   )rQ   ry   r!   rX   r   r   r   get_last_step_path   s   z$CheckpointManager.get_last_step_path	base_pathfolder_namec                 C   s0   || }t  r|jddd t rt  |S )NFT)parentsr0   )r   r4   rg   is_initializedrl   )rQ   r~   r   ru   r   r   r   _create_folder   s   z CheckpointManager._create_folderNdevice_meshc                 C   sd   d}d}|d ur.d|j v r$|d}d|j v r$||d   |d }d|j v r.|d}||fS )Nr   dp_replicatedp_shardtp)mesh_dim_namesget_local_ranksize)rQ   r   ry   tp_rankr   r   r   _get_dp_tp_mesh   s    




z!CheckpointManager._get_dp_tp_meshc                 C   s   t ||\}}||dS )N)r<   r=   )r   )rQ   r<   r>   model_sdoptim_sdr   r   r   r      s   
z CheckpointManager.get_state_dictc                 C   s|  t | j}| |t|j}tdt|  t	
 r!t	  td | ||}tj||d td t	
 r@t	  t rjt|t d}	tjtjt|dd|	dd	 W d    n1 sew   Y  | |\}
}|d
krt|
}tdt||   t|| d}	t| |	 W d    n1 sw   Y  td | j| |   t	
 rt	  dS )NzSaving to: z	Saving...rB   zState dict saved!wT)resolve   )indentr   zSaving train state to: zTrain state saved !)r   r!   r   FOLDER_NAMEr|   stepr5   r6   r&   rg   r   rl   r   rF   saver   openr7   jsonr   r   to_container
structuredr   r{   rH   rP   re   rx   )rQ   r<   r>   train_stateconfigr   r!   curr_save_dirrH   rw   ry   r   train_state_namer   r   r   r      sD   





zCheckpointManager.saver<   r!   c                 C   s   |  |\}}|p| j|d}|d u rd S t|}td t|| d}	t|	}
W d    n1 s6w   Y  |	|
 td tdt
|  | j||d}tj||d td td	 t|||d
 |d d td d S )N)ry   zReloading train staterzTrain state reloadedzLoading from: )r<   r>   rB   zState dict loaded.zReloading model and optimr<   r=   )model_state_dictoptim_state_dictzModel and optim reloaded)r   r}   r{   r|   r5   r6   r   r   rG   load_state_dictr&   r   rF   r   )rQ   r<   r>   r   r   r!   ry   r   r   rw   train_state_dictrH   r   r   r   rG     s4   	





zCheckpointManager.loadc                 C   s&   t  rtj|jdd t  | |S )NTr/   )r   rN   makedirsr!   rg   rl   )clsrK   r   r   r   instantiate_and_make_dir9  s   z*CheckpointManager.instantiate_and_make_dir)r   r[   )r   r   r   r   rR   r   r   rO   rx   r   r   r}   r&   r   r	   r   r   torchno_gradr   r'   r   nnModulerG   classmethodr   r   r   r   r   rJ   k   s>    	.	



5*rJ      mp_rankmp_sizec                 C   s:   |dkr|dks
J | d }|  r|S | d|dd S )Nr   r   r   zconsolidated.02dz.pth)r3   )r.   r   r   no_rank_pathr   r   r   get_consolidated_ckpt_pathB  s   r   consolidated_pathr#   c                 C   s>  t |}t|ddd}| rtj|dd}d|v r|d }n,t|d}|s0td| d	i }|D ]}tj|dd}d|v rE|d }|| q4| j	
  | j
  | j  |d
urd| j| | j|dd\}	}
dd |	D }	|d
urdd |	D }	t|	dkrtd|	  t|
dkrtd|
  d
S d
S )z
    Loads a consolidated checkpoint into the model.
    This version supports both:
      - a single file named 'consolidated.pth'
      - multiple parts named like 'consolidated.00.pth', 'consolidated.01.pth', etc.
    r   r   )r   r   T)weights_onlyr<   zconsolidated.*.pthz)No consolidated checkpoint file found in .NF)strictc                 S      g | ]}d |vr|qS )ztied_module.weightr   rW   kr   r   r   rY   u      z0load_consolidated_checkpoint.<locals>.<listcomp>c                 S   r   )zvision_model.r   r   r   r   r   rY   x  r   zMissing keys when reloading: z Unexpected keys when reloading: )r   r   r3   r   rG   sortedglobFileNotFoundErrorupdatevision_projectorinit_tensorsvision_modelrope_embeddingsreset_parameters	load_ckptr   lenr5   warning)r<   r   r#   	ckpt_pathcp_filest_dictcheckpoint_files	ckpt_filepartmissing_keysunexpected_keysr   r   r   load_consolidated_checkpointK  s@   



r   )Nr<   r=   )r   r   r[   )<r   loggingrN   r*   dataclassesr   r   pathlibr   typingr   r   r   r   torch.distributeddistributedrg   torch.distributed.checkpoint
checkpointrF   torch.nnr   torch.optim.optimizer	omegaconfr   torch.distributed._tensorr	   r
   )torch.distributed.checkpoint.format_utilsr   'torch.distributed.checkpoint.state_dictr   r   r   core.distributedr   	getLoggerr5   r   rV   RE_CKPTr1   r2   r7   r{   compiler,   r   r   r&   r-   r;   r   r=   	OptimizerrI   rJ   r   r   r   r   r   r   r   <module>   st   




 X