o
    ॵi                  
   @   s  d dl Z d dlZd dlmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ e	 Zdddd	ddd
dd	d
ddZdZdad2ddZdefddZdejdeeee jf deeee jf ddfddZdeeee jf ddfddZdeddfddZdejdeeee jf dedeeejf fd d!Zdejdeeee jf dedeeejf fd"d#Zd$eeejf deeee jf ddfd%d&Zd'ejd(ejdefd)d*Zdeeee jf d+edeeejf fd,d-Z d.ejded/edeej fd0d1Z!dS )3    N)DictListUnion)nn)
get_logger)	is_mastermoe   )version
world_sizev1i  )r
   r   tensor_model_parallel_sizeseed)r
   r   )zgpt-moeplugzmglm-text-summarizationzmp_rank_XX_model_states.ptFc                 K   s   ddl m} ddlm} | du r|du rJ d| du rM||}z|j} W n' tyL   z|jj}W n ty?   |jj}Y nw |t	v rHt	| ni } Y nw | 
| ||  dadS )a  Initialize megatron_util environment for megatron_based model.

    If argument `megatron_cfg` is not specified, then the megatorn_cfg will be load
    from configuration.json file in the model_dir.

    Args:
        megatron_cfg (Dict, optional): Megatron Config will be send to megatron_util.
        model_dir (str, optional): The model path for configuration. Defaults to None.
    r   )read_config)initialize_megatronNzEcfg and model_dir cannot both be None when initializing megatron_utilT)modelscope.utils.hubr   megatron_utilr   megatronAttributeErrormodeltypepipeline_DEFAULT_CFG_WITH_MODEL_TYPEupdate_IS_MEGATRON_INITIALIZED)megatron_cfg	model_dirkwargsr   r   cfg
model_type r!   S/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/utils/megatron_utils.pyinit_megatron_util$   s.   


r#   returnc                   C   s   t S N)r   r!   r!   r!   r"   is_megatron_initializedE   s   r&   r   checkpoint_dir
target_dirc                 C   s  dt fdd}tjtj|drtj|d}tt|}ttd}t	| t
| |d| d|  ||k rWtj|dd	 t| ||| }t|| |d
 dS ||krutj|dd	 t| ||| }t|| |d dS t|| |d dS )a-  Split or Merge checkpoint for megatron_based model.

    Args:
        model (nn.Module): Any megatron_based model.
        checkpoint_dir (Union[str, bytes, os.PathLike]): The save path of origin checkpoint.
        target_dir (Union[str, bytes, os.PathLike]): The target path of new checkpoint.
    informationc                 S   s   t  r
t|  d S d S r%   )r   loggerinfo)r)   r!   r!   r"   
log_masterT   s   z/convert_megatron_checkpoint.<locals>.log_masterr   
WORLD_SIZEzorigin_num_partitions: z, target_num_partitions: T)exist_okzSplit checkpoints succeeded.zMerge checkpoints succeeded.zCopy checkpoints succeeded.N)strospathexistsjoinlenlistdirintgetenv_check_origin_dir_check_target_num_partitionsmakedirs_split_checkpoint_save_converted_checkpoint_merge_checkpointshutilcopytree)r   r'   r(   r,   origin_num_partitionstarget_num_partitions
state_dictr!   r!   r"   convert_megatron_checkpointI   s8   

rC   
origin_dirc                 C   sh   t | }t|t|d @ dksJ dtt|D ]}td|d}||v s1J d| dqd S )N   r   z)The number of files must be a power of 2!XX02dzCan not find z file!)r0   r5   r4   range_CHECKPOINT_FORMATreplace)rD   	filenamesicheckpoint_namer!   r!   r"   r8   w   s   



r8   num_partitionsc                 C   s   | | d @ dksJ dd S )NrE   r   z5The number of target partitions must be a power of 2!r!   )rN   r!   r!   r"   r9      s   r9   c                 C   s   t td}|| }t||}i }|  D ](\}}t||| }	|	dkr,|| ||< qt|| ||	}
|
||   ||< q|S )NRANK)r6   r0   r7   _load_by_ranknamed_parameters_get_diff_dim_split_tensorclone)r   r'   rN   target_rankorigin_rankrB   target_state_dictname	parameterdimpartitions_listr!   r!   r"   r;      s    
r;   c                    s   t tdfddtD } fdd|D }i }|  D ],\}t||d  }|dkr<|d  |< q"tjfdd|D |d |< q"|S )	NrO   c                    s   g | ]}  | qS r!   r!   .0rL   )rN   rV   r!   r"   
<listcomp>   s    z%_merge_checkpoint.<locals>.<listcomp>c                    s   g | ]}t  |qS r!   )rQ   r]   )r'   r!   r"   r_      s    
r   rP   c                    s   g | ]}|  qS r!   r!   )r^   rB   )rY   r!   r"   r_      s    r[   )	r6   r0   r7   rH   rR   rS   torchcatrU   )r   r'   rN   origin_rank_liststate_dict_listrX   rZ   r[   r!   )r'   rY   rN   rV   r"   r=      s(   
r=   rB   c                 C   s8   t td}td|d}t| tj|| d S )NrO   rF   rG   )	r6   r0   r7   rI   rJ   ra   saver1   r3   )rB   r(   rV   target_namer!   r!   r"   r<      s   r<   tensor1tensor2c                 C   s4   t t| j|jD ]\}\}}||kr|  S q	dS )NrP   )	enumeratezipshape)rg   rh   rL   s1s2r!   r!   r"   rS      s
   rS   rankc                 C   s@   t d|d}tjtj| |dd d}d|v r|d S |S )NrF   rG   c                 S   s   | S r%   r!   )storagelocr!   r!   r"   <lambda>   s    z_load_by_rank.<locals>.<lambda>)map_locationmodule)rI   rJ   ra   loadr0   r1   r3   )r'   rn   rM   rB   r!   r!   r"   rQ      s   rQ   tensorpartition_dimc                 C   s4   ddl m} |j| ||}tj| ||d}|S )Nr   )mpur`   )r   rw   utilsdividesizera   split)ru   rN   rv   rw   per_partition_sizer\   r!   r!   r"   rT      s   
rT   )NN)"r0   r>   typingr   r   r   ra   r   modelscope.utils.loggerr   modelscope.utils.torch_utilsr   r*   r   rI   r   r#   boolr&   Moduler/   bytesPathLikerC   r8   r6   r9   Tensorr;   r=   r<   rS   rQ   rT   r!   r!   r!   r"   <module>   s   
!
.




	