o
    ߥiUJ                     @   s2  d Z ddlZddlZddlZddlZddlZddlZddlZddl	m
Z
mZ ddlmZ dZdd Zdd	 Zd
d Zd2ddZdd ZG dd dZdd Z		d3ddZdd Zdd Zdd Z					d4ddZd d! Zd"d# Z		d3d$d%Zd5d&d'Zd5d(d)Zd5d*d+Z d5d,d-Z!d5d.d/Z"d0d1 Z#dS )6z'Utilities for logging and serialization    N)mpuprint_rank_0)FP16_Optimizerrunsc                 C   s   t j|t| S N)ospathjoinSUMMARY_WRITER_DIR_NAME)namebase r   T/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/utils.pyget_log_dir   s   r   c                  C   s*   dg} t j| dd}|d d }|S )Nzhostname -ITshellutf-8r   )
subprocesscheck_outputdecodesplit)hostname_cmdresultmaster_addrr   r   r   get_hostname"   s   r   c                 C   s   t j dkr0tjdgdd}t| }|| jkr(tjdgdd}t| }t j	|g}nt j	dg}t j
|d | }|S )Nr   zshuf -n 1 -i 10000-65535Tr   )torchdistributedget_rankr   r   intstripmaster_portcuda
LongTensor	broadcastitem)argsportr   r   r   get_spare_port)   s   
r'   Tc           
   	   C   s<  |r%t ddd t| D ]}ddt|  }t d||t| |dd q|durtj|d}t|d	}t	j
t| |dd
 W d   n1 sJw   Y  | jr| jdurt| jdd}t	|}W d   n1 snw   Y  tj|d}	t|	d	}t	
|| W d   dS 1 sw   Y  dS dS dS dS )zPrint arguments.z
arguments:Tflush.   z
  {} {} {}Nzconfig.jsonw)	sort_keysr   encodingzconfig_gpt_large.json)printvarslenformatgetattrr   r   r	   openjsondump	deepspeeddeepspeed_configload)
r%   verboselog_dirargdots	json_fileoutputfiler9   deepspeed_json_filer   r   r   print_and_save_args:   s0   "rC   c              
   C   s   d}t j }d}| }t| tr| j}|jD ]2}|d D ]+}|d7 }|j }|j	 }	|j
 }
|d|||t|j7 }|d||	|
7 }qqt|dd d	S )
z+Print min, max, and norm of all parameters.r   z6iteration, rank, index, model-parallel,min, max, norm
params   z{:7d}, {:4d}, {:4d}, {:2d}, z{:.6E}, {:.6E}, {:.6E}
Tr(   N)r   r   r   
isinstancer   	optimizerparam_groupsdataminmaxnormr3   r   model_parallelr0   )rG   	iterationindexrankstring
optimizer_param_groupparammin_max_rL   r   r   r   print_params_min_max_normO   s$   





rW   c                   @   s8   e Zd ZdZG dd dZdd Zdd Zdd
dZdS )TimerszGroup of timers.c                   @   s:   e Zd ZdZdd Zdd Zdd Zdd	 ZdddZdS )zTimers.TimerzTimer.c                 C   s    || _ d| _d| _t | _d S )N        F)name_elapsed_started_time
start_timeselfr   r   r   r   __init__i   s   zTimers.Timer.__init__c                 C   s,   | j rJ dtj  t | _d| _ dS )zStart the timer.ztimer has already been startedTN)r\   r   r!   synchronizer]   r^   r`   r   r   r   starto   s   


zTimers.Timer.startc                 C   s:   | j sJ dtj  |  jt | j 7  _d| _ dS )zStop the timer.ztimer is not startedFN)r\   r   r!   rb   r[   r]   r^   rc   r   r   r   stopv   s   

zTimers.Timer.stopc                 C   s   d| _ d| _dS )zReset timer.rY   FN)r[   r\   rc   r   r   r   reset}   s   
zTimers.Timer.resetTc                 C   s6   | j }| j r
|   | j}|r|   |r|   |S )zCalculate the elapsed time.)r\   re   r[   rf   rd   )r`   rf   r\   r[   r   r   r   elapsed   s   zTimers.Timer.elapsedN)T)	__name__
__module____qualname____doc__ra   rd   re   rf   rg   r   r   r   r   Timerf   s    rl   c                 C   s
   i | _ d S r   )timersrc   r   r   r   ra      s   
zTimers.__init__c                 C   s$   || j vr| || j |< | j | S r   )rm   rl   r_   r   r   r   __call__   s   

zTimers.__call__      ?Tc                 C   sP   |dksJ d}|D ]}| j | j|dd | }|d||7 }q
t| dS )zLog a group of timers.rY   z	time (ms))rf   g     @@z | {}: {:.2f}N)rm   rg   r3   r   )r`   names
normalizerrf   rQ   r   elapsed_timer   r   r   log   s   
z
Timers.logN)ro   T)rh   ri   rj   rk   rl   ra   rn   rs   r   r   r   r   rX   c   s    ,rX   c                 C   sx   d}| d }|d tj | 7 }|d tj | 7 }|d tj | 7 }|d tj | 7 }t| dS )zSimple GPU memory report.g      0Az memory (MB)z | allocated: {}z | max allocated: {}z | cached: {}z | max cached: {}N)r3   r   r!   memory_allocatedmax_memory_allocatedmemory_cachedmemory_reservedr   )r   
mega_bytesrQ   r   r   r   report_memory   s   ry   Fc                 C   sH   |rd}nd |}|rt }|d |7 }tj| |d t S )Nreleasez{}z_zero_dp_rank_{}zmp_rank_{:02d}_model_states.pt)r3   r   get_data_parallel_rankr   r   r	   get_model_parallel_rank)checkpoints_pathrN   rz   zeroddp_rankr   r   r   get_checkpoint_name   s   
r   c                 C   s.   t j| }t j|st j|dd d S d S )NT)exist_ok)r   r   dirnameexistsmakedirs)filenamer   r   r   r   ensure_directory_exists   s   r   c                 C   s   t j| dS )Nz!latest_checkpointed_iteration.txt)r   r   r	   )r}   r   r   r   get_checkpoint_tracker_filename   s   r   c                 C   sD   ||  d}t| j|dd}t| t|| td| d S )N)rN   optimizer_state_dictT)r~     successfully saved {})
state_dictr   saver   r   r0   r3   )r%   rN   rG   zero_sdzero_checkpoint_namer   r   r   save_zero_checkpoint   s   r   c
                    s  |du rt | }|jr|st| ||||d nt dkrt|j|}
tdt	j
 | |
 d| i}|jr9|j}| }|rZi  | D ]	\}}|j |< qE fdd| D }||d< |jsw|	sw|durm| |d	< |durw| |d
< |jst |d< tj |d< t	 |d< t	j |d< t  |d< t|
 t	||
 td|
 |rt	j
  t	j
 dkrt|j}t|d}|| W d   dS 1 sw   Y  dS dS )Save a model checkpoint.N)tagr   z<global rank {} is saving checkpoint at iteration {:7d} to {}rN   c                    s   i | ]\}} | r||qS r   r   ).0keyvaluerequires_grad_dictr   r   
<dictcomp>   s    z#save_checkpoint.<locals>.<dictcomp>modulerG   lr_schedulerrandom_rng_statenp_rng_statetorch_rng_statecuda_rng_staterng_tracker_statesr   r,   ) strr8   save_ds_checkpointr   r{   r   r   r0   r3   r   r   r   r   r   named_parametersrequires_graditemsno_save_optimno_save_rngrandomgetstatenp	get_stateget_rng_stater!   get_cuda_rng_tracker
get_statesr   barrierr   r5   write)rN   modelrG   r   r%   r   r   only_changed_parametersno_deepspeedr   checkpoint_namesdr   r   	parametertracker_filenamefr   r   r   save_checkpoint   s`   




"r   c                 C   s   i }| |d< |dur|  |d< |js5t |d< tj |d< t |d< tj |d< t	
  |d< |j|j||d	 dS )
r   rN   Nclient_lr_schedulerr   r   r   r   r   )client_state)r   r   r   r   r   r   r   r   r!   r   r   r   r   r   )rN   r   r   r%   r   r   r   r   r   r     s   r   c                 C   s   t | }tj|s9td| tj| r/tj| }tj|\}}td ||ddfS td | dddfS t	|ddd	}|
  }|d
k}W d    n1 sUw   Y  | ||dfS )Nz-WARNING: could not find the metadata file {} z6Try to directly load the checkpoint from the directoryFTz<    will not load any checkpoints and will start from randomr   rr   r.   rz   )r   r   r   isfiler   r3   isdirnormpathr   r5   readr   )	load_pathr   r   load_dirr   r   
metastringrz   r   r   r   get_checkpoint_iteration/  s&   
r   c                 C   s|  t |j\}}}}	|	sdS |jrG|sG| j|||j o| |j d\}
}|js6d|v r6||d  td |
du rFt	 dkrDt
d |S npt|||}
t	 dkr^t
dtj |
 tj|
dd	}|jrk| j} | j|d
 dd\}}|sz|rtd| d|  |s|js|js|sz|dur||d  |dur||d  W n ty   td|
 Y nw |js|rd}n)z|d }W n" ty   z|d }W n ty   td|
 d}Y nw Y nw |s.|js.|js.z)t|d  tj|d  t|d  tj|d  t |d  W n ty-   td|
 Y nw t	 dkr<t
d|
 |S )zLoad a model checkpoint.r   )load_optimizer_statesload_lr_scheduler_statesr   zLoad lr scheduler stateNzUnable to load checkpoint.z'global rank {} is loading checkpoint {}cpu)map_locationr   F)strictzMissing keys z, unexpected keys rG   r   zUnable to load optimizer from checkpoint {}, exiting. Specify --no-load-optim or --finetune to prevent attempting to load the optimizer state.rN   total_iterszbA metadata file exists but Unable to load iteration  from checkpoint {}, starting from 0 iterationr   r   r   r   r   zUnable to load random state from checkpoint {}, exiting. Specify --no-load-rng or --finetune to prevent attempting to load the random state.z  successfully loaded {})r   r:   r8   load_checkpointno_load_optimno_load_lr_schedulerload_state_dictr   r   r{   r0   r   r3   r   r   r   r   finetuneKeyErrorno_load_rngr   setstater   	set_stateset_rng_stater!   r   
set_states)r   rG   r   r%   r   r   r   r   rz   successr   r   missing_keysunexpected_keysrN   r   r   r   r   P  s   




r   c                 C   sr   dt t| v }|  D ]*\}}|r|j| j}|j}n	|j}|j| j}|r1d|v r1|  }|| qdS )z
    Loads weights from src to dst via in place copy.
    src is a huggingface gpt2model, while dst is one of our models.
    dst2src=True loads parameters from our models into huggingface's.
    ^dst2src is still untested
    Conv1DweightN)r   typer   _parametersrI   t
contiguouscopy_)srcdstdst2src
conv_layernprI   r:   r   r   r   load_weights  s   r   c                 C   $   t |j| j| t |j| j| d S r   )r   c_fcdense_h_to_4hc_projdense_4h_to_houroair   r   r   r   load_mlp     r   c                 C   r   r   )r   c_attnquery_key_valuer   denser   r   r   r   load_attention  r   r   c                 C   sD   t |j| j| t |j| j| t| j|j| t| j|j	| d S r   )
r   ln_1input_layernormln_2post_attention_layernormr   mlpr   	attentionattnr   r   r   r   load_transformer_layer  s   r   c                 C   sd   |j }t|j| j j| t|j| j| t|j| j| t| j j	|j j
D ]
\}}t||| q%dS )z
    Loads weights from `oai` to `our` via in place copy.
    `oai` is a huggingface gpt2model, while `our` is one of our models.
    dst2src=True loads parameters from our models into huggingface's.
    ^dst2src=True is still untested
    N)transformerr   ln_ffinal_layernormwteword_embeddingswpeposition_embeddingsziplayershr   )r   r   r   transformer_model	our_layer	oai_layerr   r   r   move_weights  s   	r  c                 C   sT  | d | d }}| d | d | d }}}g }||   }	t|| d |	  D ] \}
}||}|dkrFd|| d|
f    d	}|| q+td
| g }t|	|dD ]}
|| |
 rj||
 q]t| t|	|| |   t
|jdkrt|	|| |   nt|	||   t|| d d |f  d S )Ntokens
target_idsattention_mask
logit_maskposition_idsz[MASK][r   ]    )r$   	enumeratetolist	IdToTokenappendr0   r	   rangesize	DecodeIdsr2   shape)
local_varsbatch_id	tokenizerr	  r
  r  r  r  output_tokenssepitokentarget_positionsr   r   r   debug_finetune_data  s:   
 

r#  )TN)FF)NTFFF)F)$rk   r   r   r   r]   r6   numpyr   r   megatron_utilr   r   megatron_util.fp16r   r
   r   r   r'   rC   rW   rX   ry   r   r   r   r   r   r   r   r   r   r   r   r   r  r#  r   r   r   r   <module>   sR   
B

E%

e



