o
    ߥiE                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlmZ ddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ d ddZ				d!ddZdd Zdd Zdd Z 				d!ddZ!dd Z"d"ddZ#		d#ddZ$dS )$    N)	FusedAdam)mpu)DynamicLossScalerFP16_ModuleFP16_Optimizer)distributed   )DistributedDataParallel)GLMForMultiTokenClozeGLMForMultiTokenClozeFastGLMForSequenceClassificationGLMForSingleTokenClozeGLMModel)PyTorchDistributedDataParallel),glm_get_params_for_weight_decay_optimization)get_checkpoint_iterationget_checkpoint_nameprint_rank_0c                    s  t |\}}}}t|||}t dkrtdtj | tj	|dd}	 j
r,| j} t| tr4| j} t| tr<| j} t| drD| j}  fdd}
 jrd|	d	 v r{|	d	 d
 } jd |jd kr{|
||  d
 j|	d	 d
< td jd   d|	d	 v r|	d	 d } jd |jd kr|
||  d j|	d	 d< td jd   t|   D ]}t| |  ||  |dddd< q| j|	d	 dd\}}|s|rtd| d|   jr jr| j| j j!j| d S d S d S )Nr   z-global rank {} is loading pretrained model {}cpu)map_locationmodelc                    s4   | j d }| jd ksJ | }| |d |< |S )Nr   r   )shapemax_position_embeddingsclone)state_weightsmodel_weightsoriginal_lengthnew_weightsargs Z/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/train_utils.pyextend_embedding_weights%   s
   
z1load_pretrained.<locals>.extend_embedding_weightsz,transformer.block_position_embeddings.weightmodulez&transformer.position_embeddings.weightr   zExtend position embedding to z#Extend block position embedding to z@mixins.block_position_embedding.block_position_embeddings.weightz"transformer.word_embeddings.weightzword_embeddings.weightF)strictzMissing keys z, unexpected keys )"r   r   r   get_data_parallel_rankprintformattorchr   get_rankload	deepspeedr#   
isinstanceTorchDDPr   hasattrr   block_lmr   r   
state_dictdatar   listkeyspopreplaceload_state_dictcontinuous_promptprompt_initprompt_spellinit_embeddingword_embeddingsweight)r   checkpoint_pathr   task_tokensload_dirtagreleasesuccesscheckpoint_namesdr"   position_weightsblock_position_weightskeymissing_keysunexpected_keysr    r   r!   load_pretrained   s   





rJ   Tc           	      C   s  t d | jr2|dkrtj| j| j| j| j| jd}n|dkr0t	j| j| j| j| j| j|d}nt
d\}}|dks>|dkrC| jsCd}|durId}|durTt d	|  td+i d
| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd|d| jd| jo| j d|d|d| jd| j}| jr|j| jd |dur|dkr| jr|r| jrt|| j d}n7t!|| j d}n/t"|| j#d}n't$|| j| j%| j&|d}n|dkrt$|| j| j%| j&|d}n	|d krnt
|t'( d!krt)d"*t'+ t,d#d$ |- D d%d& | j.r|/  |0t1j02  | j.r/t3|}| j4sc| j5s;| j6rc| j7d'krSt1j02 }t8||g|t'9 d(}|S | j7d)kr_t:|}|S t d* |S ),zBuild the model.zbuilding GPT2 model ...multiple_choice)	cache_dirfp32_layernormfp32_embeddinglayernorm_epsilonclassification)rL   rM   rN   rO   
num_labels)TTFNzContinuous spell length 
num_layers
vocab_sizehidden_sizenum_attention_headsembedding_dropout_probattention_dropout_proboutput_dropout_probmax_sequence_lengthmax_memory_lengthcheckpoint_activationscheckpoint_num_layersparallel_outputrelative_encodingblock_position_encodingoutput_predictspell_length
spell_funcattention_scale)tune_prefix_layers)length_penalty)take_softmax)	num_class
generationr   z5 > number of parameters on model parallel rank {}: {}c                 S   s   g | ]}|  qS r    )nelement).0pr    r    r!   
<listcomp>   s    zget_model.<locals>.<listcomp>T)flushr(   )
device_idsoutput_deviceprocess_grouplocalzSkip DDP modelr    );r   pretrained_bertBertForMultipleChoicefrom_pretrainedtokenizer_model_typerL   rM   rN   rO   BertForSequenceClassificationNotImplementedError
cloze_evalr   rR   rS   rT   rU   hidden_dropoutattention_dropoutr   
mem_lengthr[   r\   transformer_xlr/   	masked_lmprompt_funcrc   freeze_transformerrd   fast_decoder   re   r
   r   adapetr   output_dropout
pool_tokenr   r%   r&   r'   get_model_parallel_ranksum
parametersfp16halfcudar(   current_devicer   r+   train_itersepochsDDP_implr-   get_data_parallel_groupLocalDDP)	r   
model_typemulti_tokenrQ   ra   r   r`   paralle_outputir    r    r!   	get_modelX   s  	

	r   c                 C   sZ   t | tttfr| j} t | tttfst| }|D ]}|d D ]
}t|ds)d|_qq|S )Nparamsmodel_parallelF)r,   r   r-   r   r#   r   r.   r   )r   param_groupsparam_groupparamr    r    r!   get_optimizer_param_groups   s   
r   c                 C   s   |j r|jrtjj}nddlm} |}|| |j|jd}n-|j	dkr3t
| |j|j|j|jf|jd}n|j	dkrHddlm} || |jddd	}nttd
|jj  t|dr]|jr]t|jrqt||j|j|j|j|jdd}|S )zSet up the optimizer.r   )DeepSpeedCPUAdam)lrweight_decayadam)r   r   betaseps	adafactor)	AdafactorF)r   relative_stepwarmup_initzOptimizer = r+   )scale_window	min_scaledelayed_shift)static_loss_scaledynamic_loss_scaledynamic_loss_args)cpu_optimizercpu_torch_adamr(   optimAdamWdeepspeed.ops.adamr   r   r   	optimizerAdam
adam_beta1
adam_beta2adam_epstransformersr   rw   r&   	__class____name__r.   r+   r   r   
loss_scaler   loss_scale_windowr   
hysteresis)r   r   cpu_adam_optimizerr   r   r   r    r    r!   get_optimizer   sN   





r   c              	   C   sd   |j dur	|j }n|j}|jr||j }td|}d}|j| }t| |j||| |j||j	d}|S )z"Build the learning rate scheduler.Nr   )start_lrwarmup_iter	num_itersdecay_style	last_iterdecay_ratio)
lr_decay_itersr   finetunegradient_accumulation_stepsmaxwarmupAnnealingLRr   lr_decay_stylelr_decay_ratio)r   r   r   	init_stepr   lr_schedulerr    r    r!   get_learning_rate_scheduler  s$   



	r   c           
      C   s   t | ||||d}t|}| jdus!| jdurB| jdks!| jdkrB| jr7td tj||| t	dd\}}}}nt
|| }t|| }	nd\}}	|||	fS )zSetup model and optimizer.)r   r   rQ   ra   Nr   zDeepSpeed is enabled.F)r   model_parametersr   r   dist_init_required)NN)r   r   
train_datadata_dirr   r   r+   r   
initializer   r   r   )
r   r   r   rQ   ra   r   r   r   _r   r    r    r!   setup_model_and_optimizer.  s.   

r   c                 C   s   |}|j r|| n|jr| j|dd n|  |j s"|jdkr)|d  n|d  |jd|jd |d  |j s`|jrG| 	  |j
dkr`|jsZt| |j
 |S | |j
 |S )zBackward step.F)update_master_gradsr(   	allreduce)reduce_afterfp32_allreducer   )r+   backwardr   r   resetstartallreduce_paramsr   stopr   	clip_gradr   clip_grad_normr   clip_master_grads)r   r   lm_lossr   timerslossr    r    r!   backward_stepQ  s,   
r   Fc                 C   s   |sd S t   t  dkrDt|  tdtj d d tdtj d d tdtj d d tdtj	 d d td d S d S )	Nr   zMemory Allocated i   @	GigaByteszMax Memory Allocated zCache Allocated zMax cache Allocated  )
distbarrierr)   r&   r(   r   memory_allocatedmax_memory_allocatedmemory_cachedmax_memory_cached)messageforcer    r    r!   see_memory_usagez  s,   r   c	                 C   s  d\}	}
|du r
g n|}|j s|  	 d\}}|d  || ||||\}}}|d  |j s7||j }|  d}tj	j
|jt d |j|j|j  |_t|s|	|7 }	|
d7 }
|d  t||||| |d  |d	  |j r| r|  d}|jr|js|  n d}n|  n|
|jkr|  d}|jr|js|  nd}|d	  |rnntd
 ~~g }|rnq|j r|	|
 }	|	||fS )zSingle training step.)g        r   NT)r   Fforwardr   )groupr   r   zFound NaN loss, skip backward)r+   	zero_gradr   r   r   detachr   viewr(   r   
all_reducer1   r   r   
world_sizemodel_parallel_sizer   _has_inf_or_nanr   !is_gradient_accumulation_boundarystepr   overflowr   )data_iteratorr   r   r   r   r   forward_step_funcmemssingle_steplm_loss_totalcountskipped_itercompleter   r   reduced_lossr    r    r!   
train_step  sl   









9
r  )N)NTNN)F)NF)%r+   r(   apex.optimizersr   r   megatron_utilr   megatron_util.fp16r   r   r   r   r   r   r	   r   r
   r   r   r   r   r   r-   r   utilsr   r   r   rJ   r   r   r   r   r   r   r   r  r    r    r    r!   <module>   s<   
F
z3
#
)