o
    Ii                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlm	  m
Z d dlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ e rd dlmZmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z:m;Z;m<Z<m=Z=m>Z> d dl?m@Z@mAZAmBZB d dlCmDZD d dlEmFZF d dlGmHZHmIZImJZJmKZKmLZLmMZM d dlNmOZO d dlPmQZQmRZRmSZSmTZTmUZUmVZV d dlWmXZXmYZYmZZZ dGd!d"Z[d#d$ Z\G d%d& d&Z]d'd( Z^d)d* Z_G d+d, d,eZ`d-d. ZaG d/d0 d0ZbG d1d2 d2eZcd3d4 ZdG d5d6 d6eZeG d7d8 d8eeZfG d9d: d:eeZgG d;d< d<eeZhd=d> ZidHd?d@ZjG dAdB dBej	jkZldCdD ZmdEdF ZndS )I    N)ABC)partial)BCEWithLogitsLossCrossEntropyLossMSELoss   )AcceleratedOptimizer)AcceleratedScheduler   )is_megatron_lm_available)recursively_applysend_to_device)mputensor_parallel)DistributedDataParallel)finalize_model_grads)	ModelType)get_num_microbatches)get_megatron_optimizer)get_tensor_model_parallel_group"get_tensor_model_parallel_src_rank)get_forward_backward_func)get_model_config)build_train_valid_test_datasets)	BertModelT5Model)Classification)get_argsget_tensorboard_writerget_tokenizerprint_rank_last)_add_data_args_add_validation_args!core_transformer_config_from_args
parse_argsvalidate_args)load_args_from_checkpointload_checkpointsave_checkpoint)set_global_variables)gpt_builder)_compile_dependencies_init_autoresume_initialize_distributed_set_random_seedset_jit_fusion_optionswrite_args_to_tensorboard)_vocab_size_with_padding)%build_train_valid_test_data_iteratorsget_optimizer_param_schedulernum_floating_point_operationssetup_model_and_optimizer
train_steptraining_log))average_losses_across_data_parallel_groupcalc_params_l2_normget_ltor_masks_and_position_idsTc           	   	   C   s   t  }|jrdnd}|jdkrtd|j d| d td t|}|jdkrK|jr?|jr0d	nd}t|||jd
| |d}|S t||j	d	| |d}|S |jdkr^d|_
t|| |ddd}|S |jdkrpt|dd
| |||d}|S td|j )zBuild the model.zpre-trainingzfine-tuningr   z	Building z model in the z mode.zThe Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.bertr   T)confignum_tokentypesadd_binary_headparallel_outputpre_processpost_process)r<   num_classesr=   r@   rA   gptFN)vp_stager<   t5)r<   r=   r?   r@   rA   add_encoderadd_decoderUnsupported model type: )r   pretraining_flagrankprintmodel_type_namer#   bert_binary_headr   r   
num_labelsuse_legacy_modelsr*   r   
ValueError)	r@   rA   rF   rG   argsmoder<   r=   model rT   P/home/ubuntu/.local/lib/python3.10/site-packages/accelerate/utils/megatron_lm.pymodel_provider_funcU   sX   

 

rV   c                 C   s   |  d t }| jjjd ur3| jjjd u rtd| jjj}| jj|}t| |}t| |d d}n!t	j
}|jdkr>t	j}t}| jjjd urL| jjj}t||\}}}t||_|||fS )Nz#Preparing model optimizer schedulerzaYou must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`.)	schedulerrE   )rK   r   statemegatron_lm_plugincustom_prepare_model_functioncustom_model_provider_functionrP   prepare_optimizerprepare_schedulerr   encoder_or_decoderrL   encoder_and_decoderrV   r5   len	model_len)acceleratorrQ   custom_model_provider_funcrS   	optimizerrW   
model_typemodel_provider_func_rT   rT   rU   !prepare_model_optimizer_scheduler   s.   







rg   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )MegatronLMDummyDataLoaderz
    Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

    Args:
        **dataset_kwargs: Megatron data arguments.
    c                 K   sH   t  }t|}t|}| }t|d | _| j| d| jd< d S )Nr   Tmegatron_dataset_flag)argparseArgumentParserr!   r"   parse_known_argsvarsdataset_argsupdate)selfdataset_kwargsparser	data_argsrT   rT   rU   __init__   s   z"MegatronLMDummyDataLoader.__init__c              
   C   s^   t  }| j D ]$\}}t||d}||kr&td| d| d| d|  t||| qd S )N z<WARNING: MegatronLMDummyDataLoader overriding arguments for : with )r   rn   itemsgetattrrK   setattr)rp   rQ   keyvalue	old_valuerT   rT   rU   set_megatron_data_args   s   z0MegatronLMDummyDataLoader.set_megatron_data_argsc                 C   s   dd }|j jjd ur|j jjS z9t }|jdkr%ddlm} d|_|W S |jdkr6ddlm} d|_|W S |jdkrGddl	m} d|_|W S W |S  t
yS   Y |S w )	Nc                 S   s   t  }t|jttfr|jn|jg|j| |jd}|jdkr)||j	|j
d n'|jdkr7|d|j	i n|jdkrH||j|jdd ntd|j td
i |\}}}|||fS )z&Build train, valid, and test datasets.)data_prefixsplits_stringtrain_valid_test_num_samplesseedr;   )max_seq_lengthbinary_headrC   r   rE   )r   max_seq_length_decdataset_typerH   NrT   )r   
isinstance	data_pathlisttuplesplitr   rL   ro   
seq_lengthrM   encoder_seq_lengthdecoder_seq_lengthrP   r   )train_val_test_num_samplesrQ   rn   train_dsvalid_dstest_dsrT   rT   rU   "train_valid_test_datasets_provider   s6   



zlMegatronLMDummyDataLoader.get_train_valid_test_datasets_provider.<locals>.train_valid_test_datasets_providerr;   r   )r   TrC   rE   )rX   rY   *custom_megatron_datasets_provider_functionr   rL   pretrain_bertr   is_distributedpretrain_gptpretrain_t5ImportError)rp   rb   r   rQ   rT   rT   rU   &get_train_valid_test_datasets_provider   s.   #



z@MegatronLMDummyDataLoader.get_train_valid_test_datasets_providerc           	      C   s   t  }| |}|jd ur=g }g }g }tt|ddD ] }t| t|}||d  ||d  ||d  qnt|\}}}|||fS )Nra   r   r
   r   )	r   r   $virtual_pipeline_model_parallel_sizerangery   r   (set_virtual_pipeline_model_parallel_rankr2   append)	rp   rb   rQ   !train_valid_test_dataset_providertrain_data_iteratorvalid_data_iteratortest_data_iteratori	iteratorsrT   rT   rU   r2      s"   




z?MegatronLMDummyDataLoader.build_train_valid_test_data_iteratorsN)__name__
__module____qualname____doc__rt   r~   r   r2   rT   rT   rT   rU   rh      s    	
<rh   c                 C   sR   G dd d}|d u }t j|t j| jd}t jj|t t d |s'|r'| S |S )Nc                   @   s   e Zd Zdd Zdd ZdS )z?_handle_megatron_data_iterator.<locals>.DummyMegatronDataloaderc                 S   s   | S NrT   rp   rT   rT   rU   __iter__     zH_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__iter__c                 S   s   i S r   rT   r   rT   rT   rU   __next__  r   zH_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__next__N)r   r   r   r   r   rT   rT   rT   rU   DummyMegatronDataloader  s    r   dtypedevicegroup)torchtensorboolr   distributed	broadcastr   r   )rb   data_iteratorr   is_data_iterator_emptyis_src_data_iterator_emptyrT   rT   rU   _handle_megatron_data_iterator  s   
r   c           	   
      sh  |  d t }|jspddlm m} |j|j } fdd D }|d d u rHt|d t	j
jjr9||d _n|d= |d= |d= ||d	 _n|d	= ||d< t	j
jjjfi ||| jt t d
d| j | jdS |jd ur|j\|_|_|_nd\|_|_|_|j|j |_| \}}}|j|j |_t| |d}t| |d}t| |d}|||fS )NzPreparing dataloaderr   )_PYTORCH_DATALOADER_KWARGSprepare_data_loaderc                    s   i | ]}|t | | qS rT   )ry   ).0kr   
dataloaderrT   rU   
<dictcomp>(  s    z'prepare_data_loader.<locals>.<dictcomp>
batch_sizesamplershufflebatch_samplerFT)num_processesprocess_indexsplit_batchesput_on_device	rng_typesdispatch_batches)r   r   r   )rb   r   )rK   r   ri   data_loaderr   r   micro_batch_sizenum_micro_batchesr   r   utilsdataBatchSamplerr   
DataLoaderdatasetr   r   get_data_parallel_world_sizeget_data_parallel_rankr   copyr   consumed_samplesconsumed_train_samplesconsumed_valid_samplesconsumed_test_samplesr2   r   )	rb   r   rQ   r   r   kwargsr   r   r   rT   r   rU   r   !  s`   


r   c                       s:   e Zd Z fddZd
ddZdd Zedd	 Z  ZS )MegatronLMOptimizerWrapperc                    s   t  j|dd d d S )NF)device_placementscalersuperrt   )rp   rd   	__class__rT   rU   rt   d  s   z#MegatronLMOptimizerWrapper.__init__Nc                 C      d S r   rT   )rp   set_to_nonerT   rT   rU   	zero_gradg  r   z$MegatronLMOptimizerWrapper.zero_gradc                 C   r   r   rT   r   rT   rT   rU   stepj  r   zMegatronLMOptimizerWrapper.stepc                 C   s   | j jS )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.)rd   skipped_iterr   rT   rT   rU   step_was_skippedm  s   z+MegatronLMOptimizerWrapper.step_was_skippedr   )	r   r   r   rt   r   r   propertyr   __classcell__rT   rT   r   rU   r   c  s    
r   c                 C   s$   |  d t }t||j|j|jS )NzPreparing optimizer)rK   r   r   no_wd_decay_condscale_lr_condlr_mult)rb   rS   rQ   rT   rT   rU   r\   s  s   
r\   c                   @   s   e Zd ZdZdddZdS )MegatronLMDummySchedulera  
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int):
            Total number of steps.
        warmup_num_steps (int):
            Number of steps for warmup.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    Nr   c                 K   s   || _ || _|| _|| _d S r   )rd   total_num_stepswarmup_num_stepsr   )rp   rd   r   r   r   rT   rT   rU   rt     s   
z!MegatronLMDummyScheduler.__init__Nr   )r   r   r   r   rt   rT   rT   rT   rU   r   z  s    r   c                       s$   e Zd Z fddZdd Z  ZS )MegatronLMSchedulerWrapperc                    s   t  || d S r   r   )rp   rW   
optimizersr   rT   rU   rt     s   z#MegatronLMSchedulerWrapper.__init__c                 O   r   r   rT   )rp   rQ   r   rT   rT   rU   r     r   zMegatronLMSchedulerWrapper.step)r   r   r   rt   r   r   rT   rT   r   rU   r     s    r   c                 C   s   |  d t|}|S )NzPreparing scheduler)rK   r3   )rb   rd   rW   rT   rT   rU   r]     s   
r]   c                       8   e Zd ZdZ fddZdd Zdd Zdd	 Z  ZS )
AbstractTrainStepz;Abstract class for batching, forward pass and loss handler.c                    s   t    || _d S r   )r   rt   name)rp   r   r   rT   rU   rt     s   

zAbstractTrainStep.__init__c                 C   r   r   rT   )rp   rb   ri   rT   rT   rU   get_batch_func  r   z AbstractTrainStep.get_batch_funcc                 C   r   r   rT   r   rT   rT   rU   get_forward_step_func  r   z'AbstractTrainStep.get_forward_step_funcc                 C   r   r   rT   )rp   rb   rT   rT   rU   get_loss_func  r   zAbstractTrainStep.get_loss_func)	r   r   r   r   rt   r   r   r   r   rT   rT   r   rU   r     s    r   c                       r   )
BertTrainStepzg
    Bert train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    sh   t  d | ||j| _| ||j|j| _| 	|j|j
| _|js)d | _d S ddlm} || _d S )Nr   r   )SequenceClassifierOutput)r   rt   r   ri   	get_batchr   rI   rN   	loss_funcr   rM   forward_stepmodel_return_dictmodel_output_classtransformers.modeling_outputsr   )rp   rb   rQ   r   r   rT   rU   rt     s   

zBertTrainStep.__init__c                 C   X   dd }dd }|j jjd ur|j jjS |r*z	ddlm} |W S  ty)   Y |S w |S )Nc                 S   s   g d}t j}| durt| }nd}t|||}|d  }|d  }|d  }|d  }|d  }	|d  }
|||||	|
fS )	Build the batch.)texttypeslabels	is_random	loss_maskpadding_maskNr  r  r	  r
  r  r  r   int64nextr   broadcast_datalongfloat)r   keysdatatyper   data_btokensr  sentence_orderr
  	lm_labelsr  rT   rT   rU   get_batch_megatron  s   
z8BertTrainStep.get_batch_func.<locals>.get_batch_megatronc                 S   s   t | }t|tj }|d  }|d  }d|v r#|d  }nd}d|v r:|d  }|d dktj}nd}d}d|v rI|d  }nd}||||||fS )r  	input_idsattention_masktoken_type_idsNr  next_sentence_label)r  r   r   cudacurrent_devicer  tor  )r   r   r  r  r  r  r
  r  rT   rT   rU   get_batch_transformer  s    z;BertTrainStep.get_batch_func.<locals>.get_batch_transformerr   r   )rX   rY   custom_get_batch_functionr   r   r   rp   rb   ri   r  r!  r   rT   rT   rU   r     s   
zBertTrainStep.get_batch_funcc                    s:   dd } fdd}|j jjd ur|j jjS |r|S |S )Nc           	      S   s   |\}}|  }|   } t|d| d |   }|d urKtj|dd  |ddd}|  }|| }t||g}||d |d dfS |}t|g}|d|d ifS )Nr   )ignore_indexr   r
   )lm losszsop lossr'  )r  r   sumviewreshapeFcross_entropyr8   )	r
  r  output_tensorlm_loss_
sop_logitslm_losssop_losslossaveraged_lossesrT   rT   rU   loss_func_pretrain  s   ""
z7BertTrainStep.get_loss_func.<locals>.loss_func_pretrainc                    s    dkrt  }||d| d}n&jdkr1| jtjtjfv r1t }||d | d}nt }||| }t	|g}|d|d ifS )Nr
   r%  r2  r   )
r   r)  rN   r   r   r  intr   r   r8   )r  logitsloss_fctr2  r3  rN   rp   rT   rU   loss_func_finetune  s   

z7BertTrainStep.get_loss_func.<locals>.loss_func_finetunerX   rY   custom_loss_function)rp   rb   rI   rN   r4  r9  rT   r8  rU   r     s   
zBertTrainStep.get_loss_funcc                    s    fdd}|S )Nc           
         sb    | \}}}}}} sd}r"|||||d}|tj||fS ||||d}	|	tj|fS )Forward step.Ntokentype_idsr  )r>  r   r   r   )
r   rS   r  r  r  r
  r  r  r-  r6  rM   rI   rp   rT   rU   r   .  s   z9BertTrainStep.get_forward_step_func.<locals>.forward_steprT   )rp   rI   rM   r   rT   r@  rU   r   -  s   z#BertTrainStep.get_forward_step_func	r   r   r   r   rt   r   r   r   r   rT   rT   r   rU   r     s    @)r   c                       r   )
GPTTrainStepzf
    GPT train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    s   t  d | ||j| _| || _|  | _|j	d ur%t
 }|j| _|j| _|j| _|j| _|j| _|j| _|jsAd | _d S ddlm} || _d S )NrB  r   )!CausalLMOutputWithCrossAttentions)r   rt   r   ri   r   r   r   r   r   
vocab_filer   eod	eod_tokeneos_token_id	pad_tokenreset_position_idsreset_attention_maskeod_mask_lossr  r  r  rC  )rp   rb   rQ   	tokenizerrC  r   rT   rU   rt   F  s    



zGPTTrainStep.__init__c                    s`    fdd} fdd}|j jjd ur|j jjS |r.z	ddlm} |W S  ty-   Y |S w |S )Nc              	      s   dg}t j}| durt| }nd}t|||}|d  }|ddddf  }|ddddf  }t| j j j	 j
 jdd\}}	}
|||	||
fS )zGenerate a batchr  Nr
   r%  TrF  rH  rI  rJ  rK  pad_mask_loss)r   r  r  r   r  r  
contiguousr:   rF  rI  rJ  rK  )r   r  r  r   r  tokens_r  r  r  r
  position_idsr   rT   rU   r  [  s&   
	z7GPTTrainStep.get_batch_func.<locals>.get_batch_megatronc           	   	      s   t | }d|d i}t|tj }|d  }tj|jd df|j|j	d j
 }tj||gdd}|d d dd f  }|d d d df  }t| j
 j
 j j jdd\}}}|||||fS )	Nr  r   r
   r   dimr%  TrM  )r  r   r   r  r  r  zerosshaper   r   rF  concatrO  r:   rI  rJ  rK  )	r   r   rP  paddingr  r  r  r
  rQ  r   rT   rU   r!  y  s$   $	z:GPTTrainStep.get_batch_func.<locals>.get_batch_transformerr   r"  )rX   rY   r#  r   r   r   r$  rT   r   rU   r   Z  s   
zGPTTrainStep.get_batch_funcc                    s.   t    fdd}|jjjd ur|jjjS |S )Nc                    s   j r|\}}n|}| }| d }  jdkrDtt|d|  d|  dg}tjj|t	
 d |d |d  }nt|d|  |   } jrrtj }| rrJ d| dtj  dt d  t|g}d|d i} j r|d	|i ||fS )
Nr%  r
   r   r   zRank z7: found NaN in local forward loss calculation. Device: z, node: r'  r6  )return_logitsr  r)  context_parallel_sizer   catr(  r   
all_reducer   get_context_parallel_groupcheck_for_nan_in_loss_and_gradget_rankisnanr  r  osunamer8   ro   )r
  r-  lossesr6  r2  global_rankaveraged_lossoutput_dictrQ   rT   rU   r     s0   

.



z-GPTTrainStep.get_loss_func.<locals>.loss_func)r   rX   rY   r;  rp   rb   r   rT   rf  rU   r     s
   
zGPTTrainStep.get_loss_funcc                        fdd}|S )Nc                    s4     | \}}}}}|||||d}|t j|fS )r<  )r  r?  )r   rS   r  r  r
  r  rQ  r-  r   rT   rU   r     s   z8GPTTrainStep.get_forward_step_func.<locals>.forward_steprT   rp   r   rT   r   rU   r     s   z"GPTTrainStep.get_forward_step_funcrA  rT   rT   r   rU   rB  >  s    C%rB  c                       s\   e Zd ZdZ fddZedd Zedd Zedd	 Zd
d Z	dd Z
dd Z  ZS )T5TrainStepze
    T5 train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    sX   t  d | ||j| _| || _|  | _|j	s!d | _
d S ddlm} || _
d S )Nrj  r   )Seq2SeqLMOutput)r   rt   r   ri   r   r   r   r   r   r  r  r  rk  )rp   rb   rQ   rk  r   rT   rU   rt     s   


zT5TrainStep.__init__c                 C   s(   |  d}|  d}|| }|dk }|S )Nr
   r         ?)	unsqueeze)r  attention_mask_b1sattention_mask_bs1attention_mask_bssextended_attention_maskrT   rT   rU   attn_mask_postprocess  s
   

z!T5TrainStep.attn_mask_postprocessc                 C   s&   t t jd| | f|d}|dk }|S Nr
   r   rl  )r   trilones)r   r   r  rT   rT   rU   get_decoder_mask  s   zT5TrainStep.get_decoder_maskc           	      C   s<   | j \}}| d}tj||df|d}|| }|dk }|S rs  )rU  rm  r   rv  )	r  dec_seq_lengthr   r   _rn  ro  rp  rq  rT   rT   rU   get_enc_dec_mask  s   

zT5TrainStep.get_enc_dec_maskc                 C   r  )Nc                 S   s   g d}t j}| durt| }nd}t|||}|d  }|d  }|d  }|d  }|d dk }	|d	 dk }
|d
 dk }|||||	|
|fS )r  )text_enctext_decr  r
  enc_maskdec_maskenc_dec_maskNr{  r|  r  r
  r}  rl  r~  r  r  )r   r  r  r   r  
tokens_enc
tokens_decr  r
  r}  r~  r  rT   rT   rU   r    s   
z6T5TrainStep.get_batch_func.<locals>.get_batch_megatronc           	      S   s   t | }t|tj }|d  }|d  }|dktj}d|v r+|d  }n'|j|j	|j
tjd}|dddf  |dd	df< d
|d< ||dkd
 t|d  }t|j	d	 |j
}t|d  |j	d	 |j
}|||||||fS )r  r  r  r  decoder_input_ids)r   r   .Nr%  r
   r   ).r   r  )r  r   r   r  r  r  r   r  	new_zerosrU  r   clonemasked_fill_rj  rr  rw  rz  )	r   r   r  r  r
  r  r}  r~  r  rT   rT   rU   r!    s"    z9T5TrainStep.get_batch_func.<locals>.get_batch_transformerr   r"  )rX   rY   r#  r   r   r   r$  rT   rT   rU   r     s   
zT5TrainStep.get_batch_funcc                 C   s$   dd }|j jjd ur|j jjS |S )Nc                 S   sH   |  }t|d| d |   }|}t|g}|d|d ifS )Nr%  r'  r   )r  r   r(  r)  r*  r8   )r
  r-  r.  r0  r2  r3  rT   rT   rU   r   A  s
   "
z,T5TrainStep.get_loss_func.<locals>.loss_funcr:  rg  rT   rT   rU   r   @  s   	
zT5TrainStep.get_loss_funcc                    rh  )Nc           
   	      s>     | \}}}}}}}||||||d|d}	|	t j|fS )r<  Nr=  r?  )
r   rS   r  r  r
  r  r}  r~  r  r-  r   rT   rU   r   O  s   z7T5TrainStep.get_forward_step_func.<locals>.forward_steprT   ri  rT   r   rU   r   N  s   z!T5TrainStep.get_forward_step_func)r   r   r   r   rt   staticmethodrr  rw  rz  r   r   r   r   rT   rT   r   rU   rj    s    


?rj  c                  C   s@   t  } td d d  | jdkrtd| j d t| j| j d S )Nr   z> setting random seeds to z ...)r   r-   rJ   rK   r   r.   data_parallel_random_initrf  rT   rT   rU   finish_mpu_init_  s
   
r  c              
   C   sD  |d u ri }|  d tj sJ dt|dd}| D ],\}}t||d d urD|jdkrDt d| dt|| d| d| dd	 t||| q|j	sT|
d
dra|jd us]J dt| t| t|dd t  t  t  t  t }t|dd d u rt|j||_|jdkr|jr|jdkrd|_nd|_d|_d S )NzInitializing Megatron-LMzMegatron requires CUDA.T)ignore_unknown_argsr   z*WARNING: overriding default arguments for rv   rw   )flushuse_checkpoint_argsFz/--use-checkpoints-args requires --load argument)build_tokenizerpadded_vocab_sizer;   r   )rK   r   r  is_availabler$   rx   ry   rJ   rz   r  getloadr&   r%   r)   r  r,   r+   r/   r   r1   orig_vocab_sizer  rL   rI   rN   rM   	iteration)rb   extra_args_providerargs_defaultsrQ   r{   r|   rT   rT   rU   
initializel  s:   

 
r  c                       sp   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Zdd Z  ZS )MegatronEnginez
    Megatron-LM model wrapper

    Args:
        accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
        model: Megatron-LM model
        optimizer: Megatron-LM optimizer
        lr_scheduler: Megatron-LM lr scheduler
    c                    s   t    || _|d | _|| _|| _t }|jjj	d ur-|jjj	|fi |jjj
| _n,|jdkr9t||| _n |jdkrEt||| _n|jdkrQt||| _ntd|j d| j_i | _i | _d| _d| _d| _d | _|jd uryt  d S d S )Nr   r;   rC   rE   rH   FT)r   rt   module
base_modelrd   rW   r   rX   rY   custom_train_step_classcustom_train_step_kwargstrain_step_handlerrL   r   rB  rj  rP   r   total_loss_dicteval_total_loss_dictr  report_memory_flag$num_floating_point_operations_so_farmodule_configtensorboard_dirr0   )rp   rb   rS   rd   rW   rQ   r   rT   rU   rt     s:   







zMegatronEngine.__init__c                    s   t  }t jd } jj|_t jd trR|jrR|j	d u s#J ddd  jD |_	t
 jdkr9|j	d |_	|jrRdd  jD |_t
 jdkrR|jd |_|jrt|jrt fddtt
 jD |_t
 jdkrt|jd |_t|_|S )Nr   zWhen overlap_grad_reduce is True, config.no_sync_func must be None; a custom no_sync_func is not supported when overlapping grad-reducec                 S      g | ]}|j qS rT   )no_syncr   model_chunkrT   rT   rU   
<listcomp>      z4MegatronEngine.get_module_config.<locals>.<listcomp>r
   c                 S   r  rT   )start_grad_syncr  rT   rT   rU   r    r  c                    s   g | ]	  fd dqS )c                    s   j  | S r   )rd   finish_param_sync)x)model_indexrp   rT   rU   <lambda>  s    z=MegatronEngine.get_module_config.<locals>.<listcomp>.<lambda>rT   )r   r   )r  rU   r    s    )r   r   r  rd   
scale_lossgrad_scale_funcr   LocalDDPoverlap_grad_reduceno_sync_funcr`   delay_grad_reducegrad_sync_funcoverlap_param_gatherdelay_param_gatherr   param_sync_funcr   finalize_model_grads_func)rp   rQ   r<   rT   r   rU   get_module_config  s,   

z MegatronEngine.get_module_configc                 C   s4   | j D ]}|  q| jd u r|  | _|   d S r   )r  trainr  r  log_eval_resultsrp   model_modulerT   rT   rU   r    s
   



zMegatronEngine.trainc                 C   s0   | j D ]}|  q| jd u r|  | _d S d S r   )r  evalr  r  r  rT   rT   rU   r    s
   


zMegatronEngine.evalc                    s   t   g t|dkr, jdkr)td jD ] fdd| D  qn|gt| jdkrSt|dkrIfddtt| jD }|S d gt| j }|S t|dkr]tnd }|S )Nr   r
   c                    s.   i | ]\}}|| j  d   j   qS )r
   )r   )r   r   v)rQ   r   rT   rU   r     s    z:MegatronEngine.get_batch_data_iterator.<locals>.<dictcomp>c                    s   g | ]}t  qS rT   )iterr   ry  )data_chunksrT   rU   r        z:MegatronEngine.get_batch_data_iterator.<locals>.<listcomp>)r   r`   r   r   r   rx   r  r  )rp   
batch_databatch_data_iteratorrT   )rQ   r  r   rU   get_batch_data_iterator  s,   
z&MegatronEngine.get_batch_data_iteratorc              	   K   sT   |  |}t| jj|| j| j| j| jt d\}}}}}}}|dk| j_	||||fS )z
        Training step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to train on.
        )forward_step_funcr   rS   rd   opt_param_schedulerr<   forward_backward_funcr
   )
r  r6   r  r   r  rd   rW   r  r   r   )rp   r  r  loss_reducedr   ry  	grad_normnum_zeros_in_gradrT   rT   rU   r6     s   

zMegatronEngine.train_stepc              	      s   t  }| |}t }|| jj|| jt |j|jdd}|j	dkr&t
j  | jt |j t  7  _tjddrji }|d D ]&  fdd|D }t|d jdkr`t|t| | < qAt
|| < qA|S i S )z
        Evaluation step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to evaluate on.
        T)r  r   rS   num_microbatchesr   r   forward_onlyr
   )ignore_virtualr   c                    s   g | ]}|  qS rT   rT   )r   r  r{   rT   rU   r  C  r  z,MegatronEngine.eval_step.<locals>.<listcomp>)r   r  r   r  r   r  r   r   r   empty_unused_memory_levelr   r  empty_cacher   r   r   is_pipeline_last_stager`   rU  r(  rV  )rp   r  rQ   r  r  
loss_dictsr  losses_reduced_for_keyrT   r  rU   	eval_step#  s4   



zMegatronEngine.eval_stepc                 K   s  t  }| jd jre| jd
i |\}}}}|  jd7  _t |j t  }| j	|7  _	|  j
t||7  _
|jd urd| j  }d }	|jrMt| j}	t|| j| jjd d | j|| j|||	|
| _n?| jd
i |}|jd ur|D ]/}
| j|
tjdg||
  | j|
< | j|
d tjdgtjdg | j|
d < qttjdtj d}|D ]}
t||
 j dkr|||
 7 }qd }d|v r|d }| j!j"d ur| j!j"||d	S |S )Nr   r
   lrg        
_num_itersg      ?rt  r6  )r2  r6  rT   )#r   r  trainingr6   r  r   r   r   r   r   r  r4   r  rd   get_loss_scaleitemlog_params_normr9   rS   r7   r  param_groupsr  r  r  r  r   r  FloatTensorr   r  r`   rU  r  r  )rp   r  rQ   	loss_dictr   r  r  r   
loss_scaleparams_normr{   r2  r6  rT   rT   rU   forwardK  s\   


zMegatronEngine.forwardc                 C   s  t  }|jd u s| jdkrd S t  }t }d| j d}| jD ]R}|dr'q| j| | j|d   }|| d| d7 }ttd|	 }|j
rT|| d| d7 }|rq|| d|	 | j |j
rq|| d	|| j qt|d
 }td|  t| td|  i | _d S )Nr   zvalidation loss at iteration z | r  z value:    z PPL: z validationz validation pplr
   -)r   r  r  r   r  endswithmathexpminr  rI   
add_scalarr`   r    )rp   rQ   writerstringr{   r|   ppllengthrT   rT   rU   r    s0   


zMegatronEngine.log_eval_resultsc                 C   sH   |    t }||_tj  t| j| j| j	| j
| jd tj  d S )N)r  )r  r   saver   r   barrierr(   r  r  rd   rW   r  )rp   
output_dirrQ   rT   rT   rU   r(     s   
zMegatronEngine.save_checkpointc                 C   st   t  }||_d|_d|_tj  t| j| j	| j
\}}tj  || _|| _|jr6| jdkr8| j	  d S d S d S r   )r   r  r   r   r   r   r  r'   r  rd   rW   r  r  fp16reload_model_params)rp   	input_dirrQ   r  r  rT   rT   rU   r'     s   

zMegatronEngine.load_checkpoint)r   r   r   r   rt   r  r  r  r  r6   r  r  r  r(   r'   r   rT   rT   r   rU   r    s    
	(?r  c                 C   s   t | S )z
    Average losses across data parallel group.

    Args:
        losses (List[Tensor]): List of losses to average across data parallel group.
    )r8   )rb  rT   rT   rU   %avg_losses_across_data_parallel_group  s   r  c                 C   s   dd }t || ddS )z
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather across data parallel ranks.

    c                    s^    j dkr  d    fddttjjt dD }tjj| t d tj	|ddS )Nr   c                    s   g | ]}t  qS rT   )r   
empty_liker  r   rT   rU   r    s    zOgather_across_data_parallel_groups.<locals>._gpu_gather_one.<locals>.<listcomp>r   rR  )
ndimr  r   r   r   get_world_sizer   get_data_parallel_group
all_gatherrZ  )r   output_tensorsrT   r  rU   _gpu_gather_one  s   

z;gather_across_data_parallel_groups.<locals>._gpu_gather_oneT)error_on_other_type)r   )r   r  rT   rT   rU   "gather_across_data_parallel_groups  s   

r  )TTTT)NN)orj   r  r`  abcr   	functoolsr   r   torch.nn.functionalnn
functionalr+  torch.nnr   r   r   rd   r   rW   r	   importsr   
operationsr   r   megatron.corer   r   megatron.core.distributedr   r  r   megatron.core.enumsr   )megatron.core.num_microbatches_calculatorr   megatron.core.optimizerr   megatron.core.parallel_stater   r   megatron.core.pipeline_parallelr   megatron.core.utilsr   "megatron.legacy.data.dataset_utilsr   megatron.legacy.modelr   r   $megatron.legacy.model.classificationr   megatron.trainingr   r   r   r    megatron.training.argumentsr!   r"   r#   r$   r%   megatron.training.checkpointingr&   r'   r(   megatron.training.global_varsr)   megatron.training.gpt_buildersr*   megatron.training.initializer+   r,   r-   r.   r/   r0   %megatron.training.tokenizer.tokenizerr1   megatron.training.trainingr2   r3   r4   r5   r6   r7   megatron.training.utilsr8   r9   r:   rV   rg   rh   r   r   r   r\   r   r   r]   r   r   rB  rj  r  r  Moduler  r  r  rT   rT   rT   rU   <module>   st     
1mB   
2  %