o
    }oi-                     @   s   d dl Z d dlmZmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d d	lmZmZmZmZmZmZmZmZ d d
lmZ d dlmZmZmZ G dd dZ G dd de Z!G dd de Z"G dd de Z#G dd de Z$dS )    N)OptionalUnion)MisconfigurationException)Trainer)ModelSummary)TorchElasticEnvironment)
DictConfig	open_dict)FLOPsMeasurementCallback)CustomProgressBarFSDPMixedPrecisionPlugin
GradScalerMegatronHalfPrecisionPluginNLPDDPStrategyNLPDDPStrategyNotebookNLPFSDPStrategyPipelineMixedPrecisionPlugin)logging)AsyncFinalizableCheckpointIOAsyncFinalizerCallbackDistributedCheckpointIOc                   @   sz   e Zd ZdZdeddfddZdeeef fddZ	de
fd	d
ZdefddZdee defddZddefddZdS )MegatronTrainerBuilderz
    Builder type to hide complex configuration of PTL Trainers for Megatron LLM models.
    Can be extended to change behavior for a specific model.
    cfgreturnNc                 C   s
   || _ d S N)r   )selfr    r   g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/nlp/parts/megatron_trainer_builder.py__init__1   s   
zMegatronTrainerBuilder.__init__c                 C   s  t tdp
ttjj}|r| jjjdkrt	d t
dddS | jjddr| jjjdd	ks;| jjjdd
kr?J d| jjdd}| jjdddkrW|rWJ d| jjddrit	d d| jj_t| jjdd| jjdd| jjdd| jjdd|| jjj| jjdd| jjdd| jjddd	S td| jjjd| jjdd| jjdd| jjdddS )zN
        Returns a DDP or a FSDP strategy passed to Trainer.strategy.
        ps1   z>Detected interactive environment, using NLPDDPStrategyNotebookTF)no_ddp_communication_hookfind_unused_parametersfsdpnamedistributed_fused_adammcore_distributed_optimz/Distributed optimizer cannot be used with FSDP.fsdp_sharded_checkpointtensor_model_parallel_sizez:FSDP sharded checkpoint is not supported when TP size > 1.megatron_amp_O2zJTorch FSDP is not compatible with O2 precision recipe. Setting O2 `False`.fsdp_limit_all_gathersfsdp_sharding_strategyfullfsdp_cpu_offloadfsdp_grad_reduce_dtype    nccl_communicator_config_pathNsharpfsdp_use_orig_params)	limit_all_gatherssharding_strategycpu_offloadgrad_reduce_dtypesharded_checkpoint	precisionr0   r1   use_orig_paramsdist_ckpt_parallel_dist_opt)r!   gradient_as_bucket_viewr"   r0   r1   dist_ckpt_parallel_save)hasattrsysboolflagsinteractiver   trainerdevicesr   infor   modelgetoptimr)   r   r8   r   r;   )r   _IS_INTERACTIVEr7   r   r   r   _training_strategy4   sJ   


z)MegatronTrainerBuilder._training_strategyc                 C   s2   t | jjdd| jjdd| jjdddS )z9
        Returns a scaler for precision plugins.
        native_amp_init_scale        native_amp_growth_interval  
hysteresis   )
init_scalegrowth_intervalrN   r   r   rE   rF   r   r   r   r   _grad_scalera   s
   z#MegatronTrainerBuilder._grad_scalerc           	      C   s  | j jdd}| j jdr#| j jjddkp"| j jjddknd}g }| j jjdv rxd}| j jjd	v rF| j jd
dsC|  }d}nd}|rW|sW|t|d|d n| j jd
dri|t	||d n
|t
|d|d d| j j_| j dddkr|t  | j jd
d o| j jddp| j jdd}| j di pi di dd}|rt| j j|}|rt|}|| |S |rtd|S )zv
        Returns:
            plugins: list of plugins passed to Trainer.plugins including precision plugins.
        r)   FrG   r$   r%   r&   )   16bf1616-mixed
bf16-mixedN)rU   rV   rX   r#   rX   rY   cuda)r8   devicescaler)r8   r\   cluster_typeBCP	mcore_gpt
mcore_bertexp_managercheckpoint_callback_params
async_savezpexp_manager.checkpoint_callback_params.async_save=True withoutdistributed checkpoints is currently not supported)r   rE   rF   rG   rB   r8   rT   appendr   r   r   r   r   from_configr   r   )	r   r)   with_distributed_adampluginsr\   plugin_precisionuse_dist_ckptrc   checkpoint_ior   r   r   _pluginsk   sT   	
 
zMegatronTrainerBuilder._plugins	callbacksc                 C   s   |du rg }d| j jvs| j jjr|t  | j di pi di ddr.|t  | j di p6i ddrC|t| j  |S )	z`
        Returns:
            callbacks: list of callbacks passed to Trainer.callbacks.
        Nenable_progress_barra   rb   rc   Flog_tflops_per_sec_per_gpuT)r   rB   rm   rd   r   rF   r   r
   r   rl   r   r   r   
_callbacks   s   "z!MegatronTrainerBuilder._callbacksc                 C   s   | j jddr2d| j j_d| j j_t| j j d| j j_W d    n1 s(w   Y  d| j j_	| j jj
}|  }|  }| |}td||d| j jd|i}|| j j_
|S )N
skip_trainFr    r   rg   strategyrl   r   )r   rE   rF   rB   	max_stepsval_check_intervalr	   num_sanity_val_stepsra   create_checkpoint_callbackr8   rI   rk   rp   r   )r   rl   r8   rs   rg   rB   r   r   r   create_trainer   s   




 
z%MegatronTrainerBuilder.create_trainerr   )__name__
__module____qualname____doc__r   r   r   r   r   rI   r   rT   listrk   r   rp   r   rx   r   r   r   r   r   +   s    -
<r   c                   @      e Zd ZdZdefddZdS )MegatronBertTrainerBuilderz.Builder for BERT model Trainer with overrides.r   c                 C   s$   t | jjdd| jjdddS )NrJ   rK   rL   rM   )rP   rQ   rR   rS   r   r   r   rT      s   z'MegatronBertTrainerBuilder._grad_scalerNry   rz   r{   r|   r   rT   r   r   r   r   r          r   c                       s>   e Zd ZdZdee def fddZd	defddZ  Z	S )
MegatronT5TrainerBuilderz,Builder for T5 model Trainer with overrides.rl   r   c                    s    t  |}|tdd |S )N   )	max_depth)superrp   rd   r   ro   	__class__r   r   rp      s   z#MegatronT5TrainerBuilder._callbacksNc                 C   s:   |   }|  }| |}td||d| jjd|iS )Nrr   rl   r   )rI   rk   rp   r   r   rB   )r   rl   rs   rg   r   r   r   rx      s   
 z'MegatronT5TrainerBuilder.create_trainerr   )
ry   rz   r{   r|   r   r}   rp   r   rx   __classcell__r   r   r   r   r      s    r   c                   @   r~   )%MegatronStableDiffusionTrainerBuilderz,Builder for SD model Trainer with overrides.r   c                 C   s>   | j jdd}|rtd| j jjdddS td| j jjddS )zD
        Returns a ddp strategy passed to Trainer.strategy.
        ddp_overlapTF   )r!   r;   r"   bucket_cap_mb)r!   r;   r"   )r   rE   rF   r   r;   )r   r   r   r   r   rI      s   z8MegatronStableDiffusionTrainerBuilder._training_strategyN)ry   rz   r{   r|   r   rI   r   r   r   r   r      r   r   c                   @   r~   )MegatronLMPPTrainerBuilderzlBuilder for scripts where grad scaler is turned off for pipeline parallel LM model. E.g. PEFT tuning scriptsr   c                 C   sJ   t | jjdd| jjdd| jjdd| jjjdkr!dd
S d	d
S )NrJ   rK   rL   rM   rN   rO   r    FT)rP   rQ   rN   enabled)r   r   rE   rF   pipeline_model_parallel_sizerS   r   r   r   rT     s   z'MegatronLMPPTrainerBuilder._grad_scalerNr   r   r   r   r   r     r   r   )%r>   typingr   r   %lightning.fabric.utilities.exceptionsr   lightning.pytorchr   lightning.pytorch.callbacksr   &lightning.pytorch.plugins.environmentsr   	omegaconfr   r	   ,nemo.collections.common.metrics.perf_metricsr
   (nemo.collections.nlp.parts.nlp_overridesr   r   r   r   r   r   r   r   
nemo.utilsr   !nemo.utils.callbacks.dist_ckpt_ior   r   r   r   r   r   r   r   r   r   r   r   <module>   s"   (
 '
