o
    wi"c                  	   @   s  d dl Z d dlZd dlZd dlmZmZ z
d dlmZ dZ	W n e
efy+   dZ	Y nw zd dlmZ d dlmZmZmZmZmZmZmZmZ dZW n e
efyW   dZY nw zd dlmZmZmZmZmZ dZW n* e
efy   ed	 e	rd d
l m!Z d dl"mZmZmZ d dl"m#Z dZY nw 									 	 													dde$fddZ%dd Z&dd Z'						 	 	dddZ(dS )    N)AppStatelogging)set_logging_levelTF)tensor_parallel)RankGenerator get_pipeline_model_parallel_rankset_expert_model_parallel_rank$set_expert_model_parallel_world_size set_pipeline_model_parallel_rank&set_pipeline_model_parallel_world_sizeset_tensor_model_parallel_rank$set_tensor_model_parallel_world_size)!ConstantNumMicroBatchesCalculatorget_current_global_batch_sizeget_micro_batch_sizeget_num_microbatches init_num_microbatches_calculatorzCMegatron num_microbatches_calculator not found, using Apex version.)ConstantNumMicroBatches)r   r   r   )setup_microbatch_calculator        use_gloo_process_groupsc                 C   sr  |du s|dksJ d|dkr|dks||ksJ dt  }||_| |_||_||_||_||_||_||_|
|_	||_
||_|	|_||_||_||_||_||_||_||_t| ||||||
|||||d\|_|_|_|_|_|_|_|_t|j t|j t|j t |j t!|j|j  t"|j t#j$j%|d |durt&| |r+|dur+t'rddl(m)} |du rt*||||j|dd	 nct+|t,rt- |ksJ t. |ksJ t/ |||j  ksJ nCt0d
ddl1m)} |du rt*||||j|dd	 n(t+|t,r't- |ksJ t. |ksJ t/ |||j  ks&J nt0d
d|_2t3r7t4| dS dS )z)Initialize model parallel groups in NeMo.Nr   1pipeline_model_parallel_split_rank is deprecated.pencoder_pipeline_model_parallel_size is temporarily unavailable. We are working on a refactoring to add it back.)
world_sizeranktensor_model_parallel_size_pipeline_model_parallel_size_%virtual_pipeline_model_parallel_size_#pipeline_model_parallel_split_rank_context_parallel_size_expert_model_parallel_size_expert_tensor_parallel_size_#encoder_tensor_model_parallel_size_%encoder_pipeline_model_parallel_size_use_tp_pp_dp_mapping)use_te_rng_tracker)#_GLOBAL_NUM_MICROBATCHES_CALCULATORF)r   global_batch_sizemicro_batch_sizedata_parallel_sizerampup_batch_sizedecrease_batch_size_if_neededz*Microbatch calculator already initialized.T)5r   global_rankr   
local_rankr&   expert_model_parallel_sizetensor_model_parallel_sizepipeline_model_parallel_size$virtual_pipeline_model_parallel_sizecontext_parallel_size"encoder_tensor_model_parallel_size$encoder_pipeline_model_parallel_size$pipeline_model_parallel_comm_backenduse_fp8	use_sharpinit_mpi_proc_groupexpert_tensor_parallel_size#num_distributed_optimizer_instancesnccl_communicator_config_pathr   fake_initialize_model_paralleltensor_model_parallel_rankpipeline_model_parallel_rankexpert_model_parallel_rankexpert_tensor_parallel_rankmodel_parallel_sizer+   "pipeline_model_parallel_split_rank$virtual_pipeline_model_parallel_rankr   r   r	   r   r   r
   r   randominitialize_rng_tracker_set_random_seedMCORE_MB_CALCULATOR)megatron.core.num_microbatches_calculatorr(   r   
isinstancer   r   r   r   	Exception(apex.transformer.pipeline_parallel.utils_is_megatron_initialized	HAVE_APEXr   )r   r.   r/   r1   r0   r;   r2   r3   rD   r7   r4   r5   r6   r*   r)   r,   r8   r:   seedapex_transformer_log_levelr&   r'   r<   r=   r9   r   	app_stater(    rS   Y/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/lightning/megatron_init.py"initialize_model_parallel_for_nemoN   s   







	
	rU   c                 C   sl   | dur/| dkr/| dt    }t| tj| t| tj dkr-t	| dS dS t
d| )z$Set random seed for reproducability.Nr   d   z'Seed ({}) should be a positive integer.)r   rF   rP   nptorchmanual_seedcudadevice_countr   model_parallel_cuda_manual_seed
ValueErrorformat)seed_rP   rS   rS   rT   rH      s   

rH   c                   C   sf   t jdkr1t jd t jd t jd t jd t jd t jd t j	d dS dS )z%Set PyTorch JIT layer fusion options.z1.10.0a0+0aef44cTFN)
rX   __version___C_jit_set_profiling_executor_jit_set_profiling_mode_jit_override_can_fuse_on_cpu_jit_override_can_fuse_on_gpu_jit_set_texpr_fuser_enabled_jit_set_nvfuser_enabled%_debug_set_autodiff_subgraph_inliningrS   rS   rS   rT   set_jit_fusion_options   s   
ri   c           4         sX  |du sJ d|
dkr|	dks|	|ksJ dt || }t || }|| }t || }|
du r2d}n|
}|	dkr?|
dkr?|}n|	}|dkrS|dksKJ ||ksSJ d|| | }|| | }|| }| | dksJ d|  d| d| d	| d
| d| d| | | }|| }|| }|| | ksJ d}|durd}|dkrt|d||||rdndddndt|d||||rdnd|d |du r|}|| | }|| }|| dkrtd| d| dt||||d|rdnd|d|r|dks||ksJ d ddksJ d d dd d< fdd	}g }|dD ]}||v rDt|}td| d|  q.|dD ]}|| ||v rc|} td| d|   qJ||}!td |  td!| d"|!  g }"|d#D ]}|"| ||v r|}#td| d$|#  q|#|}$td%|"  td!| d&|$  g }%|d'D ]}|%| ||v rtd| d(t|  qtd)|%  g }&d}'|d*D ]}|&| ||v r |}'td| d+|'  q|'|}(td,|&  td| d-|(  d})|durV|dkrVg }*|d.d/d0D ]}|*| ||v rAt||})q.td1|*  td| d2|)  d}+|dur|dkrg },|d*d/d0D ]}|,| ||v r}t||}+qjtd3|,  td| d4|+  g }-g }.d}/d}0d}1|dD ]L}|-| ||v r|}/td| d5|/  t	|dkr|d |d6 g}2|.|2 n	|}2|.t|2 ||2v rt|2}0td| d7|0  q|/|}3|0dur|0|}1td8|-  td| d9|3  td:|-  td| d;|1  |(|3|)|+||||fS )=a  
    Fake initialize model data parallel groups so that we can instantiate model parallel
    models before DDP is initialized. This is needed because PTL execution flow is init
    model, init trainer -> call trainer.fit(model). DDP is initialized during .fit.
    This function is taken from megatron.core.parallel_state and modified so that the
    distributed groups are not created.
    We only need the tensor parallel and pipeline parallel ranks to instantiate the model.

    Arguments:
        tensor_model_parallel_size: number of GPUs used to parallelize model tensor.
        pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline.
        context_parallel_size: number of GPUs used to parallelize tokens of each input.

    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
    the model pipeline. The present function will
    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
    and 8 data-parallel groups as:
        8 data_parallel groups:
            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
        8 tensor model-parallel groups:
            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
        4 pipeline model-parallel groups:
            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
    Note that for efficiency, the caller should make sure adjacent ranks
    are on the same DGX box. For example if we are using 2 DGX-1 boxes
    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
    ranks 8 to 15 belong to the second box.
    Nr   r   r   z9We do not support encoders with more TP than the decoder.zworld_size: zM must be divisible by total world_size: (decoder_)tensor_model_parallel_size z* * (decoder_)pipeline_model_parallel_size z# * (decoder_)context_parallel_size z& + encoder_tensor_model_parallel_size z( * encoder_pipeline_model_parallel_size z * context_parallel_size r   ztp-cp-ep-pp-dpztp-cp-ep-dp-pp)tpepdpppcporderrank_offsetzdecoder world_size (zB) is not divisible by expert_tensor_model_pipeline_parallel size ()zmWhen not using pp-last rank ordering, the data parallel size of the attention and moe layers must be the samerm   zaPipeline parallel groups are expected to be the same for Non-Expert and Expert part,     but got z and Fc                 ;   s    ddl m} 	 |rj| fi |}n	 j| fi |}d u r+|D ]}|V  q#d S j| fi |}| dkrKt|||D ]	\}}|| V  q?d S | dkrjt|t|ksYJ t||D ]	\}}|| V  q^d S |D ]}|V  ql|D ]}|V  qtd S )Nr   )cyclerm   tp-pp)	itertoolsrr   	get_ranksziplen)
group_type	is_expertkwargsrr   d_ranksxe_ranksydecoder_rank_generatorencoder_rank_generatorexpert_decoder_rank_generatorrS   rT   generator_wrapper  s2   z9fake_initialize_model_parallel.<locals>.generator_wrapperrl   zRank z has data parallel group : zdp-cpz< has combined group of data parallel and context parallel : z>All data parallel group ranks with context parallel combined: zRanks z has data parallel rank: rn   z has context parallel group: z"All context parallel group ranks: z has context parallel rank: rs   z has model parallel group: z All model parallel group ranks: rj   z" has tensor model parallel group: z'All tensor model parallel group ranks: z! has tensor model parallel rank: rk   T)ry   z'All expert model parallel group ranks: z! has expert model parallel rank: z(All expert tensor parallel group ranks: z" has expert tensor parallel rank: z$ has pipeline model parallel group: z has embedding group: z)All pipeline model parallel group ranks: z" has pipeline model parallel rank zAll embedding group ranks: z has embedding rank: )F)
minr   RuntimeErrorru   listr   infoappendindexrw   )4r   r   r   r   r    r   r"   r#   r!   r$   r%   r&   r1   r2   rC   r4   r6   r5   encoder_model_sizedecoder_model_sizetotal_model_sizer+   encoder_world_sizedecoder_world_sizerE   *expert_tensor_model_pipeline_parallel_sizeexpert_data_parallel_sizer   %all_data_parallel_group_ranks_with_cpranksdata_parallel_groupranks_with_cpdata_parallel_group_with_cpdata_parallel_rank all_context_parallel_group_rankscontext_parallel_groupcontext_parallel_rankall_model_parallel_group_ranks%all_tensor_model_parallel_group_rankstensor_model_parallel_groupr?   rA   all_expert_model_parallel_ranksrB    all_expert_tensor_parallel_ranks'all_pipeline_model_parallel_group_ranksall_embedding_group_rankspipeline_model_parallel_groupembedding_groupembedding_rankembedding_ranksr@   rS   r   rT   r>     s  ,




	







#





















r>   )r   r   Nr   NNNr   r   r   NNNFFr   r   FFr   NFT)NNr   Nr   r   r   F))rF   numpyrW   rX   
nemo.utilsr   r   apex.transformer.log_utilr   rO   ImportErrorModuleNotFoundErrormegatron.corer   megatron.core.parallel_stater   r   r   r	   r
   r   r   r   HAVE_MEGATRON_CORErJ   r   r   r   r   r   rI   warningapex.transformer.microbatchesr   rM   r   boolrU   rH   ri   r>   rS   rS   rS   rT   <module>   s   (

 