o
    wi8                     @   sH  d dl mZmZmZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ G dd dejZG dd	 d	ejZG d
d dZG dd deZG dd dZG dd dZdd Zd(ddZd(ddZedddeddd Zedddeddd  Zejd!d"d# Zejd!edddedd$d% Zd(d&d'ZdS ))    )ANY	MagicMockpatchN)nn)MainParamsOptimizerWrapper)MegatronStrategy_strategy_libc                          e Zd Z fddZ  ZS )Identityc                    s   t    d S Nsuper__init__self	__class__ ^/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/tests/lightning/test_strategy_lib.pyr         zIdentity.__init__)__name__
__module____qualname__r   __classcell__r   r   r   r   r
          r
   c                   @   s   e Zd Zdd ZdS )WithCopyc                 C   s   t  S r   )r   r   r   r   r   copy   s   zWithCopy.copyN)r   r   r   r   r   r   r   r   r      s    r   c                   @   s(   e Zd Zdd Zdd Zedd ZdS )	Optimizerc              	   C   s.   dt jt jdddt jdigi i ddS )Nparams   cudadevicedtype)r      )param_groupsstate)torchr   	Parameterrandnfloat32r   r   r   r   
state_dict$   s    zOptimizer.state_dictc                 C   s   |   S r   )r+   )r   r+   r   r   r   load_state_dict*   s   zOptimizer.load_state_dictc                 C   s0   t jt jdddt jd}d|_|gddgS )Nr   r    r!   T)r   	is_expert)r'   r   r(   r)   r*   requires_grad)r   r   r   r   r   r%   -   s   zOptimizer.param_groupsN)r   r   r   r+   r,   propertyr%   r   r   r   r   r   #   s
    r   c                       r	   )OptimizerWrapperc                    s   t  | d S r   r   )r   	optimizerr   r   r   __init_6   s   zOptimizerWrapper.__init_)r   r   r   _OptimizerWrapper__init_r   r   r   r   r   r0   5   r   r0   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )DummyOptimizerc                 C   s   d| _ d| _d S )NTF)_custom_amp_unscale_gradsstep_calledr   r   r   r   r   ;      
zDummyOptimizer.__init__c                 G   s   t d| d S )Nz Dummy unscale_grads called with:)print)r   argsr   r   r   unscale_grads?   r   zDummyOptimizer.unscale_gradsc                 O   s   t d d| _dS )NzDummy optimizer step called.Tstep_result)r8   r6   )r   r9   kwargsr   r   r   stepB   s   zDummyOptimizer.stepN)r   r   r   r   r:   r=   r   r   r   r   r4   :   s    r4   c                   @   s    e Zd ZdddZdddZdS )Model Nc                 C   s   || _ || _d S r   )prefixmetadtar   r@   metadatar   r   r   r   I   r7   zModel.__init__c                 C   s
   t ddS )Ntest)rD   )dictrB   r   r   r   sharded_state_dictM   s   
zModel.sharded_state_dict)r?   N)r   r   r   r   rF   r   r   r   r   r>   H   s    
r>   c                  C   s   ddi} ddd |   D iS )Nzcuda:0g        found_inf_per_devicec                 S   s$   i | ]\}}|t j|t jd dqS )r    )r#   r"   )r'   tensorr*   ).0r"   valr   r   r   
<dictcomp>U   s    z(make_optimizer_state.<locals>.<dictcomp>)items)found_inf_valuesr   r   r   make_optimizer_stateQ   s
   rN   returnc                     s   t dddtjd} ddlm  G  fddd}| }|jjdks#J |jjdks+J |jjtjks4J t	
|| j |jjdksCJ |jjdksKJ |jjdksSJ |jjtjks\J d S )N   F)pipeline_model_parallel_sizeexpert_model_parallel_sizesequence_parallelpipeline_dtyper   TransformerConfigc                       s    e Zd Z fddZdd ZdS )z6test_set_model_parallel_attributes.<locals>.DummyModelc                    s    dddddd| _ d S )N   rP   F)hidden_sizenum_attention_heads
num_layersnum_moe_expertsadd_bias_linear)configr   rU   r   r   r   e   s   
z?test_set_model_parallel_attributes.<locals>.DummyModel.__init__c                 S   s   d S r   r   r   r   r   r   configure_modelj   s   zFtest_set_model_parallel_attributes.<locals>.DummyModel.configure_modelN)r   r   r   r   r^   r   rU   r   r   
DummyModeld   s    r_   )r   r'   r*   ,megatron.core.transformer.transformer_configrV   r]   rQ   rR   rT   r   set_model_parallel_attributesparallelismrS   )strategyr_   modelr   rU   r   "test_set_model_parallel_attributes[   s"   	re   c            	      C   s  ddl m}  ddlm} ddlm} | }d|_d|_d|_d|_	d|_
d|_t }d|_d|_d|_d|_d|_	d |_d	|_d	|_tjd
dd|dd	d d
dddddddd	d	d
}| D ]&\}}t||soJ d| dt||}||ksJ | d| d| q^|  |   d S )Nr   )#destroy_num_microbatches_calculator)destroy_model_parallelAppStaterP   r   r$      F   i  )
world_sizeglobal_rank
local_rankparallel_configseedfp8)
rl   rm   rn   tensor_model_parallel_sizerQ   $virtual_pipeline_model_parallel_sizecontext_parallel_sizerR   use_fp8init_mpi_proc_groupzExpected to find z in AppStatez% in AppState is incorrect, Expected: z	 Actual: ))megatron.core.num_microbatches_calculatorrf   megatron.core.parallel_staterg   
nemo.utilsri   rr   rQ   rt   rR   rm   rn   r   rs   expert_tensor_parallel_sizetp_comm_overlapuse_te_rng_trackerr   init_parallel_ranksrL   hasattrgetattr)	rf   rg   ri   	app_statemock_parallel_configexpected_app_statekvapp_attrr   r   r   test_init_parallel_ranksx   sX   	
"
r   z torch.distributed.is_initializedT)return_valuezmegatron.core.parallel_statec                 G   s   ddl m} | }d|_d|_d|_d |_d|_d|_d|_d|_	d|_
d|_d|_t|  tt  | jjddd d ddddddd dd d S )	Nr   rh   r$   rP   Fztp-cp-ep-dp-ppTrr   rQ   rs   $pipeline_model_parallel_comm_backendrt   rR   rz   	use_sharporder#num_distributed_optimizer_instancesnccl_communicator_config_pathcreate_gloo_process_groups)ry   ri   model_parallel_sizerr   rQ   r   rt   rR   rz   expert_tensor_parallel_rankrv   tensor_model_parallel_rankpipeline_model_parallel_rank	_mpu_tp_2r   init_model_parallelr   r
   initialize_model_parallelassert_called_once_withmock_mpur9   ri   r   r   r   r   test_init_model_parallel   s:   
r   c                 G   s   ddl m} | }d|_d|_d|_d |_d|_d|_d|_d|_	d|_
d|_d|_d|_t|  tt  | jjddd d ddddddd dd d S )	Nr   rh   r$   rP   FTztp-cp-ep-pp-dpr   )ry   ri   r   rr   rQ   r   rt   rR   rz   r   rv   r   r   use_tp_pp_dp_mappingr   r   r   r   r
   r   r   r   r   r   r   &test_init_model_parallel_with_tp_pp_dp   s<   
r   GPUc                  C   s:   t  } t }t|}tj| |dd}|d g gksJ d S )NrD   )sharding_typefp32_from_fp16_params)r>   r   r0   r   optimizer_sharded_state_dict)rd   r1   optimizer_state_dictr   r   r   !test_optimizer_sharded_state_dict   s
   r   c                 G   sp   t  }t }|| t }||| | }t|tu s!J |	| z|
  W d S  ty7   Y d S w r   )r   
GradScalerr4   _unscale_grads_rN   _maybe_opt_stepr+   typerE   r,   updateAssertionError)r   r9   scalerr1   optimizer_stater+   r   r   r   test_grad_scaler  s   

r   c                 C   s4   d| j _d| j_d| j_d| j_d| j_d| j_d S )NrP   r   r$   )get_tensor_model_parallel_rankr    get_pipeline_model_parallel_rank&get_pipeline_model_parallel_world_size!get_pipeline_model_parallel_groupget_tensor_model_parallel_groupget_expert_tensor_parallel_rank)r   r   r   r   r     s   r   )rO   N)unittest.mockr   r   r   pytestr'   r   nemo.core.optimr   nemo.lightningr   r   r
   r   r   r0   r4   r>   rN   re   r   r   r   markrun_only_onr   r   r   r   r   r   r   <module>   s:   	



6
#
%

	
 