o
    }oi                     @   s   d dl Zd dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ G d
d dZdS )    N)DistributedDataParallelConfig)finetunepretrain)MockDataModule)SquadDataModule)MixtralConfig8x7BMixtralModel)LoRAmixtral_8x7b)Trainerc                   @   sP   e Zd Zejdddd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd ZdS )TestMixtral8x7Bclass)scopec                 C   s   t S Nr
   )self r   c/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/recipes/test_mixtral_8x7b.pyrecipe_module   s   zTestMixtral8x7B.recipe_modulec                 C   sL   |  }t|tjsJ |jtksJ t|jtjsJ |jjtks$J d S r   )model
isinstancerunConfig__fn_or_cls__r   configr   )r   r   model_configr   r   r   
test_model"   s
   zTestMixtral8x7B.test_modelc                 C   s4  |  }t|tjsJ |jtksJ |jdksJ |jdks!J |jdks(J t|j	tjs1J |j	jj
dks:J |j	jdksBJ |j	jdksJJ |j	jtjksSJ |j	jdks[J |j	jdkscJ |j	jdu skJ |j	jdkssJ t|j	jtjs}J |j	jjtksJ |j	jjdu sJ |j	jjdu sJ d S )Ngpu   MegatronStrategy      FT)trainerr   r   r   r   r   acceleratordevices	num_nodesstrategy__name__tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtypetorchbfloat16$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelexpert_model_parallel_sizeddpr   check_for_nan_in_gradgrad_reduce_in_fp32r   r   trainer_configr   r   r   test_trainer)   s&   zTestMixtral8x7B.test_trainerc                 C   s   |  }t|tjsJ |jtksJ t|jtjsJ |jjtks$J t|j	tjs-J |j	jt
ks5J t|jtjs>J |jjtksFJ |jjdksNJ |jjdksVJ |jjdks^J d S )N   i   r    )pretrain_reciper   r   Partialr   r   r   r   r   r"   r   datar   
seq_lengthglobal_batch_sizemicro_batch_sizer   r   reciper   r   r   test_pretrain_recipeB   s   z$TestMixtral8x7B.test_pretrain_recipec                 C   s  |  }t|tjsJ |jtksJ t|jtjsJ |jjtks$J t|j	tjs-J |j	jt
ks5J t|jtjs>J |jjtksFJ |jjdksNJ |jjdksVJ |jjdks^J t|jtjsgJ |jjtksoJ |jjddgksyJ |jjdksJ d S )Ni      r    
linear_qkvlinear_proj    )finetune_reciper   r   r9   r   r   r   r   r   r"   r   r:   r   r;   r<   r=   peftr	   target_modulesdimr>   r   r   r   test_finetune_recipeP   s    z$TestMixtral8x7B.test_finetune_recipec                 C   sh   |j dddddd}|jjdksJ |jjdksJ |jjdks"J |jjdu s*J |jjdks2J d S )Nr!      F)tensor_parallelismpipeline_parallelismcontext_parallelismsequence_parallelismexpert_parallelism)r"   r&   r(   r)   r.   r/   r0   r4   r   r   r    test_trainer_parallelism_optionsb   s   z0TestMixtral8x7B.test_trainer_parallelism_optionsc                 C   sX   |  }|j}|jdksJ |jdksJ |jdksJ |jdks#J |jdks*J d S )NrD   r7   r   )r   r   
num_layershidden_sizenum_attention_headsr;   num_moe_experts)r   r   r   mixtral_configr   r   r   test_model_config_parametersp   s   z,TestMixtral8x7B.test_model_config_parametersN)r'   
__module____qualname__pytestfixturer   r   r6   r@   rI   rP   rV   r   r   r   r   r      s    

r   )nemo_runr   rY   r+   megatron.core.distributedr   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr   #nemo.collections.llm.gpt.data.squadr   &nemo.collections.llm.gpt.model.mixtralr   r   nemo.collections.llm.peft.lorar	   nemo.collections.llm.recipesr   nemo.lightningr   r   r   r   r   r   <module>   s   