o
    Ti                     @   sr   d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 ddlmZ ddlmZmZ G d	d
 d
ejZdS )    )OptionalTupleN)nn)
functional)groupslog_dist   )Experts)MOELayerTopKGatec                !       s   e Zd ZdZ													d#ded	ejd
ededededededede	e
 dedededededdf  fddZd$deddfddZd$deddfddZ	d%dejd e	ej deejejejf fd!d"Z  ZS )&MoEav  Initialize an MoE layer.

    Arguments:
        hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
        expert (nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
        num_experts (int, optional): default=1, the total number of experts per layer.
        ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
        k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
        capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
        eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
        min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
        use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
        noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
        drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
        use_rts (bool, optional): default=True, whether to use Random Token Selection.
        use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
        enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts
        top2_2nd_expert_sampling (bool, optional): default=True, whether to perform sampling for 2nd expert
    r         ?   FNThidden_sizeexpertnum_expertsep_sizekcapacity_factoreval_capacity_factormin_capacityuse_residualnoisy_gate_policydrop_tokensuse_rts	use_tutel enable_expert_tensor_parallelismtop2_2nd_expert_samplingreturnc                    s  t t|   |	| _|| _|| dksJ d| d| d|| _d| j | _|| _|| j | _t	d| d| j d| j dg |
d u sP|
d	v sPJ d
|
 t
|| j| j}tt|||||||
||d ||| j| j| j|d| _| jr|| _t|d| _d S d S )Nr   zNumber of experts (z/) should be divisible by expert parallel size ()ep_size_z%Creating MoE layer with num_experts: z | num_local_experts: z | expert_parallel_size: )NoneJitterRSamplezUnsupported noisy_gate_policy: )r      )superr   __init__r   r   r   expert_group_namer   num_local_expertsr   r	   r
   r   deepspeed_moemlpr   Linearcoefficient)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   experts	__class__ G/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/moe/layer.pyr&   &   s<   "
zMoE.__init__ use_data_before_expert_parallel_c                 C   s   | j |d d S )Nr3   )_create_process_groupsr-   r3   r1   r1   r2   set_deepspeed_parallelismV   s   zMoE.set_deepspeed_parallelismc                 C   sl   | j t vr*td| j   tjd u s| js tj| j|d n
tj| jtj|d | j	
t| j  d S )Nz=No existing process group found, creating a new group named: r4   )mpur3   )r'   r   _get_expert_parallel_group_dictprintr8   r    _create_expert_and_data_parallelr   &_create_expert_data_and_model_parallelr)   _set_ep_group_get_expert_parallel_groupr6   r1   r1   r2   r5   Y   s   
zMoE._create_process_groupshidden_states
used_tokenc                 C   s   |  ||}| jr7| |}t|tr|d }| |}tj|dd}||dddf  ||dddf   }|| j j| j j	fS )a   MoE forward

        Arguments:
            hidden_states (Tensor): input to the layer
            used_token (Tensor, optional): default: None, mask only used tokens

        Returns:
            A tuple including output, gate loss, and expert count.

            * output (Tensor): output of the model

            * l_aux (Tensor): gate loss value

            * exp_counts (Tensor): expert count
        r   )dim.r   N)
r)   r   r*   
isinstancetupler,   Fsoftmaxl_aux
exp_counts)r-   r?   r@   output
output_mlpcoefr1   r1   r2   forwardi   s   


(zMoE.forward)r   r   r   r   r   r   FNTTFFT)F)N)__name__
__module____qualname____doc__intr   Modulefloatboolr   strr&   r7   r5   torchTensorr   rL   __classcell__r1   r1   r/   r2   r      st    	
0r   )typingr   r   rV   r   torch.nnr   rE   deepspeed.utilsr   r   r.   r	   sharded_moer
   r   rR   r   r1   r1   r1   r2   <module>   s   