o
    ߥi                     @   sT   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	 G dd dej
jZdS )	z-
Copyright 2020 The Microsoft DeepSpeed Team
    N)mpu   )Experts)MOELayerTopKGatec                       s^   e Zd Z													ddeje d	ed
ededef
 fddZdddZ  Z	S )MoEr         ?   FNTnormalnoisy_gate_policydrop_tokens	use_tuteltop_k_linear_strategyuse_expert_residual_networkc                    s   t t|   |	| _|| dksJ d| d| d|| _d| j | _|| _|| j | _|
d u s<|
dv s<J d|
 t|| j| j}t	t
|||||||
|||d
|| j| j| j||d	| _| jt| j | jry|| _tj|d
| _d S d S )Nr   zNumber of experts (z/) should be divisible by expert parallel size ()ep_size_)NoneJitterRSamplezUnsupported noisy_gate_policy: )r   )r   r      )superr   __init__use_residualep_sizeexpert_group_namenum_expertsnum_local_expertsr   r   r   deepspeed_moe_set_ep_groupr   get_expert_parallel_groupmlptorchnnLinearcoefficient)selfhidden_sizeexpertr   r   kcapacity_factoreval_capacity_factormin_capacityr   r   r   use_rtsr   r   r   experts	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/gpt_moe/moe/layer.pyr      sT   

zMoE.__init__c                 C   s   |  ||}| jr:| |}t|tu r|d }| |}tjjj	|dd}||dddf  ||dddf   }|| j j
| j jfS )a   MoE forward

        Arguments:
            hidden_states (Tensor): input to the layer
            used_token (Tensor, optional): default: None, mask only used tokens

        Returns:
            A tuple including output, gate loss, and expert count.

            * output (Tensor): output of the model

            * l_aux (Tensor): gate loss value

            * exp_counts (int): expert count
        r   r   )dim.N)r   r   r    typetupler$   r!   r"   
functionalsoftmaxl_aux
exp_counts)r%   hidden_states
used_tokenoutput
output_mlpcoefr0   r0   r1   forwardI   s   

(zMoE.forward)r   r   r   r   r   r	   FNTTFr
   F)N)
__name__
__module____qualname__typingOptionalstrboolr   r>   __classcell__r0   r0   r.   r1   r      s4    
9r   )__doc__rB   r!   megatron_utilr   r-   r   sharded_moer   r   r"   Moduler   r0   r0   r0   r1   <module>   s    