o
    Tij                     @   s  U d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZmZmZ ddlZddlmZ ddlmZ ddlm  mZ dd	lmZ d
dlmZmZ e
rUee ZneZdZdZdZdZi Z e	ej!ef e"d< i Z#e	ej!ef e"d< i Z$e	ej!ef e"d< z
ddl%m&Z' dZ(W n   dZ(Y dAdej!fddZ)dedej!defddZ*ddl+m,Z- G dd  d ej.j/Z0dZ1d!d" Z2ej3j4d#ed$ed%edefd&d'Z5ej3j4d(d) Z6ej3j4d*d+ Z7						dBd,ed$e8d%e9d-ed.ee: d/e;d0e;d1eej<j=df d2e;deeeeef fd3d4Z>			dCd,ed$e8d%e9d/e;d1eej<j=df d5e;deeeeef fd6d7Z?			8dDd,ed9e9d$e8d%e9d/e;d1eej<j=df d:e:deeeeef fd;d<Z@G d=d> d>eZAG d?d@ d@eZBdS )Eac  
The file has been adapted from two fairscale files:
 (1) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/moe_layer.py
 (2) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/top2gate.py
 Git commit hash: 34df606902a240567a0d898037ece55c2f1336cf
 We retain the following license from the original files:
    )SynchronizedWallClockTimer)logger)$bwc_tensor_model_parallel_world_size)CallableDictTYPE_CHECKINGAnyOptionalTupleUnionN)Tensor)Module)groups   )drop_tokensgather_tokens	topk_gatemoe1st_a2a2nd_a2auniform_map
gumbel_mapexp_selection_uniform_map)r   TF{Gz?devicec                 C   sd   |dkr| S t |}|du r+tjjjtjd| |dtjd| |ddj}|t |< | || j S )a  
    Modified from switch transformer paper. mesh transformers
    Multiply values by a random number between 1-epsilon and 1+epsilon.
    Makes models more resilient to rounding errors introduced by bfloat16.
    This seems particularly important for logits.
    Args:
        x: a torch.tensor
        device: torch.device
        epsilon: a floating point value
    Returns:
        a jittered x.
    r   N      ?r   lowhigh)	r   gettorchdistributionsuniformUniformtensorrsampleshape)xr   epsilonr#    r*   M/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/moe/sharded_moe.pymultiplicative_jitter7   s   

r,   r'   returnc                 C   sP   t |}|d u r$tjd|d}tjd|d}tjj||j}|t |< || S )Nr   r           )r   r    r!   r%   r"   gumbelGumbelr&   )r'   r   r/   onezeror*   r*   r+   gumbel_rsampleO   s   
r3   )commc                	   @   sN   e Zd ZededejdedefddZededede	def fd	d
Z
dS )	_AllToAllctxgroupinputr-   c                 C   s,   || _ | }t|}tj|||d |S )N)r7   )r7   
contiguousr!   
empty_likedistall_to_all_single)r6   r7   r8   outputr*   r*   r+   forwardb   s
   
z_AllToAll.forwardgrad_outputNc                 G   s   d t j| jg|R  fS N)r5   applyr7   )r6   r?   r*   r*   r+   backwardj   s   z_AllToAll.backward)__name__
__module____qualname__staticmethodr   r;   ProcessGroupr   r>   r
   rB   r*   r*   r*   r+   r5   `   s
    $r5   c                 C   s  t r	t| ||S | dkr||jd d| S | dkr&|d|d S | dkr9t|d|ddS | dkrD|d| S | d	krm|jd }|jd }|jd }|jd }t||d ||||S | d
krt||jd d|d|jd S | dkr|jd }|jd }|jd }| d}||d |||}t||	dd
dS t| ||S )Ns,se->ser   
se,sc->sec   r   se,se->sse,sec->secsec,sm->ecmsec,ecm->smz
ks,ksm->sm)
USE_EINSUMr!   einsumreshaper'   	unsqueezebmmmatmult	transposesqueeze)ruleabsecmkr*   r*   r+   rQ   v   s4   



"(


rQ   gatescapacity_factormin_capacityc                 C   sF   | j d }| j d }t|| | tj}||k r!|tj}|S )Nr   r   )r'   r!   ceiltoint64)ra   rb   rc   
num_tokensnum_expertscapacityr*   r*   r+   	_capacity   s   

rj   c                 C   s   t j| |ddd S )Nr   r`   dimr   )r!   topk)sourcer`   r*   r*   r+   _top_idx   s   ro   c                 C   s   t j| |d S )Nnum_classes)Fone_hotfloat)r(   rq   r*   r*   r+   _one_hot_to_float   s   ru   logits
used_tokennoisy_gate_policyr   use_rtsep_group	use_tutelc	           !      C   s  |dkr| t | j| jd }	tj| dd}
t|
t|t|}tj|dkr)|	n|
dd}t	|
jd }tj
||d}|durFtd||}tj|dd | j}|st|| j}|durltj|tjj|d	 t dkrtjdu rydnttjd
}t|| ||j}t|t|d|j}tj|
dd}tj| dd}t|| | }|rt | j}|du rtj!j"j#tjd| jdtjd| jddj$}|t| j< |||j }n|}| jd |ksJ dt%||}|t&|'d|d }|}|r|jdd| d }t||}|r t()|}n	tj*|ddd }|rI|
| jdd}tj|| dd}||||g|g|g|fS tj|| dd}| }|
| }
t+||}td|
|}|, } ||| |fS )z Implements Top1Gating on logits.RSampler   r   rl   rp   NrH   r   opr7   mpur.   r   r   zyNo. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size.rJ   )-r3   r'   r   rr   softmaxrj   r!   r%   argmaxintrs   rQ   sumdetachre   maxr;   
all_reduceReduceOpMAXr   %_get_expert_model_parallel_world_sizer   r   rd   muldtypeminsizemeanrt   r   r    r"   r#   r$   r&   ro   
zeros_likescatter_	tutel_moefast_cumsum_sub_onecumsumru   bool)!rv   rb   rc   rw   rx   r   ry   rz   r{   logits_w_noisera   ri   
indices1_srh   mask1
exp_countsnew_capacitytpmecel_auxr#   
mask1_randtop_idx	new_mask1indices_mask
locations1gates1_slocations1_smask1_floatlocations1_sccombine_weightsdispatch_maskr*   r*   r+   
top1gating   s   


	
r   top2_2nd_expert_samplingc           %      C   s  t j| dd}tj|dd}t|jd }t j||d}	|r(| t| j| jd7 } | 	|	
 td}
tj|
dd}t j||d}tj|	ddd }tj|ddd }|tj|	ddd7 }tj|dd}tj|	 dd}t|| | | }tj|	| dd | j}|rt|t|d	 t|}|	t||9 }	|t||9 }n6t|}|d
urtj|tjj|d t dkrtjd
u rdnttjd}t|| ||j}|}tj||	 dd}tj|| dd}|	 }| }t d||}t d||}|| }tj!|t"|jj#d}|| }|| }t d||}t d||}t$||}t$||} t d||}!t d|| }"|!|" }#|#
 }$||#|$|fS )z Implements Top2Gating on logits.r   r}   rp   r   z-infr   Trl   keepdimrK   Nr~   r   rL   r   rH   rJ   )%rr   r   r!   r   r   r'   rs   r3   r   masked_fillr   rt   r   r   r   r   re   rj   r%   ltr   r;   r   r   r   r   r   r   r   rd   r   r   rQ   clampfinfoepsru   )%rv   rb   rc   r   rz   r   ra   r   rh   r   logits_except1
indices2_smask2r   
locations2r   r   r   r   ri   r   r   r   locations2_sr   mask2_floatr   gates2_sdenom_sgates1gates2r   locations2_sccombine1_seccombine2_secr   r   r*   r*   r+   
top2gating"  s\   


r   probsr`   drop_policyc                 C   s2  t j| |dd\}}tj| dd}	t|	jd }
t | d||}t j|	t jd	d|d}t j
|dd | j}t j|	dd}t j| dd}t || |
 |
 | }|rt|	t || t |}|dkrt j||ddd\}}t | d|d}t ||}t j|ddd }nS|d	krt j|ddd }|t ||9 }n=td
| t |}|durtj|tjj|d t dkrtjdu rdnttjd}t ||  ||j!}|}|	| }t j
|ddd}t j"|t #|j!j$d}|| }t%|| |}t &d||}| }||||fS )z Implements TopKGating on logits.r   rk   r}   )r   r   r   F)r`   rl   sortedpositionzInvalid drop_policy: Nr~   r   rI   Tr   r   rM   )'r!   rm   rr   r   r   r'   r   scatterr   r   r   r   re   r   r   rt   rj   r%   logical_andr   r   
ValueErrorr   r;   r   r   r   r   r   r   r   rd   r   r   r   r   r   ru   rQ   )rv   r`   rb   rc   r   rz   r   top_gater   ra   rh   topk_masked_gatesmaskr   r   r   r   ri   capacity_probscapacity_indicescapacity_mask	locationsr   r   gates_maskedgates_sr   locations_scr   r   r*   r*   r+   
topkgatingv  sF   
r   c                       s   e Zd ZU dZejjed< 									dded	ed
ede	de	dede
e dededeejjdf deddf fddZdd Z		ddejdejdedeeeef fddZ  ZS ) TopKGatea  Gate module which implements Top2Gating as described in Gshard_.
    ::

        gate = TopKGate(model_dim, num_experts)
        l_aux, combine_weights, dispatch_mask = gate(input)

    .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf

    Args:
        model_dim (int):
            size of model embedding dimension
        num_experts (int):
            number of experts in model
    wgr   r      NT	model_dimrh   r`   rb   eval_capacity_factorrc   rx   r   ry   rz   r   r-   c                    sl   t    tjj||dd| _|
| _|| _|| _|| _	|| _
|| _t | _d| _d| _|| _|	| _|| _d S )NF)biasr.   )super__init__r!   nnLinearr   rz   r`   rb   r   rc   rx   r   timerswall_clock_breakdown	gate_timer   ry   r   )selfr   rh   r`   rb   r   rc   rx   r   ry   rz   r   	__class__r*   r+   r     s   

zTopKGate.__init__c                 C   s   | j d u s	J d|| _ d S )Nz+Attempting to override an existing ep_group)rz   r   rz   r*   r*   r+   _set_ep_group  s   
zTopKGate._set_ep_groupFr8   rw   r{   c              
   C   s$  | j r
| t  | }| jdkr| jrt||jd}t	j
jj|| jj d d}| jdkrNt|| jr8| jn| j| j|| jrC| jnd | j| j| j|	}n.| jdkrht|| jr[| jn| j| j| j| j| j}nt|| j| jrr| jn| j| j| j| j}| j r| t  | tjdd| _|S )NJitterr   )weightr   r   rK   Freset)r   r   TOPK_GATE_TIMERstartrt   rx   trainingr,   r   r!   r   
functionallinearr   r   r`   r   rb   r   rc   r   ry   rz   r   r   r   stopelapsedr   )r   r8   rw   r{   
input_fp32rv   gate_outputr*   r*   r+   r>     s.   

zTopKGate.forward)	r   r   r   r   NTTNT)NF)rC   rD   rE   __doc__r!   r   r   __annotations__r   rt   r	   strr   r   distributedrG   r   r   r   r
   r>   __classcell__r*   r*   r   r+   r     sb   
 	
r   c                       sX   e Zd ZdZ	ddededededdf
 fd	d
Zdd Zde	de
de	fddZ  ZS )MOELayera  MOELayer module which implements MixtureOfExperts as described in Gshard_.
    ::

        gate = TopKGate(model_dim, num_experts)
        moe = MOELayer(gate, expert)
        output = moe(input)
        l_aux = moe.l_aux

    .. Gshard_: https://arxiv.org/pdf/2006.16668.pdf

    Args:
        gate (torch.nn.Module):
            gate network
        expert (torch.nn.Module):
            expert network
    Fgateexpertsnum_local_expertsr{   r-   Nc                    s   t    || _|| _d | _|| _|| _|| _d| _d| _	d| _
t | _d| _|o/to/|jdk| _| jr;td d S |rFtsFtd d S |rVtrX|jdkrZtd d S d S d S d S )Nr.   Fr   zUsing Tutel optimizations.zITutel optimization requested but not installed. Proceeding without Tutel.zXTo enable Tutel optimization, use top-1 instead of top-2 gate. Proceeding without Tutel.)r   r   r   r   rz   ep_sizeep_group_namer   time_falltoalltime_salltoalltime_moer   r   r   TUTEL_INSTALLEDr`   r{   r   infowarning)r   r   r   r   r   r   r{   r   r*   r+   r   '  s(   
zMOELayer.__init__c                 C   s   || _ | j| d S r@   )rz   r   r   r   r*   r*   r+   r   F  s   zMOELayer._set_ep_groupr8   kwargsc                 O   s  | j r
| t  |d jd }|d d|}| jrZ| ||d d\| _}}}}}	| _	|
d|
d}
}t| dsItj||||jd| _| jj|||	|d | j|}n| ||d \| _}}| _	td||d |}| j r}| t  ttj}|dkrt|dd	}t| j|}| j r| t  | tjd
d| _|dkrt dkrt |dd	}|| j!| j"d|}| #|}|| j!| j" d|}|dkrt dkrt|dd	}| j r| t$  t| j|}| j r| t$  | t$jd
d| _%|dkrt |dd	}| jr#| j&|'|| |}ntd||d |}||d j}| j rK| t  | tjd
d| _(|S )Nr   rI   r   T_tutel_dispatcher)dispatch_dtype)ri   rN   r}   Fr   rO   ))r   r   	MOE_TIMERr   r'   rR   r{   r   r   r   r   hasattrr   fast_dispatcherr   r  updateencoderQ   type_asFIRST_ALLTOALL_TIMERr   r   r   r   r5   rA   rz   r   r   r   r   r   r   r   r   SECOND_ALLTOALL_TIMERr   decodeviewr  )r   r8   r  d_modelreshaped_inputCEindices_
locations_gates_SMdispatched_inputr   r   tensor_model_world_sizeexpert_outputcombined_outputrZ   r*   r*   r+   r>   J  sZ   $



zMOELayer.forward)F)rC   rD   rE   r   r   r   r   r   r   r   r   r>   r   r*   r*   r   r+   r     s     r   )r   )NNTTNF)TNT)TNr   )Cr   deepspeed.utils.timerr   deepspeed.utilsr   deepspeed.utils.bwcr   typingr   r   r   r   r	   r
   r   r!   r   torch.nnr   torch.nn.functionalr   r   rr   r   mappingsr   r   Baser   r  r  r  r   r   r   r   r   tutelr   r   r  r,   r3   	deepspeedr4   r;   autogradFunctionr5   rP   rQ   jitscriptrj   ro   ru   rt   r   r   r   r   rG   r   r   r   r   r   r*   r*   r*   r+   <module>   s   $

*


n
Y
KT