o
    پik5                     @   sp  d dl mZmZ d dlZd dlZd dlmZ d dlm	Z	 d dl
mZ e Zejdejdejdejdejd	ejd
ejdejfddZ	d$dejdejdededeej f
ddZejdejdejdejd	ejdejdejdejdejdejdejdejdejfddZ	d$dejdejdedededededeej fddZ		d%deej defd d!ZG d"d# d#ZdS )&    )OptionalTupleN)
fused_topk)is_hipis_correction_biasnum_expertstopkmoe_softcappingmoe_renormalize
hidden_dim
BLOCK_SIZEc           #      C   s  t jdd}t d|}||
k }t d|d d d f }|d d d f }t j|||
  |d d d f  |dd}t j| ||
  | |dd}t j|t j|d d d f t j dd}|dkrd|}n|| }t d| }|d }|d }|| | }|rt |t d| }|| }t j|dd}t 	|||  d | t j
|dd}dt jt || dd }t 	|||  d | |dkr	t jt t d||k|td	dd}t 	|||  d | t j|t d||k dd}t 	|||  d t || |  |dkrt j|jd|jd
}t t d||k|td	}t t d||k|td	}td|D ]K} t j|| dd}!t t d||!k|td	}t 	|||  |  |! t j|t d||!k dd}"t 	|||  |  t |"| |  q;d S d S )Nr   axis        maskother            ?-infdtype)tl
program_idarangeloadsumtofloat32expargmaxstoremaxwherefloatfullshaper   range)#	input_ptrmoe_router_weight_ptrtopk_weights_ptrtopk_ids_ptrcorrection_bias_ptrr   r   r   r	   r
   r   r   pidoffsetsr   expert_offsetsrouter_maskw_routerxlogitslogits_softcappedlogits_scaledexpedtopbottombiastop1top1_v	invsumexptop2top2_v	topk_maskitopitopi_v rE   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/router.py fused_moe_router_cudacore_kernel   s   .

rG   r4   router_weightcorrection_biasc              	   C   s   t | jdkr| jd |jd ksJ | j\}}|jd }tj||ftj| jd}tj||ftj| jd}	|d u}
tr=dnd}t	|t
tt	t|d|dd	}t|f | |||	|f|
|||d
|d| ||	fS )Nr   r   r   r   device             )r   	num_warpsF)r   r   r   r	   r
   r   )lenr(   torchemptyr    rK   int32_is_hiptritonnext_power_of_2r$   mincdivrG   )r4   rH   r   r	   rI   bsr   r   topk_weightstopk_idsr   	max_warpsconfigrE   rE   rF   fused_moe_router_cudacoreu   s:   &

r_   KBLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_K	stride_am	stride_bndp_attn_workaround_flagc           /   	   C   sT  t jdd}|| t d|d d d f  }||k }t d|d d d f }| || |  }t d|d d d f }t d|d d d f }||k }||| |  }t j||ft jd}td|| D ]+}t j||ddt j}t j||ddt jj}|t 	||7 }||7 }||7 }qc|dkr|}n|| }t 
d| } | d | d  | }|
rt j|	t d|d d d f  |jdd}!||! }|rt ||kd|}t d|d d d f }"|"|k }#t jt |#|td	dd}$t jt |#|td	dd
d}%dt jt |#t 
||% ddd }&|| | |t d|  }'|'|| k }(t j||' |$|(d t j||' |&|(d |dkr|"|k |"|$d d d f k@ })t jt |)|td	dd
d}*t j||"|*k dd
d}+t 
|+|% |&d d d f  },|| | |t d|d d d f   d }-|-|| k }.t j||- |*|.d t j||- |,|.d d S d S )Nr   r   r   r   r   r   r   g    er   T)r   	keep_dimsr   )r   )r   r   r   zerosr    r)   r   r   Tdotr!   r%   r"   r&   r$   r   r#   )/a_ptrb_ptrr,   r-   rZ   r   r   r	   r
   r.   r   r`   ra   rb   rc   rd   re   rf   r/   offs_mbs_maskoffs_ka_ptrsoffs_nexpert_maskb_ptrsacckabr6   r7   r8   r;   arange_block_size_n	cond_top1r<   r=   top1_invsumexp	offs_top1	top1_mask	cond_top2r?   r@   top2_invsumexp	offs_top2	top2_maskrE   rE   rF   "fused_moe_router_tensorcore_kernel   s    


*
r   c                 C   sH  t | jdkr| jd |jd ksJ | j\}}	|jd }
|
|ks#J |	| dks+J |dks1J tj||ftj| jd}tj||ftj| jd}|d u}t||t|
| f}ddl	m
} | }t| di d| d|d|d	|d
|d|
d|d|ddd|	d|d|d|d|d|d|	d|	d| ||fS )Nr   r   r   rJ   )is_dp_attention_enabledrk   rl   r,   r-   rZ   r   r   r	   r
   Fr`   r.   r   ra   rb   rc   rd   re   rf   rE   )rQ   r(   rR   rS   r    rK   rT   rV   rY   sglang.srt.layers.dp_attentionr   r   )r4   rH   r   r	   ra   rb   rc   rI   rZ   r   r   r[   r\   r   gridr   rf   rE   rE   rF   fused_moe_router_tensorcore  sd   &


	
r   Fenable_deterministic_inferencec              
   C   s   |rJ t |jdkr|jd |jd ksJ |j\}}|jd }	d}
t|	d}|	dk r.dnd}|dks8|	d	krL|| dkrL|sLt|||| |
|||d
S t|||| |dS )Nr   r   r   rM   rL   rN   @   i      )r4   rH   r   r	   ra   rb   rc   rI   )r4   rH   r   r	   rI   )rQ   r(   r$   r   r_   )r	   hidden_statesgating_outputr   renormalizerI   r   rZ   r   r   ra   rb   rc   rE   rE   rF   fused_moe_router_shimT  s<   	


r   c                   @   s   e Zd ZdddZdd Zdejdejdeejejf fd	d
Z	ddejdeejejf fddZ	dejdeejejf fddZ
dS )FusedMoeRouterreturnNc                 C   s   || _ || _|| _d S N)router_linearr   r	   )selfr   r   r	   rE   rE   rF   __init__  s   
zFusedMoeRouter.__init__c                 O   s   | j |i |S r   )forward)r   argskwargsrE   rE   rF   __call__  s   zFusedMoeRouter.__call__r4   residualc                 C   s   |j r	| ||S | ||S r   )is_cudaforward_cudaforward_vllm)r   r4   r   rE   rE   rF   r     s   zFusedMoeRouter.forwardFc                 C   s   t | j|| jj| jddS )NF)r	   r   r   r   r   )r   r	   r   weightr   )r   r4   autotunerE   rE   rF   r     s   zFusedMoeRouter.forward_cudac                 C   s@   |  | jjj   }t|  | j | j }t||| jdS NF)	r&   r   r   ri   rR   tanhr	   r   r   )r   r4   grE   rE   rF   forward_torch  s   zFusedMoeRouter.forward_torch)r   N)F)__name__
__module____qualname__r   r   rR   Tensorr   r   r   r   rE   rE   rE   rF   r     s*    

	
r   r   r   )typingr   r   rR   rV   triton.languagelanguager   sglang.srt.layers.moe.topkr   sglang.srt.utilsr   rU   jit	constexprrG   r   intr&   r_   r   r   boolr   r   rE   rE   rE   rF   <module>   s    	
l
*	 
<
4