o
    پis                     @   s  d dl Z d dlZd dlZd dlmZmZ e eZe Z	e	r$d dl
mZ d dlmZ dd Zdd Zejdejfd	d
ZejdejfddZejdejfddZejdejfddZdejdefddZejdd ZdejfddZejdejdejfddZdd Zejdejdejd ejfd!d"Z	#dd$ejd%ejd&ejd'ed(ejd)e fd*d+Z!ejdejdejfd,d-Z"d$ejd%ejd.ejd/ejd0ed1efd2d3Z#ejd4e$dejdejfd5d6Z%d4e$fd7d8Z&ejdejfd9d:Z'ejdejd;ejd<ejfd=d>Z(ejd?ejd@ejdAejdBejdCejf
dDdEZ)e* 	#ddFejdGejdHejdIejdJejdKejdLejdMejdNejd)e fdOdPZ+ejd?ejdQejfdRdSZ,e* dTejdUejdVejdWejdKejf
dXdYZ-dZed[ed\efd]d^Z.ejd_ejfd`daZ/dbejfdcddZ0ejdedf Z1ejdejfdgdhZ2ejdejfdidjZ3ej4fdejdkedlejdmednej5f
dodpZ6ejdejfdqdrZ7dsdt Z8ejdejfdudvZ9dwdx Z:dydz Z;ejdejdejfd{d|Z<d$ejd%ejd(ejd.ejd\ejf
d}d~Z=dS )    N)ceil_divis_cuda) sglang_per_token_group_quant_fp8c                    s   d}d}d}t j| }|j}|j}|| | }|}	 fdd}
|	|kr<|
|	d |kr<|	d }	|	|kr<|
|	d |ks,|
|	}t||| }|f|	fS )N         c                    s   t  | S Ntritoncdiv	block_dimnumel X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/ep_moe/kernels.pyget_num_blocks   s   z-_get_launch_config_1d.<locals>.get_num_blocks   )torchcudaget_device_propertiesmulti_processor_countmax_threads_per_multi_processormin)devicer   MAX_THREADS_PER_BLOCKMIN_THREADS_PER_BLOCK	MAX_WAVESpropssm_countmax_threads_per_smmax_num_blocksr   r   
num_blocksgrid_dimr   r   r   _get_launch_config_1d   s"   
r$   c                    s   d}d}d}t j| }|j}|j}|| | }	|}
 fdd}|
|kr=||
d |	kr=|
d }
|
|kr=||
d |	ks-t|
}tt |	| | d}||f|
fS )Nr   r   r   c                    s    t |  S r   r	   r   mnr   r   r   :   s   z-_get_launch_config_2d.<locals>.get_num_blocksr      )	r   r   r   r   r   r
   r   maxr   )r   r&   r'   r   r   r   r   r   r    r!   r   r   
grid_dim_x
grid_dim_yr   r%   r   _get_launch_config_2d.   s"   r,   
BLOCK_SIZEc                 C   s   |j j}td}	||	|  }||	|  }| |	|  }
td||D ]<}|td| }||k }tj|
| |d|}t|D ]}t|| }|dkr]|||  }tj|| ||d q@q"d S Nr   mask)	dtype
element_tytl
program_idrangearangeloadtostore)	input_ptrgateup_input_ptrsrc2dst_ptrtopk_ids_ptra1_scales_ptrtopkhidden_sizer-   OutDtypesrc_idxsrc_ptrstart_offsetoffsetr0   in_dataidxdst_idxdst_ptrr   r   r   deepep_permute_triton_kernelI   s"   
rJ   c                 C   s   | j j}td}	||	|  }||	|  }||	|  }||	|  }
td||D ]Q}|td| }||k }tj|g|d}t|D ],}t|| }|dkrnt|| |}| ||  }tj|| |d}||| 7 }qBtj	|
| ||d q(d S Nr   r1   r/   )
r1   r2   r3   r4   r5   r6   zerosr7   r8   r9   )down_output_ptr
output_ptrr<   r=   topk_weights_ptrr?   r@   r-   InDtyperB   	store_ptrrD   rE   r0   sum_vecrG   rH   weigh_scaleload_ptrrF   r   r   r   !deepep_post_reorder_triton_kernelh   s(   
rV   c                 C   sR   t jdd}|| t d| }||k }t j| | |d}t j|| ||d d S Nr   axisr/   r3   r4   r6   r7   r9   )reorder_idssrc2dstnum_toksr-   piddst_idr0   src_idr   r   r   compute_src2dst_triton_kernel   s
   ra   c           
      C   s`   t jdd}|| t d| }||k }t j| | |d}t |}	t j|| ||	 |d d S rW   rZ   )
r[   r\   r]   num_minus_oner-   r^   r_   r0   r`   num_invalidr   r   r   $deepep_compute_src2dst_triton_kernel   s   
rd   topk_idsnum_expertsc           
      C   s   t j| ddd\}}t j|d | jt jd}t j|  | jt jd}t j|d | j|jd}t j	|||d |d }|| }d}t
|  |f}	t|	 |||  || ||d  }|||fS )	NTstabler(   r   r1   )outr   r   )r   sortviewemptyr   int64r   r6   r1   searchsortedr
   r   rd   )
re   rf   reorder_topk_idsr[   
seg_indptrr\   
expert_idsrb   r-   gridr   r   r   deepep_run_moe_deep_preprocess   s    
ru   c                 C   s~   t dd }d}|d }d}||kr1|| d }t | | |kr'|d }n|d }|}||kst || d |d  d S )Nr   r(   rg   r   r3   r4   r7   r9   )rq   rr   r]   expert_id_minus_1lowhightarget_locationmidr   r   r    compute_seg_indptr_triton_kernel   s   
r|   c                 C   s`   t j| ddd\}}d}t|  |f}t j|  | jt jd}t	| |||  | |S )Nrg   Trh   r   rj   )
r   rl   rm   r
   r   r   rn   r   int32ra   )re   _r[   r-   rt   r\   r   r   r   cutlass_w4_run_moe_ep_preproess   s   r   
NUM_STAGESc                 C   s   |j j}|d urdt| }nd}|	td td|	 }||k }td}td}tj||||
dD ]W}|tj	}|||  }|||  }| ||  | }|| }tj||dtj
}|| |}t|D ] }t|| }||krt|| }tj|||  ||d qlq6d S )N      ?r(   r   
num_stagesr/   )r1   r2   r3   r7   r4   r6   num_programsr5   r8   ro   float32r9   )r:   r;   r<   r=   r>   num_local_expertsr?   
num_tokensr@   r-   r   rA   a1_scalerE   r0   start_src_idxstepsrc_idx_int32rB   token_src2dst_ptrtoken_topk_ids_ptrsrc_ptr_offsdst_ptr_offsrF   out_datarG   	expert_idrH   r   r   r   )pre_reorder_triton_kernel_for_cutlass_moe   s4   


r   c	                 C   s8   t | j||\}	}
t|	 | |||||||||
dd d S )N   )r:   r;   r<   r=   r>   r   r?   r   r@   r-   r   )r,   r   r   )inputgateup_inputr\   re   	a1_scalesr   r?   r   r@   rt   r   r   r   r   pre_reorder_for_cutlass_moe  s   
r   BLOCK_N	NUM_STAGESCALE_UE8M0c           #   
   C   s  t d}t d}t d}t d}t || }t j|t jd}t j|t jd}t j|t jd}t j|t jd}|| t d| }| ||  | }|||  | }|||	  ||  }t j||||dD ]}t j|||  ||k ddt j	}t j|||  | ||k dd}|dt 
|   }|| jj}|| }t t t |d} | | }!|rt t t t |!}!t ||! |||jj}"t j|||  |"||k d	 t |||
  |! qhd S )
Nr   r(   r   rL   r           r0   otherg|=r/   )r3   r4   r   r7   castro   r6   r5   r8   r   expr1   r2   maximumr)   absexp2ceillog2clampr9   )#r:   stride_input_0stride_input_1stride_input_2rO   stride_output_0stride_output_1stride_output_2output_scale_ptrstride_output_scale_0stride_output_scale_1stride_output_scale_2masked_m_ptrsize_nfp8_maxfp8_minr   r   r   r   token_idhidden_dim_block_indexblock_num_per_experttoken_num_cur_expert	offs_in_dinput_ptr_offsoutput_ptr_offsoutput_scale_offstoken_indexgateupgate_up_absmaxoutput_soutput_qr   r   r   _silu_and_mul_post_quant_kernel  sj   







r   Fr   outputoutput_scalequant_group_sizemasked_mscale_ue8m0c                 C   s:  |   sJ |jtjksJ |  sJ t| jdksJ | jd |jd ks)J | jd d dks4J | jd d }|| dksCJ t|}|dk rNd}nd}|}	d}
d	}t||	}|	| dksdJ |||f}ttj}|j	}| }t
| | g|  || || ||||R |	||
|d
 dS )a  
    input shape [expert_num, token_num_padded, hidden_dim]
    output shape [expert_num, token_num_padded, hidden_dim // 2], dtype fp8
    output_scale [expert_num token_num_paddded, hidden_dim // 2 // 128] dtype float32
    quant_group_size  int,
    masked_m shape [expert_num],
    r   r   rg   r      @       r(      )r   r   	num_warpsr   N)is_contiguousr1   r   float8_e4m3fnlenshaper
   r   finfor)   r   stride)r   r   r   r   r   r   r   
expert_numBLOCK_NUM_PER_EXPERTr   r   r   hidden_dim_split_block_numrt   r   r   r   r   r   r   "silu_and_mul_masked_post_quant_fwdl  sf   	
r   c                 C   s  |j j}t|}|| }	| }
| | }dt| }td| }td| }tj||	||dD ]O}|td| }|| }||	k }|||  }tj|
| |ddtj	}tj|| |ddtj	}|dt
|   | | }tj|| |||d q1d S )Nr   r   r   r   r   r(   r/   )r1   r2   r3   r7   r4   r   r5   r6   r8   r   r   r9   )r:   rO   	scale_ptrnum_tokens_tensor_ptrintermediate_sizer-   r   rA   r   r   gate_ptrup_ptrscale	start_idxr   idids	token_idsr0   offsr   r   r   r   r   r   >silu_mul_static_tensorwise_quant_triton_kernel_for_cutlass_moe  s$   

r   r   num_tokens_tensorexpected_num_tokensr   c              	   C   s2   t | j|| \}}t| | |||||dd d S )Nr   )r:   rO   r   r   r   r-   r   )r$   r   r   )r   r   r   r   r   r   rt   r   r   r   r   0silu_mul_static_tensorwise_quant_for_cutlass_moe  s   

r   routed_scaling_factorc           !      C   s\  |j j}|
td td|
 }||k }| | }|| }td}td}tj||||dD ]{}|tj}|||  }|||  }|||  }tj	|
gtj
d}t|D ]>}t|| }||krt|| }|tj}|}t|| tj
}|||  }tj||dtj
}||| 7 }qW||	9 }|||  } tj| |||d q0d S )Nr(   r   r   rL   r/   )r1   r2   r3   r4   r6   r   r5   r8   ro   rM   r   r7   r9   )!rN   rO   r<   r=   rP   r   r?   r   r@   r   r-   r   rA   rE   r0   down_output_ptr_offsr   r   r   r   rB   r   r   token_topk_weights_ptrrS   rG   r   dst_idx_int32rH   weight_scaleload_ptr_offsrF   store_ptr_offsr   r   r   *post_reorder_triton_kernel_for_cutlass_moe  s<   


r   c
                 C   s:   t | j||\}
}t|
 | |||||||||	|dd d S )Nr   )rN   rO   r<   r=   rP   r   r?   r   r@   r   r-   r   )r,   r   r   )down_outputr   r\   re   topk_weightsr   r?   r   r@   r   rt   r   r   r   r   post_reorder_for_cutlass_moe  s   
r   c                 C   s"  | j j}td}	|	tj}
||
|  }||
|  }||
|  }||
|  }td|}td||D ]Z}|| }||k }tj|g|d}t|D ]9}t	|| }|dkrt	|| }|tj}t	|| |}| ||  }tj	|| |d}||| 7 }qJtj
|| ||d q4d S rK   )r1   r2   r3   r4   r8   ro   r6   r5   rM   r7   r9   )rN   rO   r<   r=   rP   r?   r@   r-   rQ   r   rB   rR   vecrD   rE   r0   rS   rG   r   r   rH   rT   rU   rF   r   r   r   post_reorder_triton_kernel8  s0   
r   BLOCK_EBLOCK_EXPERT_NUMc                 C   s   t d}t d|}t j| | ||k dd}t || }	t j|| |	||k d t || }
t | | }||
 }t d|}t jd||ddD ]}t || | | qKd S )Nr   r   r/   r   r   )r3   r4   r6   r7   cumsumr9   r5   )num_recv_tokens_per_expertexpert_start_loc	m_indicesrf   r   r   
cur_expertoffset_cumsumtokens_per_expertr   cur_expert_startcur_expert_token_numm_indices_start_ptr
off_expertstart_mr   r   r   _fwd_kernel_ep_scatter_1`  s&   
	
r  topk_numHIDDEN_SIZEHIDDEN_SIZE_PADSCALE_HIDDEN_SIZESCALE_HIDDEN_SIZE_PADc           *      C   sP  t d}t d}t d|}||k }t d|}||k }t|| |D ]}|t j} t j|| |  | |d}!t j|| |  ||  |d}"t jd|dddD ]S}#|#t j}$t || |	  |$ }%|%dkrt ||% d}&|&t j}'t 	|| |  |$ |& ||'|  }(||'|  })t j	|(| |!|d t j	|)||  |"|d qQq$d S )Nr   r/   r(   r   r   )
r3   r4   r   r6   r5   r8   ro   r7   
atomic_addr9   )*total_token_numr   recv_xrecv_x_stride0recv_x_stride1recv_x_scalerecv_x_scale_stride0recv_x_scale_stride1	recv_topkrecv_topk_stride0recv_topk_stride1output_tensoroutput_tensor_stride0output_tensor_stride1output_tensor_scaleoutput_tensor_scale_stride0output_tensor_scale_stride1output_indexoutput_index_stride0output_index_stride1r  r  r  r  r  start_token_idgrid_num	offset_inr0   
index_in_smask_stoken_id_int32r   to_copy	to_copy_stopk_idx_int32
topk_indexr   dest_token_index_int32dest_token_indexoutput_tensor_ptroutput_tensor_scale_ptrr   r   r   _fwd_kernel_ep_scatter_2  sP   




r*  r
  r  r  r   r   r  r  r   r  c
                 C   sx  d}
d}d}|j d }| j d }|}|| }|	rt|d}|j d |
 dks(J |j|jks:J d|j d|j |j d |j d   krK|ksNJ  J t|f ||||||
t|d t|j d d	}t|f |j d || | d| d||d|d||d|d||d|d||d|d||d|df|j d ||t||t|d
 d S )N   r   r   r(   r   zrecv_x_scale.dtype: z, output_tensor_scale.dtype: )rf   r   r   r       )r  r   r  r  r  r  )	r   r   r1   r  r
   next_power_of_2r   r*  r   )r
  r  r  r   r   r  r  r   r  r   r   BLOCK_Dr   rf   r@   rt   scale_hidden_sizer   r   r   
ep_scatter  sl   


(
r0  r.  c           !      C   s8  t d}|t j}t d}t d}t|| |D ]~}|t j}t d|}t j|gt jd}td|D ]L}|t j}t 	|||  | }|dkrt 	|
||  | }|t j}t 	|||  | }t 	|||  ||  | } || t j| 7 }q7t 
|||  ||  | ||jj qd S )Nr   r(   rL   )r3   r4   r8   ro   r   r5   r6   rM   r   r7   r9   r1   r2   )!r	  input_tensorinput_tensor_stride0input_tensor_stride1recv_topk_idsrecv_topk_ids_stride0recv_topk_ids_stride1recv_topk_weightrecv_topk_weight_stride0recv_topk_weight_stride1input_indexinput_index_stride0input_index_stride1r  r  r  r  r.  cur_block_int32	cur_blockstart_cur_token_int32r  cur_token_int32	cur_tokenoff_daccumulatortopk_index_int32r%  r   source_token_index_int32source_token_index
acc_weighttmpr   r   r   _fwd_kernel_ep_gather  sV   


rI  r1  r4  r7  r:  c           
      C   s   d}|j d }| j d }|d dkrdnd}|| dksJ t||t|df}	t|	 || | d| d||d|d||d|d||d|d||d|d|j d ||d d S )Nr   r   r(   r   r+  )r  r   r.  )r   r
   r   r   rI  r   )
r1  r4  r7  r:  r  r   r   r@   r.  rt   r   r   r   	ep_gatherX  s8   

rJ  xelement_sizereturnc                 C   s*   d}|| dks
J || }t | || S )a  
    Global memory address of TMA must be 16-byte aligned.
    Since we use column-major layout for the LHS scaling tensor,
        the M-axis of the LHS scaling tensor needs to be padded to a multiple of 16 bytes.

    Arguments:
        x: original M-axis shape of the LHS scaling tensor.
        element_size: element size of the LHS scaling tensor.

    Returns:
        M-axis shape of the LHS scaling tensor after padding.
       r   )r   )rK  rL  tma_alignment_bytes	alignmentr   r   r   get_tma_aligned_size  s   rQ  BLOCK_SIZE_Kc	                 C   s   t jdd}	t d}
t d|}t|	||
D ])}| ||  ||  }t j|||k d}|||  ||  }t j||||k d qd S rW   )r3   r4   r   r6   r5   r7   r9   )input_scale_ptrrO   r&   k_div_block_sizeinput_scale_stride_minput_scale_stride_koutput_stride_moutput_stride_krR  pid_mgrid_m	k_offsetsm_baseinput_offset
input_dataoutput_offsetr   r   r   _tma_align_input_scale_kernel  s   
r`  input_scalec                 C   s   |   dksJ | j\}}t||  }tj||f| j| jd}t|d}t	
|}t|f | |||| d| d|d|d|d	 | d | S )Nr   )r1   r   r,  r   r(   )	rS  rO   r&   rT  rU  rV  rW  rX  rR  )dimr   rQ  rL  r   rn   r1   r   r   r
   r-  r`  r   t)ra  r&   rT  padd_mr   rZ  rR  r   r   r   tma_align_input_scale  s(   


re  c                 C   sB   t d}t | | }t | | d }t || ||  d S )Nr   r(   rv   )rr   r   r   startendr   r   r   compute_masked_m_triton_kernel  s   
rh  c                 C   s   t jdd}|| t d| }||k }	t j|| |	d}
t j| |
 |
|k d}t j|| |dkd}|| }|| | }t j||
 ||	d d S rW   rZ   )re   r[   rr   r\   m_maxr]   r-   r^   r_   r0   r`   r   expert_dst_startexpert_dst_offsetr   r   r   &deepgemm_compute_src2dst_triton_kernel  s   
rl  c
                 C   s@  t d}
|
t j}|||  }|||  }| ||  }|||  }t d|	}t|D ]p}t || }|dkrt || }|t j}|||  }t d||	D ]}|| }||k }t j|| |d}t j|| ||d qT|||  }t d||	D ]}|| }||k }t j|| |d}t j|| ||d qq-d S r.   )r3   r4   r8   ro   r6   r5   r7   r9   )r:   r   r;   gateup_input_scale_ptrr<   r=   r?   r@   
scale_sizer-   r   rB   rC   scale_src_ptrr   rG   r   r   rH   rI   rD   rE   r0   rF   scale_dst_ptrin_scaler   r   r   fill_gateup_input_triton_kernel  s4   
rr  r   hidden_statestop_koutput_dtypec                    s  t j ddd\}}t j|d  jt jd}t j   jt jd}	t j| jt jd}
t	|d f ||    fdd}t
|f ||
 |dd	 d d	 }  d | d }t j|||df|j|d}t|  |||	|  d	d
 |d u rddg}t|dksJ |d |d }}t||\}}t j|d|d|df|j|jd}t|jd f |||||	 ||d|ddd

 |
||	||fS )Nrg   Trh   r(   rj   c                    s   t   | d fS Nr-   )r
   r   r   metare   r   r   <lambda>$  s    z,moe_ep_deepgemm_preprocess.<locals>.<lambda>r      r-   r+  r   r   )r   rl   rm   rM   r   ro   rn   r   r}   r|   rh  sizerl  r   per_token_group_quant_fp8r1   rr  r   )re   r   rs  rt  block_shaperu  rq   r[   rr   r\   r   rt   ri  
expected_mr   block_nblock_kr   gateup_input_scaler   ry  r   moe_ep_deepgemm_preprocess  sn   

r  c                 C   s   t d}|||  }	|||  | }
|	|ks|
|krd S t j||	|  |
 t d| |
t d| |k d}t j|gt jd}t| D ]}t ||	|  | }||| 7 }qEt j||	|  |
 t d| ||
t d| |k d d S )Nr   r/   rL   )r3   r4   r7   r6   rM   r   r5   r9   )rt  hidden_states_ptrexpert_scales_ptrr   rO   
hidden_dimscales_strider-   r^   batch_id
dim_offsethresultir   r   r   r   compute_identity_kernel^  s0   


r  c              
      s   |    | d} fdd}|dkr| |k }| }d||< | |k}	d| |	< d||	< t||j}
|d|dfdd}t| ||||
|ddd	 |
S )
Nrg   c                       t  | d fS rv  r	   rw  )Nr   r   rz        z-zero_experts_compute_triton.<locals>.<lambda>identityr   r   c                    s    | d   fS rv  r   rw  )r  r   r   r   rz    r  r{  r|  )	r   r}  cloner   
zeros_liker8   r   r  r   )expert_indicesexpert_scalesrf   zero_expert_typers  rt  rt   zero_expert_maskzero_expert_scalesnormal_expert_maskr   r   )r  r  r   r   zero_experts_compute_triton  s2   


r  c                 C   s0  t jdd| t d| }||k }t j| | |dd}	|d }
|
d }|
d }|d }|d }|d }|
|d k }||d k }||d k }||d k }||d k }||d k }t j||
 d| |d t j|| |	|d t j|| ||d t j|| ||d t j|| |	|d t j|| ||d d S )Nr   rX   r   r   r(   r   r/   rZ   )r   problem_sizes1_ptrproblem_sizes2_ptrr'   krf   r-   r^   r0   final_occurrences	ps1_idx_0	ps1_idx_1	ps1_idx_2	ps2_idx_0	ps2_idx_1	ps2_idx_2
ps1_mask_0
ps1_mask_1
ps1_mask_2
ps2_mask_0
ps2_mask_1
ps2_mask_2r   r   r   !compute_problem_sizes_w4a8_kernel  s*   
r  c              	      s2   d} fdd}t | | |||| |d ||fS )Nr{  c                    r  rv  r	   rw  rf   r   r   rz    r  z,compute_problem_sizes_w4a8.<locals>.<lambda>r|  )r  )r   problem_sizes1problem_sizes2r'   r  rf   r-   rt   r   r  r   compute_problem_sizes_w4a8  s   	r  c                 C   s.   t | |||||\}}|tj|tjfS r   )r  r8   r   r}   )r   r  r  rf   r'   r  r   r   r   &deepep_ll_get_cutlass_w4a8_moe_mm_data  s   

r  c           "      C   s  t d}t d}t d}t d}t |	| }dt |t j }t |t j}t |t j}t |t j}t |t j}|| t d| }||
k }| ||  | }|||  | }t j	||||dD ]W}|||  }||
 }t j||ddt j}t j||ddt j}|dt 
|   }|| jj}|| }|| }t ||||jj} |||  }!t j|!| |d qid	S )
z
    Triton kernel: fused SiLU(gate) * up + per-tensor FP8 quantization.

    Shape:
        input:  [E, T_padded, 2*D]  -> gate: [:,:,D], up: [:,:,D]
        output: [E, T_padded, D], dtype=float8_e4m3fn
    r   r(   r   r   r   r   r   r/   N)r3   r4   r   r7   r8   r   r   r}   r6   r5   r   r1   r2   r   r9   )"r:   stride_input_expertstride_input_tokenstride_input_dimrO   stride_output_expertstride_output_tokenstride_output_dimr   r   	inner_dimr   r   r   r   r   block_id_tokenblock_id_dimnum_token_blocksr   r   offset_dmask_dinput_base_offsoutput_base_offs	token_idxr   r   r   r   r   scaledr   out_ptrr   r   r   *_silu_and_mul_post_per_tensor_quant_kernel  s:   




r  c                 C   s*  |   sJ |  sJ |jtjksJ | jdksJ | jd |jd ks'J | jd d dks2J | dksD|jd | jd ksDJ | jd }| jd d }d}|dk rXdnd	}d}t||}	|	||f}
t	tj}|j
}| }t|
 | g|  || |||||R ||d
 |S )a  
    Fused SiLU + Mul + Per-Tensor Quantization to FP8.

    Args:
        input: [expert_num, token_num_padded, 2 * inner_dim]
        output: [expert_num, token_num_padded, inner_dim], dtype=torch.float8_e4m3fn
        masked_m: [expert_num], actual token count for each expert
        scale: [1] or [expert_num], quantization scale (per-tensor or per-expert)

    Returns:
        output tensor
    r   r   rg   r   r(   r{  r   r   r   )r   r   )r   r1   r   r   ndimr   r   r
   r   r   r)   r  r   )r   r   r   r   r   r  r   BLOCK_Mr   r   rt   r   r   r   r   r   r   -silu_and_mul_masked_post_per_tensor_quant_fwd2  sP   $

	
r  )F)>loggingr   r
   sglang.srt.utilsr   r   	getLogger__name__logger_is_cuda)sglang.srt.layers.quantization.fp8_kernelr   r~  triton.languagelanguager3   r$   r,   jit	constexprrJ   rV   ra   rd   Tensorintru   r|   r   r   r   r   boolr   r   r   floatr   r   r   r  r*  no_gradr0  rI  rJ  rQ  r`  re  rh  rl  rr  r   r1   r  r  r  r  r  r  r  r  r   r   r   r   <module>   s   
 


,S
D 

1

' H	
KA'	

.
M&#%A