o
    پiI                     @   s   d dl mZ d dlZd dlmZmZ d dlmZ dejde	fddZ
			dd	eejeej f d
ejdejdejdejdejdejdejdee deej deejj fddZdS )    )OptionalN)scaled_fp4_grouped_quantize*silu_and_mul_scaled_nvfp4_experts_quantize)grouped_gemm_nt_maskedinputreturnc                 C   s@   | j tjkrdS | j tjkrdS | j tjkrdS td| j  )Nbfloat16float16float32zUnsupported cute dtype )dtypetorchr   r	   r
   
ValueError)r    r   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/flashinfer_cutedsl_moe.pyget_cute_dtype   s   r   hidden_statesinput_global_scalew1w1_blockscalew2a2_global_scalew2_blockscalemasked_mdown_sm_countdown_signalsdown_start_eventc                 C   s  |j tjksJ d|j  |j tjksJ d|j  |j tjks*J d|j  |j tjks8J d|j  |j tjksFJ d|j  |j tjksTJ d|j  |j tjksbJ d|j  t| dksqJ d	t|  |jd
 d }| d dur| d tj}| d tj}|j\}}}|d }n.| d j\}}}|j tjksJ d|j  |j|fksJ d|j t| d |	|\}}|jd d| ksJ d|j |jd
 d |ksJ d|jd
  d| |jdd ||d fksJ d|jdd  d||d f |j|fksJ d|j |j|fks-J d|j |j|fks<J d|j tj	|||d ftj
|jd}|ddd}d}|j tjks]J |j tjksfJ d}d}d}t||f|ddd|f||	|||||dd|t|d
 t|ddd|	|\}}|dur|  tj	|||ftj
|jd}|ddd}t||f|ddd|f||	f|||||dd|t|d|
dus|durt|
|dni  |dddS ) a  
    Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL
    kernels.

    Args:
        hidden_states: Either of the following case
            * tuple[torch.Tensor, None]: [num_experts, m, k], bf16, None means no quant
            * tuple[torch.Tensor, torch.Tensor]: [num_experts, m, k // 2], uint8, [num_experts, m, k // 16], float8_e4m3fn
        input_global_scale (torch.Tensor): (l,)
        w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8
        w1_blockscale (torch.Tensor): blockscale factors, e4m3,
        w1_alpha (torch.Tensor): (l,)
        w2 (torch.Tensor): fp4 weights, [l, k, n // 2], uint8
        a2_global_scale (torch.Tensor): (l,)
        w2_blockscale (torch.Tensor): blockscale factors, e4m3,
        w2_alpha (torch.Tensor): (l,)
        masked_m (torch.Tensor): Masked dimension indices

    Notes:
        - Assumes max(masked_m) == m.
    z#w1 must be uint8 (fp4 packed), got z)w1_blockscale must be float8_e4m3fn, got zw1_alpha must be float32, got z#w2 must be uint8 (fp4 packed), got z%a2_global_scale must be float32, got z)w2_blockscale must be float8_e4m3fn, got zw2_alpha must be float32, got    z/hidden_states must be a tuple of length 2, got    Nr   z(input_global_scale must be float32, got z%input_global_scale must be (l,), got zw1 last-2 dim must be 2*n, got z"w1 last dim * 2 must equal k, got z vs k=zw2 shape mismatch, got z, expected zw1_alpha must be (l,), got z"a2_global_scale must be (l,), got zw2_alpha must be (l,), got )r   device   float4_e2m1fnfloat8_e4m3fnr   )ab_dtypesf_dtypec_dtypesf_vec_sizealphaalpha_dtype)sm_countdst_signals)r   r   uint8r#   r
   lenshapeviewr   emptyr   r    permuter   r   r   recorddict)r   r   r   r   w1_alphar   r   r   w2_alphar   r   r   r   na_qa_q_sfmk_by_2num_expertskgateup_outputr'   r$   r%   r&   diqdiq_sfoutr   r   r   flashinfer_cutedsl_moe_masked   s   &







"
 






rA   )NNN)typingr   r   
flashinferr   r   $flashinfer.cute_dsl.blockscaled_gemmr   Tensorstrr   tupleintcudaEventrA   r   r   r   r   <module>   s>    

