o
    پiJ                  3   @   s  d Z ddlmZmZ ddlZddlmZ ddlmZm	Z	m
Z
 e Zer;ddlmZmZmZmZmZmZmZmZmZmZ 					d3d
ejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdeded eej d!eeef d"ejf2d#d$Zd%Zd&Z	d4d
ejd'ejd(ejd)ejd*ejd+ejd,ejd-ejd.ejdejdejd/ed0efd1d2ZdS )5z CUTLASS based Fused MoE kernels.    )OptionalTupleN)CutlassMoEParams)is_cudais_sm90_supportedis_sm100_supported)
apply_shuffle_mul_sumcutlass_fp4_group_mm"es_fp8_blockwise_scaled_grouped_mm%es_sm100_mxfp8_blockscaled_grouped_mm(es_sm100_mxfp8_blockscaled_grouped_quantfp8_blockwise_scaled_grouped_mmprepare_moe_inputscaled_fp4_experts_quantshuffle_rowssilu_and_mulTFFFaw1_qw2_qw1_scalew2_scaletopk_weightstopk_ids
a1_strides
c1_strides
a2_strides
c2_strides	workspacea_ptrsb_ptrsout_ptrsa_scales_ptrsb_scales_ptrsexpert_offsetsproblem_sizes1problem_sizes2use_fp8_blockscale	use_mxfp8output	enable_esreturnc           8      C   s  |sJ d|j |j ksJ d|jtjksJ |jtjks J | j d |j d ks.J d|j d |j d d ks>J d|j d |j d ksLJ d|j d |j d ksZJ d	|j d |j d kshJ d
|j d |j d ksvJ d| jtjtjfv sJ dtrddlm} |\}}| j}|	d}| 	d}|	d}|	d}|	d} | j
}!tj| tj|!d}"tj| tj|!d}#|rW|r|sJ dt sJ d|d dksJ d|d dksJ d|jtjksJ d|jtjksJ d||j d d |j d f}$||j d d |j d f}%|j |$ks)J d|$ d|j  |j |%ks:J d|% d|j  d}&||  }'t||'}(|'|&d |(  })|)|& d |& |& }*d}+|rn|sb|rntj|d ftj|!d}+t|||||"|#||||+
 |r|rt| |"||  |f},tj|,tjd}-tj|*|d ftj|!d}.t|,||dd |+dd |-|. n|| d\}/}0t|/|"||  |f}-t|0|"||  t|d f}.tj||  |d f|!|d}1tj||  |f|!|d}2tj|df|!tjd}3tj|df|!tjd}4t r|rt|1|-||.||||||dd | n3|r5|r5t|1|-||.|||dd |+dd  nt|1||||||-||.|||||3|4||dd | tj||  |f|!|d}5t|1|5 |r|rtj|5tjd}6tj|*|d ftj|!d}7t|5||dd |+dd |6|7 n||5d\}6}7t r|rt|2|6||7||	|	|
||dd | n3|r|rt|2|6||7|||dd |+dd  nt|2||||||6||7||	|	|
|3|4||dd | |du rtj||f|!|d}t|2||#|| |S )a2  Performs Fused MoE computation using CUTLASS-like kernels with FP8 weights and activations.

    This function implements a Mixture of Experts (MoE) layer with a SwiGLU/SiLU
    activation, leveraging custom kernels likely derived from CUTLASS principles
    for grouped matrix multiplication (`fp8_blockwise_scaled_grouped_mm`) and
    data preparation (`prepare_moe_input`, `silu_and_mul`).

    It handles per-token routing, quantizes input activations to FP8 with
    per-token scales, performs the expert computations using FP8 GEMMs with
    pre-quantized FP8 weights (per-block scales), applies the SiLU activation,
    and combines the results weighted by the router scores.

    Args:
        a (torch.Tensor): Input activations. Shape: `(m, k)`, where `m` is the total
            number of tokens and `k` is the hidden size. Expected dtype: `torch.half`
            or `torch.bfloat16`.
        w1_q (torch.Tensor): Pre-quantized FP8 weight tensor for the first GEMM
            (up-projection part of SwiGLU). Expected shape: `(E, k, n*2)`, where
            `E` is the number of experts, `k` is the hidden size, and `n*2` is the
            intermediate size (`I`). Expected dtype: `torch.float8_e4m3fn`.
            Note: This shape implies weights are stored as (num_experts, hidden_size, intermediate_size).
        w2_q (torch.Tensor): Pre-quantized FP8 weight tensor for the second GEMM
            (down-projection). Expected shape: `(E, n, k)`, where `n` is half the
            intermediate size (`I // 2`). Expected dtype: `torch.float8_e4m3fn`.
            Note: This shape implies weights are stored as (num_experts, intermediate_size // 2, hidden_size).
        w1_scale (torch.Tensor): Scales corresponding to `w1_q` (per-block scales).
            Shape: `(E, num_blocks_n, num_blocks_k)`. Dtype: `torch.float32`.
        w2_scale (torch.Tensor): Scales corresponding to `w2_q` (per-block scales).
             Shape: `(E, num_blocks_k, num_blocks_n)`. Dtype: `torch.float32`.
        topk_weights (torch.Tensor): Router weights for the selected top-k experts
            for each token. Shape: `(m, topk)`. Dtype should ideally match `a`.
        topk_ids (torch.Tensor): Indices of the selected top-k experts for each token.
            Shape: `(m, topk)`. Dtype: `torch.int32`.
        a1_strides (torch.Tensor): Stride information for the first GEMM's 'a' input.
            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
            Note: Its exact usage within `fp8_blockwise_scaled_grouped_mm` needs clarification
            as it's passed as both a_stride and b_stride in the first call.
        c1_strides (torch.Tensor): Stride information for the first GEMM's 'c' output.
            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
        a2_strides (torch.Tensor): Stride information for the second GEMM's 'a' input.
            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
            Note: Its exact usage within `fp8_blockwise_scaled_grouped_mm` needs clarification
            as it's passed as both a_stride and b_stride in the second call.
        c2_strides (torch.Tensor): Stride information for the second GEMM's 'c' output.
            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
        workspace (torch.Tensor): Reusable workspace for the underlying kernel.
        a_ptrs (torch.Tensor): Pointers container for calculating offsets of the input activations for each expert.
        b_ptrs (torch.Tensor): Pointers container for calculating offsets of the input weights for each expert.
        out_ptrs (torch.Tensor): Pointers container for calculating offsets of the output activations for each expert.
        a_scales_ptrs (torch.Tensor): Pointers container for calculating offsets of the input scales for each expert.
        b_scales_ptrs (torch.Tensor): Pointers container for calculating offsets of the input scales for each expert.
        use_fp8_blockscale (bool, optional): Flag indicating usage of FP8 with
            block scaling. Currently, only `True` is supported. Defaults to `True`.
        use_mxfp8 (bool, optional): Flag indicating usage of MXFP8 (UE8M0 scales)
            with SM100 expert-specialization kernels. Defaults to `False`.
        output (torch.Tensor, optional): Output tensor. If not provided, a new tensor will be created.
        enable_es (tuple(bool, bool)): Flag indicating usage of expert specialization kernel for (up-projection, down-projection)
    Returns:
        torch.Tensor: The computed MoE layer output. Shape: `(m, k)`, dtype matches `a`.

    Raises:
        AssertionError: If input shapes, dtypes, or flags are inconsistent or unsupported.
        NotImplementedError: If CUDA is not available or `sgl_kernel` is not properly installed.
    z#Only support fp8 blockscale for nowtopk shape mismatch   zHidden size mismatch w1   zHidden size mismatch w2r   zExpert number mismatchzWeights expert number mismatchz w1 scales expert number mismatchz w2 scales expert number mismatchzInvalid output dtype) sglang_per_token_group_quant_fp8dtypedevicez3MXFP8 requires expert-specialization for both GEMMszMXFP8 requires SM100    z0MXFP8 requires hidden size to be divisible by 32z6MXFP8 requires intermediate size to be divisible by 32zMXFP8 w1_scale must be uint8zMXFP8 w2_scale must be uint8zMXFP8 w1_scale must be z, got zMXFP8 w2_scale must be    N)r1   r2   r1      )shaper1   torchfloat8_e4m3fnhalfbfloat16r   )sglang.srt.layers.quantization.fp8_kernelr/   sizer2   emptynumelint32r   uint8minr   r   
empty_liker   intr   r
   r   r   r   r   to)8r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r/   es_upes_down	out_dtypenum_expertsmkntopkr2   a_mapc_mapexpected_w1_scale_shapeexpected_w2_scale_shapemxfp8_blockscale_aligntotal_tokensnonzero_experts	max_totalmax_blockscaleblockscale_offsetsrep_arep_a_qrep_a1_scalesa_qa1_scalec1c2a_sf_layoutw_sf_layoutintermediateintemediate_qa2_scale re   U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/cutlass_moe.pycutlass_fused_experts_fp8   s  Z 







	






	




rg   g      @g      |@	a1_gscalew1_fp4w1_blockscale	w1_alphas	a2_gscalew2_fp4w2_blockscale	w2_alphasparamsapply_router_weight_on_inputc           !      C   st  |	j |
j ks
J d|jtjksJ d|jtjksJ d|jdkr2|jdkr2|jdkr2|jdks6J d| j \}}|j \}}}|j \}}}||krP||jksTJ d|d |kr_|j|kscJ d||jd krq||jd ksuJ d	d| |ksJ d
| jtjtj	fv sJ d| j}|
j d }| j
}tj|
 tj|d}tj|
 tj|d}t|
|j|j|j|||j|j|j|j
 t| ||j|j||d\}}t|||||||| }~~tj|| |j d d f||d}t|| t|||j|j|\}}t|||||||| } ~~t| ||| |jf} | |||j} |s1| |	||d| } | jdd|S )a  
    MoE implementation for FP4 Inputs

    # Gemm 1
    a: Input tensor: [m, k] (half/bfloat16)
    a1_gscale: Activation scale per expert: [e]  (float32)
    w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k]
    w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1)
    (Note: `n` is the up projection output dim, `k` is the input dim in
     full precision)
    w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3)
                   (Block size = 16 for NVFP4)

    # Gemm 2
    a2_gscale: Activation scale per expert: [e]
    w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n]
    w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1)
    w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3

    Strides for activations, weights and output in logical number of elements.
    The activations & output stride is the number of elements to the next row.
    The weights stride is the number of elements to the next row per expert.
    For example, if the weight is [e, n, k], then the b_stride is a tensor of
    shape [e] with each element being k. Similarly for activations, if the
    shape is [m, k], then the a_stride has shape [e] with each value k.
    Similarly for output, if the output is [m, n], then the c_stride is a
    tensor of shape [e] with each element being k.

    Note: cutlass_fp4_group_mm is designed to accept the strides of
    activations and weights to be the same, so it is passed in as a single
    tensor.
    ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
    ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
    c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
    c_strides_2: [e] dtype: int64 [Gemm 1: Output Strides]

    topk_weights: [m, topk] dtype: float8
    topk_ids: [m, topk] dtype: float8

    m, n, k: Unquantized weight shapes, dtype: int
    e: number of experts for the current rank, dtype: int
    assumes that topk < k < n to satisfy - up/down projection expectations.
    r,   zweight 1 must be uint8zweight 2 must be uint8   z1All Weights must be of rank 3 for cutlass_moe_fp4)zNumber of experts must matchz between weights.r.   z)Hidden size mismatch between a, w1 and w2zmismatch in expected `n`zHidden size mismatch w2 and w1zInvalid input dtyper-   r0   )
expert_mapr6   )dim)r8   r1   r9   rB   ndimrJ   hidden_sizeintermediate_size_per_partitionr;   r<   r2   r?   r@   rA   r   r$   r%   r&   rX   r   r	   to_gemm1_argsr   to_gemm2_argsr   viewrF   sum)!r   rh   ri   rj   rk   rl   rm   rn   ro   r   r   rp   rq   m_ak_ae_w1nx2_w1	half_k_w1e_w2k_w2	half_n_w2rI   num_topkr2   rO   rP   	rep_a_fp4rep_a_blockscaler^   rb   int_fp4int_blockscaler_   re   re   rf   cutlass_moe_fp4Y  s   :








r   )TFNr   )F)__doc__typingr   r   r9   (sglang.srt.layers.moe.cutlass_moe_paramsr   sglang.srt.utilsr   r   r   _is_cuda
sgl_kernelr   r	   r
   r   r   r   r   r   r   r   Tensorboolrg   FLOAT4_E2M1_MAXFLOAT8_E4M3_MAXr   re   re   re   rf   <module>   s    0#	


  =	
