o
    پiK                  /   @   s`  d Z ddlmZ ddlZddlmZ e ZerddlmZm	Z	 ddlm
Z
 ddlmZ ddlmZ dd	lmZmZmZmZmZmZmZmZmZ 			
	d+dejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdejdeej deej d ed!ed"ejf.d#d$Z		d,dejdejdejdejdejdejd%ejdejdejdejdejdejdejdejdejdejdejdejdeej deej d"ejf*d&d'Z		d,dejdejdejdejdejd%ejd(ejdejdejdejdejdejdejdejdejdejdejdejdeej deej d"ejf*d)d*ZdS )-zCutlass W4A8 MoE kernel.    )OptionalN)is_cuda_alike)cutlass_w4a8_moe_mmget_cutlass_w4a8_moe_mm_data)silu_and_mul)per_tensor_quant_fp8)"get_moe_expert_parallel_world_size)	cutlass_w4_run_moe_ep_preproess&deepep_ll_get_cutlass_w4a8_moe_mm_datadeepep_permute_triton_kernel!deepep_post_reorder_triton_kerneldeepep_run_moe_deep_preprocesspost_reorder_for_cutlass_moepre_reorder_for_cutlass_moe-silu_and_mul_masked_post_per_tensor_quant_fwd0silu_mul_static_tensorwise_quant_for_cutlass_moeF      ?aw1_qw2_qw1_scalew2_scaletopk_weightstopk_ids
a_strides1
b_strides1
c_strides1
a_strides2
b_strides2
c_strides2s_strides13
s_strides2expert_offsetsproblem_sizes1problem_sizes2a1_scalea2_scaleapply_router_weight_on_inputrouted_scaling_factorreturnc           $      C   s  |j |j ks
J d|jtjksJ |jtjksJ | j d d |j d ks*J d|j d d |j d ks:J d|j d |j d ksHJ d|j d |j d ksVJ d|j d |j d ksdJ d	|j d |j d ksrJ d
|j d |j d ksJ d|
j d |j d ksJ d|j d |j d ksJ d|d}| d}|dd }|dd }|d}|r|dksJ d| j}t dkrt|dk||}t|}tj	|| |f|tj
d}t| ||||||||	 tj	| tj|d}tj	| tj|d}t|||||||||	 tj	|| |d f|tjd} tj	|| |f|tjd}!t| ||| ||dd ||||	|d| tj	|| |ftj
|d}"t| |"| |dd || | t|!|"|| ||dd ||
|||d| t| }#t|!|#||||||||
 |#S )a  
    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
    using two sets of quantized weights, w1_q and w2_q, and top-k gating
    mechanism. The matrix multiplications are implemented with CUTLASS
    grouped gemm.

    Parameters:
    - a (torch.Tensor): The input tensor to the MoE layer.
        Shape: [M, K]
    - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
        Shape: [num_experts, N * 2,  K // 2]
        (the weights are passed transposed and int4-packed)
    - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
        Shape: [num_experts, K, N // 2]
        (the weights are passed transposed and int4-packed)
    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
        Shape: [num_experts, K // 512, N * 8]
    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
        Shape: [num_experts, N // 512, K * 4]
    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
    - topk_ids (torch.Tensor): The ids of each token->expert mapping.
    - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
    - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
    - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
    - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
    - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
    - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
        Shape: scalar or [1, K]
    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
        quantize the intermediate result between the gemms.
        Shape: scalar or [1, N]
    - apply_router_weight_on_input (bool): When true, the topk weights are
        applied directly on the inputs. This is only applicable when topk is 1.

    Returns:
    - torch.Tensor: The fp8 output tensor after applying the MoE layer.
    topk shape mismatch      Hidden size mismatch w1Hidden size mismatch w2r   Expert number mismatch w1 scales expert number mismatch w2 scales expert number mismatch"A Strides 1 expert number mismatch"B Strides 1 expert number mismatch"A Strides 2 expert number mismatch"B Strides 2 expert number mismatchz;apply_router_weight_on_input is only implemented for topk=1devicedtyper9   r8   N   )shaper9   torchint8sizer8   r   wherer	   emptyfloat8_e4m3fnr   numelint32r   bfloat16r   floatr   
empty_liker   )$r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   num_local_expertsmkntopkr8   src2dstgateup_inputa_mapc_mapc1c2intermediate_qoutput rU   Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/moe/cutlass_w4a8_moe.pycutlass_w4a8_moe#   s   @  







rW   	topk_ids_c           )      C   s  |j |j ks
J d|jtjksJ |jtjksJ | j d d |j d ks*J d|j d d |j d ks:J d|j d |j d ksHJ d|j d |j d ksVJ d|j d |j d ksdJ d	|j d |j d ksrJ d
|j d |j d ksJ d|
j d |j d ksJ d|j d |j d ksJ d|d}| d}|dd }|dd }|d}|d}| d}|dd }|dd }|d}| j}t||\}}}| }tjt	|| j d f|| jd}t
| j d f | |||tjd|| j d dd tj|j tj|d}t||| d ~|} t| dk||tj } tj|  tj|d}!tj|  tj|d}"t| ||||!|"|||	 tj|| |d f|tjd}#tj|| |f|tjd}$t|#||| ||dd ||||	|d| tj|| |f|tjd}%t|#|% tj|%j tj|d}&t|%|&| d t|$|&|| ||dd ||
|||d| |j d | }'tj|'|$j d f|$jtjd}(t|'f |$|(|||||$j d dd |(S )al  
    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
    using two sets of quantized weights, w1_q and w2_q, and top-k gating
    mechanism. The matrix multiplications are implemented with CUTLASS
    grouped gemm.

    Parameters:
    - a (torch.Tensor): The input tensor to the MoE layer.
        Shape: [M, K]
    - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
        Shape: [num_experts, N * 2,  K // 2]
        (the weights are passed transposed and int4-packed)
    - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
        Shape: [num_experts, K, N // 2]
        (the weights are passed transposed and int4-packed)
    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
        Shape: [num_experts, K // 512, N * 8]
    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
        Shape: [num_experts, N // 512, K * 4]
    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
    - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
    - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
    - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
    - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
    - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
    - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
        Shape: scalar or [1, K]
    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
        quantize the intermediate result between the gemms.
        Shape: scalar or [1, N]
    - apply_router_weight_on_input (bool): When true, the topk weights are
        applied directly on the inputs. This is only applicable when topk is 1.

    Returns:
    - torch.Tensor: The fp8 output tensor after applying the MoE layer.
    r*   r+   r,   r-   r.   r   r/   r0   r1   r2   r3   r4   r5   r7   Ni   )
BLOCK_SIZEr:   Tr6   r;   )r<   r9   r=   r>   r?   r8   r   rC   rA   intr   toint64rB   r   rF   r@   rD   
contiguousr   rE   zerosr   r   r   ))r   r   r   r   r   r   rX   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   num_expertsrI   rJ   rK   rL   r8   reorder_topk_idsrM   _num_total_tokensgateup_input_pre_reorderrN   local_topk_idsrO   rP   rQ   rR   intermediaterS   
num_tokensrT   rU   rU   rV   cutlass_w4a8_moe_deepep_normal   s   =  













rg   masked_mc                 C   sX  |j tjksJ |j tjksJ | jd d |jd ks J d|jd d |jd ks0J d|jd |jd ks>J d|jd |jd ksLJ d|jd |jd ksZJ d|jd |jd kshJ d	|jd |jd ksvJ d
|
jd |jd ksJ d|jd |jd ksJ d|d}| d}|dd }|dd }|d}| j}t||||||\}}tj| jtj|d}t	| ||
 d tj|||d f|tjd}tj|||f|tjd}t||||
 ||dd ||||	|d| tj|||f| jtjd}t|||| t||||
 ||dd ||
|||d| |S )a  
    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
    using two sets of quantized weights, w1_q and w2_q, and top-k gating
    mechanism. The matrix multiplications are implemented with CUTLASS
    grouped gemm.

    Parameters:
    - a (torch.Tensor): The input tensor to the MoE layer.
        Shape: [num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, K]
    - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
        Shape: [num_experts, N * 2,  K // 2]
        (the weights are passed transposed and int4-packed)
    - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
        Shape: [num_experts, K, N // 2]
        (the weights are passed transposed and int4-packed)
    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
        Shape: [num_experts, K // 512, N * 8]
    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
        Shape: [num_experts, N // 512, K * 4]
    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
    - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
    - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
    - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
    - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
    - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
    - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
        Shape: scalar or [1, K]
    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
        quantize the intermediate result between the gemms.
        Shape: scalar or [1, N]
    - apply_router_weight_on_input (bool): When true, the topk weights are
        applied directly on the inputs. This is only applicable when topk is 1.

    Returns:
    - torch.Tensor: The fp8 output tensor after applying the MoE layer.
    r,   r-   r+   r.   r   r/   r0   r1   r2   r3   r4   r5   r:   Tr7   Nr6   r;   )r9   r=   r>   r<   r?   r8   r
   rA   rB   r   rF   rE   r   r   )r   r   r   r   r   rX   rh   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r_   rI   rJ   rK   rL   r8   rN   rQ   rR   rS   rU   rU   rV   cutlass_w4a8_moe_deepep_ll  s   =  


	

ri   )NNFr   )NN)__doc__typingr   r=   sglang.srt.utilsr   _is_cuda_alike
sgl_kernelr   r   r   &sglang.jit_kernel.per_tensor_quant_fp8r   sglang.srt.distributedr   $sglang.srt.layers.moe.ep_moe.kernelsr	   r
   r   r   r   r   r   r   r   TensorboolrF   rW   rg   ri   rU   rU   rU   rV   <module>   s4  , 	

 P	

 R	
