o
    ÁÙ¾iÐ#  ã                #   @   sx  d dl mZmZmZ d dlZ	d:dd„Z			d;dejdejd	ejd
ededeej ddfdd„Z			d<dejdejd	ejd
edeej ddfdd„Z
	 d=dd„Zdejdejfdd„Z	 	 	d>dd„Z			d?dd„Zdd„ Z	d@d eej fd!d"„Zd#d$„ Z	d@d%ejd&ed'ed(ed)ed*ed+ejd,ejd-ed.ed/ejd0ed1ed2ed3ed4ee ddf"d5d6„Zd7eeef fd8d9„ZdS )Aé    )ÚAnyÚDictÚOptionalNFc              
   C   s"   t jjj | |||||||¡ d S ©N)ÚtorchÚopsÚ
sgl_kernelÚmoe_align_block_sizeÚdefault)Útopk_idsÚnum_expertsÚ
block_sizeÚsorted_token_idsÚexperts_idsÚnum_tokens_post_padÚcumsum_bufferÚpad_sorted_token_ids© r   úB/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/moe.pyr	      s   

ør	   ç        Útopk_weightsr   Úgating_outputÚrenormalizeÚmoe_softcappingÚcorrection_biasÚreturnc                 C   s   t jjj | |||||¡ dS )aí  
    Compute top-k softmax for MoE routing.

    Args:
        topk_weights: Output tensor for top-k weights [num_tokens, topk]
        topk_ids: Output tensor for top-k expert indices [num_tokens, topk]
        gating_output: Gating logits [num_tokens, num_experts]
        renormalize: Whether to renormalize the top-k weights
        moe_softcapping: Tanh softcapping value (0.0 to disable)
        correction_bias: Per-expert bias correction [num_experts], must be float32 if provided
    N)r   r   r   Útopk_softmaxr
   )r   r   r   r   r   r   r   r   r   r      s   
úr   c                 C   s   t jjj | ||||¡ dS )a¬  
    Compute top-k sigmoid for MoE routing.

    Args:
        topk_weights: Output tensor for top-k weights [num_tokens, topk]
        topk_ids: Output tensor for top-k expert indices [num_tokens, topk]
        gating_output: Gating logits [num_tokens, num_experts]
        renormalize: Whether to renormalize the top-k weights
        correction_bias: Per-expert bias correction [num_experts], must be float32 if provided
    N)r   r   r   Útopk_sigmoidr
   )r   r   r   r   r   r   r   r   r   9   s   
ûr   c                 C   s   t jjj | ||¡ d S r   )r   r   r   Úmoe_sum_reducer
   )Úinput_tensorÚoutput_tensorÚrouted_scaling_factorr   r   r   r   S   s
   
ýr   r   r    c                 C   s   t jjj | |¡ d S r   )r   r   r   Úmoe_sumr
   )r   r    r   r   r   r"   _   s   
þr"   c              
   C   s   t jjj | |||||||¡S r   )r   r   r   Úmoe_fused_gater
   )r   ÚbiasÚnum_expert_groupÚ
topk_groupÚtopkÚnum_fused_shared_expertsr!   Ú%apply_routed_scaling_factor_on_outputr   r   r   r#   i   s   
ør#   Tç      ð?c                 C   s   t jjj | |||||¡S )aÖ  
    Simplified fused kernel for Kimi K2 model (num_expert_group=1).
    This kernel removes the grouped topk logic since all experts belong to a single group.

    Args:
        input_tensor: Gating output tensor [num_tokens, num_experts]
        bias: Correction bias tensor [num_experts]
        topk: Number of experts to select per token
        renormalize: Whether to renormalize the topk weights
        routed_scaling_factor: Scaling factor for expert weights
        apply_routed_scaling_factor_on_output: If true, apply scaling factor to output

    Returns:
        Tuple of (topk_weights, topk_ids)
        - topk_weights: [num_tokens, topk] float32 tensor
        - topk_ids: [num_tokens, topk] int32 tensor
    )r   r   r   Úkimi_k2_moe_fused_gater
   )r   r$   r'   r   r!   r)   r   r   r   r+   Œ   s   
úr+   c                 C   s6   t jjj | |||||||||	|
|||||||¡ d S r   )r   r   r   Úfp8_blockwise_scaled_grouped_mmr
   )ÚoutputÚa_ptrsÚb_ptrsÚout_ptrsÚa_scales_ptrsÚb_scales_ptrsÚaÚbÚscales_aÚscales_bÚstride_aÚstride_bÚstride_cÚ
layout_sfaÚ
layout_sfbÚproblem_sizesÚexpert_offsetsÚ	workspacer   r   r   r,   ¯   s(   
îr,   Úblockscale_offsetsc
           
      C   s&   t jjj | ||	|||||||¡
 d S r   )r   r   r   Úprepare_moe_inputr
   )
r   r=   Úproblem_sizes1Úproblem_sizes2Úinput_permutationÚoutput_permutationr   ÚnÚkr?   r   r   r   r@   Ù   s   
ör@   c                 C   s   t jjj | |||¡ d S r   )r   r   r   Úapply_shuffle_mul_sumr
   )Úinputr-   ÚpermutationÚfactorsr   r   r   rG   ó   s   
ÿrG   ÚqkvÚnum_heads_qÚnum_heads_kÚnum_heads_vÚhead_dimÚepsÚq_weightÚk_weightÚbaseÚis_neoxÚposition_idsÚfactorÚlowÚhighÚattention_factorÚ
rotary_dimc                 C   sB   t jj | |||||||||	|
|||||d ur|¡ d S |¡ d S r   )r   r   r   Úfused_qk_norm_rope)rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r   r   r   r[   þ   s(   
ððr[   Úparamsc                 C   sp   | j d }|j d }	||	f}
tj|
||d}tjjj || |||||d |d |d |d |d ¡ |j|d	S )
a¼  
    An FP4 Blockscaled Group Gemm that takes in  a_tensors, b_tensors and runs
    the gemms for each combination based on the specified problem sizes.

    This is used as the MoE gemm during NVFP4 Quantized FusedMoE forward.
    - a/b_tensors: the NVFP4 a_ptrs and b_ptrs tensors which are quantized
                     input and expert weights.
    - a_/b_scales: The blockscales in FP8-E4M3 precision
    - ab_strides/c_strides: Strides for the a/b tensors between rows.
    - expert_offsets/sf_offsets: Indices that mark at which token index
                    each expert begins its computation. The number of tokens
                    computed with expert E is expert_offsets[E + 1] -
                    expert_offsets[E] And the sf_size per expert is
                    sf_offset[E+1] - sf_offset[E]
    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
                     MMs used in the fused MoE operation.
    r   é   )ÚdeviceÚdtypeÚ
ab_stridesÚ	c_stridesr<   r=   r?   )r_   )Úshaper   Úemptyr   r   Úcutlass_fp4_group_mmr
   Úto)Úa_fp4Úb_fp4Úa_blockscaleÚb_blockscaleÚalphasÚ	out_dtyper^   r\   Úm_topkrE   Úc_shapeÚcr   r   r   rd   $  s$   


õrd   )F)Fr   N)FN)r   )r   r   F)Tr*   Fr   )Útypingr   r   r   r   r	   ÚTensorÚboolÚfloatr   r   r   r"   r#   r+   r,   r@   rG   Úintr[   Ústrrd   r   r   r   r   Ú<module>   sÂ    
øúÿþýüûú
ù!ûÿþýüû
ú
ýÿ
þ
ø'
ú#4ö

öðÿþýüûúùø	÷
öõôóòñð
ï&
ø