o
    پi#                  #   @   sx  d dl mZmZmZ d dlZ	d:ddZ			d;dejdejd	ejd
ededeej ddfddZ			d<dejdejd	ejd
edeej ddfddZ
	 d=ddZdejdejfddZ	 	 	d>ddZ			d?ddZdd Z	d@d eej fd!d"Zd#d$ Z	d@d%ejd&ed'ed(ed)ed*ed+ejd,ejd-ed.ed/ejd0ed1ed2ed3ed4ee ddf"d5d6Zd7eeef fd8d9ZdS )A    )AnyDictOptionalNFc              
   C   s"   t jjj| ||||||| d S N)torchops
sgl_kernelmoe_align_block_sizedefault)topk_idsnum_experts
block_sizesorted_token_idsexperts_idsnum_tokens_post_padcumsum_bufferpad_sorted_token_ids r   B/home/ubuntu/.local/lib/python3.10/site-packages/sgl_kernel/moe.pyr	      s   

r	           topk_weightsr   gating_outputrenormalizemoe_softcappingcorrection_biasreturnc                 C   s   t jjj| ||||| dS )a  
    Compute top-k softmax for MoE routing.

    Args:
        topk_weights: Output tensor for top-k weights [num_tokens, topk]
        topk_ids: Output tensor for top-k expert indices [num_tokens, topk]
        gating_output: Gating logits [num_tokens, num_experts]
        renormalize: Whether to renormalize the top-k weights
        moe_softcapping: Tanh softcapping value (0.0 to disable)
        correction_bias: Per-expert bias correction [num_experts], must be float32 if provided
    N)r   r   r   topk_softmaxr
   )r   r   r   r   r   r   r   r   r   r      s   
r   c                 C   s   t jjj| |||| dS )a  
    Compute top-k sigmoid for MoE routing.

    Args:
        topk_weights: Output tensor for top-k weights [num_tokens, topk]
        topk_ids: Output tensor for top-k expert indices [num_tokens, topk]
        gating_output: Gating logits [num_tokens, num_experts]
        renormalize: Whether to renormalize the top-k weights
        correction_bias: Per-expert bias correction [num_experts], must be float32 if provided
    N)r   r   r   topk_sigmoidr
   )r   r   r   r   r   r   r   r   r   9   s   
r   c                 C   s   t jjj| || d S r   )r   r   r   moe_sum_reducer
   )input_tensoroutput_tensorrouted_scaling_factorr   r   r   r   S   s
   
r   r   r    c                 C   s   t jjj| | d S r   )r   r   r   moe_sumr
   )r   r    r   r   r   r"   _   s   
r"   c              
   C   s   t jjj| |||||||S r   )r   r   r   moe_fused_gater
   )r   biasnum_expert_group
topk_grouptopknum_fused_shared_expertsr!   %apply_routed_scaling_factor_on_outputr   r   r   r#   i   s   
r#   T      ?c                 C   s   t jjj| |||||S )a  
    Simplified fused kernel for Kimi K2 model (num_expert_group=1).
    This kernel removes the grouped topk logic since all experts belong to a single group.

    Args:
        input_tensor: Gating output tensor [num_tokens, num_experts]
        bias: Correction bias tensor [num_experts]
        topk: Number of experts to select per token
        renormalize: Whether to renormalize the topk weights
        routed_scaling_factor: Scaling factor for expert weights
        apply_routed_scaling_factor_on_output: If true, apply scaling factor to output

    Returns:
        Tuple of (topk_weights, topk_ids)
        - topk_weights: [num_tokens, topk] float32 tensor
        - topk_ids: [num_tokens, topk] int32 tensor
    )r   r   r   kimi_k2_moe_fused_gater
   )r   r$   r'   r   r!   r)   r   r   r   r+      s   
r+   c                 C   s6   t jjj| |||||||||	|
||||||| d S r   )r   r   r   fp8_blockwise_scaled_grouped_mmr
   )outputa_ptrsb_ptrsout_ptrsa_scales_ptrsb_scales_ptrsabscales_ascales_bstride_astride_bstride_c
layout_sfa
layout_sfbproblem_sizesexpert_offsets	workspacer   r   r   r,      s(   
r,   blockscale_offsetsc
           
      C   s&   t jjj| ||	|||||||
 d S r   )r   r   r   prepare_moe_inputr
   )
r   r=   problem_sizes1problem_sizes2input_permutationoutput_permutationr   nkr?   r   r   r   r@      s   
r@   c                 C   s   t jjj| ||| d S r   )r   r   r   apply_shuffle_mul_sumr
   )inputr-   permutationfactorsr   r   r   rG      s   
rG   qkvnum_heads_qnum_heads_knum_heads_vhead_dimepsq_weightk_weightbaseis_neoxposition_idsfactorlowhighattention_factor
rotary_dimc                 C   sB   t jj| |||||||||	|
|||||d ur| d S | d S r   )r   r   r   fused_qk_norm_rope)rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r   r   r   r[      s(   
r[   paramsc                 C   sp   | j d }|j d }	||	f}
tj|
||d}tjjj|| |||||d |d |d |d |d  |j|d	S )
a  
    An FP4 Blockscaled Group Gemm that takes in  a_tensors, b_tensors and runs
    the gemms for each combination based on the specified problem sizes.

    This is used as the MoE gemm during NVFP4 Quantized FusedMoE forward.
    - a/b_tensors: the NVFP4 a_ptrs and b_ptrs tensors which are quantized
                     input and expert weights.
    - a_/b_scales: The blockscales in FP8-E4M3 precision
    - ab_strides/c_strides: Strides for the a/b tensors between rows.
    - expert_offsets/sf_offsets: Indices that mark at which token index
                    each expert begins its computation. The number of tokens
                    computed with expert E is expert_offsets[E + 1] -
                    expert_offsets[E] And the sf_size per expert is
                    sf_offset[E+1] - sf_offset[E]
    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
                     MMs used in the fused MoE operation.
    r      )devicedtype
ab_strides	c_stridesr<   r=   r?   )r_   )shaper   emptyr   r   cutlass_fp4_group_mmr
   to)a_fp4b_fp4a_blockscaleb_blockscalealphas	out_dtyper^   r\   m_topkrE   c_shapecr   r   r   rd   $  s$   


rd   )F)Fr   N)FN)r   )r   r   F)Tr*   Fr   )typingr   r   r   r   r	   Tensorboolfloatr   r   r   r"   r#   r+   r,   r@   rG   intr[   strrd   r   r   r   r   <module>   s    

!



'
#4

	

&
