o
    iC                     @   s   d dl mZ d dlmZ d dlZd dlmZ d dlZd dlm	Z	m
Z
mZ e
g ddd Zejd	d
 Zei ede	ddejdejdededededejdejdeddfddZdS )    )flashinfer_api)gen_dsv3_fused_routing_moduleN)SimpleNamespace)register_custom_opsupported_compute_capabilitybackend_requirement)Y   Z   d   g   x   y   c	              
   C   s   | j d }	|| |k s||kr"td||  d| d| d| d	|dkrT|	| }
|
| }|dkr:td| d	|
d
krFtd|
 d|dkrRtd| ddS |	dkr`td|	 d|dkrltd| d	dS )a  Validate configuration parameters for DSv3 fused routing kernel.

    Args:
        scores: Input routing scores tensor
        bias: Per-expert routing bias tensor
        n_group: Number of expert groups
        topk_group: Number of top groups to select
        topk: Number of top experts to select per token
        routed_scaling_factor: Scaling factor for normalized weights
        topk_values: Output tensor for normalized expert weights
        topk_indices: Output tensor for selected expert indices
        launch_with_pdl: Whether to use Persistent Device-side Launch

    Raises:
        ValueError: If configuration is invalid or exceeds kernel limits
       z-Invalid configuration: topk_group * n_group (z) must be >= topk (z) and topk_group (z) must be <= n_group ()   z-Invalid configuration for n_group > 1: topk (z) must be <= 8    z>Invalid configuration for n_group > 1: num_experts / n_group (z) must be <= 32   zKInvalid configuration for n_group > 1: num_experts / n_group * topk_group (z) must be <= 128i  z4Invalid configuration for n_group = 1: num_experts (z) must be <= 384z-Invalid configuration for n_group = 1: topk (T)shape
ValueError)scoresbiasn_group
topk_grouptopkrouted_scaling_factortopk_valuestopk_indiceslaunch_with_pdlnum_expertsexperts_per_groupmax_experts_in_selected_groups r!   ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/flashinfer/fused_moe/fused_routing_dsv3.py#_check_dsv3_fused_routing_supported   sL   



r#   c                     sf   t    tdddgd	ddtjdtjdtd	td
tdtdtjdtjdtdd f fdd} t| dS )Nzflashinfer::NoAuxTcr   r   )mutates_argsTr   r   r   r   r   r   r   returnc	           	         s     | ||||||||	 d S )NNoAuxTc	r   r   r   r   r   r   r   r   r   moduler!   r"   r'   W   s   z.get_dsv3_fused_routing_module.<locals>.NoAuxTcr&   T)	r   build_and_loadr   torchTensorintfloatboolr   r&   r!   r)   r"   get_dsv3_fused_routing_moduleS   s<   
	
r2   )common_checkTr   r   r   r   r   r   r   r   r   r%   c	           	      C   s    t  | ||||||||	 dS )a  Fused expert routing with top-k selection for DeepSeek-V3.

    This function performs a highly optimized fused routing operation specifically
    designed for DeepSeek-V3's Mixture of Experts (MoE) architecture with grouped
    expert routing and no auxiliary loss. It combines score computation, expert
    selection, and normalization into a single kernel operation.

    The routing algorithm consists of the following steps:
    1. Compute biased scores: sigmoid(scores) + bias for each expert
    2. Group experts and compute group scores (sum of top-2 experts per group)
    3. Select top-k groups based on group scores
    4. From selected groups, select top-k experts based on biased scores
    5. Normalize selected expert weights: sigmoid_scores / sum(sigmoid_scores) * scale

    Args:
        scores (torch.Tensor): Input routing scores of shape (num_tokens, num_experts).
            The logits produced by the router network before activation. Supports
            bfloat16, float16, or float32.
        bias (torch.Tensor): Per-expert routing bias of shape (num_experts,). Added to
            sigmoid-activated scores to produce biased scores for expert selection.
            Must match the dtype of scores.
        n_group (int): Number of expert groups. Experts are divided into groups for
            hierarchical selection. Typical value is 8 for DeepSeek-V3 with 256 experts
            (32 experts per group).
        topk_group (int): Number of top groups to select. Must be <= n_group. Typical
            value is 4, meaning the top 4 groups are selected from 8 groups.
        topk (int): Number of top experts to select per token. Must be <= num_experts.
            Typical value is 8, meaning 8 experts are routed per token.
        routed_scaling_factor (float): Scaling factor applied to normalized expert
            weights. The final output weights are:
            sigmoid_scores / sum(sigmoid_scores) * routed_scaling_factor.
        topk_values (torch.Tensor): Pre-allocated output tensor of shape
            (num_tokens, topk) for the normalized expert weights. Must be float32.
            This tensor is mutated in-place.
        topk_indices (torch.Tensor): Pre-allocated output tensor of shape
            (num_tokens, topk) for the selected expert indices. Must be int32 or int64.
            This tensor is mutated in-place.
        launch_with_pdl (bool, optional): Whether to launch the kernel using Persistent
            Device-side Launch. Defaults to True.

    Returns:
        None: Results are written directly to `topk_values` and `topk_indices` tensors.

    Note:
        - The kernel uses float32 internally for all computations to ensure numerical
          precision, even when inputs are float16 or bfloat16.
        - This implementation is optimized for Hopper (compute capability 90, 100),
          Ada (compute capability 89), and Blackwell (compute capability 120, 121)
          architectures.
        - The "NoAux" prefix indicates this variant does not compute auxiliary losses
          (e.g., load balancing loss) during routing.
        - The "Tc" suffix indicates the use of Tensor Core optimizations in the
          underlying CUDA kernel.
    N)r2   r'   r(   r!   r!   r"   fused_topk_deepseekw   s   Cr4   r+   )flashinfer.api_loggingr   flashinfer.jitr   	functoolstypesr   r-   flashinfer.utilsr   r   r   r#   cacher2   r.   r/   r0   r1   r4   r!   r!   r!   r"   <module>   sD    

E

#
	
