o
    (i                      @   s   d dl mZ ddlmZ ddlZddlmZ ddlZddlm	Z	m
Z
mZ dd Ze
d	gd
d Ze
d	gdd Zejdd Zei ede	ddejdejdejdeddf
ddZei ede	ddejdejdejdeddf
ddZdS )   )flashinfer_api    )gen_dsv3_router_gemm_moduleN)SimpleNamespace)register_custom_opsupported_compute_capabilitybackend_requirementc           	      C   s  |   dkr
td|  dkrtd|  dkrtd| ddkr)td|ddkr4td|ddkr?td	| jd |jd krMtd
|jd | jd kr[td|jd |jd kritdd}d}d}| jd |k s}| jd |krtd| d| | jd |krtd| |jd |krtd| | jtjkrtd|jtjkrtd|j|krtd| ddS )Nr   zmat_a must be a 2D tensorzmat_b must be a 2D tensorzout must be a 2D tensor   zmat_a must be row-majorzout must be row-majorr   zmat_b must be column-majorz.mat_a.shape[1] must be equal to mat_b.shape[0]z,out.shape[0] must be equal to mat_a.shape[0]z,out.shape[1] must be equal to mat_b.shape[1]i      z,mat_a.shape[0] (num_tokens) must be between z and z-mat_a.shape[1] (hidden_dim) must be equal to z.mat_b.shape[1] (num_experts) must be equal to zmat_a must be a bfloat16 tensorzmat_b must be a bfloat16 tensorzout must be a z tensorT)dim
ValueErrorstrideshapedtypetorchbfloat16)	mat_amat_boutlaunch_with_pdlexpected_num_expertsexpected_out_dtypeexpected_hidden_dim
min_tokens
max_tokens r   \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/flashinfer/gemm/routergemm_dsv3.py_mm_M1_16_K7168_shape_checks   sP   
r   d   c                 C      t | |||dtjdS )N   r   r   )r   r   float32r   r   r   r   r   r   r   !_mm_M1_16_K7168_N256_shape_checksD      r$   c                 C   r   )N   r!   )r   r   r   r#   r   r   r   !_mm_M1_16_K7168_N128_shape_checksQ   r%   r'   c                     s   t    tddgd	ddtjdtjdtjdtdd f
 fd	d
} tddgd	ddtjdtjdtjdtdd f
 fdd}t| |dS )Nzflashinfer::ml3_router_gemm_opr   )mutates_argsFr   r   r   returnc                         | ||| d S N)ml3_router_gemm_opr#   moduler   r   mm_M1_16_K7168_N128a      
z8get_dsv3_router_gemm_module.<locals>.mm_M1_16_K7168_N128zflashinfer::dsv3_router_gemm_opc                    r*   r+   )dsv3_router_gemm_opr#   r-   r   r   mm_M1_16_K7168_N256m   r0   z8get_dsv3_router_gemm_module.<locals>.mm_M1_16_K7168_N256r/   r2   F)r   build_and_loadr   r   Tensorboolr   r3   r   r-   r   get_dsv3_router_gemm_module]   sJ   
r8   )common_checkFr   r   r   r   r)   c                 C      t  | ||| dS )a0  Optimized GEMM for the router operation in Mistral Large 3.

    This function performs a highly optimized matrix multiplication specifically tailored
    for the expert routing GEMM in Mistral Large 3's Mixture of Experts (MoE) architecture.
    It computes out = mat_a @ mat_b where mat_a contains token embeddings and mat_b
    contains expert routing weights.

    The implementation is optimized for the specific problem dimensions used in Mistral Large 3:
    - Hidden dimension (K): 7168
    - Number of experts (N): 128
    - Number of tokens (M): 1-16

    Args:
        mat_a (torch.Tensor): Input token embeddings of shape (M, K) where M is the number
            of tokens (1-16) and K is the hidden dimension (7168). Must be bfloat16,
            row-major (contiguous).
        mat_b (torch.Tensor): Expert routing weights of shape (K, N) where K is the hidden
            dimension (7168) and N is the number of experts (128). Must be bfloat16,
            column-major (transposed layout).
        out (torch.Tensor): Pre-allocated output tensor of shape (M, N) containing the
            routing scores. Must be bfloat16, row-major (contiguous). This tensor is
            mutated in-place.
        launch_with_pdl (bool, optional): Whether to launch the kernel using Persistent
            Device-side Launch. Defaults to False.

    Returns:
        None: The result is written directly to the `out` tensor.

    Raises:
        ValueError: If tensor dimensions, strides, or data types do not match the
            expected Mistral Large 3 router configuration.

    Note:
        This kernel is specialized for compute capability 10.0 (Blackwell architecture).
        The specific problem size optimization makes this significantly faster than
        general-purpose GEMM implementations for the router operation.
    N)r8   r/   r#   r   r   r   r/         -r/   c                 C   r:   )a  Optimized GEMM for the router operation in DeepSeek-V3.

    This function performs a highly optimized matrix multiplication specifically tailored
    for the expert routing GEMM in DeepSeek-V3's Mixture of Experts (MoE) architecture.
    It computes out = mat_a @ mat_b where mat_a contains token embeddings and mat_b
    contains expert routing weights.

    The implementation is optimized for the specific problem dimensions used in DeepSeek-V3:
    - Hidden dimension (K): 7168
    - Number of experts (N): 256
    - Number of tokens (M): 1-16

    Args:
        mat_a (torch.Tensor): Input token embeddings of shape (M, K) where M is the number
            of tokens (1-16) and K is the hidden dimension (7168). Must be bfloat16,
            row-major (contiguous).
        mat_b (torch.Tensor): Expert routing weights of shape (K, N) where K is the hidden
            dimension (7168) and N is the number of experts (256). Must be bfloat16,
            column-major (transposed layout).
        out (torch.Tensor): Pre-allocated output tensor of shape (M, N) containing the
            routing scores. Must be float32, row-major (contiguous). This tensor is
            mutated in-place.
        launch_with_pdl (bool, optional): Whether to launch the kernel using Persistent
            Device-side Launch. Defaults to False.

    Returns:
        None: The result is written directly to the `out` tensor.

    Raises:
        ValueError: If tensor dimensions, strides, or data types do not match the
            expected DeepSeek-V3 router configuration.

    Note:
        This kernel is specialized for compute capability 10.0 (Blackwell architecture).
        The specific problem size optimization makes this significantly faster than
        general-purpose GEMM implementations for the router operation.
    N)r8   r2   r#   r   r   r   r2      r;   r2   r4   )api_loggingr   flashinfer.jitr   	functoolstypesr   r   flashinfer.utilsr   r   r   r   r$   r'   cacher8   r6   r7   r/   r2   r   r   r   r   <module>   sR    7



!
0