o
    
۾ivX                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ eeZejdd	Zejd
efddZejd
efddZde
de
d
efddZde d
e
dB fddZ!efde de dede
f fddZ"e"ddZ#e"ddZ$e"ddZ%e"ddZ&e"d d!Z'e"d"d#Z(e"d"d$Z)e"d"d%Z*e"d"d&Z+e"d'd(Z,e"d"d)Z-e"d*d+d,d- d.Z.ejd
efd/d0Z/ejd
efd1d2Z0ejd
efd3d4Z1ejd
efd5d6Z2ejd
efd7d8Z3ejd
efd9d:Z4ejd
efd;d<Z5ejd
efd=d>Z6ejd
efd?d@Z7d
edB fdAdBZ8dCe9dDe9d
efdEdFZ:		G	GddCe9dDe9dHe9dIe9dJe9dKe dLej;dMedNedB dOedPed
efdQdRZ<e rTddSl=m>Z> dTej?dUej?dVej?d
dfdWdXZ@dTej?dUej?dVej?d
dfdYdZZAe>d[e@dTgeAd\ ejBjCd]g d^d_d`ej?daej?dbej?dcej?ddej?deej;dfedge d
ej?fdhdiZDejBEd]d`ej?daej?dbej?dcej?ddej?deej;dfedge d
ej?fdjdkZFejBjCdlg d^d_d`ej?daej?dbej?dcej?deej;dge d
ej?fdmdnZGejBEdld`ej?daej?dbej?dcej?deej;dge d
ej?fdodpZHejBjCdqg d^d_drej?dsej?d
eIej?ej?f fdtduZJejBEdqdrej?dsej?d
eIej?ej?f fdvdwZKdrej?dxej?dyej?dzej?d{ej?d|ej;dge d
ej?fd}d~ZL	ddrej?dxej?dej?dej?d|ej;dej?dB d
ej?fddZMdrej?dsej?d
eIej?ej?f fddZNe"ddZOejd
efddZPejd
efddZQdedej;dej?dej?fddZRg dZSdS )zoCompatibility wrapper for FlashInfer API changes.

Users of vLLM should always import **only** these wrappers.
    N)Callable)AnyNoReturn)init_logger)vllm_is_batch_invariant)current_platformFLASHINFER_CUBINS_REPOSITORYzWhttps://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/returnc                   C   s,   t jrdS tjddurdS td dS )z7Return `True` if flashinfer-cubin package is available.Tflashinfer_cubinNz&flashinfer-cubin package was not foundF)envsVLLM_HAS_FLASHINFER_CUBIN	importlibutil	find_speclogger
debug_once r   r   I/home/ubuntu/.local/lib/python3.10/site-packages/vllm/utils/flashinfer.pyhas_flashinfer_cubin&   s   
r   c                   C   sD   t jddu rtd dS t s tddu r td dS dS )z8Return `True` if flashinfer-python package is available.
flashinferNz2FlashInfer unavailable since package was not foundFnvcczSFlashInfer unavailable since nvcc was not found and not using pre-downloaded cubinsT)r   r   r   r   r   r   shutilwhichr   r   r   r   has_flashinfer1   s   
r   ___c                  O   s   t d)z/Placeholder for unavailable FlashInfer backend.zFlashInfer backend is not available. Please install the package to enable FlashInfer kernels: https://github.com/flashinfer-ai/flashinfer)RuntimeError)r   r   r   r   r   _missingD   s   r   module_namec              	   C   s&   zt | W S  ttfy   Y dS w )zBSafely import a submodule and return it, or None if not available.N)r   import_moduleImportErrorModuleNotFoundError)r   r   r   r   _get_submoduleM   s
   r"   	attr_namefallback_fn.c                    s&   t jfdd  fdd}|S )z5Create a lazy import wrapper for a specific function.c                     s&   t  sd S t} | rt|  d S d S N)r   r"   getattr)mod)r#   r   r   r   	_get_impl[   s   z'_lazy_import_wrapper.<locals>._get_implc                     s*     }|d u r| i |S || i |S r%   r   )argskwargsimpl)r(   r$   r   r   wrapperb   s   z%_lazy_import_wrapper.<locals>.wrapper)	functoolscache)r   r#   r$   r,   r   )r(   r#   r$   r   r   _lazy_import_wrapperV   s   r/   flashinfer.fused_moetrtllm_bf16_moetrtllm_fp8_block_scale_moetrtllm_fp8_per_tensor_scale_moecutlass_fused_moe$flashinfer.cute_dsl.blockscaled_gemmgrouped_gemm_nt_maskedr   fp4_quantizenvfp4_batched_quantize*silu_and_mul_scaled_nvfp4_experts_quantizescaled_fp4_grouped_quantizezflashinfer.fp4_quantizationblock_scale_interleavetrtllm_fp4_block_scale_moezflashinfer.autotunerautotunec                  O   s   t  S r%   )
contextlibnullcontext)r)   r*   r   r   r   <lambda>   s    r@   )r$   c                   C      t  o
tjdduS )z5Return `True` if FlashInfer comm module is available.flashinfer.commNr   r   r   r   r   r   r   r   has_flashinfer_comm   s   rD   c                  C   @   t  sdS g d} | D ]\}}t|}|rt||s dS qdS )z7Return `True` if FlashInfer mnnvl all2all is available.F))rB   Mapping)zflashinfer.comm.mnnvlMnnvlMemory)flashinfer.comm.trtllm_alltoallMnnvlMoe)rH   MoEAlltoallInfoT)rD   r"   hasattrrequired_functionsr   r#   r'   r   r   r   has_flashinfer_all2all      rN   c                   C   rA   )z4Return `True` if FlashInfer MoE module is available.r0   NrC   r   r   r   r   has_flashinfer_moe   s   rP   c                   C   rA   )z:Return ``True`` if FlashInfer cutedsl module is available.zflashinfer.cute_dslNrC   r   r   r   r   has_flashinfer_cutedsl   s   rQ   c                  C   rE   )z:Return `True` if FlashInfer TRTLLM fused MoE is available.F))r0   r2   )r0   r3   r0   r<   )r0   trtllm_mxint4_block_scale_moeTrP   r"   rK   rL   r   r   r   has_flashinfer_trtllm_fused_moe   s   rU   c                  C   rE   )z;Return `True` if FlashInfer CUTLASS fused MoE is available.F))r0   r4   )r   r7   )r   nvfp4_block_scale_interleaverR   TrT   rL   r   r   r    has_flashinfer_cutlass_fused_moe   rO   rW   c                  C   rE   )z=Return ``True`` if FlashInfer CUTLASS fused MoE is available.F))r5   r6   )r   r:   )r   &silu_and_scaled_nvfp4_experts_quantizeT)rQ   r"   rK   rL   r   r   r   -has_flashinfer_cutedsl_grouped_gemm_nt_masked   s   rY   c               
   C   s~   t  rdS z tjtdd} | jdk}|rtd |W S td| j |W S  ty> } ztd| W Y d}~d	S d}~ww )
zReturn `True` if NVIDIA's artifactory is accessible.

    This checks connectivity to the kernel inference library artifactory
    which is required for downloading certain cubin kernels like TRTLLM FHMA.
    T   )timeout   z NVIDIA artifactory is accessiblez2NVIDIA artifactory returned failed status code: %dz+Failed to connect to NVIDIA artifactory: %sNF)	r   requestsgetr   status_coder   r   warning_once	Exception)response
accessibleer   r   r   has_nvidia_artifactory   s$   

re   c                   C   s   t  rdS tdot S )z
    TRTLLM attention is supported if the platform is SM100,
    NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
    Fd   )r   r   is_device_capability_familyre   r   r   r   r   supports_trtllm_attention  s   rh   c                  C   s   ddl m}  |  }|jjS )a,  
    This function should only be called during initialization stage when vllm config
    is set.
    Return `None` if --attention-config.use_trtllm_attention is not set,
    return `True` if TRTLLM attention is forced to be used,
    return `False` if TRTLLM attention is forced to be not used.
    r   )get_current_vllm_config)vllm.configri   attention_configuse_trtllm_attention)ri   vllm_configr   r   r   force_use_trtllm_attention$  s   rn   num_qo_headsnum_kv_headsc                 C   s$   t  du rdS t }|o| | dkS )z=Check if the current configuration supports TRTLLM attention.Fr   )rn   rh   )ro   rp   
has_trtllmr   r   r   can_use_trtllm_attention2  s   
rr   F
num_tokensmax_seq_lendcp_world_sizekv_cache_dtypeq_dtype
is_prefillforce_use_trtllm	has_sinkshas_specc                 C   s   |dur|sdS |dkrt d dS t s|rt d dS | | dkr.|r,t d dS |
r9|s9t d d	S |t krFt d
 d	S |	rOt d d	S |du rs|rb|dk}|r`t d |S |dkoi|dk}|rqt d |S t d d	S )z*Return `True` if TRTLLM attention is used.NF   zcTrtllm does not support returning LSE and as a result does not support DCP, reverting to FlashInferzkTRTLLM attention is not supported on this platform, but --attention-config.use_trtllm_attention is set to 1r   zTRTLLM attention is not supported for this combination of query and key heads, but --attention-config.use_trtllm_attention is set to 1z:Using TRTLLM attention (enabled for speculative decoding).Tz,Using TRTLLM attention (query is quantized).z6Using TRTLLM attention (required for attention sinks).autoz/Using TRTLLM prefill attention (auto-detected).   z.Using TRTLLM decode attention (auto-detected).zLUsing TRTLLM attention (--attention-config.use_trtllm_attention is set to 1))r   r`   rh   	info_oncer   	fp8_dtype)ro   rp   rs   rt   ru   rv   rw   rx   ry   rz   r{   
use_trtllmr   r   r   rl   :  sT   




rl   )direct_register_custom_opkk_nopek_pec                 C   s   ddl m} || || dS )ar  Custom op wrapper for flashinfer's concat_mla_k.

        This is an in-place operation that concatenates k_nope and k_pe into k.

        The kernel is optimized for DeepSeek V3 dimensions:
        - num_heads=128
        - nope_dim=128
        - rope_dim=64

        Key optimizations:
        - Warp-based processing with software pipelining
        - Vectorized memory access (int2 for nope, int for rope)
        - L2 prefetching for next row while processing current
        - Register reuse for rope values across all heads

        Args:
            k: Output tensor, shape [num_tokens, num_heads, nope_dim + rope_dim].
                Modified in-place.
            k_nope: The nope part of k, shape [num_tokens, num_heads, nope_dim].
            k_pe: The rope part of k (shared), shape [num_tokens, 1, rope_dim].
                  This is broadcast to all heads.
        r   )concat_mla_kN)flashinfer.concat_opsr   )r   r   r   r   r   r   r   _flashinfer_concat_mla_k  s   r   c                 C   s   d S r%   r   )r   r   r   r   r   r   _flashinfer_concat_mla_k_fake  s   r   flashinfer_concat_mla_k)op_nameop_funcmutates_args	fake_implzvllm::flashinfer_mm_fp4cuda)r   device_typesABA_scaleB_scaleg_scaledtypeuse_8x4_sf_layoutbackendc           	      C   s&   ddl m} || |||||d||d	S )Nr   )mm_fp4   )
block_sizer   r   )r   r   )	r   r   r   r   r   r   r   r   flashinfer_mm_fp4_r   r   r   flashinfer_mm_fp4  s   r   c                 C   s    t j| jd |jd || jdS )Nr   r|   r   devicetorchemptyshaper   )r   r   r   r   r   r   r   r   r   r   r   flashinfer_mm_fp4_fake  s    r   zvllm::bmm_fp8c                 C   s    ddl m} || ||||d |S )Nr   )bmm_fp8)r   r   )r   r   r   r   r   r   bmm_fp8_r   r   r   r     s   r   c                 C   s(   t j| jd | jd |jd || jdS )Nr   r|      r   r   )r   r   r   r   r   r   r   r   r   bmm_fp8_fake  s   r   zvllm::flashinfer_nvfp4_quantizeaa_global_sfc                 C   s*   ddl m} ddl m} || ||jddS )Nr   )SfLayout)nvfp4_quantizeF)sfLayout
do_shuffle)r   r   r   
layout_8x4)r   r   r   nvfp4_quantize_r   r   r   flashinfer_nvfp4_quantize  s
   
r   c                 C   s^   | j \}}dd }||d}|d }||d}tj||d tj| jdtj||tj| jdfS )Nc                 S   s   | | d | | S )Nr|   r   )xyr   r   r   r@   "  s    z0flashinfer_nvfp4_quantize_fake.<locals>.<lambda>   r      r   r   )r   r   r   uint8r   )r   r   mnround_up	rounded_mscale_n	rounded_nr   r   r   flashinfer_nvfp4_quantize_fake  s   


r   bblock_scale_ablock_scale_balpha	out_dtypec              
   C   s   | j dkr
|j dksJ |j dkr|j dksJ | ddkr&|ddks(J | jd |jd ks4J |dv rD|tj}|tj}|dkrQ| jd dkrQdnd	}t| | || ||||d
S )Nr   r|   )cutlasscudnntrtllmr       TF)r   r   )ndimstrider   viewr   r   r   t)r   r   r   r   r   r   r   r   r   r   r   flashinfer_scaled_fp4_mm-  s$   	 r   scale_ascale_bbiasc                 C   s  | j dkr
|j dksJ | jd |jd ksJ | dkr$| dks&J | jtjkr2|jtjks4J | jjdkr@|jjdksBJ |jtjkrN|jtjksPJ |jjdkr\|jjdks^J t	| 
d|
d|||d| jd |jd }|d ur|| }|S )Nr   r|   r   r   r}   )r   r   numelr   r   float8_e4m3fnr   typefloat32r   	unsqueezer   )r   r   r   r   r   r   outputr   r   r   flashinfer_scaled_fp8_mmM  s(   	r   c                 C   s
   t | |S r%   )r   )r   r   r   r   r   $flashinfer_quant_nvfp4_8x4_sf_layoutk  s   
r   flashinfer.gemmfp8_blockscale_gemm_sm90c                   C   s   t  otdottddS )z>Return `True` if FlashInfer block-scale FP8 GEMM is available.Z   r   r   )r   r   is_device_capabilityrK   r"   r   r   r   r   "has_flashinfer_fp8_blockscale_gemmv  s
   r   c                   C   s   t jot S )z>Return `True` if FlashInfer block-scale FP8 GEMM is supported.)r   #VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFERr   r   r   r   r   +is_flashinfer_fp8_blockscale_gemm_supported  s   r   is_flashinfer_supportedoutput_dtypeinputweightc           	      C   sb   | sdS d}d}|j }|j }|tjko.|tjko.|tjko.|jd | dko.|jd | dk}|S )NF@      r   r|   )r   r   bfloat16r   r   )	r   r   r   r   
N_MULTIPLE
K_MULTIPLEweight_dtypeinput_dtypeshould_use_flashinferr   r   r   -should_use_flashinfer_for_blockscale_fp8_gemm  s    
r   )r   %flashinfer_trtllm_fp8_block_scale_moeflashinfer_cutlass_fused_moe)flashinfer_cutedsl_grouped_gemm_nt_maskedflashinfer_fp4_quantizer9   r:   rV   r<   r=   rP   rD   rN   rW   rY   r   re   rh   rr   rl   r   r   r   flashinfer_fp8_blockscale_gemmr   r   )NFFr%   )T__doc__r>   r-   r   importlib.utilosr   collections.abcr   typingr   r   r]   r   	vllm.envsr   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platformsr   __name__r   environr^   r   r.   boolr   r   r   strr"   r/   flashinfer_trtllm_bf16_moer   *flashinfer_trtllm_fp8_per_tensor_scale_moer   r   r   r8   r9   r:   rV   r<   r=   rD   rN   rP   rQ   rU   rW   rY   re   rh   rn   intrr   r   rl   vllm.utils.torch_utilsr   Tensorr   r   library	custom_opr   register_faker   r   r   tupler   r   r   r   r   r   r   r   r   __all__r   r   r   r   <module>   sb  
	






T

		

&

	
