o
    .icQ                     @   s8  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ ddlmZ eeZejdd	Zejd
efddZejd
efddZde
de
d
efddZde d
e
dB fddZ!efde de dede
f fddZ"e"ddZ#e"ddZ$e"ddZ%e"dd Z&e"d!d"Z'e"d!d#Z(e"d!d$Z)e"d!d%Z*e"d!d&Z+e"d!d'Z,e"d(d)d*d+ d,Z-ejd
efd-d.Z.ejd
efd/d0Z/ejd
efd1d2Z0ejd
efd3d4Z1ejd
efd5d6Z2ejd
efd7d8Z3ejd
efd9d:Z4ejd
efd;d<Z5ejd
efd=d>Z6d
edB fd?d@Z7dAe8dBe8d
efdCdDZ9		E	EddAe8dBe8dFe8dGe8dHe8dIe dJej:dKedLedB dMedNed
efdOdPZ;e r ej<j=dQg dRdSdTej>dUej>dVej>dWej>dXej>dYej:dZed[e d
ej>fd\d]Z?ej<@dQdTej>dUej>dVej>dWej>dXej>dYej:dZed[e d
ej>fd^d_ZAej<j=d`g dRdSdTej>dUej>dVej>dWej>dYej:d[e d
ej>fdadbZBej<@d`dTej>dUej>dVej>dWej>dYej:d[e d
ej>fdcddZCej<j=deg dRdSdfej>dgej>d
eDej>ej>f fdhdiZEej<@dedfej>dgej>d
eDej>ej>f fdjdkZFdfej>dlej>dmej>dnej>doej>dpej:d[e d
ej>fdqdrZG	ddfej>dlej>dsej>dtej>dpej:duej>dB d
ej>fdvdwZHdfej>dgej>d
eDej>ej>f fdxdyZIe"dzd{ZJejd
efd|d}ZKejd
efd~dZLdedej:dej>dej>fddZMg dZNdS )zoCompatibility wrapper for FlashInfer API changes.

Users of vLLM should always import **only** these wrappers.
    N)Callable)AnyNoReturn)init_logger)vllm_is_batch_invariant)current_platformFLASHINFER_CUBINS_REPOSITORYzWhttps://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/returnc                   C   s,   t jrdS tjddurdS td dS )z7Return `True` if flashinfer-cubin package is available.Tflashinfer_cubinNz&flashinfer-cubin package was not foundF)envsVLLM_HAS_FLASHINFER_CUBIN	importlibutil	find_speclogger
debug_once r   r   R/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/utils/flashinfer.pyhas_flashinfer_cubin&   s   
r   c                   C   sD   t jddu rtd dS t s tddu r td dS dS )z8Return `True` if flashinfer-python package is available.
flashinferNz2FlashInfer unavailable since package was not foundFnvcczSFlashInfer unavailable since nvcc was not found and not using pre-downloaded cubinsT)r   r   r   r   r   r   shutilwhichr   r   r   r   has_flashinfer1   s   
r   ___c                  O   s   t d)z/Placeholder for unavailable FlashInfer backend.zFlashInfer backend is not available. Please install the package to enable FlashInfer kernels: https://github.com/flashinfer-ai/flashinfer)RuntimeError)r   r   r   r   r   _missingD   s   r   module_namec              	   C   s&   zt | W S  ttfy   Y dS w )zBSafely import a submodule and return it, or None if not available.N)r   import_moduleImportErrorModuleNotFoundError)r   r   r   r   _get_submoduleM   s
   r"   	attr_namefallback_fn.c                    s&   t jfdd  fdd}|S )z5Create a lazy import wrapper for a specific function.c                     s&   t  sd S t} | rt|  d S d S N)r   r"   getattr)mod)r#   r   r   r   	_get_impl[   s   z'_lazy_import_wrapper.<locals>._get_implc                     s*     }|d u r| i |S || i |S r%   r   )argskwargsimpl)r(   r$   r   r   wrapperb   s   z%_lazy_import_wrapper.<locals>.wrapper)	functoolscache)r   r#   r$   r,   r   )r(   r#   r$   r   r   _lazy_import_wrapperV   s   r/   flashinfer.fused_moetrtllm_fp8_block_scale_moetrtllm_fp8_per_tensor_scale_moecutlass_fused_moe$flashinfer.cute_dsl.blockscaled_gemmgrouped_gemm_nt_maskedr   fp4_quantizenvfp4_batched_quantize*silu_and_mul_scaled_nvfp4_experts_quantizescaled_fp4_grouped_quantizenvfp4_block_scale_interleavetrtllm_fp4_block_scale_moezflashinfer.autotunerautotunec                  O   s   t  S r%   )
contextlibnullcontext)r)   r*   r   r   r   <lambda>   s    r?   )r$   c                   C      t  o
tjdduS )z5Return `True` if FlashInfer comm module is available.flashinfer.commNr   r   r   r   r   r   r   r   has_flashinfer_comm   s   rC   c                  C   @   t  sdS g d} | D ]\}}t|}|rt||s dS qdS )z7Return `True` if FlashInfer mnnvl all2all is available.F))rA   Mapping)zflashinfer.comm.mnnvlMnnvlMemory)flashinfer.comm.trtllm_alltoallMnnvlMoe)rG   MoEAlltoallInfoT)rC   r"   hasattrrequired_functionsr   r#   r'   r   r   r   has_flashinfer_all2all      rM   c                   C   r@   )z4Return `True` if FlashInfer MoE module is available.r0   NrB   r   r   r   r   has_flashinfer_moe   s   rO   c                   C   r@   )z:Return ``True`` if FlashInfer cutedsl module is available.zflashinfer.cute_dslNrB   r   r   r   r   has_flashinfer_cutedsl   s   rP   c                  C   rD   )z:Return `True` if FlashInfer TRTLLM fused MoE is available.F))r0   r1   )r0   r2   r0   r;   TrO   r"   rJ   rK   r   r   r   has_flashinfer_trtllm_fused_moe   s   rS   c                  C   rD   )z;Return `True` if FlashInfer CUTLASS fused MoE is available.F))r0   r3   )r   r6   )r   r:   rQ   TrR   rK   r   r   r    has_flashinfer_cutlass_fused_moe   rN   rT   c                  C   rD   )z=Return ``True`` if FlashInfer CUTLASS fused MoE is available.F))r4   r5   )r   r9   )r   &silu_and_scaled_nvfp4_experts_quantizeT)rP   r"   rJ   rK   r   r   r   -has_flashinfer_cutedsl_grouped_gemm_nt_masked   s   rV   c               
   C   s~   t  rdS z tjtdd} | jdk}|rtd |W S td| j |W S  ty> } ztd| W Y d}~d	S d}~ww )
zReturn `True` if NVIDIA's artifactory is accessible.

    This checks connectivity to the kernel inference library artifactory
    which is required for downloading certain cubin kernels like TRTLLM FHMA.
    T   )timeout   z NVIDIA artifactory is accessiblez2NVIDIA artifactory returned failed status code: %dz+Failed to connect to NVIDIA artifactory: %sNF)	r   requestsgetr   status_coder   r   warning_once	Exception)response
accessibleer   r   r   has_nvidia_artifactory   s$   

rb   c                   C   s   t  rdS tdot S )z
    TRTLLM attention is supported if the platform is SM100,
    NVIDIA artifactory is accessible, and batch-invariant mode is not enabled.
    Fd   )r   r   is_device_capability_familyrb   r   r   r   r   supports_trtllm_attention  s   re   c                  C   s   ddl m}  |  }|jjS )a,  
    This function should only be called during initialization stage when vllm config
    is set.
    Return `None` if --attention-config.use_trtllm_attention is not set,
    return `True` if TRTLLM attention is forced to be used,
    return `False` if TRTLLM attention is forced to be not used.
    r   )get_current_vllm_config)vllm.configrf   attention_configuse_trtllm_attention)rf   vllm_configr   r   r   force_use_trtllm_attention!  s   rk   num_qo_headsnum_kv_headsc                 C   s$   t  du rdS t }|o| | dkS )z=Check if the current configuration supports TRTLLM attention.Fr   )rk   re   )rl   rm   
has_trtllmr   r   r   can_use_trtllm_attention/  s   
ro   F
num_tokensmax_seq_lendcp_world_sizekv_cache_dtypeq_dtype
is_prefillforce_use_trtllm	has_sinkshas_specc                 C   s   |dur|sdS |dkrt d dS t s|rt d dS | | dkr.|r,t d dS |
r9|s9t d d	S |t krFt d
 d	S |	rOt d d	S |du rs|rb|dk}|r`t d |S |dkoi|dk}|rqt d |S t d d	S )z*Return `True` if TRTLLM attention is used.NF   zcTrtllm does not support returning LSE and as a result does not support DCP, reverting to FlashInferzkTRTLLM attention is not supported on this platform, but --attention-config.use_trtllm_attention is set to 1r   zTRTLLM attention is not supported for this combination of query and key heads, but --attention-config.use_trtllm_attention is set to 1z:Using TRTLLM attention (enabled for speculative decoding).Tz,Using TRTLLM attention (query is quantized).z6Using TRTLLM attention (required for attention sinks).autoz/Using TRTLLM prefill attention (auto-detected).   z.Using TRTLLM decode attention (auto-detected).zLUsing TRTLLM attention (--attention-config.use_trtllm_attention is set to 1))r   r]   re   	info_oncer   	fp8_dtype)rl   rm   rp   rq   rr   rs   rt   ru   rv   rw   rx   
use_trtllmr   r   r   ri   7  sT   




ri   zvllm::flashinfer_mm_fp4cuda)mutates_argsdevice_typesABA_scaleB_scaleg_scaledtypeuse_8x4_sf_layoutbackendc           	      C   s&   ddl m} || |||||d||d	S )Nr   )mm_fp4   )
block_sizer   r   )r   r   )	r   r   r   r   r   r   r   r   flashinfer_mm_fp4_r   r   r   flashinfer_mm_fp4  s   r   c                 C   s    t j| jd |jd || jdS )Nr   ry   r   devicetorchemptyshaper   )r   r   r   r   r   r   r   r   r   r   r   flashinfer_mm_fp4_fake  s    r   zvllm::bmm_fp8c                 C   s    ddl m} || ||||d |S )Nr   )bmm_fp8)r   r   )r   r   r   r   r   r   bmm_fp8_r   r   r   r     s   r   c                 C   s(   t j| jd | jd |jd || jdS )Nr   ry      r   r   )r   r   r   r   r   r   r   r   r   bmm_fp8_fake  s   r   zvllm::flashinfer_nvfp4_quantizeaa_global_sfc                 C   s*   ddl m} ddl m} || ||jddS )Nr   )SfLayout)nvfp4_quantizeF)sfLayout
do_shuffle)r   r   r   
layout_8x4)r   r   r   nvfp4_quantize_r   r   r   flashinfer_nvfp4_quantize  s
   
r   c                 C   s^   | j \}}dd }||d}|d }||d}tj||d tj| jdtj||tj| jdfS )Nc                 S   s   | | d | | S )Nry   r   )xyr   r   r   r?     s    z0flashinfer_nvfp4_quantize_fake.<locals>.<lambda>   r      r   r   )r   r   r   uint8r   )r   r   mnround_up	rounded_mscale_n	rounded_nr   r   r   flashinfer_nvfp4_quantize_fake  s   


r   bblock_scale_ablock_scale_balpha	out_dtypec              
   C   s   | j dkr
|j dksJ |j dkr|j dksJ | ddkr&|ddks(J | jd |jd ks4J |dkrD|tj}|tj}|dkrQ| jd dkrQdnd	}t| | || ||||d
S )Nr   ry   cutlasstrtllmr       TF)r   r   )ndimstrider   viewr   r   r   t)r   r   r   r   r   r   r   r   r   r   r   flashinfer_scaled_fp4_mm  s$   	 r   scale_ascale_bbiasc                 C   s  | j dkr
|j dksJ | jd |jd ksJ | dkr$| dks&J | jtjkr2|jtjks4J | jjdkr@|jjdksBJ |jtjkrN|jtjksPJ |jjdkr\|jjdks^J t	| 
d|
d|||d| jd |jd }|d ur|| }|S )Nr   ry   r   r   rz   )r   r   numelr   r   float8_e4m3fnr   typefloat32r   	unsqueezer   )r   r   r   r   r   r   outputr   r   r   flashinfer_scaled_fp8_mm  s(   	r   c                 C   s
   t | |S r%   )r   )r   r   r   r   r   $flashinfer_quant_nvfp4_8x4_sf_layout9  s   
r   flashinfer.gemmfp8_blockscale_gemm_sm90c                   C   s   t  otdottddS )z>Return `True` if FlashInfer block-scale FP8 GEMM is available.Z   r   r   )r   r   is_device_capabilityrJ   r"   r   r   r   r   "has_flashinfer_fp8_blockscale_gemmD  s
   r   c                   C   s   t jot S )z>Return `True` if FlashInfer block-scale FP8 GEMM is supported.)r   #VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFERr   r   r   r   r   +is_flashinfer_fp8_blockscale_gemm_supportedN  s   r   is_flashinfer_supportedoutput_dtypeinputweightc           	      C   sb   | sdS d}d}|j }|j }|tjko.|tjko.|tjko.|jd | dko.|jd | dk}|S )NF@      r   ry   )r   r   bfloat16r   r   )	r   r   r   r   
N_MULTIPLE
K_MULTIPLEweight_dtypeinput_dtypeshould_use_flashinferr   r   r   -should_use_flashinfer_for_blockscale_fp8_gemmW  s    
r   )r   %flashinfer_trtllm_fp8_block_scale_moeflashinfer_cutlass_fused_moe)flashinfer_cutedsl_grouped_gemm_nt_maskedflashinfer_fp4_quantizer8   r9   r:   r;   r<   rO   rC   rM   rT   rV   r   rb   re   ro   ri   r   r   r   flashinfer_fp8_blockscale_gemmr   r   )NFFr%   )O__doc__r=   r-   r   importlib.utilosr   collections.abcr   typingr   r   rZ   r   	vllm.envsr   vllm.loggerr   *vllm.model_executor.layers.batch_invariantr   vllm.platformsr   __name__r   environr[   r   r.   boolr   r   r   strr"   r/   r   *flashinfer_trtllm_fp8_per_tensor_scale_moer   r   r   r7   r8   r9   r:   r;   r<   rC   rM   rO   rP   rS   rT   rV   rb   re   rk   intro   r   ri   library	custom_opTensorr   register_faker   r   r   tupler   r   r   r   r   r   r   r   r   __all__r   r   r   r   <module>   s*  
	






T		

&

	
