o
    
۾iF:                     @   s~  U d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd deZejdefddZejdefddZde	de	de
fddZdaede	f dB ed< da ede	f dB ed< da!ede	f dB ed< da"ede	f dB ed< da#ede	f dB ed< da$ede	f dB ed< da%ede	f dB ed< da&ede	f dB ed< da'ede	f dB ed< d\d d!Z(de)fd"d#Z*ejde+e) fd$d%Z,d&ej-dej-fd'd(Z.d)d* Z/d+d, Z0d-d. Z1d/d0 Z2d1ej-d2e3ej-ej-f d3ej-d4ej-d5ej-d6edej-fd7d8Z4d9ej-d:e)d;e)dej-fd<d=Z5d>ej-d?ej-d3ej-d9ej-d@ej-dAej-dBe)d6edej-fdCdDZ6d&ej-fdEdFZ7d&e)dGe)de)fdHdIZ8d&e)dJe)fdKdLZ9dMdMgZ:ej;dNej<dOe:dPfd&ej-d:e+e) dQede3ej-ej-f fdRdSZ=d&ej-dGej-fdTdUZ>	d]dVej?dWej-dXedB fdYdZZ@g d[ZAdS )^zmCompatibility wrapper for DeepGEMM API changes.

Users of vLLM should always import **only** these wrappers.
    N)Callable)Enum)AnyNoReturn)logger)get_fp8_min_max)current_platform)has_deep_gemmcdivc                   @   s4   e Zd ZdZdZdZed
ddZeddd	ZdS )DeepGemmQuantScaleFMTr         returnNc                 C   s^   t | dd}|durdS tjot otdu}|s| j| _dS tdr)| j	| _dS | j
| _dS )z>Initialize the oracle decision and store it in the class cache_oracle_cacheNd   )getattrenvsVLLM_USE_DEEP_GEMM_E8M0is_deep_gemm_supported_fp8_gemm_nt_implFLOAT32r   r   is_device_capability_familyUE8M0FLOAT32_CEIL_UE8M0)clscacheduse_e8m0 r   H/home/ubuntu/.local/lib/python3.10/site-packages/vllm/utils/deep_gemm.pyinit_oracle_cache&   s    z'DeepGemmQuantScaleFMT.init_oracle_cachec                 C   s    t | dd}|dusJ d|S )z*Return the pre-initialized oracle decisionr   Nz2DeepGemmQuantScaleFMT oracle cache not initialized)r   )r   r   r   r   r   from_oracle<   s   z!DeepGemmQuantScaleFMT.from_oracler   N)r   r   )	__name__
__module____qualname__r   r   r   classmethodr    r!   r   r   r   r   r      s    r   r   c                  C   s,   t  ot dpt d} tjot o| S )zReturn `True` if DeepGEMM is supported on the current platform.
    Currently, only Hopper and Blackwell GPUs are supported.
    Z   r   )r   is_cudais_device_capabilityr   r   VLLM_USE_DEEP_GEMMr	   )is_supported_archr   r   r   r   D   s
   
r   c                   C   s^   t  s
td dS t  tdu rtjddd dS tjr&tjddd dS tjd	dd dS )
znReturn `True` if vLLM is configured to use DeepGEMM "
    "E8M0 scale on a Hopper or Blackwell-class GPU.
    z>DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.FNz3DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not foundlocal)scopez*DeepGEMM E8M0 enabled on current platform.Tz0DeepGEMM E8M0 disabled on current configuration.)r   r   
debug_once
_lazy_initr   	info_oncer   r   r   r   r   r   is_deep_gemm_e8m0_usedP   s    r1   ___c                  O   s   t d)z-Placeholder for unavailable DeepGEMM backend.zDeepGEMM backend is not available or outdated. Please install or update the `deep_gemm` to a newer version to enable FP8 kernels.)RuntimeError)r2   r3   r   r   r   _missingk   s   r5   .r   _grouped_impl_grouped_masked_impl_fp8_mqa_logits_impl_fp8_paged_mqa_logits_impl#_get_paged_mqa_logits_metadata_impl%_get_mn_major_tma_aligned_tensor_impl,_get_mk_alignment_for_contiguous_layout_impl'_transform_sf_into_required_layout_implc                  C   s   t dus tdus tdus tdus tdus tdus tdus tdur"dS t s'dS d} t	j
| ds;t	jtjdt	j
| < td}t|dda t|ddat|ddat|ddat|ddat|d	dat|d
dat|ddat|ddat  dS )z2Import deep_gemm and resolve symbols on first use.NDG_JIT_CACHE_DIR	deep_gemmfp8_gemm_nt m_grouped_fp8_gemm_nt_contiguousfp8_m_grouped_gemm_nt_maskedfp8_mqa_logitsfp8_paged_mqa_logitsget_paged_mqa_logits_metadataget_mn_major_tma_aligned_tensor&get_mk_alignment_for_contiguous_layout!transform_sf_into_required_layout)r   r6   r7   r8   r9   r:   r<   r=   r	   osenvirongetpathjoinr   VLLM_CACHE_ROOT	importlibimport_moduler   r;   r   r    )DEEP_GEMM_JIT_CACHE_ENV_NAME_dgr   r   r   r/   ~   sF   


r/   c                  C   s   t   td} t|  S )Nr?   )r/   rO   rP   intget_num_sms)rR   r   r   r   rT      s   
rT   c                  C   s"   t   td u r
t S t } | | gS N)r/   r<   r5   )mk_align_sizer   r   r   rG      s
   rG   xc                 C   s   t   tdu r
t S t| S )z6Wrapper for DeepGEMM's get_mn_major_tma_aligned_tensorN)r/   r;   r5   rW   r   r   r    get_col_major_tma_aligned_tensor   s   rY   c                  O   sN   t   td u rt| i |S d|v r|d }|d= nt }t| d| i|S )Nr1   disable_ue8m0_cast)r/   r   r5   r1   )argskwargs	use_ue8m0r   r   r   r@      s   r@   c                  O   2   t   td u rt| i |S t| dt  i|S NrZ   )r/   r6   r5   r1   r[   r\   r   r   r   rA         rA   c                  O   r^   r_   )r/   r7   r5   r1   r`   r   r   r   rB      ra   rB   c                  O   r^   r_   )r/   r=   r5   r1   r`   r   r   r   rH      ra   rH   qkvweightscu_seqlen_kscu_seqlen_keclean_logitsc                 C   s(   t   tdu r
t S t| |||||dS )a%  Compute FP8 MQA logits for a single sequence without KV paging.

    Args:
        q: Query tensor of shape [M, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N])
            with dtype `torch.float32`.
        weights: weights of shape [M, H], dtype `torch.float32`.
        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
            shape [M], dtype int32.
        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
            shape [M], dtype int32.
        clean_logits: Whether to clean the unfilled logits into `-inf`.

    Returns:
        Logits tensor of shape [M, N], dtype `torch.float32`.
    Nrg   )r/   r8   r5   )rb   rc   rd   re   rf   rg   r   r   r   rC      s   rC   context_lens
block_sizenum_smsc                 C   s    t   tdu r
t S t| ||S )a  Build scheduling metadata for paged MQA logits.

    Args:
        context_lens: Tensor of shape [B], dtype int32; effective context length
            per batch element.
        block_size: KV-cache block size in tokens (e.g., 64).
        num_sms: Number of SMs available. 132 for Hopper

    Returns:
        Backend-specific tensor consumed by `fp8_paged_mqa_logits` to
        schedule work across SMs.
    N)r/   r:   r5   )ri   rj   rk   r   r   r   rE     s   rE   q_fp8kv_cache_fp8block_tablesschedule_metadatamax_model_lenc              
   C   s,   t   tdu r
t S t| |||||||dS )aO  Compute FP8 MQA logits using paged KV-cache.

    Args:
        q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
            4 bytes per (block,pos) store the `float` dequant scale.
        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
        context_lens: Tensor of shape [B], dtype int32; effective context length
            for each batch element.
        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
            block indices to physical blocks in the paged cache.
        schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
            used to distribute work across SMs.
        max_model_len: Maximum sequence length used to size the logits output.
        clean_logits: Whether to clean the unfilled logits into `-inf`.

    Returns:
        Logits tensor of shape [B * next_n, max_model_len], dtype
        `torch.float32`.
    Nrh   )r/   r9   r5   )rl   rm   rd   ri   rn   ro   rp   rg   r   r   r   rD   (  s    rD   c              	   C   s   t dt t |  S )Ng       @)torchpowceillog2absrX   r   r   r   _ceil_to_ue8m0W  s   rv   yc                 C   s   t | || S rU   r
   )rW   rw   r   r   r   _align[     rx   element_sizec                 C   s   t | d| S )N   )rx   )rW   rz   r   r   r   get_tma_aligned_size`  ry   r|      T)dynamicbackendFr]   c                 C   s  t  }|  dksJ | j\}}|\}}tjt||t||f| j| jd}| |d |d |f< |	d||
d| |}	|	  jdddd}
t \}}|
| }|r[t|n|}|	d	|  |}||d |d |f  |	|	
d
|	
dfS )Nr   )dtypedevicer   )r      T)dimkeepdimg-C6?g      ?r   )r   	fp8_dtyper   shaperq   zerosrx   r   r   viewsizeru   floatamaxclampr   rv   toview_as
contiguous)rW   rj   r]   r   mnblock_mblock_nx_paddedx_viewx_amaxr2   fp8_maxsfx_scaledr   r   r   per_block_cast_to_fp8h  s"   

 r   c                 C   sB   |   |  } }| |  ||   }d| |   | }d| S )a|  Return a global difference metric for unit tests.

    DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
    error, causing `torch.testing.assert_close` to fail.  Instead of checking
    every element, we compute a cosine-style similarity over the whole tensor
    and report `1 - sim`.  Once kernel accuracy improves this helper can be
    removed.
    r   r   )doublesum)rW   rw   denominatorsimr   r   r   	calc_diff  s   
r   output_dtypeweightsupports_deep_gemmc                 C   sH   |d u rt  }d}d}|o#| tjko#|jd | dko#|jd | dkS )N@   r}   r   r   )r   rq   bfloat16r   )r   r   r   
N_MULTIPLE
K_MULTIPLEr   r   r   "should_use_deepgemm_for_fp8_linear  s   r   )r   r   r@   rA   rB   rC   rD   rE   r   r1   r   rT   r   rY   rG   r"   rU   )B__doc__	functoolsrO   rI   collections.abcr   enumr   typingr   r   rq   	vllm.envsr   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platformsr   vllm.utils.import_utilsr	   vllm.utils.math_utilsr   r   cacheboolr   r1   r5   r   __annotations__r6   r7   r8   r9   r:   r;   r<   r=   r/   rS   rT   listrG   TensorrY   r@   rA   rB   rH   tuplerC   rE   rD   rv   rx   r|   DEFAULT_BLOCK_SIZEcompilesimple_compile_backendr   r   r   r   __all__r   r   r   r   <module>   s   )
5			
"
	
/
