o
    .i9                     @   sv  U d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd deZejdefddZejdefddZde	de	de
fddZdaede	f dB ed< da ede	f dB ed< da!ede	f dB ed< da"ede	f dB ed< da#ede	f dB ed< da$ede	f dB ed< da%ede	f dB ed< da&ede	f dB ed< da'ede	f dB ed< d[d d!Z(de)fd"d#Z*ejde+e) fd$d%Z,d&ej-dej-fd'd(Z.d)d* Z/d+d, Z0d-d. Z1d/d0 Z2d1ej-d2e3ej-ej-f d3ej-d4ej-d5ej-dej-fd6d7Z4d8ej-d9e)d:e)dej-fd;d<Z5d=ej-d>ej-d3ej-d8ej-d?ej-d@ej-dAe)dej-fdBdCZ6d&ej-fdDdEZ7d&e)dFe)de)fdGdHZ8d&e)dIe)fdJdKZ9dLdLgZ:ej;dMej<dNe:dOfd&ej-d9e+e) dPede3ej-ej-f fdQdRZ=d&ej-dFej-fdSdTZ>	d\dUej?dVej-dWedB fdXdYZ@g dZZAdS )]zmCompatibility wrapper for DeepGEMM API changes.

Users of vLLM should always import **only** these wrappers.
    N)Callable)Enum)AnyNoReturn)logger)get_fp8_min_max)current_platform)has_deep_gemmcdivc                   @   s4   e Zd ZdZdZdZed
ddZeddd	ZdS )DeepGemmQuantScaleFMTr         returnNc                 C   s^   t | dd}|durdS tjot otdu}|s| j| _dS tdr)| j	| _dS | j
| _dS )z>Initialize the oracle decision and store it in the class cache_oracle_cacheNd   )getattrenvsVLLM_USE_DEEP_GEMM_E8M0is_deep_gemm_supported_fp8_gemm_nt_implFLOAT32r   r   is_device_capability_familyUE8M0FLOAT32_CEIL_UE8M0)clscacheduse_e8m0 r   Q/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/utils/deep_gemm.pyinit_oracle_cache&   s    z'DeepGemmQuantScaleFMT.init_oracle_cachec                 C   s    t | dd}|dusJ d|S )z*Return the pre-initialized oracle decisionr   Nz2DeepGemmQuantScaleFMT oracle cache not initialized)r   )r   r   r   r   r   from_oracle<   s   z!DeepGemmQuantScaleFMT.from_oracler   N)r   r   )	__name__
__module____qualname__r   r   r   classmethodr    r!   r   r   r   r   r      s    r   r   c                  C   s,   t  ot dpt d} tjot o| S )zReturn `True` if DeepGEMM is supported on the current platform.
    Currently, only Hopper and Blackwell GPUs are supported.
    Z   r   )r   is_cudais_device_capabilityr   r   VLLM_USE_DEEP_GEMMr	   )is_supported_archr   r   r   r   D   s
   
r   c                   C   sR   t  s
td dS t  tdu rtd dS tjr"td dS td dS )znReturn `True` if vLLM is configured to use DeepGEMM "
    "E8M0 scale on a Hopper or Blackwell-class GPU.
    z>DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.FNz3DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not foundz*DeepGEMM E8M0 enabled on current platform.Tz0DeepGEMM E8M0 disabled on current configuration.)r   r   
debug_once
_lazy_initr   	info_oncer   r   r   r   r   r   is_deep_gemm_e8m0_usedP   s   


r/   ___c                  O   s   t d)z-Placeholder for unavailable DeepGEMM backend.zDeepGEMM backend is not available or outdated. Please install or update the `deep_gemm` to a newer version to enable FP8 kernels.)RuntimeError)r0   r1   r   r   r   _missingi   s   r3   .r   _grouped_impl_grouped_masked_impl_fp8_mqa_logits_impl_fp8_paged_mqa_logits_impl#_get_paged_mqa_logits_metadata_impl%_get_mn_major_tma_aligned_tensor_impl,_get_mk_alignment_for_contiguous_layout_impl'_transform_sf_into_required_layout_implc                  C   s   t dus tdus tdus tdus tdus tdus tdus tdur"dS t s'dS d} t	j
| ds;t	jtjdt	j
| < td}t|dda t|ddat|ddat|ddat|ddat|d	dat|d
dat|ddat|ddat  dS )z2Import deep_gemm and resolve symbols on first use.NDG_JIT_CACHE_DIR	deep_gemmfp8_gemm_nt m_grouped_fp8_gemm_nt_contiguousfp8_m_grouped_gemm_nt_maskedfp8_mqa_logitsfp8_paged_mqa_logitsget_paged_mqa_logits_metadataget_mn_major_tma_aligned_tensor&get_mk_alignment_for_contiguous_layout!transform_sf_into_required_layout)r   r4   r5   r6   r7   r8   r:   r;   r	   osenvirongetpathjoinr   VLLM_CACHE_ROOT	importlibimport_moduler   r9   r   r    )DEEP_GEMM_JIT_CACHE_ENV_NAME_dgr   r   r   r-   |   sF   


r-   c                  C   s   t   td} t|  S )Nr=   )r-   rM   rN   intget_num_sms)rP   r   r   r   rR      s   
rR   c                  C   s"   t   td u r
t S t } | | gS N)r-   r:   r3   )mk_align_sizer   r   r   rE      s
   rE   xc                 C   s   t   tdu r
t S t| S )z6Wrapper for DeepGEMM's get_mn_major_tma_aligned_tensorN)r-   r9   r3   rU   r   r   r    get_col_major_tma_aligned_tensor   s   rW   c                  O   sN   t   td u rt| i |S d|v r|d }|d= nt }t| d| i|S )Nr/   disable_ue8m0_cast)r-   r   r3   r/   )argskwargs	use_ue8m0r   r   r   r>      s   r>   c                  O   2   t   td u rt| i |S t| dt  i|S NrX   )r-   r4   r3   r/   rY   rZ   r   r   r   r?         r?   c                  O   r\   r]   )r-   r5   r3   r/   r^   r   r   r   r@      r_   r@   c                  O   r\   r]   )r-   r;   r3   r/   r^   r   r   r   rF      r_   rF   qkvweightscu_seqlen_kscu_seqlen_kec                 C   s$   t   tdu r
t S t| ||||S )a  Compute FP8 MQA logits for a single sequence without KV paging.

    Args:
        q: Query tensor of shape [M, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N])
            with dtype `torch.float32`.
        weights: weights of shape [M, H], dtype `torch.float32`.
        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
            shape [M], dtype int32.
        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
            shape [M], dtype int32.

    Returns:
        Logits tensor of shape [M, N], dtype `torch.float32`.
    N)r-   r6   r3   )r`   ra   rb   rc   rd   r   r   r   rA      s   rA   context_lens
block_sizenum_smsc                 C   s    t   tdu r
t S t| ||S )a  Build scheduling metadata for paged MQA logits.

    Args:
        context_lens: Tensor of shape [B], dtype int32; effective context length
            per batch element.
        block_size: KV-cache block size in tokens (e.g., 64).
        num_sms: Number of SMs available. 132 for Hopper

    Returns:
        Backend-specific tensor consumed by `fp8_paged_mqa_logits` to
        schedule work across SMs.
    N)r-   r8   r3   )re   rf   rg   r   r   r   rC     s   rC   q_fp8kv_cache_fp8block_tablesschedule_metadatamax_model_lenc              
   C   s,   t   tdu r
t S t| ||||||ddS )a  Compute FP8 MQA logits using paged KV-cache.

    Args:
        q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
            4 bytes per (block,pos) store the `float` dequant scale.
        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
        context_lens: Tensor of shape [B], dtype int32; effective context length
            for each batch element.
        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
            block indices to physical blocks in the paged cache.
        schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
            used to distribute work across SMs.
        max_model_len: Maximum sequence length used to size the logits output.

    Returns:
        Logits tensor of shape [B * next_n, max_model_len], dtype
        `torch.float32`.
    NT)clean_logits)r-   r7   r3   )rh   ri   rb   re   rj   rk   rl   r   r   r   rB   "  s   rB   c              	   C   s   t dt t |  S )Ng       @)torchpowceillog2absrV   r   r   r   _ceil_to_ue8m0O  s   rs   yc                 C   s   t | || S rS   r
   )rU   rt   r   r   r   _alignS     ru   element_sizec                 C   s   t | d| S )N   )ru   )rU   rw   r   r   r   get_tma_aligned_sizeX  rv   ry      T)dynamicbackendFr[   c                 C   s  t  }|  dksJ | j\}}|\}}tjt||t||f| j| jd}| |d |d |f< |	d||
d| |}	|	  jdddd}
t \}}|
| }|r[t|n|}|	d	|  |}||d |d |f  |	|	
d
|	
dfS )Nr   )dtypedevicer   )r      T)dimkeepdimg-C6?g      ?r   )r   	fp8_dtyper   shapern   zerosru   r}   r~   viewsizerr   floatamaxclampr   rs   toview_as
contiguous)rU   rf   r[   r   mnblock_mblock_nx_paddedx_viewx_amaxr0   fp8_maxsfx_scaledr   r   r   per_block_cast_to_fp8`  s"   

 r   c                 C   sB   |   |  } }| |  ||   }d| |   | }d| S )a|  Return a global difference metric for unit tests.

    DeepGEMM kernels on Blackwell/B200 currently exhibit noticeable per-element
    error, causing `torch.testing.assert_close` to fail.  Instead of checking
    every element, we compute a cosine-style similarity over the whole tensor
    and report `1 - sim`.  Once kernel accuracy improves this helper can be
    removed.
    r   r   )doublesum)rU   rt   denominatorsimr   r   r   	calc_diffw  s   
r   output_dtypeweightsupports_deep_gemmc                 C   sH   |d u rt  }d}d}|o#| tjko#|jd | dko#|jd | dkS )N@   rz   r   r   )r   rn   bfloat16r   )r   r   r   
N_MULTIPLE
K_MULTIPLEr   r   r   "should_use_deepgemm_for_fp8_linear  s   r   )r   r   r>   r?   r@   rA   rB   rC   r   r/   r   rR   r   rW   rE   r"   rS   )B__doc__	functoolsrM   rG   collections.abcr   enumr   typingr   r   rn   	vllm.envsr   vllm.loggerr   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.platformsr   vllm.utils.import_utilsr	   vllm.utils.math_utilsr   r   cacheboolr   r/   r3   r   __annotations__r4   r5   r6   r7   r8   r9   r:   r;   r-   rQ   rR   listrE   TensorrW   r>   r?   r@   rF   tuplerA   rC   rB   rs   ru   ry   DEFAULT_BLOCK_SIZEcompilesimple_compile_backendr   r   r}   r   __all__r   r   r   r   <module>   s   )
5			


-
