o
    
۾izg                     @   s  U d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlmZmZmZ d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ erjd d	lm Z  d d
l!m"Z" ne#Z e#Z"ee$Z%ej&ej'ej(ej)ej*ej+ej+ej+ej,ej-ej+dZ.ej(ej(ej&ej&ej/ej/ej+ej+ej0ej0ej1ej1iZ2ddiZ3edZ4dej5de6fddZ7e j8dej9fddZ:e j8dde;dB fddZ<e j8dd Z=dej9de;fddZ>dej9de;fddZ?dej9d ej9fd!d"Z@d#eej9 fd$d%ZAd&ej5d'e*d(e*ddfd)d*ZB	dd+eCej9B dB d,eCej9B dB dej9fd-d.ZDd/eEeCef deCdB fd0d1ZFd/eEeCef dej9dB fd2d3ZGd4eCd5e deCfd6d7ZHd4eCd5e dej9fd8d9ZId:e;dB ddfd;d<ZJ			=	>dd?e;d@e;dAe;dBe;dCe;d+eCej9B dB d,eCej9B dB d:e;dB dDeCdB dEeCdB deKeLej5 eLej5 f fdFdGZM			=dd?e;d@e;dAe;dBe;dCe;d+eCej9B dB d,eCej9B dB d:e;dB dDeCdB deKeLej5 eLej5 f fdHdIZNdJeLdej9dKeCejOB dLe6dej5f
dMdNZPddOdPeLeLe4  dQe4dejQdRe;dB dejRf
dSdTZSdddUdVdPeLeLe4  dQe4dej9dRe;dB dDeCejOB dB dLe6dej5fdWdXZTejUjVZWeX ZYdYejUjZddfdZd[Z[e[ejU_VG d\d] d]Z\dejUjZfd^d_Z]da^ejUjZdB e_d`< dejUjZdB fdadbZ`e
dcddddeeCdB de;fdfdgZade;fdhdiZbd&edefdjdkZcdlej5eLej5 B eKej5 B e"B dej5eLe B eKe B eB fdmdnZddoej5dej5fdpdqZedreCdseCde6fdtduZfdseCde6fdvdwZgdseCde6fdxdyZhdseCde6fdzd{Zide6fd|d}Zjed~dZk					ddeCdedeLeC dB dedB dedB deCdB deKejldf fddZmdS )    N)Callable
Collection)	lru_cache)TYPE_CHECKINGAnyTypeVar)version)Version)Libraryinfer_schema)init_logger)ModelConfigIntermediateTensors)float32halffloat16bfloat16floatfp8fp8_e4m3fp8_e5m2int8fp8_inc
fp8_ds_mlar   r   Ttreturnc                 C   sZ   |   sdS | j}|  }d}tt|d ddD ]}|| |kr$ dS ||| 9 }qdS )a  
    Check if tensor is contiguous AND has no degenerate strides.

    A degenerate stride occurs when a dimension has size 1 but the stride
    doesn't match the canonical contiguous layout. This can cause issues
    in some CUDA kernels that rely on stride values for memory access.

    For a C-contiguous tensor of shape (d0, d1, ..., dn), the expected
    strides are: stride[i] = product(shape[i+1:]) for all i, with stride[-1]=1.

    Example with torch.Size([16, 1, 8, 32]):
        - Canonical strides: (256, 256, 32, 1)
        - Degenerate strides: (256, 1, 32, 1)  # dim=1 has size=1, allowing
                                                  # non-canonical stride in dim=0
    F   T)is_contiguousshapestriderangelen)r   r!   stridesexpected_stridei r(   J/home/ubuntu/.local/lib/python3.10/site-packages/vllm/utils/torch_utils.pyis_strictly_contiguousB   s   r*   dtypec                 c   s(    t  }t |  dV  t | dS )z0Sets the default torch dtype to the given dtype.N)torchget_default_dtypeset_default_dtype)r+   	old_dtyper(   r(   r)   set_default_torch_dtype`   s
   
r0   num_threadsc              	   c   s    | du r.d} z	t tjd } W n ty   td|  Y n ty-   td|  Y nw t	 }t
|  zdV  W t
| dS t
| w )z
    Sets the default number of threads for PyTorch to the given value.

    `None` means using the value of the environment variable `OMP_NUM_THREADS`
    (or `1` if that is not available).
    Nr   OMP_NUM_THREADSz;OMP_NUM_THREADS is not set; defaulting Torch threads to %d.z;OMP_NUM_THREADS is invalid; defaulting Torch threads to %d.)intosenvironKeyErrorlogger
debug_once
ValueErrorwarning_oncer,   get_num_threadsset_num_threads)r1   old_num_threadsr(   r(   r)   set_default_torch_num_threadsi   s*   
r>   c               
   c   s    ddl m}  |  sdV  dS tjd}dtjd< z6zdV  W n ty? } zdt|v r2d}nt|}t||d}~ww W |du rKtjd= dS |tjd< dS |du r[tjd= w |tjd< w )z%Avoid unexpected CUDA initialization.r   current_platformNCUDA_VISIBLE_DEVICES zNo CUDA GPUs are availablezCUDA initialization is blocked.)	vllm.platformsr@   is_cudar4   r5   get	ExceptionstrRuntimeError)r@   	old_valueeerr_msgr(   r(   r)   guard_cuda_initialization   s.   


rL   c                 C   s   t jg | d S )z'Get the size of the data type in bytes.r+   )r,   tensorelement_sizerM   r(   r(   r)   get_dtype_size   s   rP   c                 C   s   | t jk| j | jd  S )N   )r,   boolis_floating_point
is_complexrM   r(   r(   r)   _get_precision_level   s   rU   	src_dtype	tgt_dtypec                 C   s   | |krdS t | }t |}||k rdS ||krdS | js6| js6t| }t|}|j|jko5|j|jkS t| }t|}|j|jkoQ|j|jkoQ|j|jkS )z[
    Test whether it is lossless to cast a tensor from
    `src_dtype` to `tgt_dtype`.
    TF)	rU   rS   rT   r,   iinfominmaxfinfo
resolution)rV   rW   	src_level	tgt_levelsrc_infotgt_infor(   r(   r)   is_lossless_cast   s&   





ra   dtypesc                    s   t   fdddS )zv
    Get the common `dtype` where all of the other `dtypes` can be
    cast to it without losing any information.
    c                    s   t  fddD S )Nc                 3   s    | ]}t | V  qd S N)ra   ).0dtrM   r(   r)   	<genexpr>   s    z?common_broadcastable_dtype.<locals>.<lambda>.<locals>.<genexpr>)sumrM   rb   rM   r)   <lambda>   s    z,common_broadcastable_dtype.<locals>.<lambda>)key)rZ   rh   r(   rh   r)   common_broadcastable_dtype   s   
rk   rN   lowhighc                 C   s:   ddl m} tj| tjd}||| || | ~d S )Nr   )_custom_opsrM   )vllmrn   r,   
empty_liker   uniform_convert_fp8)rN   rl   rm   ops
tensor_tmpr(   r(   r)   _generate_random_fp8   s
   ru   cache_dtypemodel_dtypec                 C   s   t | tr:| dkr)t |tr|tv rt| }|S t |tjr"|}|S td| | tv r3t|  }|S td|  t | tjrD| }|S td|  )NautozInvalid model dtype: zInvalid kv cache dtype: )
isinstancerG   STR_DTYPE_TO_TORCH_DTYPEr,   r+   r9   )rv   rw   torch_dtyper(   r(   r)   get_kv_cache_torch_dtype   s"   
r|   	quant_cfgc                 C   s   |  dd}|drn|  d| }| dp$|  dp$| dp$|  d}t|trP| ddu rB| d	d
krB| ddkrBd}ntd| tt  dS t|t	rn|
 }|tv rat| S td|tt  dS dS )a  Get the KV cache quantization algorithm string from the quantization config.

    Maps various FP8 format names to vLLM's standard cache dtype strings.
    Returns None if no kv_cache_quant_algo is specified.
    Returns "auto" if the value is not recognized/supported.
    quant_methodrB   modeloptquantizationkv_cache_schemekv_cache_quant_algodynamicFnum_bits   typer   r   zhWARNING: Unknown kv_cache_quant_algo '%s' in model config. Supported values: %s. Falling back to 'auto'.rx   N)rE   
startswithry   dictr7   warninglist#MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAPkeysrG   lower)r}   r~   quantization_innerkv_algokv_algo_lowerr(   r(   r)   get_kv_cache_quant_algo_string  s@   	





r   c                 C   s$   t | }|dur|dkrt| S dS )zKGet the KV cache quantization algorithm dtype from the quantization config.Nrx   )r   rz   )r}   kv_algo_strr(   r(   r)   get_kv_cache_quant_algo_dtype;  s   r   kv_cache_dtypemodel_configc                 C   sL   | dkr| S t |dd}|dur$t |dd}|dur$t|}|dur$|S dS )z}Resolve 'auto' kv_cache_dtype to the actual string value from model config.
    Returns the resolved cache_dtype string.
    rx   	hf_configNquantization_config)getattrr   )r   r   hf_cfgr}   r   r(   r(   r)   resolve_kv_cache_dtype_stringD  s   r   c                 C   s    | dkr|r	|j S tjS t|  S )Nrx   )r+   r,   r   rz   )r   r   r(   r(   r)   kv_cache_dtype_str_to_dtypeY  s   r   seedc                 C   sJ   | d ur!t |  tj |  t|  tj r#tj|  d S d S d S rc   )randomr   npr,   manual_seedcudais_availablemanual_seed_all)r   r(   r(   r)   set_random_seedb  s   


r   r   NHD
num_blocks
block_size
num_layers	num_heads	head_sizedevicecache_layoutc
                    s   t | t||}
| d|||f |	dv sJ |	dkrdnd}t fdd|D }|d }g }g }t|D ]B}tj||
|d	j| }|d
v rN|| | n|dkrZt|| | nt	d| |
|d d df  |
|d d df  q5||fS )NrQ   )r   HNDr   )r   r   rQ         )r   r   r   rQ   r   c                 3   s    | ]} | V  qd S rc   r(   )rd   r'   generic_kv_cache_shaper(   r)   rf   ~      z5create_kv_caches_with_random_flash.<locals>.<genexpr>      ࿩sizer+   r   rx   r   r   r   r   #Does not support key cache of type r   r   )r   r|   tupler#   r,   emptypermuterq   ru   r9   append)r   r   r   r   r   rv   rw   r   r   r   r+   stride_orderkv_cache_allocation_shapescale
key_cachesvalue_caches_key_value_cacher(   r   r)   "create_kv_caches_with_random_flashk  s.   
r   c	                 C   sH  |dkr|d rt d| t| t||}	|d }
dtjg |	d  }| ||| ||f}g }t|D ].}tj||	|d}|dv rL||
 |
 n|dkrXt	||
 |
 nt d| |
| q6| |||f}g }t|D ].}tj||	|d}|dv r||
 |
 n|dkrt	||
 |
 nt d	| |
| qq||fS )
Nr      z6Does not support key cache of type fp8 with head_size r   rM   r   r   r   z%Does not support value cache of type )r9   r   r|   r,   rN   rO   r#   r   rq   ru   r   )r   r   r   r   r   rv   rw   r   r   r+   r   xkey_cache_shaper   r   	key_cachevalue_cache_shaper   value_cacher(   r(   r)   create_kv_caches_with_random  s:   
r   datatarget_device
pin_memoryc                 C   s    t j| ||dd}|j|ddS )z?Asynchronously create a tensor and copy it from host to device.cpu)r+   r   r   T)r   non_blocking)r,   rN   to)r   r+   r   r   r   r(   r(   r)   async_tensor_h2d  s   r   max_lenr   padr   c                C   sl   |du rt tt| dd}tjt| |f||d}t| D ]\}}t||ks)J |||dt|f< q|S )z
    Make a padded array from 2D inputs.

    The padding is applied to the end of each inner list until it reaches
    `max_len`.
    Nr   )defaultrM   )rZ   mapr$   r   full	enumerate)r   r   r+   r   padded_xindblocktbr(   r(   r)   make_ndarray_with_pad  s   r   F)r   r   r   c          	      C   s8   t | }t| |||d}t||}|r| }|S )z
    Make a padded tensor from 2D inputs.

    The padding is applied to the end of each inner list until it reaches
    `max_len`.
    r   )TORCH_DTYPE_TO_NUMPY_DTYPEr   r,   
from_numpyr   r   )	r   r   r+   r   r   r   np_dtyper   rN   r(   r(   r)   make_tensor_with_pad  s   r   streamc                 C   s   | t _t|  d S rc   )_current_stream_tlsvalueprev_set_stream)r   r(   r(   r)   _patched_set_stream  s   r   c                   @   s   e Zd Zdd ZdS )_StreamPlaceholderc                 C   s   dd | _ d S )Nc                   S   s   d S rc   r(   r(   r(   r(   r)   ri     s    z-_StreamPlaceholder.__init__.<locals>.<lambda>)synchronize)selfr(   r(   r)   __init__  s   z_StreamPlaceholder.__init__N)__name__
__module____qualname__r   r(   r(   r(   r)   r     s    r   c                  C   s   ddl m}  ttdrtjdu rA|  s|  r$tj	tj
  tjS |  r/t t_tjS | j}|dur=| t_tjS tdtjS )a  
    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
    it turns out that `torch.cuda.current_stream()` is quite expensive,
    as it will construct a new stream object at each call.
    here we patch `torch.cuda.set_stream` to keep track of the current stream
    directly, so that we can avoid calling `torch.cuda.current_stream()`.

    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
    from C/C++ code.
    r   r?   r   NzZFail to set current stream, current platform may not support current_stream with torch API)rC   r@   hasattrr   r   is_rocmrD   r,   r   
set_streamStreamis_cpur   current_streamr9   )r@   r   r(   r(   r)   r     s    

r   _aux_streamc                  C   s*   ddl m}  tdu r|  rtj atS )z5
    Ensures aux_stream is initialized only once
    r   r?   N)rC   r@   r   is_cuda_aliker,   r   r   r?   r(   r(   r)   
aux_streamC  s   
r   r   )maxsizecuda_visible_devicesc                 C   sv   dd l }dd l}ddlm} |j sdS | r't|jdr$|j nd}n|j	 }|dk r7|j
 }|S |}|S )Nr   r?   _device_count_amdsmir   )
torch.cudatorch.versionrC   r@   r   _is_compiledr   r   r   _device_count_nvml_C_cuda_getDeviceCount)r   r,   r@   	raw_countrr(   r(   r)   _cuda_device_count_statelessQ  s   	


r   c                   C   s
   t tjS )zGet number of CUDA devices, caching based on the value of
    CUDA_VISIBLE_DEVICES at the time of call.

    This should be used instead of torch.cuda.device_count()
    unless CUDA_VISIBLE_DEVICES has already been set to the desired
    value.)r   envsrA   r(   r(   r(   r)   cuda_device_count_statelesso  s   

r  c                 C   s*   t | tjr|  dkrtjj| S | S )z
    Create a weak reference to a tensor.
    The new tensor will share the same data as the original tensor,
    but will not keep the original tensor alive.
    This ignores 0-size tensors as those don't allocate any memory.
    r   )ry   r,   Tensornumelrs   r   weak_ref_tensor)rN   r(   r(   r)   r  |  s   r  tensorsc                 C   s   t | tjr
t| S t | trdd | D S t | tr$tdd | D S ddlm} t | |r=|dd | j	 D }|S t
d	)
z
    Convenience function to create weak references to tensors,
    for single tensor, list of tensors or tuple of tensors.
    c                 S   s   g | ]}t |qS r(   r  rd   r   r(   r(   r)   
<listcomp>  s    z$weak_ref_tensors.<locals>.<listcomp>c                 s   s    | ]}t |V  qd S rc   r  r  r(   r(   r)   rf     r   z#weak_ref_tensors.<locals>.<genexpr>r   r   c                 S   s   i | ]	\}}|t |qS r(   r  )rd   rj   valr(   r(   r)   
<dictcomp>  s    z$weak_ref_tensors.<locals>.<dictcomp>zInvalid type for tensors)ry   r,   r  r  r   r   vllm.sequencer   r  itemsr9   )r  r   retr(   r(   r)   weak_ref_tensors  s   



r  
cpu_tensorc                 C   s@   |   sJ dddlm} | rtjj| S tjj| S )zY
    Get an accelerator view of a CPU tensor using Unified Virtual Addressing (UVA).
    zCPU tensor must be pinnedr   r?   )		is_pinnedrC   r@   is_xpur,   rs   r   get_xpu_view_from_cpu_tensorget_cuda_view_from_cpu_tensor)r  r@   r(   r(   r)   $get_accelerator_view_from_cpu_tensor  s
   r  torch_versiontargetc                 C   s   t | t |kS rc   )r   parse)r  r  r(   r(   r)   _is_torch_equal_or_newer  s   r  c                 C   s>   z	t ttj| W S  ty   ttjdt| k Y S w )zCheck if the installed torch version is >= the target version.

    Args:
        target: a version string, like "2.6.0".

    Returns:
        Whether the condition meets.
    r,   )	r  rG   r,   __version__rF   r	   	importlibmetadatar   r  r(   r(   r)   is_torch_equal_or_newer  s
   	r  c                 C   sF   |  ddks	J ttj}t|}|t| ko"t| d |kS )N.rQ   z.1)countrG   r,   r  r   r  )r  r  r(   r(   r)   _is_torch_equal  s   

r!  c                 C   s6   zt | W S  ty   ttjdt| k Y S w )zCheck if the installed torch version is == the target version.

    Args:
        target: a version string, like "2.6.0".

    Returns:
        Whether the condition meets.
    r,   )r!  rF   r	   r  r  r   r  r(   r(   r)   is_torch_equal  s
   	
r"  c                   C   s
   t j S rc   )r,   distributedis_xccl_availabler(   r(   r(   r)   supports_xccl  s   
r%  ro   FRAGMENTr(   op_nameop_funcmutates_args	fake_impl
target_libdispatch_keytags.c           
      C   sx   |du rg }|du rddl m} |j}t||d}|pt}	|	j| | |d |	j| ||d |dur:|	| | dS dS )a  
    `torch.library.custom_op` can have significant overhead because it
    needs to consider complicated dispatching logic. This function
    directly registers a custom op and dispatches it to the CUDA backend.
    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
    for more details.

    By default, the custom op is registered to the vLLM library. If you
    want to register it to a different library, you can pass the library
    object to the `target_lib` argument.

    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
    library object. If you want to bind the operator to a different library,
    make sure the library object is alive when the operator is used.
    Nr   r?   )r)  )r-  )r,  )rC   r@   r,  r   vllm_libdefineimpl_register_fake)
r'  r(  r)  r*  r+  r,  r-  r@   
schema_strmy_libr(   r(   r)   direct_register_custom_op  s   r4  rc   )NNr   r   )NNr   )NNNNr(   )n
contextlibimportlib.metadatar  r4   r   	threadingcollections.abcr   r   	functoolsr   typingr   r   r   numpyr   numpy.typingnptr,   	packagingr   packaging.versionr	   torch.libraryr
   r   	vllm.envsr  vllm.loggerr   vllm.configr   r  r   objectr   r7   r   r   r   r   r   uint8r   float8_e4m3fnrz   float64int32int64r   r   r   r  rR   r*   contextmanagerr+   r0   r3   r>   rL   rP   rU   ra   rk   ru   rG   r|   r   r   r   r   r   r   r   r   r   r   r   r   	DTypeLikeNDArrayr   r   r   r   r   localr   r   r   r   r   r   __annotations__r   r   r  r  r  r  r  r  r!  r"  r%  r.  Tagr4  r(   r(   r(   r)   <module>   s  
 
 

 5	

		

/	

0




.


