o
    .ihd                     @   s  U d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dl	m
Z
 d dlmZmZmZ d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZ erdd dlmZ d d	lm Z  ne!Ze!Z d dl"Z"e"#e$Z%ej&ej'ej(ej)ej*ej+ej+ej+ej,ej-ej+d
Z.ej(ej(ej&ej&ej/ej/ej+ej+ej0ej0ej1ej1iZ2ddiZ3edZ4dej5de6fddZ7e j8dej9fddZ:e j8de;fddZ<e j8dd Z=dej9de;fddZ>dej9de;fddZ?dej9dej9fd d!Z@d"eej9 fd#d$ZAd%ej5d&e*d'e*ddfd(d)ZB	dd*eCej9B dB d+eCej9B dB dej9fd,d-ZDd.eEeCef deCdB fd/d0ZFd.eEeCef dej9dB fd1d2ZGd3eCd4edeCfd5d6ZHd3eCd4edej9fd7d8ZId9e;dB ddfd:d;ZJ			<	=dd>e;d?e;d@e;dAe;dBe;d*eCej9B dB d+eCej9B dB d9e;dB dCeCdB dDeCdB deKeLej5 eLej5 f fdEdFZM			<dd>e;d?e;d@e;dAe;dBe;d*eCej9B dB d+eCej9B dB d9e;dB dCeCdB deKeLej5 eLej5 f fdGdHZNdIeLdej9dJeCejOB dKe6dej5f
dLdMZPddNdOeLeLe4  dPe4dejQdQe;dB dejRf
dRdSZSdddTdUdOeLeLe4  dPe4dej9dQe;dB dCeCejOB dB dKe6dej5fdVdWZTejUjVZWeX ZYdXejUjZddfdYdZZ[e[ejU_VG d[d\ d\Z\dejUjZfd]d^Z]da^ejUjZdB e_d_< dejUjZdB fd`daZ`e
dbdcdddeCdB de;fdedfZade;fdgdhZbd%edefdidjZcdkej5eLej5 B eKej5 B e B dej5eLe B eKe B eB fdldmZddnej5dej5fdodpZedqeCdreCde6fdsdtZfdreCde6fdudvZgdreCde6fdwdxZhdreCde6fdydzZide6fd{d|Zjed}d~Zk					ddeCdedeLeC dB dedB dedB deCdB deKejldf fddZmdS )    N)Callable
Collection)	lru_cache)TYPE_CHECKINGAnyTypeVar)version)Version)Libraryinfer_schema)ModelConfigIntermediateTensors)float32halffloat16bfloat16floatfp8fp8_e4m3fp8_e5m2int8fp8_inc
fp8_ds_mlar   r   Ttreturnc                 C   sZ   |   sdS | j}|  }d}tt|d ddD ]}|| |kr$ dS ||| 9 }qdS )a  
    Check if tensor is contiguous AND has no degenerate strides.

    A degenerate stride occurs when a dimension has size 1 but the stride
    doesn't match the canonical contiguous layout. This can cause issues
    in some CUDA kernels that rely on stride values for memory access.

    For a C-contiguous tensor of shape (d0, d1, ..., dn), the expected
    strides are: stride[i] = product(shape[i+1:]) for all i, with stride[-1]=1.

    Example with torch.Size([16, 1, 8, 32]):
        - Canonical strides: (256, 256, 32, 1)
        - Degenerate strides: (256, 1, 32, 1)  # dim=1 has size=1, allowing
                                                  # non-canonical stride in dim=0
    F   T)is_contiguousshapestriderangelen)r   r    stridesexpected_stridei r'   S/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/utils/torch_utils.pyis_strictly_contiguousC   s   r)   dtypec                 c   (    t  }t |  dV  t | dS )z0Sets the default torch dtype to the given dtype.N)torchget_default_dtypeset_default_dtype)r*   	old_dtyper'   r'   r(   set_default_torch_dtypea   
   
r0   num_threadsc                 c   r+   )zBSets the default number of threads for PyTorch to the given value.N)r,   get_num_threadsset_num_threads)r2   old_num_threadsr'   r'   r(   set_default_torch_num_threadsj   r1   r6   c               
   c   s    ddl m}  |  sdV  dS tjd}dtjd< z6zdV  W n ty? } zdt|v r2d}nt|}t||d}~ww W |du rKtjd= dS |tjd< dS |du r[tjd= w |tjd< w )z%Avoid unexpected CUDA initialization.r   current_platformNCUDA_VISIBLE_DEVICES zNo CUDA GPUs are availablezCUDA initialization is blocked.)	vllm.platformsr8   is_cudaosenvironget	ExceptionstrRuntimeError)r8   	old_valueeerr_msgr'   r'   r(   guard_cuda_initializations   s.   


rF   c                 C   s   t jg | d S )z'Get the size of the data type in bytes.r*   )r,   tensorelement_sizerG   r'   r'   r(   get_dtype_size   s   rJ   c                 C   s   | t jk| j | jd  S )N   )r,   boolis_floating_point
is_complexrG   r'   r'   r(   _get_precision_level   s   rO   	src_dtype	tgt_dtypec                 C   s   | |krdS t | }t |}||k rdS ||krdS | js6| js6t| }t|}|j|jko5|j|jkS t| }t|}|j|jkoQ|j|jkoQ|j|jkS )z[
    Test whether it is lossless to cast a tensor from
    `src_dtype` to `tgt_dtype`.
    TF)	rO   rM   rN   r,   iinfominmaxfinfo
resolution)rP   rQ   	src_level	tgt_levelsrc_infotgt_infor'   r'   r(   is_lossless_cast   s&   





r[   dtypesc                    s   t   fdddS )zv
    Get the common `dtype` where all of the other `dtypes` can be
    cast to it without losing any information.
    c                    s   t  fddD S )Nc                 3   s    | ]}t | V  qd S N)r[   ).0dtrG   r'   r(   	<genexpr>   s    z?common_broadcastable_dtype.<locals>.<lambda>.<locals>.<genexpr>)sumrG   r\   rG   r(   <lambda>   s    z,common_broadcastable_dtype.<locals>.<lambda>)key)rT   rb   r'   rb   r(   common_broadcastable_dtype   s   
re   rH   lowhighc                 C   s:   ddl m} tj| tjd}||| || | ~d S )Nr   )_custom_opsrG   )vllmrh   r,   
empty_liker   uniform_convert_fp8)rH   rf   rg   ops
tensor_tmpr'   r'   r(   _generate_random_fp8   s
   ro   cache_dtypemodel_dtypec                 C   s   t | tr:| dkr)t |tr|tv rt| }|S t |tjr"|}|S td| | tv r3t|  }|S td|  t | tjrD| }|S td|  )NautozInvalid model dtype: zInvalid kv cache dtype: )
isinstancerA   STR_DTYPE_TO_TORCH_DTYPEr,   r*   
ValueError)rp   rq   torch_dtyper'   r'   r(   get_kv_cache_torch_dtype   s"   
rw   	quant_cfgc                 C   s   |  dd}|drn|  d| }| dp$|  dp$| dp$|  d}t|trP| ddu rB| d	d
krB| ddkrBd}ntd| tt  dS t|t	rn|
 }|tv rat| S td|tt  dS dS )a  Get the KV cache quantization algorithm string from the quantization config.

    Maps various FP8 format names to vLLM's standard cache dtype strings.
    Returns None if no kv_cache_quant_algo is specified.
    Returns "auto" if the value is not recognized/supported.
    quant_methodr:   modeloptquantizationkv_cache_schemekv_cache_quant_algodynamicFnum_bits   typer   r   zhWARNING: Unknown kv_cache_quant_algo '%s' in model config. Supported values: %s. Falling back to 'auto'.rr   N)r?   
startswithrs   dictloggerwarninglist#MODELOPT_TO_VLLM_KV_CACHE_DTYPE_MAPkeysrA   lower)rx   ry   quantization_innerkv_algokv_algo_lowerr'   r'   r(   get_kv_cache_quant_algo_string   s@   	





r   c                 C   s$   t | }|dur|dkrt| S dS )zKGet the KV cache quantization algorithm dtype from the quantization config.Nrr   )r   rt   )rx   kv_algo_strr'   r'   r(   get_kv_cache_quant_algo_dtype$  s   r   kv_cache_dtypemodel_configc                 C   sL   | dkr| S t |dd}|dur$t |dd}|dur$t|}|dur$|S dS )z}Resolve 'auto' kv_cache_dtype to the actual string value from model config.
    Returns the resolved cache_dtype string.
    rr   	hf_configNquantization_config)getattrr   )r   r   hf_cfgrx   r   r'   r'   r(   resolve_kv_cache_dtype_string-  s   r   c                 C   s    | dkr|r	|j S tjS t|  S )Nrr   )r*   r,   r   rt   )r   r   r'   r'   r(   kv_cache_dtype_str_to_dtypeB  s   r   seedc                 C   sJ   | d ur!t |  tj |  t|  tj r#tj|  d S d S d S r]   )randomr   npr,   manual_seedcudais_availablemanual_seed_all)r   r'   r'   r(   set_random_seedK  s   


r   r   NHD
num_blocks
block_size
num_layers	num_heads	head_sizedevicecache_layoutc
                    s   t | t||}
| d|||f |	dv sJ |	dkrdnd}t fdd|D }|d }g }g }t|D ]B}tj||
|d	j| }|d
v rN|| | n|dkrZt|| | nt	d| |
|d d df  |
|d d df  q5||fS )NrK   )r   HNDr   )r   r   rK         )r   r   r   rK   r   c                 3   s    | ]} | V  qd S r]   r'   )r^   r&   generic_kv_cache_shaper'   r(   r`   g      z5create_kv_caches_with_random_flash.<locals>.<genexpr>      ࿩sizer*   r   rr   r   r   r   r   #Does not support key cache of type r   r   )r   rw   tupler"   r,   emptypermuterk   ro   ru   append)r   r   r   r   r   rp   rq   r   r   r   r*   stride_orderkv_cache_allocation_shapescale
key_cachesvalue_caches_key_value_cacher'   r   r(   "create_kv_caches_with_random_flashT  s.   
r   c	                 C   sH  |dkr|d rt d| t| t||}	|d }
dtjg |	d  }| ||| ||f}g }t|D ].}tj||	|d}|dv rL||
 |
 n|dkrXt	||
 |
 nt d| |
| q6| |||f}g }t|D ].}tj||	|d}|dv r||
 |
 n|dkrt	||
 |
 nt d	| |
| qq||fS )
Nr      z6Does not support key cache of type fp8 with head_size r   rG   r   r   r   z%Does not support value cache of type )ru   r   rw   r,   rH   rI   r"   r   rk   ro   r   )r   r   r   r   r   rp   rq   r   r   r*   r   xkey_cache_shaper   r   	key_cachevalue_cache_shaper   value_cacher'   r'   r(   create_kv_caches_with_random|  s:   
r   datatarget_device
pin_memoryc                 C   s    t j| ||dd}|j|ddS )z?Asynchronously create a tensor and copy it from host to device.cpu)r*   r   r   T)r   non_blocking)r,   rH   to)r   r*   r   r   r   r'   r'   r(   async_tensor_h2d  s   r   max_lenr   padr   c                C   sl   |du rt tt| dd}tjt| |f||d}t| D ]\}}t||ks)J |||dt|f< q|S )z
    Make a padded array from 2D inputs.

    The padding is applied to the end of each inner list until it reaches
    `max_len`.
    Nr   )defaultrG   )rT   mapr#   r   full	enumerate)r   r   r*   r   padded_xindblocktbr'   r'   r(   make_ndarray_with_pad  s   r   F)r   r   r   c          	      C   s8   t | }t| |||d}t||}|r| }|S )z
    Make a padded tensor from 2D inputs.

    The padding is applied to the end of each inner list until it reaches
    `max_len`.
    r   )TORCH_DTYPE_TO_NUMPY_DTYPEr   r,   
from_numpyr   r   )	r   r   r*   r   r   r   np_dtyper   rH   r'   r'   r(   make_tensor_with_pad  s   r   streamc                 C   s   | t _t|  d S r]   )_current_stream_tlsvalueprev_set_stream)r   r'   r'   r(   _patched_set_stream  s   r   c                   @   s   e Zd Zdd ZdS )_StreamPlaceholderc                 C   s   dd | _ d S )Nc                   S   s   d S r]   r'   r'   r'   r'   r(   rc     s    z-_StreamPlaceholder.__init__.<locals>.<lambda>)synchronize)selfr'   r'   r(   __init__  s   z_StreamPlaceholder.__init__N)__name__
__module____qualname__r   r'   r'   r'   r(   r     s    r   c                  C   s   ddl m}  ttdrtjdu rA|  s|  r$tj	tj
  tjS |  r/t t_tjS | j}|dur=| t_tjS tdtjS )a  
    replace `torch.cuda.current_stream()` with `vllm.utils.current_stream()`.
    it turns out that `torch.cuda.current_stream()` is quite expensive,
    as it will construct a new stream object at each call.
    here we patch `torch.cuda.set_stream` to keep track of the current stream
    directly, so that we can avoid calling `torch.cuda.current_stream()`.

    the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
    from C/C++ code.
    r   r7   r   NzZFail to set current stream, current platform may not support current_stream with torch API)r;   r8   hasattrr   r   is_rocmr<   r,   r   
set_streamStreamis_cpur   current_streamru   )r8   r   r'   r'   r(   r     s    

r   _aux_streamc                  C   s*   ddl m}  tdu r|  rtj atS )z5
    Ensures aux_stream is initialized only once
    r   r7   N)r;   r8   r   is_cuda_aliker,   r   r   r7   r'   r'   r(   
aux_stream,  s   
r   r   )maxsizecuda_visible_devicesc                 C   sv   dd l }dd l}ddlm} |j sdS | r't|jdr$|j nd}n|j	 }|dk r7|j
 }|S |}|S )Nr   r7   _device_count_amdsmir   )
torch.cudatorch.versionr;   r8   r   _is_compiledr   r   r   _device_count_nvml_C_cuda_getDeviceCount)r   r,   r8   	raw_countrr'   r'   r(   _cuda_device_count_stateless:  s   	


r   c                   C   s
   t tjS )zGet number of CUDA devices, caching based on the value of
    CUDA_VISIBLE_DEVICES at the time of call.

    This should be used instead of torch.cuda.device_count()
    unless CUDA_VISIBLE_DEVICES has already been set to the desired
    value.)r   envsr9   r'   r'   r'   r(   cuda_device_count_statelessX  s   

r   c                 C   s*   t | tjr|  dkrtjj| S | S )z
    Create a weak reference to a tensor.
    The new tensor will share the same data as the original tensor,
    but will not keep the original tensor alive.
    This ignores 0-size tensors as those don't allocate any memory.
    r   )rs   r,   Tensornumelrm   r   weak_ref_tensor)rH   r'   r'   r(   r  e  s   r  tensorsc                 C   s   t | tjr
t| S t | trdd | D S t | tr$tdd | D S ddlm} t | |r=|dd | j	 D }|S t
d	)
z
    Convenience function to create weak references to tensors,
    for single tensor, list of tensors or tuple of tensors.
    c                 S   s   g | ]}t |qS r'   r  r^   r   r'   r'   r(   
<listcomp>  s    z$weak_ref_tensors.<locals>.<listcomp>c                 s   s    | ]}t |V  qd S r]   r  r  r'   r'   r(   r`     r   z#weak_ref_tensors.<locals>.<genexpr>r   r   c                 S   s   i | ]	\}}|t |qS r'   r  )r^   rd   valr'   r'   r(   
<dictcomp>  s    z$weak_ref_tensors.<locals>.<dictcomp>zInvalid type for tensors)rs   r,   r   r  r   r   vllm.sequencer   r  itemsru   )r  r   retr'   r'   r(   weak_ref_tensorsr  s   



r  
cpu_tensorc                 C   s   |   sJ dtjj| S )zQ
    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
    zCPU tensor must be pinned)	is_pinnedr,   rm   r   get_cuda_view_from_cpu_tensor)r  r'   r'   r(   r    s   r  torch_versiontargetc                 C   s   t | t |kS r]   )r   parse)r  r  r'   r'   r(   _is_torch_equal_or_newer  s   r  c                 C   s>   z	t ttj| W S  ty   ttjdt| k Y S w )zCheck if the installed torch version is >= the target version.

    Args:
        target: a version string, like "2.6.0".

    Returns:
        Whether the condition meets.
    r,   )	r  rA   r,   __version__r@   r	   	importlibmetadatar   r  r'   r'   r(   is_torch_equal_or_newer  s
   	r  c                 C   sF   |  ddks	J ttj}t|}|t| ko"t| d |kS )N.rK   z.1)countrA   r,   r  r   r  )r  r  r'   r'   r(   _is_torch_equal  s   

r  c                 C   s6   zt | W S  ty   ttjdt| k Y S w )zCheck if the installed torch version is == the target version.

    Args:
        target: a version string, like "2.6.0".

    Returns:
        Whether the condition meets.
    r,   )r  r@   r	   r  r  r   r  r'   r'   r(   is_torch_equal  s
   	
r  c                   C   s   t dotj S )Nz	2.8.0.dev)r  r,   distributedis_xccl_availabler'   r'   r'   r(   supports_xccl  s   r  ri   FRAGMENTr'   op_nameop_funcmutates_args	fake_impl
target_libdispatch_keytags.c           
      C   sx   |du rg }|du rddl m} |j}t||d}|pt}	|	j| | |d |	j| ||d |dur:|	| | dS dS )a  
    `torch.library.custom_op` can have significant overhead because it
    needs to consider complicated dispatching logic. This function
    directly registers a custom op and dispatches it to the CUDA backend.
    See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
    for more details.

    By default, the custom op is registered to the vLLM library. If you
    want to register it to a different library, you can pass the library
    object to the `target_lib` argument.

    IMPORTANT: the lifetime of the operator is tied to the lifetime of the
    library object. If you want to bind the operator to a different library,
    make sure the library object is alive when the operator is used.
    Nr   r7   )r"  )r&  )r%  )r;   r8   r%  r   vllm_libdefineimpl_register_fake)
r   r!  r"  r#  r$  r%  r&  r8   
schema_strmy_libr'   r'   r(   direct_register_custom_op  s   r-  r]   )NNr   r   )NNr   )NNNNr'   )n
contextlibimportlib.metadatar  r=   r   	threadingcollections.abcr   r   	functoolsr   typingr   r   r   numpyr   numpy.typingnptr,   	packagingr   packaging.versionr	   torch.libraryr
   r   	vllm.envsr   vllm.configr   r  r   objectlogging	getLoggerr   r   r   r   r   r   r   uint8r   float8_e4m3fnrt   float64int32int64r   r   r   r   rL   r)   contextmanagerr*   r0   intr6   rF   rJ   rO   r[   re   ro   rA   rw   r   r   r   r   r   r   r   r   r   r   r   r   	DTypeLikeNDArrayr   r   r   r   r   localr   r   r   r   r   r   __annotations__r   r   r   r  r  r  r  r  r  r  r  r'  Tagr-  r'   r'   r'   r(   <module>   s  


 

 5	

		

/	

0




.
	

