o
    
۾i%                     @   s  d dl mZmZ d dlmZ d dlZd dlZd dlm	Z	 d dl
m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlm Z  G dd dZ!de"e# dB de#de#dede$e#e#f f
ddZ%de#dee# de$e#e#f dede#dB f
ddZ&de$e#e#f dej'deddfdd Z(d!e#d"e#d#e d$ed%e"e d&e#d'ede)e$e*ef e$e*ej+f f fd(d)Z,dS )*    )CallableIterable)AnyN)tqdm)
VllmConfig)CUDAGraphMode)graph_captureis_global_first_rank)set_forward_context)AttentionMetadataBuilder)KVCacheConfig)build_attn_metadatabuild_slot_mappings_by_layer)BlockTables)make_num_tokens_across_dp)InputBuffersc                   @   s   e Zd ZdededejfddZdefddZd	e	d
e
e	 de	dB fddZde	dejdedejdB dejdB dedee deddfddZe dejdedejdB dejdB dedee deddfddZde	dejfddZdS )CudaGraphManagervllm_config
uses_mropedevicec                 C   s   || _ |j| _|| _|| _|jj| _| jj| _| jj| _	|j
j| _|j| _| jd us,J | jj| _t| jj| j| j	| j| _i | _tj | _d | _d S N)r   scheduler_configr   r   model_configmax_model_lenmax_num_seqsmax_num_reqsmax_num_batched_tokensmax_num_tokensparallel_configdata_parallel_sizedp_sizecompilation_configcudagraph_modeget_cudagraph_sizescudagraph_capture_sizescudagraph_sizesgraphstorchcudagraph_pool_handlepoolhidden_states)selfr   r   r    r-   V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/gpu/cudagraph_utils.py__init__   s(   





zCudaGraphManager.__init__returnc                 C   s   t | jdkS )Nr   )lenr%   )r,   r-   r-   r.   needs_capture3   s   zCudaGraphManager.needs_capturenum_tokens_after_paddingnum_tokens_per_requestNc                 C   s   t ||| j| jS r   )get_cudagraph_sizer%   r"   )r,   r3   r4   r-   r-   r.   r5   6   s   z#CudaGraphManager.get_cudagraph_size
num_tokensmodelinput_buffersmrope_positionsinputs_embedsblock_tablesattn_metadata_builderskv_cache_configc	              	   C   s  t || j}	|jd | }
|jd | }| jr'|d usJ |d d d |f }|d ur1|d | }t|	||||| j|\}}t| j|}t	|| j
|tj||d ||
||d}| jd u rct|| _W d    n1 smw   Y  || jvsyJ tj }t	|| j
|tj||d. tj|| j ||
||d}|| jd |< W d    n1 sw   Y  W d    n1 sw   Y  || j|< d S )N)r6   cudagraph_runtime_modenum_tokens_across_dpslot_mapping)	input_ids	positionsr:   )minr   rA   rB   r   prepare_inputs_to_capturer   r   r    r
   r   r   NONEr+   r'   
empty_liker&   r(   	CUDAGraphgraphr*   )r,   r6   r7   r8   r9   r:   r;   r<   r=   num_reqsrA   rB   attn_metadataslot_mappingsr?   r+   rH   r-   r-   r.   capture_graphB   st   	

	 zCudaGraphManager.capture_graphc                 C   s&   t | j| j| j|||||||d
 d S )N)r7   r8   r9   r:   r;   r<   r=   )capture_graphsr%   r   rL   )r,   r7   r8   r9   r:   r;   r<   r=   r-   r-   r.   capture   s   
zCudaGraphManager.capturec                 C   s8   || j v sJ | j |   | jd usJ | jd | S r   )r&   replayr+   )r,   r6   r-   r-   r.   run   s   zCudaGraphManager.run)__name__
__module____qualname__r   boolr'   r   r/   r2   intr   r5   nnModuler   Tensorr   listr   r   rL   inference_moderN   rP   r-   r-   r-   r.   r      s`    
	

E	r   capture_sizesr   r   r"   r0   c                    s   |  si S | s
i S t| } |tjkr|n|  fdd| D } | s$i S i }td| d d D ]}| D ]}||kr?|||<  nq3q/|S )Nc                    s   g | ]}| kr|qS r-   r-   .0xupper_boundr-   r.   
<listcomp>       z'get_cudagraph_sizes.<locals>.<listcomp>   )has_full_cudagraphssortedr   FULL_DECODE_ONLYrange)r[   r   r   r"   r%   ir^   r-   r_   r.   r#      s*   
r#   num_tokens_after_dp_paddingr4   r%   c                 C   sN   |  sd S || }|d u rd S tdd |D }|r%| tjkr%d S |S )Nc                 s   s    | ]}|d kV  qdS )rc   Nr-   r\   r-   r-   r.   	<genexpr>   s    z%get_cudagraph_size.<locals>.<genexpr>)re   getany
mixed_moder   FULL)rj   r4   r%   r"   sizeis_mixedr-   r-   r.   r5      s   
r5   r   
capture_fnc                 K   sp   t t|  dd}t rt|dd}t|d |D ]
}||fi | qW d    d S 1 s1w   Y  d S )NT)reversezCapturing CUDA graphs)desc)r   )rf   setvaluesr	   r   r   )r%   r   rr   capture_kwargssizes_to_capturerp   r-   r-   r.   rM      s   "rM   rI   r6   r8   r;   r<   r   r=   c                    s   |  }t j d t jd| }||d< t|}	|	|jd  d < ||j d d < |jd  d  }
||jd  < d|j d <  fdd|jD }|jd d d |f }t	||}t
| ||
|	|j||||d
}||fS )Nrc   )dtyperd   r   c                    s   g | ]}|d   qS r   r-   r\   rI   r-   r.   ra     rb   z-prepare_inputs_to_capture.<locals>.<listcomp>)
r<   rI   r6   query_start_loc_gpuquery_start_loc_cpuseq_lensmax_seq_lenr;   rK   r=   )nparangeint32r'   
from_numpyquery_start_locr}   input_block_tablesrK   r   r   )rI   r6   r8   r;   r<   r   r=   num_tokens_per_reqquery_start_loc_npr|   r   r   rK   slot_mappings_by_layerrJ   r-   rz   r.   rD      s6   	
rD   )-collections.abcr   r   typingr   numpyr   r'   torch.nnrV   r   vllm.configr   vllm.config.compilationr   vllm.distributed.parallel_stater   r	   vllm.forward_contextr
   vllm.v1.attention.backendr   vllm.v1.kv_cache_interfacer   vllm.v1.worker.gpu.attn_utilsr   r   vllm.v1.worker.gpu.block_tabler   vllm.v1.worker.gpu.dp_utilsr   vllm.v1.worker.gpu.input_batchr   r   rY   rU   dictr#   r5   r   rM   tuplestrrX   rD   r-   r-   r-   r.   <module>   s    






