o
    
۾if!                     @   sL  U d dl mZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
 eG dd dZee Zeed< d	ed
ededefddZdedededefddZdedededefddZ	d*dedejdedededee eB dB deedB edB f fddZdejd edejfd!d"Zd#ed$e
de
fd%d&Zdee d'e
dee
 fd(d)ZdS )+    )	dataclass)	TypeAliasN)ParallelConfig)CommonAttentionMetadatac                   @   s>   e Zd ZU eed< eed< defddZedefddZ	dS )	UBatchSlicerequest_slicetoken_slicereturnc                 C   s    | j j| j jkp| jj| jjkS N)r   startstopr   self r   O/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/ubatch_utils.pyis_empty   s   zUBatchSlice.is_emptyc                 C   s   | j j| j j S r
   )r   r   r   r   r   r   r   
num_tokens   s   zUBatchSlice.num_tokensN)
__name__
__module____qualname__slice__annotations__boolr   propertyintr   r   r   r   r   r      s   
 r   UBatchSlicesorig_num_tokenspadded_num_tokensnum_ubatchesr	   c                 C   s   || |d  | kS )N   r   )r   r   r   r   r   r   is_last_ubatch_empty    s   r    configr   uniform_decodec                 C   s"   | j sdS |r|| jkS || jkS )NF)use_ubatchingdbo_decode_token_thresholddbo_prefill_token_threshold)r!   r   r"   r   r   r   check_ubatch_thresholds&   s
   

r&   ubatch_slicesnum_total_tokensnum_reqs_paddedc                 C   s<   | d }t |jj|}t |jj|}| d d t||g S )N)r   r   r   r   r   )r'   r(   r)   
last_slicepadded_last_request_slicepadded_last_token_slicer   r   r   _pad_out_ubatch_slices3   s   
r.   should_ubatchnum_scheduled_tokensnum_tokens_paddedsplit_pointc                    s  | sdS  d u rt ||   fddtd|D }tjt|d tjd}tj|tj|dd  d g }d}	||d g }
|
D ],}t|	|}t tj||	d	d
d }t tj||dd
}t||}|	t
|| |}	q@t|||}tdd |D |ksJ ||fS )N)NNc                    s   g | ]} | qS r   r   ).0ir2   r   r   
<listcomp>M   s    z.maybe_create_ubatch_slices.<locals>.<listcomp>r   )dtype)r7   outr   r*   right)sideleftc                 s   s    | ]}|j V  qd S r
   )r   )r3   sr   r   r   	<genexpr>p   s    z-maybe_create_ubatch_slices.<locals>.<genexpr>)r   rangenpzeroslenint32cumsumr   searchsortedappendr   r.   sum)r/   r0   r1   r)   r   r2   token_split_pointscu_num_tokensr'   start_token
all_points	end_tokenr   	req_startreq_stop	req_sliceubatch_slices_paddedr   r5   r   maybe_create_ubatch_slices?   s,   

rP   query_start_locr   c                 C   s   | |j |jd  | |j   S )z
    Creates a new query_start_loc that corresponds to the requests in
    request_slice.

    Note: This function creates a new tensor to hold the new query_start_locs.
    This will break cudagraph compatibility.
    r   )r   r   )rQ   r   r   r   r   slice_query_start_locsu   s   rR   ubatch_sliceattn_metadatac                 C   s  |   rJ d|  d| j}| j}|j}|j}|j}|jd }|jd }|| |  kr8||d  k s=J d J d||| k}	|||d  d k }
t||}t|j|}t|dksgJ dt| |	r|||  }|dd  |8  < |dd  |8  < |j	| }|j
| }|
r||d  |j }|d  |8  < |d  |8  < | }| }|d  |8  < |d  |8  < t| }|j| }|j|j }|j|j }ttt|dd |dd   }|d	kr|j}|j| }|j| }t|||||||||||d
S )z{
    This function creates a new CommonAttentionMetadata that corresponds to
    the requests included in ubatch_slice
    zUbatch slice z	 is emptyr   z*Token slice start outside of first request   z3query_start_loc must have at least 2 elements, got Nr*   r   )rQ   query_start_loc_cpuseq_lensnum_reqsnum_actual_tokensmax_query_lenmax_seq_lenblock_table_tensorslot_mapping_seq_lens_cpu_num_computed_tokens_cpu)r   r   r   rV   r   r   rR   rQ   rA   rW   seq_lens_cpucloner   maxnum_computed_tokens_cputorchabsitemrZ   r\   r]   r   )rS   rT   r   r   
start_locs	first_req	first_toklast_reqlast_toksplits_first_requestsplits_last_requestrV   rQ   tokens_skippedrW   r`   r[   rc   num_requestsrY   rZ   r\   r]   r   r   r   _make_metadata_with_slice   sx   

 




&

rp   common_attn_metadatac                 C   s"   g }| D ]
}| t|| q|S )z
    Creates a new CommonAttentionMetadata instance that corresponds to the
    requests for each UBatchSlice in ubatch_slices.

    Note: This function does not modify common_attn_metadata
    )rE   rp   )r'   rq   resultsrS   r   r   r   split_attn_metadata   s   
rs   r
   )dataclassesr   typingr   numpyr?   rd   vllm.configr   vllm.v1.attention.backendr   r   listr   r   r   r   r    r&   r.   ndarraytuplerP   Tensorr   rR   rp   rs   r   r   r   r   <module>   s   



6

_