o
    
۾iA                     @   s(  d dl mZ d dlmZ d dlZd dlZd dlmZm	Z	 d dl
mZ G dd dZeG dd	 d	Ze	jd
ejfddZdejdejdejdejdejdejdejddfddZe	jd
ejfddZdejdejdejdejdejddfddZe	jd
ejfddZdejdejdejdejdejdejd ejd!ejd"edejfd#d$Ze	jd%d& Zd'ejdejd!ejdejdejdeejejf fd(d)Ze	jd*d+ Zdejdejdejd,ejd-ejd'ejd.ejdejddfd/d0Ze	jd
ejfd1d2Zdejd3ed!ejd4edeejejf f
d5d6ZdS )7    )	dataclass)AnyN)tltritonrandom_uuidc                   @   s$   e Zd ZdededejfddZdS )InputBuffersmax_num_reqsmax_num_tokensdevicec                 C   sj   || _ || _|| _tj|tj|d| _tj|tj|d| _tj|d tj|d| _	tj|tj|d| _
d S )Ndtyper      )r	   r
   r   torchzerosint32	input_idsint64	positionsquery_start_locseq_lens)selfr	   r
   r    r   R/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/gpu/input_batch.py__init__   s   zInputBuffers.__init__N)__name__
__module____qualname__intr   r   r   r   r   r   r   r      s    r   c                   @   s&  e Zd ZU ee ed< eed< ejed< e	j
ed< ejed< ejed< e	j
ed< eed< eed	< eed
< ejed< e	j
ed< ejed< ejed< ejed< ejdB ed< ejdB ed< eeef ed< eeejf ed< ejed< ejed< e	j
ed< eed< ededededejdd f
ddZdS )
InputBatchreq_idsnum_reqsidx_mappingidx_mapping_npexpanded_idx_mappingexpanded_local_posnum_scheduled_tokens
num_tokensnum_tokens_after_paddingnum_draft_tokensr   query_start_loc_npr   r   r   Nmrope_positionsinputs_embedsattn_metadataslot_mappingslogits_indicescu_num_logitscu_num_logits_nphas_structured_output_reqsinput_buffersr   returnc                 C   s\  d|  k r|ksJ  J dd t |D }tj|tjd}tj|tj|d}|}tj|tj|d}	tj||| tjd}
|
d  || 7  < t|
 |ksRJ || |j	d |< |j	|d   || 7  < d|j	|d < |j	d | }tj
|d tjd}d|d< tj|
|dd  d d|jd< tj|d|jd|d  d	 ||j|d d < |jd |d  }|jd |  }|jd |  }|dd  d }tj|d |tjd
}tj|d tjd}| d#i d|d|d|d|d|d|	d|
d|d|ddd|d|d|d|d|dd dd dd dd d|d|d |d!d"S )$Nr   c                 S   s   g | ]}d | dt   qS )req__r   ).0ir   r   r   
<listcomp>\   s    z)InputBatch.make_dummy.<locals>.<listcomp>)r   r   r   )out)dimr;   )r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r   r*   r   r   r   r+   r,   r-   r.   r/   r0   r1   r2   Fr   )rangenparanger   r   r   fullr   sumr   emptycumsumr   r   zero_r   )clsr!   r'   r3   r   r    r#   r"   r$   r%   r&   r   r*   r   r   r   r/   r0   r1   r   r   r   
make_dummyS   s   
	
zInputBatch.make_dummy)r   r   r   liststr__annotations__r   r   Tensorr>   ndarraydictr   boolclassmethodr   r   rF   r   r   r   r   r       sH   
 












r   
BLOCK_SIZEc	                 C   s  t d}	t ||	 }
t ||
 }t ||
 }||kr d S t ||	 }t ||	 d }|| }||
|  }td||D ]%}|t d| }||k }t j|| | |d}t j| | | ||d q@|| }||k rt || }t ||
 | d S d S Nr   r   )mask)r   
program_idloadr=   r?   store)input_ids_ptrnext_prefill_tokens_ptridx_mapping_ptrquery_start_loc_ptrprefill_token_ids_ptrprefill_token_ids_strideprefill_lens_ptrnum_computed_tokens_ptrrO   	batch_idxreq_state_idxprefill_lennum_computedquery_start	query_end	query_lenprefill_ptrr8   blockrQ   tokensnext_pos
next_tokenr   r   r   _prepare_prefill_inputs_kernel   s(   
ri   r   next_prefill_tokensr"   r   prefill_token_idsr_   num_computed_tokensr4   c                 C   s4   |j d }t|f | |||||d||dd	 d S )Nr      rO   )shaperi   stride)r   rj   r"   r   rk   r_   rl   r!   r   r   r   prepare_prefill_inputs   s   
	
rq   c                 C   s  t d}t dd }||kr2t |||D ]}	|	t d| }
|
|k }t j||
 d|d qd S t || }t || }t || }t || d }|| }|| }t || | t d||D ]}	|	t d| }
|
|k }||
 }t j| | |
 ||d qgd S rP   )r   rR   num_programsr=   r?   rT   rS   )pos_ptrseq_lens_ptrrW   rX   r\   r	   rO   req_idr!   r8   re   rQ   r^   rl   startendrc   seq_lenposr   r   r   _prepare_pos_seq_lens_kernel   s*   

rz   ry   r   c              	   C   s4   | j d }t|d f ||| |||j d dd d S )Nr   r   rm   rn   )ro   rz   )r"   r   rl   ry   r   r!   r   r   r   prepare_pos_seq_lens   s   

r{   c                 C   s&  t d}t || }t || }t || d }|| }|d }t d|
}t || d }|| }t j|	| | || ||k d t || }t || }||kr[d S t || }t | | | | |dkr||k }t j|||  | |d}t j| | | | ||d d S d S rP   r   rR   rS   r?   rT   )rU   rW   last_sampled_tokens_ptrrX   rt   prefill_len_ptrdraft_tokens_ptrdraft_tokens_stridecu_num_logits_ptrlogits_indices_ptrrO   r]   r^   cu_num_logits_startcu_num_logits_end
num_logitsr)   re   rb   logits_startrx   r_   last_token_idrQ   draft_tokensr   r   r   (_combine_sampled_and_draft_tokens_kernel  s@   


r   last_sampled_tokensr   r0   r   c	                 C   s`   |j d }	|j d }
tj|tj| jd}t|	f | |||||||d||t|
d d |S )Nr   r:   r   r   rn   )	ro   r   rB   r   r   r   rp   r   next_power_of_2)r   r"   r   r   r   r_   r   r0   r   r!   num_speculative_stepsr/   r   r   r    combine_sampled_and_draft_tokensH  s*   

r   c                 C   s   t d}t || }t || }t || }	||	k }
t | | }t |
d|}t | | | t || }t || d }|| }|| }t |
d|}t || | d S Nr   r   )r   rR   rS   whererT   )num_sampled_ptrnum_rejected_ptrrt   r   rW   r~   r]   r^   rx   r_   is_chunked_prefillingnum_sampledr   
logits_endr   num_rejectedr   r   r   $_get_num_sampled_and_rejected_kernelm  s   
	r   r   c                 C   s4   |j d }t| }t|f | ||||| | |fS )Nr   )ro   r   
empty_liker   )r   r   r0   r"   r_   r!   r   r   r   r   get_num_sampled_and_rejected  s   

r   c
                 C   s  t d}
t | |
 }t ||
 }|dkr,t ||
|  | d }t || | t|D ]$}t ||
|  | }|||  | }t |}|d7 }t || q0t |	|
 }t |	|
 d }|| }t ||
 }t || }||| 7 }t || | d S r   )r   rR   rS   rT   r=   )rW   r\   r}   output_bin_counts_ptroutput_bin_counts_stridesampled_tokens_ptrsampled_tokens_strider   r   rX   ru   r^   r   token_idr8   	token_ptrcountra   rb   rc   r   r`   r   r   r   _post_update_kernel  s,   

r   output_bin_countssampled_tokensr   c           	      C   s>   | j d }t|f | ||||d||d|||dd d S )Nr   r   )	num_warps)ro   r   rp   )	r"   rl   r   r   r   r   r   r   r!   r   r   r   post_update  s   

r   c                 C   s   t d}t || }t || d }|| }t d|}	|	|k }
t | | }t j|| |	 ||
d t j|| |	 |	|
d d S rP   r|   )rW   expanded_idx_mapping_ptrexpanded_local_pos_ptrr   rO   req_idx	start_idxend_idxr'   re   rQ   r^   r   r   r   _expand_idx_mapping_kernel  s   
r   total_num_logitsmax_expand_lenc                 C   sN   | j d }| |}tj|tj| jd}t|f | |||t|d ||fS )Nr   r   rn   )	ro   	new_emptyr   rB   r   r   r   r   r   )r"   r   r0   r   r!   r$   r%   r   r   r   expand_idx_mapping  s   


r   )dataclassesr   typingr   numpyr>   r   vllm.triton_utilsr   r   
vllm.utilsr   r   r   jit	constexprri   rJ   rq   rz   r{   r   r   r   r   tupler   r   r   r   r   r   r   r   r   <module>   s   x	$
$
8	

%


)

"