o
    
۾iq^                     @   s   d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d
ZG dd dZdS )    )castN)LoRARequest)SamplingType)&length_from_prompt_token_ids_or_embeds)swap_dict_values)LogprobsTensors)MultiGroupBlockTable)CachedRequestStategh㈵>c                   @   s  e Zd Zdedededejdededee dee fd	d
Ze	dee
 fddZ	d7dddedB ddfddZde
dedB fddZdededdfddZdee ddfddZdejfddZd ejd!ejdeeed"f eed"f ee f fd#d$Ze	defd%d&Ze	defd'd(Ze	defd)d*Ze	defd+d,Ze	defd-d.Ze	defd/d0Ze	defd1d2Ze	dedB fd3d4Ze	defd5d6Z dS )8
InputBatchmax_num_reqsmax_model_lenmax_num_batched_tokensdevice
pin_memory
vocab_sizeblock_sizeskernel_block_sizesc	           	   	   C   s  || _ || _|| _|| _|| _|| _g | _i | _tj	||fdtj
dd| _| j | _tj	|tj
d| _tj	|tj
d| _tj	|fdtj
|d| _| j | _t|||||||d| _tj|ftj|d| _tj|ftjd|d| _| j | _t | _t | _tj|ftj|d| _tj|ftjd|d| _| j | _t | _ tj|ftj
|d| _!tj|ftj
d|d| _"| j" | _#t | _$tj|ftj|d| _%tj|ftjd|d| _&| j& | _'t | _(tj|ftj)|d| _*tj|ftj)d|d| _+| j+ | _,t | _-tj|ftj)|d| _.tj|ftj)d|d| _/| j/ | _0t | _1tj|ftj)|d| _2tj|ftj)d|d| _3| j3 | _4t | _5i | _6tj	| j ftj7d| _8i | _9i | _:i | _;i | _<i | _=d g| | _>t | _?d | _@d | _Ai | _Bg | _Cd S )NcpuFr   dtyper   )r   )r   r   r   r   r   r   r   r   r   )r   r   r   )Dr   r   r   r   r   r   _req_idsreq_id_to_indextorchzerosint32token_ids_cpu_tensornumpytoken_ids_cpunpnum_tokens_no_specnum_prompt_tokensnum_computed_tokens_cpu_tensornum_computed_tokens_cpur   block_tableemptyfloat32temperaturetemperature_cpu_tensortemperature_cpusetgreedy_reqsrandom_reqstop_ptop_p_cpu_tensor	top_p_cpu
top_p_reqstop_ktop_k_cpu_tensor	top_k_cpu
top_k_reqsmin_pmin_p_cpu_tensor	min_p_cpu
min_p_reqsfloatfrequency_penaltiesfrequency_penalties_cpu_tensorfrequency_penalties_cpufrequency_penalties_reqspresence_penaltiespresence_penalties_cpu_tensorpresence_penalties_cpupresence_penalties_reqsrepetition_penaltiesrepetition_penalties_cpu_tensorrepetition_penalties_cpurepetition_penalties_reqs
min_tokensint64request_lora_mappinglora_id_to_request_idslora_id_to_lora_request
generatorsnum_logprobsin_progress_prompt_logprobs_cpu
logit_biashas_allowed_token_idsallowed_token_ids_mask!allowed_token_ids_mask_cpu_tensorbad_words_token_idsreq_output_token_ids)	selfr   r   r   r   r   r   r   r    rU   R/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/tpu_input_batch.py__init__   s   




zInputBatch.__init__returnc                 C   s   t tt | jS N)r   liststrr   rT   rU   rU   rV   req_ids   s   zInputBatch.req_idsNrequestr	   	req_indexc           
      C   sj  |d u r| j }|| jk sJ |j}|t| jkr&| j| | j|j n|| j|< |j| j|< || j|< t	|j
|j}|| j|< |j
| j|d |f< |}|t|j }|j| j|||f< |j| j|< |j| j|< | j|j| |j}|d us~J d|jtjkrd| j|< | j| n|j| j|< | j| |j| j|< |jdk r| j | |j!}d|  k r| j"k rn n| j#| n| j"}|| j$|< |j%| j&|< |j'| j(|< |j%t)kr| j*| |j'dkr| j+| |j,| j-|< |j,dkr| j.| |j/| j0|< |j/dkr| j1| |j2r|j2|j3f| j2|< |j4d ur*|j4| j5|< |j6d ur6|j6| j7|< |j8d urB|j8| j8|< |j9rz| j:| | j;d u rmt<j=| j| j"t<j>| j?d| _@t<j=| j| j"t<j>dd| _;d| j;|< d	| j;| |j9< |jAr|jA| jA|< |jBr|jBjC}	|	| jDvrtE | jD|	< |	| jF|< | jD|	 |j |jB| jG|	< d S d| jF|< d S )
Nz"pooling requests not supported yetg           r   g      ?r   r   TF)Hnum_reqsr   req_idlenr   appendrS   output_token_idsr   r   prompt_token_idsprompt_embedsr!   r   
num_tokensr    num_computed_tokensr#   r$   add_row	block_idssampling_paramssampling_typer   GREEDYr)   r+   addr'   r,   r-   r/   r0   r1   r   r4   r3   r5   r7   frequency_penaltyr<   _SAMPLING_EPSr8   r=   presence_penaltyr@   rA   repetition_penaltyrD   rE   rF   all_stop_token_ids	generatorrK   logprobsrL   rN   allowed_token_idsrO   rQ   r   r   boolr   rP   rR   lora_requestlora_int_idrI   r*   rH   rJ   )
rT   r^   r_   rb   r!   	start_idxend_idxrl   r1   lora_idrU   rU   rV   add_request   s   











zInputBatch.add_requestrb   c                 C   s^  | j |d}|du rdS d| j|< d| j|< | j| | j| | j| | j| | j	| | j
|d | j| | j| | j| | j|d | j|d | j|d | j| }|dkr| j| | t| j| dkr| j| | j| d| j|< d| j|< | j| | jdur| j| d | j|d |S )z<This method must always be followed by a call to condense().Nr   F)r   popr   rS   r+   discardr,   r0   r4   r8   rF   r=   rA   rE   rK   rL   rM   rH   rI   rc   rJ   rN   rO   rQ   fill_rR   )rT   rb   r_   r}   rU   rU   rV   remove_request#  s<   





zInputBatch.remove_requesti1i2c                 C   s  | j | }| j | }| j | | j | | j |< | j |< | j| | j| | j|< | j|< |d ur4|d us6J | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j	| | j	| | j	|< | j	|< | j
| | j
| | j
|< | j
|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j|df  }| j|df | j|df< || j|df< t| j|| t| j|| t| j|| | j| | j| | j|< | j|< | j| | j| | j|< | j|< | jd ur\| j| | j| 	| j|< | j|< | j|| d S )N.)r   rS   r   r    r!   r#   r)   r/   r3   r<   r@   rD   r7   r   copyr   rK   rF   rR   rH   rN   rQ   r$   swap_row)rT   r   r   	old_id_i1	old_id_i2tmprU   rU   rV   swap_statesJ  sl   

""""zInputBatch.swap_statesempty_req_indicesc                 C   sZ  | j }|dkr| j  | j  dS |t| d }|r||v r*|d8 }||v s"| }||kr3n| j| }| j| }|dusCJ || j|< d| j|< || j|< d| j|< || j|< | j| }| j|d|f | j|d|f< | j| | j|< | j	| | j	|< | j
| | j
|< | j|| | j| | j|< | j| | j|< | j| | j|< | j| | j|< | j| | j|< | j| | j|< | j| | j|< | j|d}|dur|| j|< | j|d}	|	dur|	| j|< | j| | j|< | j| | j|< | jdur| j| | j|< | j|d}
|
dur|
| j|< |d8 }|s| j| j d= | j| j d= dS )zMove non-empty requests down into lower, empty indices.

        Args:
          empty_req_indices: empty batch indices, sorted descending.
        r   Nr`   )ra   r   clearrS   rc   r   r   r    r   r!   r#   r$   move_rowr)   r/   r3   r<   r@   rD   r7   rK   rF   rH   rN   rQ   rR   )rT   r   ra   last_req_indexempty_indexrb   re   rh   ru   	min_tokenrR   rU   rU   rV   condense  s   




















GzInputBatch.condensec                 C   s   | j d | j  }tj| j|fdtj| jd}| }| jd | jd |f |d d < t	| jD ]}| j
||| j | d f< q1|j| jddS )Nr   r   T)r   non_blocking)r!   ra   maxr   r%   rG   r   r   r   ranger   tor   )rT   max_prompt_lenprompt_token_ids_cpu_tensorrf   irU   rU   rV   _make_prompt_token_ids_tensor  s    z(InputBatch._make_prompt_token_ids_tensornum_scheduled_tokensnum_sampled_tokens.c                 C   s>   | j d| j }t|}t||}t| j }|||fS )a  
        Given the num_scheduled_tokens for each request in the batch, return
        datastructures used to activate the current LoRAs.
        Returns:
            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
               where, token_lora_mapping[i] is the LoRA id to use for ith token.
            3. lora_requests: Set of relevant LoRA requests.
        N)rH   ra   tuplerepeatr*   rJ   values)rT   r   r   req_lora_mappingprompt_lora_mappingtoken_lora_mappingactive_lora_requestsrU   rU   rV   make_lora_inputs  s   
zInputBatch.make_lora_inputsc                 C   s
   t | jS rY   )rc   r   r\   rU   rU   rV   ra     s   
zInputBatch.num_reqsc                 C      t | jdkS Nr   )rc   r,   r\   rU   rU   rV   
all_greedy     zInputBatch.all_greedyc                 C   r   r   )rc   r+   r\   rU   rU   rV   
all_random   r   zInputBatch.all_randomc                 C   r   r   )rc   r0   r\   rU   rU   rV   no_top_p$  r   zInputBatch.no_top_pc                 C   r   r   )rc   r4   r\   rU   rU   rV   no_top_k(  r   zInputBatch.no_top_kc                 C   r   r   )rc   r8   r\   rU   rU   rV   no_min_p,  r   zInputBatch.no_min_pc                 C   s*   t | jdkot | jdkot | jdkS r   )rc   rA   r=   rE   r\   rU   rU   rV   no_penalties0  s
   zInputBatch.no_penaltiesc                 C   s   | j r
t| j  S d S rY   )rL   r   r   r\   rU   rU   rV   max_num_logprobs8  s   zInputBatch.max_num_logprobsc                 C   r   r   )rc   rO   r\   rU   rU   rV   no_allowed_token_ids<  r   zInputBatch.no_allowed_token_idsrY   )!__name__
__module____qualname__intr   r   rx   rZ   rW   propertyr[   r]   r~   r   r   r   Tensorr   r   ndarrayr   r*   r   r   ra   r   r   r   r   r   r   r   r   rU   rU   rU   rV   r
      st    	
 
v'MZ 
r
   )typingr   r   r   r   vllm.lora.requestr   vllm.sampling_paramsr   
vllm.utilsr   vllm.utils.collection_utilsr   vllm.v1.outputsr   vllm.v1.worker.block_tabler   vllm.v1.worker.gpu_input_batchr	   rq   r
   rU   rU   rU   rV   <module>   s   