o
    
۾i9                     @   s   d dl mZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZmZ d dlmZ d dlm Z  d dl!m"Z" eG dd dZ#G dd dZ$dS )    )	dataclass)castN)LoRARequest)MultiModalFeatureSpec)PoolingParams)SamplingParamsSamplingType)&length_from_prompt_token_ids_or_embeds)swap_dict_values)LogprobsTensors)PoolingMetadataPoolingStates)BatchUpdateBuilderLogitsProcessorsMoveDirectionality)SamplingMetadata)
copy_slice)MultiGroupBlockTablec                   @   s"  e Zd ZU eed< ee dB ed< ee ed< edB ed< e	j
dB ed< eee df ed< eed	< ee ed
< dZe	jdB ed< dZedB ed< dZe	jdB ed< dZedB ed< dZe	jdB ed< dZeed< dZedB ed< dZedB ed< dd ZedefddZdedefddZdS )CachedRequestStatereq_idNprompt_token_idsmm_featuressampling_params	generator.	block_idsnum_computed_tokensoutput_token_idsmrope_positionsmrope_position_deltaxdrope_positionslora_requestprompt_embedsr   prev_num_draft_lenpooling_paramspooling_statesc                 C   s*   t | j| j| _| jd urt | _d S d S N)r	   r   r!   num_prompt_tokensr#   r   r$   self r)   R/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/gpu_input_batch.py__post_init__8   s   
z CachedRequestState.__post_init__returnc                 C   s   | j t| j S r%   )r&   lenr   r'   r)   r)   r*   
num_tokens@   s   zCachedRequestState.num_tokensidxc                 C   sV   || j k r| jd u rtd| d| j| S || j  t| jk r)| j|| j   S dS )NzTried to access token index zG, but that token was provided via prompt_embeds, and its ID is unknown.)r&   r   
ValueErrorr-   r   )r(   r/   r)   r)   r*   get_token_idD   s   



zCachedRequestState.get_token_id)__name__
__module____qualname__str__annotations__listintr   r   torch	Generatortupler   Tensorr   r   r    r   r!   r"   r#   r   r$   r   r+   propertyr.   r2   r)   r)   r)   r*   r      s*   
 r   c                   @   sf  e Zd Z						dSdedededejded	ed
ee dee dee dB dedB dedededefddZ	e
dee fddZdddefddZdddefddZdedeeee f ddfddZd ededB fd!d"Zd#ed$eddfd%d&ZdTd'd(Zd)d* Zdefd+d,Zdee fd-d.Zdee fd/d0Zdefd1d2Zdejfd3d4Zd5e j!d6e j!de"e"ed7f e"ed7f e#e$ f fd8d9Z%d:ejd;ej&ddfd<d=Z'dTd>d?Z(d@eee  ddfdAdBZ)e
defdCdDZ*e
defdEdFZ+e
defdGdHZ,e
defdIdJZ-e
defdKdLZ.e
defdMdNZ/e
dedB fdOdPZ0e
defdQdRZ1dS )U
InputBatchNF   max_num_reqsmax_model_lenmax_num_batched_tokensdevice
pin_memory
vocab_sizeblock_sizeskernel_block_sizesmax_num_blocks_per_reqlogitsprocs!logitsprocs_need_output_token_idsis_spec_decodeis_pooling_modelcp_kv_cache_interleave_sizec                 C   sn  || _ || _|| _|| _|| _|| _|| _|| _g | _i | _	t
j||fdt
jdd| _| j | _t
j||fdtdd| _| j | _i | _tj|tjd| _tj|tjd| _t
j|fdt
j|d| _| j | _t||||||||	|d	| _t
j|ft
j|d| _t
j|ft
jd|d| _| j | _t  | _!t  | _"t
j|ft
j|d| _#t
j|ft
jd|d| _$| j$ | _%t  | _&t
j|ft
j|d| _'t
j|ft
jd|d| _(| j( | _)t  | _*t
j|ft
j+|d| _,t
j|ft
j+d|d| _-| j- | _.t  | _/t
j|ft
j+|d| _0t
j|ft
j+d|d| _1| j1 | _2t  | _3t
j|ft
j+|d| _4t
j|ft
j+d|d| _5| j5 | _6t  | _7t
j8|ft
j9d|d| _:| j: | _;tj| jftj9d| _<i | _=i | _>i | _?i | _@i | _AtB | _Ct  | _Dd | _Ed | _Fi | _Gtj|td| _Hg | _I|
ptJ | _K|| _Ldd	 tM|D | _N| O | _Pi | _Qi | _Rd | _Sd | _Td | _Ud | _Vd S )
NcpuFrD   dtyperE   )rQ   )	rA   rB   rC   rE   rD   rG   rH   max_num_blocksrN   rQ   rD   )rQ   rD   rE   c                 S   s   g | ]}g qS r)   r)   ).0_r)   r)   r*   
<listcomp>   s    z'InputBatch.__init__.<locals>.<listcomp>)WrM   rL   rA   rB   rC   rD   rE   rF   _req_idsreq_id_to_indexr:   zerosint32token_ids_cpu_tensornumpytoken_ids_cpuboolis_token_ids_tensoris_token_idsreq_prompt_embedsnpnum_tokens_no_specr&   num_computed_tokens_cpu_tensornum_computed_tokens_cpur   block_tableemptyfloat32temperaturetemperature_cpu_tensortemperature_cpusetgreedy_reqsrandom_reqstop_ptop_p_cpu_tensor	top_p_cpu
top_p_reqstop_ktop_k_cpu_tensor	top_k_cpu
top_k_reqsfloatfrequency_penaltiesfrequency_penalties_cpu_tensorfrequency_penalties_cpufrequency_penalties_reqspresence_penaltiespresence_penalties_cpu_tensorpresence_penalties_cpupresence_penalties_reqsrepetition_penaltiesrepetition_penalties_cpu_tensorrepetition_penalties_cpurepetition_penalties_reqsonesint64num_accepted_tokens_cpu_tensornum_accepted_tokens_cpurequest_lora_mappinglora_id_to_request_idslora_id_to_lora_request
generatorsnum_logprobsin_progress_prompt_logprobs_cpur   batch_update_builderhas_allowed_token_idsallowed_token_ids_mask!allowed_token_ids_mask_cpu_tensorbad_words_token_ids!logits_processing_needs_token_idsreq_output_token_idsr   rJ   rK   rangespec_token_ids_make_sampling_metadatasampling_metadatar#   r$   prev_sampled_token_idsprev_req_id_to_indexsampled_token_ids_cpuasync_copy_ready_event)r(   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   r)   r)   r*   __init__R   s   





zInputBatch.__init__r,   c                 C   s   t tt | jS r%   )r   r8   r6   rW   r'   r)   r)   r*   req_ids  s   zInputBatch.req_idsrequestr   c                 C   sT   | j   }du r| j}|| jk sJ d| j _|jr(| j j||j|j|j	f |S )zfTrack add-request operations for logits processors.
        Not applicable to pooling models.
        NT)
r   pop_removednum_reqsrA   batch_changedr   addedappendr   r   )r(   r   new_req_indexr)   r)   r*   _register_add_request  s   	z InputBatch._register_add_requestc                 C   s  |  |}|j}|t| jkr#| j| | j|j | jg  n|| j|< |j| j|< | j|   || j	|< t
|j|j}|| j|< |}|t|j }|jd urh|j| j|d |f< d| j|d |f< n	d| j|d |f< |jd ur||j| j|< |j| j|||f< d| j|||f< |j| j|< |j| j|< | j|j| |j }r|jtjkrd| j|< | j| n|j| j|< | j | |j!| j"|< |j!dk r| j#| |j$}d|  k r| j%k rn n| j&| n| j%}|| j'|< |j(| j)|< |j(dkr	| j*| |j+| j,|< |j+dkr| j-| |j.| j/|< |j.dkr-| j0| |j1d ur9|j1| j2|< |j3d urN|j3dkrH| j%n|j3| j4|< |j5r| j6| | j7d u ryt8j9| j:| j%t8j;| j<d| _=t8j9| j:| j%t8j;d	d| _7d| j7|< d| j7| |j5< |j>r|j>| j>|< n%|j? }	r|j@}
|
d usJ |	| j?|< |
| j@|< |	jA| jB|< ntCd
d| jD|< |jEr|jEjF}|| jGvrtH | jG|< || jI|< | jG| |j |jE| jJ|< |S d| jI|< |S )NTFg        r@   r   g      ?r0   rS   rO   zUnrecognized request type)Kr   r   r-   rW   r   r   r   r   clearrX   r	   r   r!   r&   r]   r`   ra   r.   rc   r   re   rf   add_rowr   r   sampling_typer   GREEDYrk   rm   addri   rn   ro   rq   rr   rs   rF   rv   ru   frequency_penaltyrz   r{   presence_penaltyr~   r   repetition_penaltyr   r   r   r   logprobsr   allowed_token_idsr   r   r:   rY   rA   r^   rD   r   r   r#   r$   requires_token_idsr   NotImplementedErrorr   r    lora_int_idr   rl   r   r   )r(   r   	req_indexr   r&   	start_idxend_idxr   rs   r#   r$   lora_idr)   r)   r*   add_request0  s   















zInputBatch.add_requestscheduled_spec_tokensc           
      C   sv   |j }| j| }| j| }|  ||d}t|}||_|s"d S | j| }|| }	|| j|||	f< |	| d S )Nr)   )
r   rX   r   r   getr-   r"   rc   r]   extend)
r(   r   r   r   r   cur_spec_token_idsr   num_spec_tokensstart_indexend_token_indexr)   r)   r*   update_req_spec_token_ids  s   


z$InputBatch.update_req_spec_token_idsr   c                 C   s  | j |d}|du rdS | j| d| j|< d| j|< | j|   | j| }|dkrF| j	| }|
| |sA| j	|= | j|= d| j|< | jrY| j|d | j|d |S | j
| | j
| | j
| | j
| | j
| | j
| | j
| | j|d | j|d | j|d | jdur| j|d | j
| | jdur| j| d | j|d |S )zThis method must always be followed by a call to condense().

        Args:
          req_id: request to remove

        Returns:
          Removed request index, or `None` if `req_id` not recognized
        Nr   F)rX   popr   removed_appendrW   r   r   r   r   r   discardr   rM   r#   r$   rm   rn   rr   rv   r{   r   r   r   r   r   r   r   r   fill_r   )r(   r   r   r   lora_req_idsr)   r)   r*   remove_request  sH   








zInputBatch.remove_requesti1i2c                 C   s^  | j | }| j | }| j | | j | | j |< | j |< | j| | j| | j|< | j|< | j| | j| | j|< | j|< |d urE|d usGJ | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j|df  }| j|df | j|df< || j|df< | j	||gdf | j	||gdf< | j
|}| j
|}|d ur|| j
|< n| j
|d  |d ur|| j
|< n| j
|d  | j|| | j| | j| | j|< | j|< | jrd S | jj||tjf | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< | j| | j| | j|< | j|< t| j|| t| j|| | j d ur| j | | j | 	| j |< | j |< d S d S )N.)!rW   r   r   rX   rc   r&   re   r]   copyr`   ra   r   r   rf   swap_rowr   rM   r   movedr   r   SWAPrk   rq   ru   rz   r~   r   r   r
   r   r   r   )r(   r   r   	old_id_i1	old_id_i2tmp	embeds_i1	embeds_i2r)   r)   r*   swap_states  s   

"
 ""zInputBatch.swap_statesc           
      C   s  | j }| jj }sdS |dkr | j  | j  | j  dS |t| d }|rp||v r7|d8 }||v s/| j }|dusBJ ||krHn(| j	  | j| }| j| }|dus]J || j|< d| j|< || j|< d| j|< || j
|< | j| t| j|  }| j| | j| | j|< | j|< | j|   | j|d|f | j|d|f< | j|d|f | j|d|f< || jv r| j|| j|< | j| | j|< | j| | j|< | j| | j|< | j|| | j| | j|< | jr|d8 }q(| jj||tjf | j| | j|< | j| | j|< | j| | j|< | j| | j|< | j| | j|< | j| | j|< | j | | j |< | j!|d}|durK|| j!|< | j"durY| j"| | j"|< | j#|d}	|	durj|	| j#|< |d8 }|s+| j|d= | j|d= | j|d= dS )a0  Slide non-empty requests down into lower, empty indices.

        Any consecutive empty indices at the very end of the list are not
        filled.

        Returns:
          swaps: list of (from,to) swap tuples for moved requests
          empty_req_indices: indices not filled by condensation
        Nr   r@   )$r   r   removedrW   r   r   r   r-   peek_removedr   rX   rc   r]   r`   ra   r   r&   re   rf   move_rowr   rM   r   r   r   UNIDIRECTIONALrk   rq   ru   rz   r~   r   r   r   r   r   )
r(   r   empty_req_indiceslast_req_indexempty_indexr   r   r.   r   r   r)   r)   r*   condenser  s   





























dzInputBatch.condensec                 C   s^   | j r| j }|r|  | _dS | j| j}| jjD ]}|	| q|r-|  | _dS dS )z-Apply any batch updates to sampling metadata.N)
rM   r   resetr   r   get_and_resetr   rJ   allupdate_state)r(   r   batch_update
logit_procr)   r)   r*   refresh_metadata  s   

zInputBatch.refresh_metadatac                 C   sV  | j }| jst| j| j|}nd }| jst| j| j| | js't| j	| j
| | jsBt| j| j| t| j| j| t| j| j| | j pN| jd |  }|rU|  nd }| j pbt| jpb| j}|rotttt  | jng }d }| js| jd us}J t| j| j| | jd | }tdi d|d| jd| j d| jrd n| jd | d| jrd nB| j
d | d| j!d| j"d|d	| jd | d
| jd | d| jd | d|d| j#d| jd|d| jd| j$S d| j!d| j"d|d	| jd | d
| jd | d| jd | d|d| j#d| jd|d| jd| j$S )Nri   
all_greedy
all_randomro   rs   r   max_num_logprobsr   rx   r|   r   r   r   no_penaltiesr   r   rJ   r)   )%r   r   r   rj   ri   no_top_prp   ro   no_top_krt   rs   r   ry   rx   r}   r|   r   r   r   any_make_prompt_token_ids_tensorr^   r   rK   r   r8   r9   r   no_allowed_token_idsr   r   r   r   r   r   r   rJ   )r(   r   ri   needs_prompt_token_idsr   needs_output_token_idsr   r   r)   r)   r*   r     s   


		
	
z"InputBatch._make_sampling_metadatac                    ,   t  jt  jksJ  fdd jD S )Nc                       g | ]} j | qS r)   )r#   rT   r   r'   r)   r*   rV   \      z1InputBatch.get_pooling_params.<locals>.<listcomp>)r-   r   r#   r'   r)   r'   r*   get_pooling_paramsZ     zInputBatch.get_pooling_paramsc                    r   )Nc                    r   r)   )r$   r   r'   r)   r*   rV   `  r   z1InputBatch.get_pooling_states.<locals>.<listcomp>)r-   r   r$   r'   r)   r'   r*   get_pooling_states^  r   zInputBatch.get_pooling_statesc                 C   s6   |   }|  }tt| jd | j | jj||dS )N)prompt_lensr   r#   r$   )	r   r   r   r:   
from_numpyr&   r   r   r   )r(   r#   r$   r)   r)   r*   get_pooling_metadatab  s   zInputBatch.get_pooling_metadatac                 C   s   | j }| jd |  }tj| j |fdtj| jd}| }| jd |d |f |d d < t	|D ]}| j
||| j| d f< q1|j| jddS )NrO   rP   T)rD   non_blocking)r   r&   maxr:   rg   r   rE   r\   r]   r   rF   torD   )r(   r   max_prompt_lenprompt_token_ids_cpu_tensorr   ir)   r)   r*   r   m  s   z(InputBatch._make_prompt_token_ids_tensornum_scheduled_tokensnum_sampled_tokens.c                 C   sD   | j d| j }t||}t||}t| j }|||fS )a<  
        Given the num_scheduled_tokens for each request in the batch, return
        datastructures used to activate the current LoRAs.
        Returns:
            1. prompt_lora_mapping: A tuple of size np.sum(num_sampled_tokens)
               where, prompt_lora_mapping[i] is the LoRA id to use for the ith
               sampled token.
            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
               where, token_lora_mapping[i] is the LoRA id to use for ith token.
            3. lora_requests: Set of relevant LoRA requests.
        N)r   r   r<   repeatrl   r   values)r(   r   r   req_lora_mappingprompt_lora_mappingtoken_lora_mappingactive_lora_requestsr)   r)   r*   make_lora_inputs~  s   
zInputBatch.make_lora_inputsr   r   c                 C   s(   | j jr|| _|| _dS d| _d| _dS )z
        In async scheduling case, store ref to sampled_token_ids_cpu
        tensor and corresponding copy-ready event. Used to repair
        output_token_ids prior to sampling, if needed by logits processors.
        N)r   r   r   r   )r(   r   r   r)   r)   r*   set_async_sampled_token_ids  s
   


z&InputBatch.set_async_sampled_token_idsc                 C   s  | j j}| jdu s|sdS | jdusJ d}t| jD ]f\}}| j|}|du r*q|| }|r6|d dkr7q|du rL| jdusBJ | j  | j	 }|| }|sSq|d dkr]t
|n|d}|d}	t
||	 }
t||
}||d= |	| }|||	|< qdS )z
        In async scheduling case, update output_token_ids in sampling metadata
        from prior steps sampled token ids once they've finished copying to CPU.
        This is called right before they are needed by the logits processors.
        Nr0   )r   r   r   r   	enumerater   r   r   synchronizetolistr-   indexmin)r(   r   sampled_token_idsr
  r   
prev_indexr   new_idsnum_sampled_idsfirst_placeholdernum_placeholdersnum_to_replace	end_indexr)   r)   r*   update_async_output_token_ids  s6   




z(InputBatch.update_async_output_token_idsdraft_token_idsc                 C   s   |r| j sdS | jj }dur<t| j|D ](\}}|r;| j |}|dur;|| }|r;|t|d= |  || qdS dS )z
        In async scheduling case, update spec_token_ids in sampling metadata with
        real draft token ids from prior step. This is called right before they are
        needed by the rejection sampler for penalty/bad_words computation.
        N)	r   r   r   zipr   r   r-   r   r   )r(   r  r   r   spec_idsr  	draft_idsr)   r)   r*   update_async_spec_token_ids  s   

z&InputBatch.update_async_spec_token_idsc                 C   s
   t | jS r%   )r-   rX   r'   r)   r)   r*   r     s   
zInputBatch.num_reqsc                 C      t | jdkS Nr   )r-   rn   r'   r)   r)   r*   r        zInputBatch.all_greedyc                 C   r  r  )r-   rm   r'   r)   r)   r*   r     r  zInputBatch.all_randomc                 C   r  r  )r-   rr   r'   r)   r)   r*   r     r  zInputBatch.no_top_pc                 C   r  r  )r-   rv   r'   r)   r)   r*   r     r  zInputBatch.no_top_kc                 C   s*   t | jdkot | jdkot | jdkS r  )r-   r   r{   r   r'   r)   r)   r*   r     s
   zInputBatch.no_penaltiesc                 C   s   | j r
t| j  S d S r%   )r   r   r   r'   r)   r)   r*   r      s   zInputBatch.max_num_logprobsc                 C   r  r  )r-   r   r'   r)   r)   r*   r     r  zInputBatch.no_allowed_token_ids)NNFFFr@   )r,   N)2r3   r4   r5   r9   r:   rD   r^   r8   r   r   r>   r6   r   r   r   r   dictr   r   r   r   r   r   r   r   r   r   r   r   r   r=   r   rb   ndarrayr<   rl   r   r  Eventr  r  r  r   r   r   r   r   r   r   r   r)   r)   r)   r*   r?   Q   s    	


 ?
 
6
g T 


(r?   )%dataclassesr   typingr   r\   rb   r:   vllm.lora.requestr   vllm.multimodal.inputsr   vllm.pooling_paramsr   vllm.sampling_paramsr   r   
vllm.utilsr	   vllm.utils.collection_utilsr
   vllm.v1.outputsr   vllm.v1.pool.metadatar   r   vllm.v1.sample.logits_processorr   r   r   vllm.v1.sample.metadatar   vllm.v1.utilsr   vllm.v1.worker.block_tabler   r   r?   r)   r)   r)   r*   <module>   s&   3