o
    -ig                     @   sx  d dl Z d dlmZmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z< e+e=Z>G dd deZ?G dd de?Z@dS )    N)AsyncGeneratorMapping)AnyFinalcast)Request)Response)assert_neveroverride)EngineClient)ChatTemplateContentFormatOption)RequestLogger)ErrorResponse	UsageInfo)EmbeddingServeContextOpenAIServingServeContext)OpenAIServingModels)EmbeddingBytesResponseEmbeddingChatRequestEmbeddingCompletionRequestEmbeddingRequestEmbeddingResponseEmbeddingResponseData)RenderConfig)TokensPrompt)init_logger)EmbeddingRequestOutputPoolingOutputPoolingRequestOutputRequestOutput)PoolingParams)merge_async_iterators)
chunk_list)
EmbedDTypeEncodingFormat
Endiannessencode_pooling_bytesencode_pooling_outputc                       s@  e Zd Z fddZedededB fddZdede	fd	d
Z
ededeeB eB fddZdefddZdefddZdedee dedeeedf  fddZdee dedef fddZdedededeeef dB dedeeeB df fddZedededB f fdd ZedededB f fd!d"Z  Z S )#EmbeddingMixinc                    sJ   t  j|i | | jj}t|o|j| _|r |jr |j| _d S d | _d S N)super__init__model_configpooler_configboolenable_chunked_processingsupports_chunked_processingmax_embed_len)selfargskwargsr.   	__class__ c/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/entrypoints/pooling/embed/serving.pyr,   9   s   zEmbeddingMixin.__init__ctxreturnNc              
      s   t t|}zK| |j|_t|jtr:| j|j| j|jj	|jj
p#|j
|j|jj|jj|jjdI d H \}|_W d S |  }|j|jj| |jdI d H |_W d S  ttfyq } ztd | t|W  Y d }~S d }~ww )N)chat_templatechat_template_content_formatadd_generation_promptcontinue_final_messageadd_special_tokens)prompt_or_promptsconfigz$Error in preprocessing prompt inputs)r   r   _maybe_get_adaptersrequestlora_request
isinstancer   _preprocess_chatrenderermessagesr<   r=   r>   r?   r@   engine_prompts_get_completion_rendererrender_promptinput_build_render_config
ValueError	TypeErrorlogger	exceptioncreate_error_responsestr)r3   r:   _rH   er8   r8   r9   _preprocessH   s6   


zEmbeddingMixin._preprocessrD   c                 C   s.   |  |rd }n| jp| j}t||j|jdS )N)
max_lengthtruncate_prompt_tokensr@   )_should_use_chunked_processingr2   max_model_lenr   rY   r@   )r3   rD   rX   r8   r8   r9   rN   g   s   
z#EmbeddingMixin._build_render_configc                    s   t tt  j jj jj jj fdd}dtdt	f fdd}dks4dkr7| S d	ks?dkrF|dkd
S t
 d S )Nc               	      sr   g } d}t D ]\}}t|t|dd}|j}| | |t|7 }qt||d}t j j	 j
| |dS )Nr   )encoding_formatembed_dtype
endianness)index	embedding)prompt_tokenstotal_tokensidcreatedmodeldatausage)	enumerater   r(   prompt_token_idsappendlenr   r   
request_idcreated_time
model_name)itemsnum_prompt_tokensidx	final_resitemrj   rh   r:   r]   r\   r^   final_res_batch_checkedr8   r9   encode_float_base64   s4   	
z;EmbeddingMixin._build_response.<locals>.encode_float_base64
bytes_onlyr;   c              	      sH   t d\}}}| rd ndt j j j||di}t||dS )N)pooling_outputsr]   r^   metadatarc   )contentheaders)r'   jsondumpsrm   rn   ro   r   )rx   r{   rp   rh   r|   )r:   r]   r^   rv   r8   r9   encode_bytes   s$   z4EmbeddingMixin._build_response.<locals>.encode_bytesfloatbase64bytes)rx   )r   listr   final_res_batchrD   r\   r]   r^   r/   r   r	   )r3   r:   rw   r   r8   ru   r9   _build_responset   s    zEmbeddingMixin._build_responsec                 C   s   | j jS )z?Get the model's effective maximum sequence length for chunking.)r-   r[   )r3   r8   r8   r9   _get_max_position_embeddings   s   z+EmbeddingMixin._get_max_position_embeddingsc                 C   s   t |ttfo	| jS )z<Check if chunked processing should be used for this request.)rF   r   r   r1   )r3   rD   r8   r8   r9   rZ      s   z-EmbeddingMixin._should_use_chunked_processing	token_ids
prompt_idxc                    s   g }|   }tt||D ]5\}}	|j d| d| }
t|	d}| j|
|||jd | jj|||
|j|t	|j
ddd}|| q|S )z1Process a single prompt using chunked processing.z-prompt--chunk-)rj   paramsrE   priorityr   rE   trace_headersr   )r   ri   r#   rm   r   _log_inputsrE   engine_clientencodegetattrrD   rk   )r3   r:   r   pooling_paramsr   r   
generatorsmax_pos_embeddings	chunk_idxchunk_tokenschunk_request_idchunk_engine_promptoriginal_generatorr8   r8   r9   _process_chunked_request   s0   	
	z'EmbeddingMixin._process_chunked_request	input_ids
input_textc                    s   t |}t|ttfrT| |}|  }| jdurd}| j}nd}| j}d}	d}
||kr6t|	j	|||d||krN|rDt
d|| n
t|
j	d||dt||d	S t |||S )
z>Override to support chunked processing for embedding requests.Nzmaximum embedding input lengthzmaximum context lengthzThis model's {length_type} is {max_length_value} tokens. However, you requested {token_num} tokens in the input for embedding generation. Please reduce the length of the input.zThis model's {length_type} is {max_length_value} tokens. However, you requested {token_num} tokens in the input for embedding generation. Please reduce the length of the input or enable chunked processing.)length_typemax_length_value	token_numzOInput length %s exceeds max_position_embeddings %s, will use chunked processingz"maximum position embeddings length)promptrj   )rl   rF   r   r   rZ   r   r2   r[   rO   formatrQ   infor   r+   _validate_input)r3   rD   r   r   r   enable_chunkedr   r   r   validation_error_msgchunked_processing_error_msgr6   r8   r9   r      sJ   


zEmbeddingMixin._validate_inputengine_promptr   r   prompt_indexc              
      sJ   |j  d| }| j||||jd | jj||||j|t|jdddS )zACreate a generator for a single prompt using standard processing.-r   r   r   r   )rm   r   rE   r   r   r   rD   )r3   r:   r   r   r   r   request_id_itemr8   r8   r9   _create_single_prompt_generatorB  s    	z.EmbeddingMixin._create_single_prompt_generatorc              
      s  t t|}| |j}|st |I dH S g }z|jdu r!dn	| |jjI dH }| 	|}t
|tr8|W S z	|d| j W n ty[ } z| t|W  Y d}~W S d}~ww |jdu rg| dW S |  }t|jD ]5\}}	d|	v r|	d }
t|
|kr| ||
|||I dH }|| qp| ||	|||I dH }|| qpt| |_W dS  ty } z| t|W  Y d}~S d}~ww )z'Override to support chunked processing.NembedEngine prompts not availablerj   )r   r   rZ   rD   r+   _prepare_generatorsraw_request_get_trace_headersr|   _create_pooling_paramsrF   r   verifyr-   rO   rS   rT   rJ   r   ri   rl   r   extendr   rk   r"   result_generator	Exception)r3   r:   use_chunkedr   r   r   rV   r   ir   rj   chunk_generators	generatorr6   r8   r9   r   ^  s^   









z"EmbeddingMixin._prepare_generatorsc              
      s  t t|}z|jdu r| dW S | |j}|s&t j|dI dH W S |jdu r1| dW S i }i }|j2 z3 dH W \}}d|j	v r|j	
d}zt||dd  }W n ttfyf   |}Y nw ||vrzdd	d	|j	
dd	 d
||< || }	t|ts| dt|j   W S t|jdr|jj}
nt|jdr|jj}
n| dt|jj   W S t|
tjstj|
tjd}
|jdu r| d  W S t|j}|
jtjd| }|	d du r||	d< n|	d  |7  < |	d  |7  < |	d  d7  < q8|j	
d}zt|d }W n ttfy   |}Y nw t t|||< q86 g }t|j}t|D ]}||v r|| }	|	d }|	d }|durt|tjrt|ttfr|d	kr|| }t |d}|j| }d|vr| d| d  W S |d }t|	d ||d	dd}|!| q4| d|   W S ||v r|!t t||  q4| d|   W S t t"t#tB  ||_$W dS  t%y } z| t&|W  Y d}~S d}~ww )zCollect and aggregate batch results
        with support for chunked processing.

        For chunked requests, performs online aggregation to
        minimize memory usage.
        For regular requests, collects results normally.
        Nr   )r:   zResult generator not availabler   r   r      r   )weighted_sumtotal_weightchunk_countrm   z9Expected PoolingRequestOutput for chunked embedding, got rg   r`   zUnsupported output type: )dtypez6prompt_token_ids cannot be None for chunked processingr   r   r   )rg   rj   zChunked prompt z does not contain token IDsrm   T)rm   rj   outputsnum_cached_tokensfinishedz&Failed to aggregate chunks for prompt zResult not found for prompt )'r   r   rJ   rS   rZ   rD   r+   _collect_batchr   rm   splitintr_   rO   
IndexErrorrF   r   type__name__hasattrr   rg   r`   torchTensortensorfloat32rj   rl   toranger   r   rk   r   r    r   r   rT   )r3   r:   r   prompt_aggregatorsshort_prompts_results
result_idxresultpartsr   
aggregatorembedding_dataweightweighted_embeddingr   num_promptsr   r   final_embeddingpooling_output_dataoriginal_promptoriginal_token_idspooling_request_outputrV   r6   r8   r9   r     s   













Q












zEmbeddingMixin._collect_batch)!r   
__module____qualname__r,   r
   r   r   rW   r   r   rN   r   r   r   r   r   r/   rZ   r   r   r   r   r   rT   r   r   r!   r   r    r   r   r   __classcell__r8   r8   r6   r9   r)   8   st    
J
-K
Gr)   c                       s   e Zd ZdZddddedededB dedB d	ed
e	de	ddf fddZ
	ddededB deeB f fddZedee deeB f fddZdededB f fddZ  ZS )OpenAIServingEmbeddingembdF)trust_request_chat_templatelog_error_stackr   modelsrequest_loggerNr<   r=   r   r   r;   c                   s*   t  j||||d || _|| _|| _d S )N)r   r   r   r   )r+   r,   r<   r=   r   )r3   r   r   r   r<   r=   r   r   r6   r8   r9   r,   \  s   
zOpenAIServingEmbedding.__init__rD   r   c                    sP   | j  }| j d| ||j }t||||| j| jd}t 	|I dH S )z
        Embedding API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/embeddings/create
        for the API specification. This API mimics the OpenAI Embedding API.
        r   )rD   r   ro   rm   r<   r=   N)
r   ro   request_id_prefix_base_request_idrm   r   r<   r=   r+   handle)r3   rD   r   ro   rm   r:   r6   r8   r9   create_embeddingr  s   
	z'OpenAIServingEmbedding.create_embeddingr:   c              
      sb   t  |}t|tr|S z
|d| j W |S  ty0 } z| t|W  Y d }~S d }~ww )Nr   )	r+   r   rF   r   r   r-   rO   rS   rT   )r3   r:   r   rV   r6   r8   r9   r     s   
z-OpenAIServingEmbedding._create_pooling_paramsc                    sF   t |jtr| j|jj|jj| jd}|d ur|S t |I d H S )N)request_chat_templatechat_template_kwargsr   )	rF   rD   r   _validate_chat_templater<   r   r   r+   rW   )r3   r:   error_check_retr6   r8   r9   rW     s   z"OpenAIServingEmbedding._preprocessr*   )r   r   r   r   r   r   r   rT   r   r/   r,   r   r   r   r   r   r
   r   r!   r   rW   r   r8   r8   r6   r9   r   Y  sP    
	
r   )Ar}   collections.abcr   r   typingr   r   r   r   fastapir   fastapi.responsesr   typing_extensionsr	   r
   vllm.engine.protocolr   vllm.entrypoints.chat_utilsr   vllm.entrypoints.loggerr   'vllm.entrypoints.openai.engine.protocolr   r   &vllm.entrypoints.openai.engine.servingr   r   r   &vllm.entrypoints.openai.models.servingr   'vllm.entrypoints.pooling.embed.protocolr   r   r   r   r   r   vllm.entrypoints.rendererr   vllm.inputs.datar   vllm.loggerr   vllm.outputsr   r   r   r    vllm.pooling_paramsr!   vllm.utils.async_utilsr"   vllm.utils.collection_utilsr#   vllm.utils.serial_utilsr$   r%   r&   r'   r(   r   rQ   r)   r   r8   r8   r8   r9   <module>   s:        %