o
    i=d                     @   s~  U d dl Z d dlmZmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z< e.e=Z>ee$ Z?ee@d< G dd deZAdS )    N)AsyncGeneratorCallableMapping)partial)AnyFinalLiteral	TypeAliascast)Request)assert_never)EngineClient)ChatTemplateContentFormatOption)RequestLogger)ErrorResponse	UsageInfo)OpenAIServingServeContext)OpenAIServingModels)EmbeddingBytesResponseEmbeddingChatRequestEmbeddingCompletionRequestEmbeddingRequestEmbeddingResponseEmbeddingResponseData)encode_pooling_bytesencode_pooling_output_base64encode_pooling_output_float)TokensPrompt)init_logger)PoolingOutputPoolingRequestOutput)PoolingParams)	TokPrompt)merge_async_iterators)
chunk_list)
EmbedDType
EndiannessEmbeddingServeContextc                       s  e Zd ZdZddddedededB dedB d	ed
e	de	ddf fddZ
dededB fddZdee dedededed dededefddZdee dedededed dededefddZdedeeB eB fdd Zdefd!d"Zde	fd#d$Zded%ee d&ed'eeef dB d(edeeedf  fd)d*Zd+ed,ee d-ede f fd.d/Z!ded0e"d&ed'eeef dB d1edeedf fd2d3Z#dededB f fd4d5Z$dededB f fd6d7Z%	d;d+e&d8e'dB deeB fd9d:Z(  Z)S )<OpenAIServingEmbeddingembdF)trust_request_chat_templatelog_error_stackengine_clientmodelsrequest_loggerNchat_templatechat_template_content_formatr+   r,   returnc          	         s^   t  j||||d || _|| _|| _| jj}t|o|j| _	|r*|j
r*|j
| _
d S d | _
d S )N)r-   r.   r/   r,   )super__init__r0   r1   r+   model_configpooler_configboolenable_chunked_processingsupports_chunked_processingmax_embed_len)	selfr-   r.   r/   r0   r1   r+   r,   r6   	__class__ \/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/entrypoints/pooling/embed/serving.pyr4   1   s(   zOpenAIServingEmbedding.__init__ctxc              
      s   zW|  |j|_t|jtr;| j|jj|jj| jd}|d ur#|W S | j	|j|jj
| j| jd dI d H \}|_W d S t|jtrS| j|j|jjd dI d H |_W d S | dW S  ttfyx } ztd | t|W  Y d }~S d }~ww )N)request_chat_templatechat_template_kwargsr+   )default_templatedefault_template_content_formatdefault_template_kwargs)prompt_inputprompt_embedsz#Invalid classification request typez$Error in preprocessing prompt inputs)_maybe_get_adaptersrequestlora_request
isinstancer   _validate_chat_templater0   rB   r+   _preprocess_chatmessagesr1   engine_promptsr   _preprocess_completioninputcreate_error_response
ValueError	TypeErrorlogger	exceptionstr)r;   r@   error_check_ret_er>   r>   r?   _preprocessS   s@   
z"OpenAIServingEmbedding._preprocessfinal_res_batch
request_idcreated_time
model_nameencoding_format)floatbase64embed_dtype
endiannessc                 C   s   t ttgtt tB f |dkrtntt||d}g }	d}
t	|D ]\}}t
|||d}|j}|	| |
t|7 }
q!t|
|
d}t||||	|dS )Nra   )rc   rd   r   )index	embedding)prompt_tokenstotal_tokensidcreatedmodeldatausage)r
   r   r!   listra   rW   r   r   r   	enumerater   prompt_token_idsappendlenr   r   )r;   r\   r]   r^   r_   r`   rc   rd   	encode_fnitemsnum_prompt_tokensidx	final_resitemrq   rn   r>   r>   r?   %request_output_to_embed_json_responsex   s>   

z<OpenAIServingEmbedding.request_output_to_embed_json_response)bytes
bytes_onlyc              	   C   sF   t |||d\}}	}
|dkrd ndt||||	|
di}t||dS )N)pooling_outputsrc   rd   r|   metadatari   )contentheaders)r   jsondumpsr   )r;   r\   r]   r^   r_   r`   rc   rd   r   ru   rn   r   r>   r>   r?   &request_output_to_embed_bytes_response   s$   
z=OpenAIServingEmbedding.request_output_to_embed_bytes_responsec              	   C   s   |j j}|j j}|j j}|dks|dkr#| |j|j|j|j|||S |dks+|dkr:| 	|j|j|j|j|||S t
| d S )Nra   rb   r{   r|   )rI   r`   rc   rd   rz   r\   r]   r^   r_   r   r   )r;   r@   r`   rc   rd   r>   r>   r?   _build_response   s0   

z&OpenAIServingEmbedding._build_responsec                 C   s   | j jS )z?Get the model's effective maximum sequence length for chunking.)r5   max_model_len)r;   r>   r>   r?   _get_max_position_embeddings   s   z3OpenAIServingEmbedding._get_max_position_embeddingsc                 C   s   t |ttfo	| jS )z<Check if chunked processing should be used for this request.)rK   r   r   r9   )r;   rI   r>   r>   r?   _should_use_chunked_processing   s   z5OpenAIServingEmbedding._should_use_chunked_processing	token_idspooling_paramstrace_headers
prompt_idxc              
      s   g }|   }tt||D ]>\}}	|j d| d| }
t|	d}| j|
|||jd |j| j	}|
 }| jj|||
|j|||jjd}|| q|S )z1Process a single prompt using chunked processing.z-prompt--chunk-)rq   paramsrJ   rJ   tokenization_kwargsr   priority)r   rp   r%   r]   r   _log_inputsrJ   rI   build_tok_paramsr5   get_encode_kwargsr-   encoder   rr   )r;   r@   r   r   r   r   
generatorsmax_pos_embeddings	chunk_idxchunk_tokenschunk_request_idchunk_engine_prompt
tok_paramsr   original_generatorr>   r>   r?   _process_chunked_request   s6   	

z/OpenAIServingEmbedding._process_chunked_requestrI   	input_ids
input_textc                    s   t |}t|ttfrT| |}|  }| jdurd}| j}nd}| j}d}	d}
||kr6t|	j	|||d||krN|rDt
d|| n
t|
j	d||dt||d	S t |||S )
z>Override to support chunked processing for embedding requests.Nzmaximum embedding input lengthzmaximum context lengthzThis model's {length_type} is {max_length_value} tokens. However, you requested {token_num} tokens in the input for embedding generation. Please reduce the length of the input.zThis model's {length_type} is {max_length_value} tokens. However, you requested {token_num} tokens in the input for embedding generation. Please reduce the length of the input or enable chunked processing.)length_typemax_length_value	token_numzOInput length %s exceeds max_position_embeddings %s, will use chunked processingz"maximum position embeddings length)promptrq   )rs   rK   r   r   r   r   r:   r   rS   formatrU   infor   r3   _validate_input)r;   rI   r   r   r   enable_chunkedr   r   r   validation_error_msgchunked_processing_error_msgr<   r>   r?   r   '  sJ   


z&OpenAIServingEmbedding._validate_inputengine_promptprompt_indexc           	   	      s\   |j  d| }| j||||jd |j| j}| }| jj||||j|||jj	dS )zACreate a generator for a single prompt using standard processing.-r   r   )
r]   r   rJ   rI   r   r5   r   r-   r   r   )	r;   r@   r   r   r   r   request_id_itemr   r   r>   r>   r?   _create_single_prompt_generatorr  s&   	z6OpenAIServingEmbedding._create_single_prompt_generatorc              
      s8  |  |j}|st |I dH S g }zp|jdu rdn	| |jjI dH }| |}t|t	r3|W S |j
du r>| dW S |  }t|j
D ]5\}}d|v rk|d }	t|	|krk| ||	|||I dH }
||
 qG| |||||I dH }|| qGt| |_W dS  ty } z| |W  Y d}~S d}~ww )z'Override to support chunked processing.NEngine prompts not availablerq   )r   rI   r3   _prepare_generatorsraw_request_get_trace_headersr   _create_pooling_paramsrK   r   rO   rR   r   rp   rs   r   extendr   rr   r$   result_generator	Exception)r;   r@   use_chunkedr   r   r   r   ir   rq   chunk_generators	generatorrZ   r<   r>   r?   r     sN   








z*OpenAIServingEmbedding._prepare_generatorsc              
      s  z|j du r| dW S | |j}|s!t j|dI dH W S |jdu r,| dW S i }i }|j2 z3 dH W \}}d|jv r|jd}zt	||
dd  }W n ttfy`   |}Y nw ||vrtdd	d	|jdd	 d
||< || }	t|ts| dt|j   W S t|jdr|jj}
nt|jdr|jj}
n| dt|jj   W S t|
tjstj|
tjd}
|jdu r| d  W S t|j}|
jtjd| }|	d du r||	d< n|	d  |7  < |	d  |7  < |	d  d7  < q3|jd}zt	|d }W n ttfy   |}Y nw |||< q36 g }t|j }t|D ]}||v r|| }	|	d }|	d }|durt|tjrt|t	tfr|d	kr|| }t|d}|j | }d|vrv| d| d  W S |d }t|	d ||d	dd}|| q+| d|   W S ||v r|||  q+| d|   W S ||_ W dS  t!y } z| |W  Y d}~S d}~ww )zCollect and aggregate batch results
        with support for chunked processing.

        For chunked requests, performs online aggregation to
        minimize memory usage.
        For regular requests, collects results normally.
        Nr   )r@   zResult generator not availabler   r   r      r   )weighted_sumtotal_weightchunk_countr]   z9Expected PoolingRequestOutput for chunked embedding, got rm   rf   zUnsupported output type: )dtypez6prompt_token_ids cannot be None for chunked processingr   r   r   )rm   rq   zChunked prompt z does not contain token IDsr]   T)r]   rq   outputsnum_cached_tokensfinishedz&Failed to aggregate chunks for prompt zResult not found for prompt )"rO   rR   r   rI   r3   _collect_batchr   r]   splitintre   rS   
IndexErrorrK   r!   type__name__hasattrr   rm   rf   torchTensortensorfloat32rq   rs   torangera   r    rr   r\   r   )r;   r@   r   prompt_aggregatorsshort_prompts_results
result_idxresultpartsr   
aggregatorembedding_dataweightweighted_embeddingr\   num_promptsr   r   final_embeddingpooling_output_dataoriginal_promptoriginal_token_idspooling_request_outputrZ   r<   r>   r?   r     s   













O












z%OpenAIServingEmbedding._collect_batchr   c                    sF   | j  }| j d| ||j }t||||d}| |I dH S )z
        Embedding API similar to OpenAI's API.

        See https://platform.openai.com/docs/api-reference/embeddings/create
        for the API specification. This API mimics the OpenAI Embedding API.
        r   )rI   r   r_   r]   N)r.   r_   request_id_prefix_base_request_idr]   r(   handle)r;   rI   r   r_   r]   r@   r>   r>   r?   create_embeddingy  s   
z'OpenAIServingEmbedding.create_embedding)N)*r   
__module____qualname__r   r   r   r   rW   r   r7   r4   r(   r   r[   ro   r!   r   r   r&   r'   r   rz   r   r   r   r   r   r"   r   r   r   objectr   r   r#   r   r   r   r   r   r   __classcell__r>   r>   r<   r?   r)   .   s    
	
"
%	
1	
"

 
1K

 = .r)   )Br   collections.abcr   r   r   	functoolsr   typingr   r   r   r	   r
   r   fastapir   typing_extensionsr   vllm.engine.protocolr   vllm.entrypoints.chat_utilsr   vllm.entrypoints.loggerr   'vllm.entrypoints.openai.engine.protocolr   r   &vllm.entrypoints.openai.engine.servingr   r   &vllm.entrypoints.openai.models.servingr   'vllm.entrypoints.pooling.embed.protocolr   r   r   r   r   r   vllm.entrypoints.pooling.utilsr   r   r   vllm.inputs.datar   vllm.loggerr   vllm.outputsr    r!   vllm.pooling_paramsr"   vllm.renderers.inputsr#   vllm.utils.async_utilsr$   vllm.utils.collection_utilsr%   vllm.utils.serial_utilsr&   r'   r   rU   r(   __annotations__r)   r>   r>   r>   r?   <module>   s4   
 