o
    ip!                     @  s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	 ddl
mZ er<d d	lmZ d d
lmZ d dlmZ eG dd dZeG dd dZdHddZdIddZdJddZdKd d!Zd"ed#fd$ed%fd&ed'fd(ed)fd*ed+fd,ed-fd.ed/fd0ed1fd2ed1fd3ed4fd5ed6fd7ed6fd8ed9fd:ed9fd;ed<fd=ed>ejfgZed?Zd@ZdLdCdDZdMdFdGZdS )N    )annotations)	dataclassN)TYPE_CHECKING)Optional)Message   )ATTR_MODEL_NAME)EngineCoreOutput)RequestState)RequestStateStatsc                   @  s   e Zd ZU dZdZded< dZded< dZded< d	Zd
ed< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dS )RequestDataz>Container for vLLM request data extracted from engine outputs.NOptional[str]promptr   intinput_tokensoutput_tokens stroutput_textfinish_reasonzOptional[int]embedding_dimr   num_embeddings	lora_namenum_cached_tokensOptional[float]temperaturetop_pn
max_tokenszOptional[list[int]]input_)__name__
__module____qualname____doc__r   __annotations__r   r   r   r   r   r   r   r   r   r   r   r   r    r%   r%   \/home/ubuntu/.local/lib/python3.10/site-packages/ddtrace/contrib/internal/vllm/extractors.pyr      s    
 r   c                   @  sN   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	ded< dS )	LatencyMetricsz5Computed latency metrics from vLLM RequestStateStats.Nr   time_to_first_tokentime_in_queuetime_in_model_prefilltime_in_model_decodetime_in_model_inference)
r    r!   r"   r#   r(   r$   r)   r*   r+   r,   r%   r%   r%   r&   r'   '   s   
 r'   returntuple[int, Optional[int]]c                 C  sp   | du st | jdkrdS t | jdkrdt| jd fS t| jd t| jd }}|dkr4d|fS ||fS )z:Extract (num_embeddings, embedding_dim) from torch.Tensor.Nr   )r   Nr   )lenshaper   )tensorfirstlastr%   r%   r&   get_embedding_shape2   s   r5   	req_state'RequestState'engine_core_output'EngineCoreOutput'c              
   C  s   |j du}| j}|s%|du r%| jr%| jr%t| jdd}|r%|j| jdd}t|| jp+d| j|j	| j
| j| j| jd}|jrCt|j|_|rXt|j \}}||_||_| j|_|S | jr`| jj|_|S )zExtract request data from engine-side structures.

    Args:
        req_state: RequestState from OutputProcessor.request_states
        engine_core_output: EngineCoreOutput from engine_core

    Returns:
        RequestData for LLMObs tagging
    N	tokenizerF)skip_special_tokensr   )r   r   r   r   r   r   r   r   )pooling_outputr   prompt_token_idsdetokenizergetattrdecoder   
prompt_lenr   r   r   r   r   max_tokens_paramr   r   r5   r   r   r   r   )r6   r8   is_embeddingprompt_textr:   datanum_embemb_dimr%   r%   r&   extract_request_data@   s6   


rH   r   c                 C  s   t | tdS )z7Extract injected model name (set by traced_engine_init)N)r?   r   )instancer%   r%   r&   get_model_nameq   s   rJ   statsOptional['RequestStateStats']Optional[LatencyMetrics]c                 C  s   | sdS t  }| jrt| j|_| j}| j}| j}| j}|r'|r't|| |_|r2|r2t|| |_	|rA|rA||krAt|| |_
|rL|rLt|| |_|S )ztExtract latency metrics from vLLM RequestStateStats.

    Single source of truth for latency calculation logic.
    N)r'   first_token_latencyfloatr(   	queued_tsscheduled_tsfirst_token_tslast_token_tsr)   r*   r+   r,   )rK   metricsqueued	scheduledfirst_token
last_tokenr%   r%   r&   extract_latency_metricsv   s$   rY   z<|start_header_id|>z?<\|start_header_id\|>(system|user|assistant)<\|end_header_id\|>z<|header_start|>z9<\|header_start\|>(system|user|assistant)<\|header_end\|>z<|start_of_role|>zF<\|start_of_role\|>(system|user|assistant|documents?)<\|end_of_role\|>z<start_of_turn>z"<start_of_turn>(system|user|model)z<beginning_of_sentence>z'<beginning_of_sentence>(system|user|ai)z<|im_start|>z%<\|im_start\|>(system|user|assistant)z	<|User|>:z<\|(User|Assistant)\|>:z
<|system|>z<\|(system|user|assistant)\|>z<|user|>u   <｜u   <｜(User|Assistant)｜>z<_user>z<_(system|user|bot)>z	<_system>z<#user#>z<#(system|user|bot)#>z
<#system#>z### Instructionz!### (Instruction|Response|Input):zUser:z"^(System|User|Assistant|Falcon): ?u   <\|im_end\|>|<\|eot_id\|>|<\|end\|>|<\|eot\|>|<\|eom\|>|<\|end_of_text\|>|<end_of_turn>|<end_of_sentence>|<\|eos\|>|<｜end▁of▁sentence｜>)	assistantmodelaibotresponsefalconr   list[Message]c                 C  sB   | sg S t D ]\}}|| v rt| |}|r|  S qtd| dgS )z2Parse a formatted prompt into structured messages.r   rolecontent)_ROLE_PATTERNS_parse_with_patternr   )r   markerpatternmessagesr%   r%   r&   parse_prompt_to_messages   s   
ri   r   c                 C  s   t || }|sg S g }t|D ]O\}}|d}|sq| }| }|d t|k r5||d   nt| }	t	d| ||	 
d }
|tv rW|
sW|t|d krWq|t||
d q|S )z+Parse prompt using a specific role pattern.r   r   :ra   )listfinditer	enumerategrouplowerendr0   start_END_MARKERSsublstripstrip_ASSISTANT_ROLESappendr   )r   role_patternmatchesrh   imatch
role_matchrb   rq   rp   rc   r%   r%   r&   re      s    
(re   )r-   r.   )r6   r7   r8   r9   r-   r   )r-   r   )rK   rL   r-   rM   )r   r   r-   r`   )r   r   r-   r`   )
__future__r   dataclassesr   retypingr   r   ddtrace.llmobs.typesr   
_constantsr   vllm.v1.engine.corer	   vllm.v1.engine.output_processorr
   vllm.v1.statsr   r   r'   r5   rH   rJ   rY   compile	MULTILINErd   rr   rv   ri   re   r%   r%   r%   r&   <module>   sT    



1
%"
