o
    
۾ia                     @   s  U d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
mZmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d d	lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d d
lm&Z' d dlm(Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZDmEZE d dlFmGZGmHZHmIZImJZJmKZKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZS d dlTmUZU d dlVmWZW erd dlXZXd dlYZYneWdeZ dZYeWdeZ dZXe?e[Z\de]fd d!Z^G d"d# d#e_Z`d$d%d&d'ZaG d(d) d)e9d*d+ZbG d,d- d-e9d*d+ZcG d.d/ d/e9d*d+ZdG d0d1 d1e9d*d+ZeG d2d3 d3e9d*d+ZfG d4d5 d5e9d*d+ZgG d6d7 d7e4ZhG d8d9 d9e9d*d+ZiG d:d; d;e9d*d+ZjG d<d= d=e9d*d+ZkG d>d? d?e9d*d+ZlG d@dA dAe9d*d+Zme'ecB e B egB e!B eiB ejB edB eeB ekB elB e]B emB Z&eendB< G dCdD dDe9d*d+Zoe)eoB e0B Z(eendE< G dFdG dGe9d*d+ZpedH ZqedI ZredJ ZsedKZtG dLdM dMeIZudNeveXjw dOeQfdPdQZxdReveye]dSf  dOeQfdTdUZzdVe]dReve dOeQfdWdXZ{G dYdZ dZeeet Z|d[eve}e~e]dB f  dOeQd\eve] fd]d^Zd_eye]eve}e~e]dB f  f dOeQd`eye]eve] f dae}eDeEf fdbdcZG ddde dee|e}e~e]dB f  ZG dfdg dge|e
e}e~e]dB f   ZG dhdi dieZG djdk dkeZG dldm dmeZdnee]B dB fdodpZd*dqdnee]B dB dredae]dB fdsdtZeeZd*dqdnee]B dB dredae]dB fdudvZdweye]evf dxeve] dae]fdydzZdweye]evf dxeve] d{edae]fd|d}Zeee"ZeeedZeeeeZeee Zeee!ZeeeiZeeemZe6ejZe6ecjZe6egjZe6e-jZe]eye]e]f B e+B ehB Zeend~< dd dd dd dd dd dd dd dd dd dd dd dd dd dZeye]ee&gef f end< de&dae}e]ef fddZdZde]dee& de|ded{edaevep fddZde&deded{edaedB f
ddZeeeZeee%Zde(de|derd{edaevep f
ddZdevep dadfddZdeve( de=derdae}evep eDdB eEdB f fddZdeve( de=derdae}evep eDdB eEdB f fddZdevep fddZdde]fddZdS )    N)ABCabstractmethod)Counterdefaultdict)	AwaitableCallableIterable)cached_property	lru_cachepartial)
accumulate)Path)TYPE_CHECKINGAnyGenericLiteral	TypeAliasTypeVarcast)#ChatCompletionAssistantMessageParam#ChatCompletionContentPartImageParam(ChatCompletionContentPartInputAudioParam%ChatCompletionContentPartRefusalParam"ChatCompletionContentPartTextParamChatCompletionFunctionToolParam"ChatCompletionMessageToolCallParamChatCompletionToolMessageParam)ChatCompletionContentPartParam)ChatCompletionMessageParam)
InputAudio)ResponseInputImageParam)Message)Image)	BaseModel
ConfigDictTypeAdapter)Required	TypedDict)envs)ModelConfig)init_logger)SupportsMultiModal)MULTIMODAL_REGISTRYMultiModalDataDictMultiModalUUIDDict)MultiModalBatchedFieldMultiModalFlatFieldMultiModalSharedFieldVisionChunkVisionChunkImageVisionChunkVideo)MEDIA_CONNECTOR_REGISTRYMediaConnector)BaseMultiModalProcessorrandom_uuid)
is_list_of)
LazyLoadertransformerstorchnamec                 C   s<   | dkrddl m} tjdtdd |S tdtd| )	Nresolve_hf_chat_templater   )resolve_chat_templatez`vllm.entrypoints.chat_utils.resolve_hf_chat_template` has been moved to `vllm.renderers.hf.resolve_chat_template`. The old name will be removed in v0.16.   )
stacklevelzmodule z has no attribute )vllm.renderers.hfr@   warningswarnDeprecationWarningAttributeError__name__)r>   r@    rI   O/home/ubuntu/.local/lib/python3.10/site-packages/vllm/entrypoints/chat_utils.py__getattr__E   s   rK   c                   @   s   e Zd ZdZdS )ChatTemplateResolutionErrorzRaised when chat template resolution fails.

    This is a subclass of ValueError for backward compatibility with
    existing exception handlers.
    N)rH   
__module____qualname____doc__rI   rI   rI   rJ   rL   V   s    rL   z<##IMAGE##>z<##AUDIO##>z<##VIDEO##>)imageaudiovideoc                   @      e Zd ZU ee ed< dS )AudioURLurlNrH   rM   rN   r&   str__annotations__rI   rI   rI   rJ   rT   e      
 rT   F)totalc                   @   *   e Zd ZU ee ed< eed  ed< dS )#ChatCompletionContentPartAudioParam	audio_urltypeN)rH   rM   rN   r&   rT   rX   r   rI   rI   rI   rJ   r\   l      
 r\   c                   @   F   e Zd ZU eeeef B dB ed< 	 eed  ed< 	 edB ed< dS ))ChatCompletionContentPartImageEmbedsParamNimage_embedsr^   uuidrH   rM   rN   rW   dictrX   r&   r   rI   rI   rI   rJ   ra   s      
 ra   c                   @   r`   ))ChatCompletionContentPartAudioEmbedsParamNaudio_embedsr^   rc   rd   rI   rI   rI   rJ   rg      rf   rg   c                   @   rS   )VideoURLrU   NrV   rI   rI   rI   rJ   ri      rY   ri   c                   @   r[   )#ChatCompletionContentPartVideoParam	video_urlr^   N)rH   rM   rN   r&   ri   rX   r   rI   rI   rI   rJ   rj      r_   rj   c                   @   s&   e Zd ZU dZejed< eddZdS )PILImagez#
    A PIL.Image.Image object.
    	image_pilT)arbitrary_types_allowedN)rH   rM   rN   rO   r"   rX   r$   model_configrI   rI   rI   rJ   rl      s   
 
rl   c                   @   s*   e Zd ZU dZedB ed< edB ed< dS )(CustomChatCompletionContentPILImageParamzA simpler version of the param that only accepts a PIL image.

    Example:
    {
        "image_pil": ImageAsset('cherry_blossom').pil_image
    }
    Nrm   rc   )rH   rM   rN   rO   rl   rX   rW   rI   rI   rI   rJ   rp      
   
 rp   c                   @   *   e Zd ZU dZedB ed< edB ed< dS )+CustomChatCompletionContentSimpleImageParamzA simpler version of the param that only accepts a plain image_url.
    This is supported by OpenAI API, although it is not documented.

    Example:
    {
        "image_url": "https://example.com/image.jpg"
    }
    N	image_urlrc   rH   rM   rN   rO   rW   rX   rI   rI   rI   rJ   rs      s
   
 	rs   c                   @   s   e Zd ZU dZedB ed< dS )+CustomChatCompletionContentSimpleAudioParamzA simpler version of the param that only accepts a plain audio_url.

    Example:
    {
        "audio_url": "https://example.com/audio.mp3"
    }
    Nr]   ru   rI   rI   rI   rJ   rv      s   
 rv   c                   @   rr   )+CustomChatCompletionContentSimpleVideoParamzA simpler version of the param that only accepts a plain audio_url.

    Example:
    {
        "video_url": "https://example.com/video.mp4"
    }
    Nrk   rc   ru   rI   rI   rI   rJ   rw      rq   rw   c                   @   s:   e Zd ZU dZee ed< 	 eed< 	 eed  ed< dS )!CustomThinkCompletionContentParamzA Think Completion Content Param that accepts a plain text and a boolean.

    Example:
    {
        "thinking": "I am thinking about the answer",
        "closed": True,
        "type": "thinking"
    }
    thinkingclosedr^   N)	rH   rM   rN   rO   r&   rW   rX   boolr   rI   rI   rI   rJ   rx      s   
 
rx   r   c                   @   sz   e Zd ZU dZee ed< 	 eee B ed< 	 eed< 	 edB ed< 	 e	e
 dB ed< 	 edB ed< 	 ee dB ed	< dS )
 CustomChatCompletionMessageParamz0Enables custom roles in the Chat Completion API.rolecontentr>   Ntool_call_id
tool_calls	reasoningtools)rH   rM   rN   rO   r&   rW   rX   listr   r   r   r   rI   rI   rI   rJ   r|     s    
 r|   r   c                   @   s   e Zd ZU ee ed< 	 edB eeeef  B ed< 	 edB ed< 	 edB ed< 	 ee	 dB ed< 	 edB ed< 	 edB ed< 	 ee
 dB ed	< dS )
ConversationMessager}   Nr~   r   r>   r   r   reasoning_contentr   )rH   rM   rN   r&   rW   rX   r   re   r   r   r   rI   rI   rI   rJ   r   5  s"   
 r   )autostringopenai)r   r   )rP   rQ   rR   rb   rh   vision_chunk_Tc                   @   s   e Zd ZdS )_BatchedSingleItemFieldN)rH   rM   rN   rI   rI   rI   rJ   r   ]  s    r   tensorsmm_processorc                    s   | d }|j jj }t| dkr.|jdkr.|jd dkr.|jd |kr.td t	ddS |j t
 fdd| D r?t S d	d
 | D }dgt|fdd
tt|D }t|dS )Nr         zBatched multi-modal embedding inputs are deprecated for Chat API. Please pass a separate content part for each multi-modal item.)
batch_sizec                 3   s    | ]}|j  kV  qd S N)shape).0t)first_shaperI   rJ   	<genexpr>u  s    z _detect_field.<locals>.<genexpr>c                 S   s   g | ]}t |qS rI   )len)r   tensorrI   rI   rJ   
<listcomp>x      z!_detect_field.<locals>.<listcomp>c                    s$   g | ]}t  |  |d   fqS )r   )slice)r   i)
slice_idxsrI   rJ   r   z  s    )slices)infoctxro   get_inputs_embeds_sizer   ndimr   loggerwarningr   allr/   r   ranger0   )r   r   
first_itemhidden_sizesize_per_itemr   rI   )r   r   rJ   _detect_fielda  s&   




r   
data_itemsztorch.Tensorc                    s    si S t  d  tfdd dd  D rtd fddD  fdd D }z6t|i fd	dD fd
dD }|D ] jfdd D dd|< qSW |S  t	yy   t
d Y |S w )Nr   c                 3   s     | ]}t |  kV  qd S r   )setkeysr   item)
first_keysrI   rJ   r     s    z _merge_embeds.<locals>.<genexpr>r   zCAll dictionaries in the list of embeddings must have the same keys.c                    s&   i | ]  t  fd dD qS )c                       g | ]}|  qS rI   rI   r   keyrI   rJ   r     r   ,_merge_embeds.<locals>.<dictcomp>.<listcomp>)r   )r   )r   r   r   rJ   
<dictcomp>  s    z!_merge_embeds.<locals>.<dictcomp>c                    s.   i | ]\ } |j  fd dD ddqS )c                    r   rI   rI   r   r   rI   rJ   r     r   r   F
pin_memory)_reduce_data)r   field)r   r   rJ   r     s    c                    s   i | ]}| | j qS rI   )r   r   r   )parsed_configsrI   rJ   r         c                    s.   g | ]} | | krt  | ts|qS rI   )
isinstancer   r   )fieldsparsed_fieldsrI   rJ   r     s    z!_merge_embeds.<locals>.<listcomp>c                    r   rI   rI   r   r   rI   rJ   r     r   Fr   zKError when parsing merged embeddings. Falling back to auto-detected fields.)r   r   any
ValueErroritems_get_mm_fields_configr<   BatchFeaturer   	Exceptionr   	exception)r   r   data_mergedkeys_to_updaterI   )r   r   r   r   r   r   r   rJ   _merge_embeds  sD   
	
r   modalityc                    s|   t |dkr|S tdd |D r|S t|tjr.|  d  fdd|D }t||  S t|tr8t||S tt|)Nr   c                 s   s    | ]}|d u V  qd S r   rI   r   rI   rI   rJ   r         z#_get_embeds_data.<locals>.<genexpr>_embedsc                    s   g | ]} |iqS rI   rI   r   
embeds_keyrI   rJ   r     r   z$_get_embeds_data.<locals>.<listcomp>)	r   r   r:   r=   Tensorr   re   NotImplementedErrorr^   )r   r   r   
dict_itemsrI   r   rJ   _get_embeds_data  s   


r   c                       s   e Zd ZdZdef fddZedefddZe	defdd	Z
edee fd
dZe	dd Ze	dd Ze	dd Zedd ZdedededB fddZedddZ  ZS )BaseMultiModalItemTrackerz
    Tracks multi-modal items in a given request and ensures that the number
    of multi-modal items in a given request does not exceed the configured
    maximum per prompt.
    ro   c                    s@   t    || _tttt f t| _tttt f t| _d S r   )	super__init___model_configr   rW   r   r   _items_by_modality_modality_order)selfro   	__class__rI   rJ   r     s   
z"BaseMultiModalItemTracker.__init__returnc                 C   s   t | jjddS )zDCheck if model uses unified vision_chunk modality for images/videos.use_unified_vision_chunkF)getattrr   	hf_configr   rI   rI   rJ   !use_unified_vision_chunk_modality  s   z;BaseMultiModalItemTracker.use_unified_vision_chunk_modalityc                 C   s   | j S r   )r   r   rI   rI   rJ   ro     s   z&BaseMultiModalItemTracker.model_configc                 C   s$   ddl m} || j}ttt |S )Nr   )get_model_cls) vllm.model_executor.model_loaderr   ro   r   r^   r+   )r   r   	model_clsrI   rI   rJ   r     s   
z#BaseMultiModalItemTracker.model_clsc                 C      | j jS r   )r   allowed_local_media_pathr   rI   rI   rJ   r        z2BaseMultiModalItemTracker.allowed_local_media_pathc                 C   r   r   )r   allowed_media_domainsr   rI   rI   rJ   r     r   z/BaseMultiModalItemTracker.allowed_media_domainsc                 C   s   t S r   )r,   r   rI   rI   rJ   mm_registry     z%BaseMultiModalItemTracker.mm_registryc                 C   s   | j | jS r   )r   create_processorro   r   rI   rI   rJ   r     s   z&BaseMultiModalItemTracker.mm_processorr   r   Nc                 C   s   | dd}|}| jo|dv }|rd}t| j| d }n	t| j| d }| jj}|dur>|jr>||dkr>|dr>n| j	j
|| |rY| j| | | jd | n| j| | | j||S )z
        Add a multi-modal item to the current prompt and returns the
        placeholder string to use, if any.

        An optional uuid can be added which serves as a unique identifier of the
        media.
        r    )rR   rP   r   r   Nr   )replacer   r   r   ro   multimodal_configenable_mm_embedsget_limit_per_promptendswithr   r   validate_num_itemsappendr   r   get_placeholder_str)r   r   r   input_modalityoriginal_modalityuse_vision_chunk	num_items	mm_configrI   rI   rJ   add  s.   zBaseMultiModalItemTracker.addBaseMultiModalContentParserc                 C      t r   r   r   rI   rI   rJ   create_parser(  r   z'BaseMultiModalItemTracker.create_parserr   r   )rH   rM   rN   rO   r)   r   r	   r{   r   propertyro   r^   r+   r   r   r   r   r   ModalityStrr   rW   r   r   r   __classcell__rI   rI   r   rJ   r     s(    	



/r   vision_chunk_itemsvision_chunks_modality_orderc                 C   s~  dd | D }t | t |ksJ dt |  dt | dg }d}t|| D ]\}\}}|dkrJt|drD|j}	|td|	|d	 q'|| q'|d
krt|dr|d urz@|p\t }
t|trmt |dkrm|d }n|}|	|}t
|D ]\}}|td|d |
 d| ||d d qx|d7 }W q' ty } ztd| || W Y d }~q'd }~ww || q'||fS )Nc                 S      g | ]\}}|qS rI   rI   r   datarc   rI   rI   rJ   r   4  r   z/_resolve_vision_chunk_items.<locals>.<listcomp>zvision_chunk items (z) and modality_order (z) must have same lengthr   rP   media)r^   rP   rc   rR   split_video_chunksr   video_chunk-prompt)r^   r  rc   	video_idxr  z Failed to split video chunks: %s)r   ziphasattrr	  r   r3   r9   r   tupler
  	enumerater4   r   r   r   )r  r   r  vision_chunks_uuidsprocessed_chunksr  inner_modalityr  rc   
image_data
video_uuid
video_datavideo_chunksr   vcerI   rI   rJ   _resolve_vision_chunk_items-  sZ   



	
r  items_by_modalitymodality_orderr   c                 C   s  d| v rd| v rt dd| v rd| v rt di }i }d| v r:tddd | d D ||d< d	d | d D |d< d| v rTd
d | d D |d< dd | d D |d< d| v rrtddd | d D ||d< dd | d D |d< d| v rdd | d D |d< dd | d D |d< d| v rdd | d D |d< dd | d D |d< d| v rt| d ||dg \}}||d< ||d< ||fS )NrP   rb   z4Mixing raw image and embedding inputs is not allowedrQ   rh   z4Mixing raw audio and embedding inputs is not allowedc                 S      g | ]\}}|qS rI   rI   r  rI   rI   rJ   r   z  r   z"_resolve_items.<locals>.<listcomp>c                 S   r  rI   rI   r  rI   rI   rJ   r   }  r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   rR   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   r   )r   r   r  get)r  r   r  mm_datamm_uuidsr  vision_chunk_uuidsrI   rI   rJ   _resolve_itemsk  sL   
r$  c                   @   4   e Zd ZdeedB edB f fddZdddZdS )	MultiModalItemTrackerr   Nc                 C   s    | j sdS tt| j | j| jS )NNN)r   r$  re   r   r   r   rI   rI   rJ   resolve_items  s
   z#MultiModalItemTracker.resolve_itemsr   c                 C      t | S r   )MultiModalContentParserr   rI   rI   rJ   r        z#MultiModalItemTracker.create_parserr   rH   rM   rN   r  r-   r.   r(  r   rI   rI   rI   rJ   r&    s
    

r&  c                   @   r%  )	AsyncMultiModalItemTrackerr   Nc                    s6   | j sdS dd | j  D I d H }t|| j| jS )Nr'  c                    s$   i | ]\}}|t j| I d H qS r   )asynciogather)r   r   corosrI   rI   rJ   r     s
    z<AsyncMultiModalItemTracker.resolve_items.<locals>.<dictcomp>)r   r   r$  r   r   )r   resolved_items_by_modalityrI   rI   rJ   r(    s   
z(AsyncMultiModalItemTracker.resolve_itemsr   c                 C   r)  r   )AsyncMultiModalContentParserr   rI   rI   rJ   r     r+  z(AsyncMultiModalItemTracker.create_parserr   r,  rI   rI   rI   rJ   r-    s
    
r-  c                       sh  e Zd Zd! fddZdededB fddZdeeef fd	d
Z	e
d"dedB dedB ddfddZe
	d"deeeef B dB dedB ddfddZe
	d"dejdB dedB ddfddZe
d"dedB dedB ddfddZe
	d"dedB dedB ddfddZe
	d"deeeef B dB dedB ddfddZe
d"dedB dedB ddfdd Z  ZS )#r   r   Nc                    s   t    tt| _d S r   )r   r   r   r   _placeholder_storager   r   rI   rJ   r     s   
z$BaseMultiModalContentParser.__init__r   placeholderc                 C   s$   t | }|r| j| | d S d S r   )MODALITY_PLACEHOLDERS_MAPr3  r   )r   r   r4  mod_placeholderrI   rI   rJ   _add_placeholder  s   z,BaseMultiModalContentParser._add_placeholderc                 C   s
   t | jS r   )re   r3  r   rI   rI   rJ   mm_placeholder_storage  s   
z2BaseMultiModalContentParser.mm_placeholder_storagert   rc   c                 C   r   r   r   )r   rt   rc   rI   rI   rJ   parse_image  r   z'BaseMultiModalContentParser.parse_imagerb   c                 C   r   r   r   )r   rb   rc   rI   rI   rJ   parse_image_embeds     z.BaseMultiModalContentParser.parse_image_embedsrm   c                 C   r   r   r   )r   rm   rc   rI   rI   rJ   parse_image_pil     z+BaseMultiModalContentParser.parse_image_pilr]   c                 C   r   r   r   )r   r]   rc   rI   rI   rJ   parse_audio  r   z'BaseMultiModalContentParser.parse_audioinput_audioc                 C   r   r   r   )r   r?  rc   rI   rI   rJ   parse_input_audio  r=  z-BaseMultiModalContentParser.parse_input_audiorh   c                 C   r   r   r   )r   rh   rc   rI   rI   rJ   parse_audio_embeds  r;  z.BaseMultiModalContentParser.parse_audio_embedsrk   c                 C   r   r   r   )r   rk   rc   rI   rI   rJ   parse_video  r   z'BaseMultiModalContentParser.parse_video)r   Nr   )rH   rM   rN   r   r  rW   r7  re   r   r8  r   r9  r:  r"   r<  r>  r   r@  rA  rB  r  rI   rI   r   rJ   r     s\    ""*r   c                       s:  e Zd Zdeddf fddZedefddZddedB d	edB ddfd
dZ		ddee
eef B dB d	edB ddfddZ	ddee
eef B dB d	edB ddfddZ	ddejdB d	edB ddfddZddedB d	edB ddfddZ	ddedB d	edB ddfddZddedB d	edB ddfddZ  ZS )r*  trackerr   Nc                    D   t    || _| jjj}t|dd }tjtj	||j
|jd| _d S Nmedia_io_kwargs)rF  r   r   r   r   _trackerro   r   r   r5   loadr(   VLLM_MEDIA_CONNECTORr   r   
_connectorr   rC  r   rF  r   rI   rJ   r     s   

z MultiModalContentParser.__init__c                 C   r   r   rH  ro   r   rI   rI   rJ   ro     r   z$MultiModalContentParser.model_configrt   rc   c                 C   6   |r| j |nd }| jd||f}| d| d S NrP   )rK  fetch_imagerH  r   r7  )r   rt   rc   rP   r4  rI   rI   rJ   r9       z#MultiModalContentParser.parse_imagerb   c                    s    j  }|jstdt|tr% fdd| D } jd||f}t|t	r9 j
|} jd||f}|d u rF jdd |f} d| d S )N9You must set `--enable-mm-embeds` to input `image_embeds`c                       i | ]\}}| j |qS rI   rK  fetch_image_embeddingr   kvr   rI   rJ   r   #      z>MultiModalContentParser.parse_image_embeds.<locals>.<dictcomp>rb   rP   )ro   get_multimodal_configr   r   r   re   r   rH  r   rW   rK  rU  r7  )r   rb   rc   r   embedsr4  	embeddingrI   r   rJ   r:    s    



z*MultiModalContentParser.parse_image_embedsrh   c                    s    j  }|jstdt|tr& fdd| D } jd||f}nt|t	r; j
|} jd||f}n	 jdd |f} d| d S )N9You must set `--enable-mm-embeds` to input `audio_embeds`c                    rS  rI   rK  fetch_audio_embeddingrV  r   rI   rJ   r   >  rY  z>MultiModalContentParser.parse_audio_embeds.<locals>.<dictcomp>rh   rQ   )ro   rZ  r   r   r   re   r   rH  r   rW   rK  r_  r7  )r   rh   rc   r   r[  r4  r\  rI   r   rJ   rA  2  s   



z*MultiModalContentParser.parse_audio_embedsrm   c                 C   s"   | j d||f}| d| d S rO  )rH  r   r7  )r   rm   rc   r4  rI   rI   rJ   r<  K  s   z'MultiModalContentParser.parse_image_pilr]   c                 C   rN  NrQ   )rK  fetch_audiorH  r   r7  )r   r]   rc   rQ   r4  rI   rI   rJ   r>  Q  rQ  z#MultiModalContentParser.parse_audior?  c                 C   H   |r| dd}| dd}|rd| d| }nd }nd }| ||S Nr  r   formatzdata:audio/z;base64,r   r>  r   r?  rc   
audio_dataaudio_formatr]   rI   rI   rJ   r@  W     z)MultiModalContentParser.parse_input_audiork   c                 C   s8   |r	| j j|dnd }| jd||f}| d| d S )N)rk   rR   )rK  fetch_videorH  r   r7  )r   rk   rc   rR   r4  rI   rI   rJ   rB  g  s   z#MultiModalContentParser.parse_videor   )rH   rM   rN   r&  r   r  r)   ro   rW   r9  re   r:  rA  r"   r<  r>  r   r@  rB  r  rI   rI   r   rJ   r*    sN     	


 
(r*  c                       s  e Zd Zdeddf fddZedefddZdedB d	edB fd
dZ	d$dedB d	edB ddfddZ
	d$deeeef B dB d	edB ddfddZ	d$deeeef B dB d	edB ddfddZ	d$dejdB d	edB ddfddZdedB d	edB fddZd$dedB d	edB ddfddZ	d$dedB d	edB ddfddZdedB d	edB fd d!Zd$dedB d	edB ddfd"d#Z  ZS )%r2  rC  r   Nc                    rD  rE  rG  rL  r   rI   rJ   r   o  s   

z%AsyncMultiModalContentParser.__init__c                 C   r   r   rM  r   rI   rI   rJ   ro   |  r   z)AsyncMultiModalContentParser.model_configrt   rc   c                    $   |r| j |I d H nd }||fS r   )rK  fetch_image_async)r   rt   rc   rP   rI   rI   rJ   _image_with_uuid_async     z3AsyncMultiModalContentParser._image_with_uuid_asyncc                 C   *   |  ||}| jd|}| d| d S rO  )rm  rH  r   r7  )r   rt   rc   coror4  rI   rI   rJ   r9       z(AsyncMultiModalContentParser.parse_imagerb   c                        j  }|jstdtjttjt	t
tjf B d B t
d B f   }t|t	r9 fdd| D }|||f t|t
rK j|}|||f |d u rV|d |f  jd|} d| d S )NrR  c                    rS  rI   rT  rV  r   rI   rJ   r     rY  zCAsyncMultiModalContentParser.parse_image_embeds.<locals>.<dictcomp>rb   rP   )ro   rZ  r   r   r.  Futurer  r=   r   re   rW   r   r   
set_resultrK  rU  rH  r   r7  )r   rb   rc   r   futurer[  r\  r4  rI   r   rJ   r:    (   
"


z/AsyncMultiModalContentParser.parse_image_embedsrh   c                    rr  )Nr]  c                    rS  rI   r^  rV  r   rI   rJ   r     rY  zCAsyncMultiModalContentParser.parse_audio_embeds.<locals>.<dictcomp>rh   rQ   )ro   rZ  r   r   r.  rs  r  r=   r   re   rW   r   r   rt  rK  r_  rH  r   r7  )r   rh   rc   r   ru  r[  r\  r4  rI   r   rJ   rA    rv  z/AsyncMultiModalContentParser.parse_audio_embedsrm   c                 C   s^   t jttjd B td B f   }|r|||f n|d |f | jd|}| d| d S rO  )	r.  rs  r  r"   rW   rt  rH  r   r7  )r   rm   rc   ru  r4  rI   rI   rJ   r<    s   z,AsyncMultiModalContentParser.parse_image_pilr]   c                    rk  r   )rK  fetch_audio_async)r   r]   rc   rQ   rI   rI   rJ   _audio_with_uuid_async  rn  z3AsyncMultiModalContentParser._audio_with_uuid_asyncc                 C   ro  r`  )rx  rH  r   r7  )r   r]   rc   rp  r4  rI   rI   rJ   r>    rq  z(AsyncMultiModalContentParser.parse_audior?  c                 C   rb  rc  re  rf  rI   rI   rJ   r@    ri  z.AsyncMultiModalContentParser.parse_input_audiork   c                    rk  r   )rK  fetch_video_async)r   rk   rc   rR   rI   rI   rJ   _video_with_uuid_async  rn  z3AsyncMultiModalContentParser._video_with_uuid_asyncc                 C   ro  )NrR   )rz  rH  r   r7  )r   rk   rc   rp  r4  rI   rI   rJ   rB    rq  z(AsyncMultiModalContentParser.parse_videor   )rH   rM   rN   r-  r   r  r)   ro   rW   rm  r9  re   r:  rA  r"   r<  rx  r>  r   r@  rz  rB  r  rI   rI   r   rJ   r2  n  sT     	
#
#
 
(r2  chat_templatec                    s    du rdS t  tr  stdt  trLd}t fdd|D sFt  sHddlm} |  }| sJtd  d	  d
| dS dS dS t	t
  d)z5Raises if the provided chat template appears invalid.Nz-the supplied chat template path doesn't exist{}
c                 3       | ]}| v V  qd S r   rI   r   cr{  rI   rJ   r     r   z)validate_chat_template.<locals>.<genexpr>r   CHAT_TEMPLATES_DIRz#The supplied chat template string (z/) appears path-like, but doesn't exist! Tried:  and z" is not a valid chat template type)r   r   existsFileNotFoundErrorrW   r   /vllm.transformers_utils.chat_templates.registryr  r   	TypeErrorr^   )r{  JINJA_CHARSr  builtin_template_pathrI   r  rJ   validate_chat_template  s.   


r  
is_literalr  c                   sB   d u rd S |rt  trtd S zt }| W  d    W S 1 s(w   Y  W d S  ty } zet  tr= d}t fdd|D sddlm} |  }z!t|}| W  d    W W  Y d }~S 1 sow   Y  W n ty   d  d  d	| d
| }t	||w t
 ddW  Y d }~S d }~ww )Nz<chat_template is expected to be read directly from its valuer|  c                 3   r}  r   rI   r~  r  rI   rJ   r   :  r   z&_load_chat_template.<locals>.<genexpr>r   r  zThe supplied chat template (z=) looks like a file path, but it failed to be opened. Tried: r  z
. Reason: Tr  )r   r   r  openreadOSErrorr   r  r  r   _load_chat_template)r{  r  fr  r  r  r  msgrI   r  rJ   r  "  sJ   

(

2
r  c                C   s   t | |dS )Nr  )_cached_load_chat_template)r{  r  rI   rI   rJ   load_chat_templateU  s   r  placeholder_storagetextsc                 C   s6   t |D ]\}}|| v r| | d||< qd|S )Nr   
)r  popjoin)r  r  idxelemrI   rI   rJ   _get_interleaved_text_prompt]  s
   
r  interleave_stringsc                 C   s   t dd |  D }|rt| |}nd|}g }|D ]1}||  ||8  < || dk rCtd| td| td| d|	|g||   q|rXd||g S d|S )	z;Combine multimodal prompts for a multimodal language model.c                 S   s   g | ]	}|D ]}|qqS rI   rI   )r   r  rX  rI   rI   rJ   r   v  s    z4_get_full_multimodal_text_prompt.<locals>.<listcomp>r  r   zPlaceholder count is negative! Ensure that the 'interleave_strings' flag is disabled (current value: %s) when manually placing image placeholders.zInput prompt: %szFound more 'zA' placeholders in input prompt than actual multimodal data items.)
r   valuesr  r  countr   errordebugr   extend)r  r  r  placeholder_countstext_promptmissing_placeholdersr4  rI   rI   rJ    _get_full_multimodal_text_prompti  s,   


r  _ContentPartc                 C      t | dd S Ntext_TextParserr   partrI   rI   rJ   <lambda>      r  c                 C   r  )Nry   )_ThinkParserr   r  rI   rI   rJ   r    r  c                 C   r  r  r  r  rI   rI   rJ   r    r  c                 C   r  r  r  r  rI   rI   rJ   r    r  c                 C   r  )Nrt   )_ResponsesInputImageParserr   r  rI   rI   rJ   r    r  c                 C      t | di dd S )Nrt   rU   )_ImageParserr   r  rI   rI   rJ   r    r   c                 C   r  )Nrb   )_ImageEmbedsParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nrh   )_AudioEmbedsParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nrm   )_PILImageParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nr]   rU   )_AudioParserr   r  rI   rI   rJ   r    r   c                 C   r  )Nr?  )_InputAudioParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nrefusal)_RefusalParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nrk   rU   )_VideoParserr   r  rI   rI   rJ   r    r   )r  ry   
input_textoutput_textinput_imagert   rb   rh   rm   r]   r?  r  rk   MM_PARSER_MAPr  c                 C   s  t | tsJ | dd}| dd}t |tr;|tv r;|du r;t| | }|dkr7| dddkr7td ||fS |du sC|durd| v ratt| }|dd}t |tr]|dd}d|fS d	| v rttt	| }|d	d}d	|fS d
| v rtt
| }|d
d}d
|fS d| v rtt| }|dd}	d|	fS d| v rtt| }|dd}
t |
tr|
dd}
d|
fS | ddurttttf | }d|fS d| v rtt| }|dd}t |tr|dd}d|fS tdt |tstd|dfS )a  
    Parses a given multi-modal content part based on its type.

    Args:
        part: A dict containing the content part, with a potential 'type' field.

    Returns:
        A tuple (part_type, content) where:
        - part_type: Type of the part (e.g., 'text', 'image_url').
        - content: Parsed content (e.g., text, image URL).

    Raises:
        ValueError: If the 'type' field is missing and no direct URL is found.
    r^   Nrc   rt   detailr   zB'image_url.detail' is currently not supported and will be ignored.rU   rm   rb   rh   r]   r?  rk   z(Missing 'type' field in multimodal part.z(Invalid 'type' field in multimodal part.zunknown part_type content)r   re   r   rW   r  r   r   r   rs   rp   ra   rg   rv   rw   r   )r  	part_typerc   r~   image_paramsrt   rm   rb   audio_paramsrh   r]   input_audio_paramsvideo_paramsrk   rI   rI   rJ   #_parse_chat_message_content_mm_part  st   





r  )r  r  r}   parts
mm_tracker
wrap_dictsc                C   s   t t  }| }|D ]}t||||d}|r|| q|r&t| |dgS tt t |}	| }
|
r:t	|
|	|}nd
|	}t| |dgS )Nr  r  )r}   r~   r  )r   r  r    _parse_chat_message_content_partr   r   r   rW   r8  r  r  )r}   r  r  r  r  r~   	mm_parserr  	parse_resr  r8  r  rI   rI   rJ   !_parse_chat_message_content_parts*  s,   


r  r  c                C   s  t | tr| S t| \}}|tv r|du rtd| | dS |dv r0tt|}|r.d|dS |S | dd}|dur>t|}d}|dkrY|durNttj|nd}	|	|	| d}n|d	v rktt|}|
|| d}ny|d
kr|dur~tttttf B |nd}||| d}n[|dkr|durtttttf B |nd}||| d}n=|dkrtt|}||| d}n+|dkrtt|}
||
| d}n|dkrtt|}||| d}ntd| |rd|iS |rt| S dS )a|  Parses a single part of a conversation. If wrap_dicts is True,
    structured dictionary pieces for texts and images will be
    wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
    {"type": "image"}, respectively. Otherwise multimodal data will be
    handled by mm_parser, and texts will be returned as strings to be joined
    with multimodal placeholders.
    NzKSkipping multimodal part '%s' (type: '%s') with empty / unparsable content.)r  r  r  r  ry   r  r^   r  rc   rm   rP   )rt   r  rb   rh   rQ   r]   r?  rk   rR   zUnknown part type: r^   )r   rW   r  PART_TYPES_TO_SKIP_NONE_CONTENTr   r   r   r   r"   r<  r9  re   r:  rA  r>  r   r@  rB  r   r5  )r  r  r  r  r  r~   str_contentrc   r   image_contentdict_contentrI   rI   rJ   r  O  sn   



""



r  messagecontent_formatc           
      C   s  | d }|  d}|  d}|d u rg }nt|tr!td|dg}t||||dk|d}|D ]^}|dkr]t| }	d	|	v rJ|	d	 d urJt|	d	 |d	< |d ur\tt||d< tt||d
< n|dkrot| }	d|	v ro|	d |d< d| v rt| d tr| d |d< |dkr|  dd |d< q.|S )Nr}   r~   r   r  r  r   r  	assistantr   r   toolr   r>   	developerr   )	r   r   rW   r   r  _AssistantParserr   r   _ToolParser)
r  r  r  r  r}   r~   r   result
result_msg
parsed_msgrI   rI   rJ   _parse_chat_message_content  sF   


r  messagesc                 C   s   | D ]I}|d dkrKd|v rK| d}t|tsqt|dkr&|dd  q|D ]"}|d  d }rDt|ttfsCt||d d< q(i |d d< q(qd S )Nr}   r  r   r   function	arguments)r   r   r   r   r  re   jsonloads)r  r  r   r   r~   rI   rI   rJ   _postprocess_messages  s    

r  ro   c           	      C   sd   g }t |}| D ]}t||||dko|jd uo|jjd}|| qt| | \}}|||fS Nr   )r  )r&  r  r   interleave_mm_stringsr  r  r(  	r  ro   r  conversationr  r  sub_messagesr!  r"  rI   rI   rJ   parse_chat_messages  s    	
r  c           	         sl   g }t |}| D ]}t||||dko|jd uo|jjd}|| q	t| | I d H \}}|||fS r  )r-  r  r   r  r  r  r(  r  rI   rI   rJ   parse_chat_messages_async  s"   	
r  r  c                 C   sD   d}| D ]}|d dkr| d}||d urtt|nd7 }q|S )Nr   r}   r  r   )r   r   r   )r  r  r  r   rI   rI   rJ   get_history_tool_calls_cnt3  s   
r  randomid_typec                 C   s$   | dkrd| d| S dt   S )Nkimi_k2z
functions.:zchatcmpl-tool-r8   )r  	func_namer  rI   rI   rJ   make_tool_call_id<  s   r  )r  NN)r.  r  rD   abcr   r   collectionsr   r   collections.abcr   r   r   	functoolsr	   r
   r   	itertoolsr   pathlibr   typingr   r   r   r   r   r   r   openai.types.chatr   r   r   r   r   r   r   r   r   $OpenAIChatCompletionContentPartParamr    OpenAIChatCompletionMessageParam@openai.types.chat.chat_completion_content_part_input_audio_paramr   openai.types.responsesr    openai_harmonyr!   OpenAIHarmonyMessagePILr"   pydanticr#   r$   r%   typing_extensionsr&   r'   vllmr(   vllm.configr)   vllm.loggerr*   vllm.model_executor.modelsr+   vllm.multimodalr,   r-   r.   vllm.multimodal.inputsr/   r0   r1   r2   r3   r4   vllm.multimodal.mediar5   r6   vllm.multimodal.processingr7   
vllm.utilsr9   vllm.utils.collection_utilsr:   vllm.utils.import_utilsr;   r=   r<   globalsrH   r   rW   rK   r   rL   r5  rT   r\   ra   rg   ri   rj   rl   rp   rs   rv   rw   rx   rX   r|   r   ChatTemplateContentFormatOptionChatTemplateContentFormatr  r   r   r   r   r   re   r   r   r   r  objectr  r$  r&  r-  r   r*  r2  r  r{   r  r  r  r  r  r  r  r  r  r  r  r  validate_pythonr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rI   rI   rI   rJ   <module>   s(  
$(
 
			




3
d
>

 1
=p "

0






7









 

`
%

R

2
!
!	