o
    -i;                     @   s  U d dl Z d dlZd dlZd dlmZmZ d dlmZmZ d dl	m
Z
mZmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d d	lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d d
lm&Z' d dlm(Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZDmEZE d dlFmGZGmHZHmIZImJZJmKZKmLZL d dlMmNZN d dlOmPZPmQZQ d dlRmSZS d dlTmUZU d dlVmWZW erd dlXZXd dlYZYneWdeZ dZYeWdeZ dZXe?e[Z\de]fd d!Z^G d"d# d#e_Z`d$d%d&d'ZaG d(d) d)e9d*d+ZbG d,d- d-e9d*d+ZcG d.d/ d/e9d*d+ZdG d0d1 d1e9d*d+ZeG d2d3 d3e9d*d+ZfG d4d5 d5e9d*d+ZgG d6d7 d7e4ZhG d8d9 d9e9d*d+ZiG d:d; d;e9d*d+ZjG d<d= d=e9d*d+ZkG d>d? d?e9d*d+ZlG d@dA dAe9d*d+Zme'ecB e B egB e!B eiB ejB edB eeB ekB elB e]B emB Z&eendB< G dCdD dDe9d*d+Zoe)eoB e0B Z(eendE< G dFdG dGe9d*d+ZpedH ZqedI ZredJ ZsedKZtG dLdM dMeIZudNeveXjw dOeNfdPdQZxdReveye]dSf  dOeNfdTdUZzdVe]dReve dOeNfdWdXZ{dYeEdZeDd[eEfd\d]Z|dZeDd[eve] fd^d_Z}G d`da daeeet Z~dbeye]eveee]dB f  f dOeNdceye]eve] f d[eeDeEf fdddeZG dfdg dge~eee]dB f  ZG dhdi die~e
eee]dB f   ZG djdk dkeZG dldm dmeZG dndo doeZdpee]B dB fdqdrZd*dsdpee]B dB dted[e]dB fdudvZeeZd*dsdpee]B dB dted[e]dB fdwdxZdyeye]evf dzeve] d[e]fd{d|Zdyeye]evf dzeve] d}ed[e]fd~dZeee"ZeeedZeeeeZeee Zeee!ZeeeiZeeemZe6ejZe6ecjZe6egjZe6e-jZe]eye]e]f B e+B ehB Zeend< dd dd dd dd dd dd dd dd dd dd dd dd dd dZeye]ee&gef f end< de&d[ee]ef fddZdZde]dee& de~ded}ed[evep fddZde&deded}ed[edB f
ddZeeeZeee%Zde(de~derd}ed[evep f
ddZdevep d[dfddZdeve( de=derd[eevep eDdB eEdB f fddZdeve( de=derd[eevep eDdB eEdB f fddZdevep fddZdde]fddZdS )    N)ABCabstractmethod)Counterdefaultdict)	AwaitableCallableIterable)cached_property	lru_cachepartial)
accumulate)Path)TYPE_CHECKINGAnyGenericLiteral	TypeAliasTypeVarcast)#ChatCompletionAssistantMessageParam#ChatCompletionContentPartImageParam(ChatCompletionContentPartInputAudioParam%ChatCompletionContentPartRefusalParam"ChatCompletionContentPartTextParamChatCompletionFunctionToolParam"ChatCompletionMessageToolCallParamChatCompletionToolMessageParam)ChatCompletionContentPartParam)ChatCompletionMessageParam)
InputAudio)ResponseInputImageParam)Message)Image)	BaseModel
ConfigDictTypeAdapter)Required	TypedDict)envs)ModelConfig)init_logger)SupportsMultiModal)MULTIMODAL_REGISTRYMultiModalDataDictMultiModalUUIDDict)MultiModalBatchedFieldMultiModalFlatFieldMultiModalSharedFieldVisionChunkVisionChunkImageVisionChunkVideo)BaseMultiModalProcessor)MEDIA_CONNECTOR_REGISTRYMediaConnectorrandom_uuid)
is_list_of)
LazyLoadertransformerstorchnamec                 C   s<   | dkrddl m} tjdtdd |S tdtd| )	Nresolve_hf_chat_templater   )resolve_chat_templatez`vllm.entrypoints.chat_utils.resolve_hf_chat_template` has been moved to `vllm.renderers.hf.resolve_chat_template`. The old name will be removed in v0.16.   )
stacklevelzmodule z has no attribute )vllm.renderers.hfr@   warningswarnDeprecationWarningAttributeError__name__)r>   r@    rI   X/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/entrypoints/chat_utils.py__getattr__E   s   rK   c                   @   s   e Zd ZdZdS )ChatTemplateResolutionErrorzRaised when chat template resolution fails.

    This is a subclass of ValueError for backward compatibility with
    existing exception handlers.
    N)rH   
__module____qualname____doc__rI   rI   rI   rJ   rL   V   s    rL   z<##IMAGE##>z<##AUDIO##>z<##VIDEO##>)imageaudiovideoc                   @      e Zd ZU ee ed< dS )AudioURLurlNrH   rM   rN   r&   str__annotations__rI   rI   rI   rJ   rT   e      
 rT   F)totalc                   @   *   e Zd ZU ee ed< eed  ed< dS )#ChatCompletionContentPartAudioParam	audio_urltypeN)rH   rM   rN   r&   rT   rX   r   rI   rI   rI   rJ   r\   l      
 r\   c                   @   F   e Zd ZU eeeef B dB ed< 	 eed  ed< 	 edB ed< dS ))ChatCompletionContentPartImageEmbedsParamNimage_embedsr^   uuidrH   rM   rN   rW   dictrX   r&   r   rI   rI   rI   rJ   ra   s      
 ra   c                   @   r`   ))ChatCompletionContentPartAudioEmbedsParamNaudio_embedsr^   rc   rd   rI   rI   rI   rJ   rg      rf   rg   c                   @   rS   )VideoURLrU   NrV   rI   rI   rI   rJ   ri      rY   ri   c                   @   r[   )#ChatCompletionContentPartVideoParam	video_urlr^   N)rH   rM   rN   r&   ri   rX   r   rI   rI   rI   rJ   rj      r_   rj   c                   @   s&   e Zd ZU dZejed< eddZdS )PILImagez#
    A PIL.Image.Image object.
    	image_pilT)arbitrary_types_allowedN)rH   rM   rN   rO   r"   rX   r$   model_configrI   rI   rI   rJ   rl      s   
 
rl   c                   @   s*   e Zd ZU dZedB ed< edB ed< dS )(CustomChatCompletionContentPILImageParamzA simpler version of the param that only accepts a PIL image.

    Example:
    {
        "image_pil": ImageAsset('cherry_blossom').pil_image
    }
    Nrm   rc   )rH   rM   rN   rO   rl   rX   rW   rI   rI   rI   rJ   rp      
   
 rp   c                   @   *   e Zd ZU dZedB ed< edB ed< dS )+CustomChatCompletionContentSimpleImageParamzA simpler version of the param that only accepts a plain image_url.
    This is supported by OpenAI API, although it is not documented.

    Example:
    {
        "image_url": "https://example.com/image.jpg"
    }
    N	image_urlrc   rH   rM   rN   rO   rW   rX   rI   rI   rI   rJ   rs      s
   
 	rs   c                   @   s   e Zd ZU dZedB ed< dS )+CustomChatCompletionContentSimpleAudioParamzA simpler version of the param that only accepts a plain audio_url.

    Example:
    {
        "audio_url": "https://example.com/audio.mp3"
    }
    Nr]   ru   rI   rI   rI   rJ   rv      s   
 rv   c                   @   rr   )+CustomChatCompletionContentSimpleVideoParamzA simpler version of the param that only accepts a plain audio_url.

    Example:
    {
        "video_url": "https://example.com/video.mp4"
    }
    Nrk   rc   ru   rI   rI   rI   rJ   rw      rq   rw   c                   @   s:   e Zd ZU dZee ed< 	 eed< 	 eed  ed< dS )!CustomThinkCompletionContentParamzA Think Completion Content Param that accepts a plain text and a boolean.

    Example:
    {
        "thinking": "I am thinking about the answer",
        "closed": True,
        "type": "thinking"
    }
    thinkingclosedr^   N)	rH   rM   rN   rO   r&   rW   rX   boolr   rI   rI   rI   rJ   rx      s   
 
rx   r   c                   @   sz   e Zd ZU dZee ed< 	 eee B ed< 	 eed< 	 edB ed< 	 e	e
 dB ed< 	 edB ed< 	 ee dB ed	< dS )
 CustomChatCompletionMessageParamz0Enables custom roles in the Chat Completion API.rolecontentr>   Ntool_call_id
tool_calls	reasoningtools)rH   rM   rN   rO   r&   rW   rX   listr   r   r   r   rI   rI   rI   rJ   r|     s    
 r|   r   c                   @   s   e Zd ZU ee ed< 	 edB eeeef  B ed< 	 edB ed< 	 edB ed< 	 ee	 dB ed< 	 edB ed< 	 edB ed< 	 ee
 dB ed	< dS )
ConversationMessager}   Nr~   r   r>   r   r   reasoning_contentr   )rH   rM   rN   r&   rW   rX   r   re   r   r   r   rI   rI   rI   rJ   r   5  s"   
 r   )autostringopenai)r   r   )rP   rQ   rR   rb   rh   vision_chunk_Tc                   @   s   e Zd ZdS )_BatchedSingleItemFieldN)rH   rM   rN   rI   rI   rI   rJ   r   ]  s    r   tensorsmm_processorc                    s   | d }|j jj }t| dkr.|jdkr.|jd dkr.|jd |kr.td t	ddS |j t
 fdd| D r?t S d	d
 | D }dgt|fdd
tt|D }t|dS )Nr         zBatched multi-modal embedding inputs are deprecated for Chat API. Please pass a separate content part for each multi-modal item.)
batch_sizec                 3   s    | ]}|j  kV  qd S N)shape).0t)first_shaperI   rJ   	<genexpr>u  s    z _detect_field.<locals>.<genexpr>c                 S   s   g | ]}t |qS rI   )len)r   tensorrI   rI   rJ   
<listcomp>x      z!_detect_field.<locals>.<listcomp>c                    s$   g | ]}t  |  |d   fqS )r   )slice)r   i)
slice_idxsrI   rJ   r   z  s    )slices)infoctxro   get_inputs_embeds_sizer   ndimr   loggerwarningr   allr/   r   ranger0   )r   r   
first_itemhidden_sizesize_per_itemr   rI   )r   r   rJ   _detect_fielda  s&   




r   
data_itemsztorch.Tensorc                    s    si S t  d  tfdd dd  D rtd fddD  fdd D }z6t|i fd	dD fd
dD }|D ] jfdd D dd|< qSW |S  t	yy   t
d Y |S w )Nr   c                 3   s     | ]}t |  kV  qd S r   )setkeysr   item)
first_keysrI   rJ   r     s    z _merge_embeds.<locals>.<genexpr>r   zCAll dictionaries in the list of embeddings must have the same keys.c                    s&   i | ]  t  fd dD qS )c                       g | ]}|  qS rI   rI   r   keyrI   rJ   r     r   ,_merge_embeds.<locals>.<dictcomp>.<listcomp>)r   )r   )r   r   r   rJ   
<dictcomp>  s    z!_merge_embeds.<locals>.<dictcomp>c                    s.   i | ]\ } |j  fd dD ddqS )c                    r   rI   rI   r   r   rI   rJ   r     r   r   F
pin_memory)_reduce_data)r   field)r   r   rJ   r     s    c                    s   i | ]}| | j qS rI   )r   r   r   )parsed_configsrI   rJ   r         c                    s.   g | ]} | | krt  | ts|qS rI   )
isinstancer   r   )fieldsparsed_fieldsrI   rJ   r     s    z!_merge_embeds.<locals>.<listcomp>c                    r   rI   rI   r   r   rI   rJ   r     r   Fr   zKError when parsing merged embeddings. Falling back to auto-detected fields.)r   r   any
ValueErroritems_get_mm_fields_configr<   BatchFeaturer   	Exceptionr   	exception)r   r   data_mergedkeys_to_updaterI   )r   r   r   r   r   r   r   rJ   _merge_embeds  sD   
	
r   modalityc                    s|   t |dkr|S tdd |D r|S t|tjr.|  d  fdd|D }t||  S t|tr8t||S tt|)Nr   c                 s   s    | ]}|d u V  qd S r   rI   r   rI   rI   rJ   r         z#_get_embeds_data.<locals>.<genexpr>_embedsc                    s   g | ]} |iqS rI   rI   r   
embeds_keyrI   rJ   r     r   z$_get_embeds_data.<locals>.<listcomp>)	r   r   r:   r=   Tensorr   re   NotImplementedErrorr^   )r   r   r   
dict_itemsrI   r   rJ   _get_embeds_data  s   


r   mm_uuidsmm_datareturnc                 C   sf   | d}|du r| S t| }g }|D ]}t|tsJ | d}|dur*|| q|r1||d< |S )as  Rebuild mm_uuids after vision_chunk processing.

    When videos are split into chunks, the original UUIDs need to be updated
    to reflect the new UUIDs generated for each chunk.

    Args:
        mm_uuids: Original UUIDs dictionary
        mm_data: Processed multimodal data with vision_chunk items

    Returns:
        Updated UUIDs dictionary with chunk UUIDs
    r   Nrc   )getre   r   append)r   r   vision_chunks	new_uuidsvision_chunk_uuidsr   uuid_valrI   rI   rJ   rebuild_mm_uuids_from_mm_data  s   


r   c                 C   s   |  d}|du rg S tt}|D ]#}t|tsJ | ddkr4| dd}| dd}|| | qg }t| D ]}|d||  q=|S )	a   Build video prompts from vision_chunk data.

    Collects prompts from video chunks and groups them by video_idx.

    Args:
        mm_data: Processed multimodal data with vision_chunk items

    Returns:
        List of video prompts, one per video.
    r   Nr^   video_chunk	video_idxr   prompt )	r   r   r   r   re   r   sortedr   join)r   r   video_prompts_dictr   r   r   video_promptsrI   rI   rJ    build_video_prompts_from_mm_data  s   
r   c                       s   e Zd ZdZdef fddZedefddZe	defdd	Z
edee fd
dZe	dd Ze	dd Ze	dd Zedd ZdedededB fddZedddZ  ZS )BaseMultiModalItemTrackerz
    Tracks multi-modal items in a given request and ensures that the number
    of multi-modal items in a given request does not exceed the configured
    maximum per prompt.
    ro   c                    s@   t    || _tttt f t| _tttt f t| _d S r   )	super__init___model_configr   rW   r   r   _items_by_modality_modality_order)selfro   	__class__rI   rJ   r     s   
z"BaseMultiModalItemTracker.__init__r   c                 C   s   t | jjddS )zDCheck if model uses unified vision_chunk modality for images/videos.use_unified_vision_chunkF)getattrr   	hf_configr   rI   rI   rJ   !use_unified_vision_chunk_modality!  s   z;BaseMultiModalItemTracker.use_unified_vision_chunk_modalityc                 C   s   | j S r   )r   r   rI   rI   rJ   ro   &  s   z&BaseMultiModalItemTracker.model_configc                 C   s$   ddl m} || j}ttt |S )Nr   )get_model_cls) vllm.model_executor.model_loaderr   ro   r   r^   r+   )r   r   	model_clsrI   rI   rJ   r   *  s   
z#BaseMultiModalItemTracker.model_clsc                 C      | j jS r   )r   allowed_local_media_pathr   rI   rI   rJ   r   1     z2BaseMultiModalItemTracker.allowed_local_media_pathc                 C   r   r   )r   allowed_media_domainsr   rI   rI   rJ   r   5  r   z/BaseMultiModalItemTracker.allowed_media_domainsc                 C   s   t S r   )r,   r   rI   rI   rJ   mm_registry9     z%BaseMultiModalItemTracker.mm_registryc                 C   s   | j | jS r   )r   create_processorro   r   rI   rI   rJ   r   =  s   z&BaseMultiModalItemTracker.mm_processorr   r   Nc                 C   s   | dd}|}| jo|dv }|rd}t| j| d }n	t| j| d }| j|| |r@| j| | | jd | n| j| | | j	||S )z
        Add a multi-modal item to the current prompt and returns the
        placeholder string to use, if any.

        An optional uuid can be added which serves as a unique identifier of the
        media.
        r   r   )rR   rP   r   r   )
replacer   r   r   r   validate_num_itemsr   r   r   get_placeholder_str)r   r   r   input_modalityoriginal_modalityuse_vision_chunk	num_itemsrI   rI   rJ   addA  s   zBaseMultiModalItemTracker.addBaseMultiModalContentParserc                 C      t r   r   r   rI   rI   rJ   create_parserf  r   z'BaseMultiModalItemTracker.create_parserr   r  )rH   rM   rN   rO   r)   r   r	   r{   r   propertyro   r^   r+   r   r   r   r   r   ModalityStrr   rW   r  r   r
  __classcell__rI   rI   r   rJ   r     s(    	



%r   items_by_modalityvision_chunk_modality_orderc                 C   sF  d| v rd| v rt dd| v rd| v rt di }i }d| v r:tddd | d D ||d< d	d | d D |d< d| v rTd
d | d D |d< dd | d D |d< d| v rrtddd | d D ||d< dd | d D |d< d| v rdd | d D |d< dd | d D |d< d| v rdd | d D |d< dd | d D |d< d| v r| d }|dg }dd | d D |d< dd t|D }t|t|ksJ dt| dt| dg }d}	t|D ]\}
\}}||
 }|\}}|t|d k r|nd }|dkr"t|dr|j}|td||d q|| q|dkrt|dr|d urzD|p8t	 }t
|trKt|dkrK|d }n|}||}t|D ]\}
}|td|d | d|
 |	|d  d! qV|	d7 }	W q ty } ztd"| || W Y d }~qd }~ww || q||d< ||fS )#NrP   rb   z4Mixing raw image and embedding inputs is not allowedrQ   rh   z4Mixing raw audio and embedding inputs is not allowedc                 S      g | ]\}}|qS rI   rI   r   datarc   rI   rI   rJ   r   z  r   z"_resolve_items.<locals>.<listcomp>c                 S      g | ]\}}|qS rI   rI   r  rI   rI   rJ   r   }  r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   rR   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   c                 S   r  rI   rI   r  rI   rI   rJ   r     r   r   c                 S   r  rI   rI   r  rI   rI   rJ   r     s    c                 S   s    g | ]\}}|d ur||fqS r   rI   )r   idxr   rI   rI   rJ   r     s
    zvision_chunk items (z) and modality_order (z) must have same lengthr   media)r^   rP   rc   split_video_chunksr   r   -r   )r^   r   rc   r   r   z Failed to split video chunks: %s)r   r   r   	enumerater   hasattrr  r   r3   r9   r   tupler  r4   r   r   r   )r  r   r  r   r   vision_chunk_itemsmodality_orderfiltered_itemsprocessed_chunksr   r   r  r   inner_modalityr  rc   r   
image_data
video_uuid
video_datavideo_chunksvcerI   rI   rJ   _resolve_itemsk  s   





	
r'  c                   @   4   e Zd ZdeedB edB f fddZdddZdS )	MultiModalItemTrackerr   Nc                 C   s    | j sdS tt| j | j| jS )NNN)r   r'  re   r   r   r   rI   rI   rJ   resolve_items  s
   z#MultiModalItemTracker.resolve_itemsr  c                 C      t | S r   )MultiModalContentParserr   rI   rI   rJ   r
       z#MultiModalItemTracker.create_parserr  rH   rM   rN   r  r-   r.   r+  r
  rI   rI   rI   rJ   r)    s
    

r)  c                   @   r(  )	AsyncMultiModalItemTrackerr   Nc                    s6   | j sdS dd | j  D I d H }t|| j| jS )Nr*  c                    s$   i | ]\}}|t j| I d H qS r   )asynciogather)r   r   corosrI   rI   rJ   r     s
    z<AsyncMultiModalItemTracker.resolve_items.<locals>.<dictcomp>)r   r   r'  r   r   )r   resolved_items_by_modalityrI   rI   rJ   r+    s   
z(AsyncMultiModalItemTracker.resolve_itemsr  c                 C   r,  r   )AsyncMultiModalContentParserr   rI   rI   rJ   r
    r.  z(AsyncMultiModalItemTracker.create_parserr  r/  rI   rI   rI   rJ   r0    s
    
r0  c                       sh  e Zd Zd! fddZdededB fddZdeeef fd	d
Z	e
d"dedB dedB ddfddZe
	d"deeeef B dB dedB ddfddZe
	d"dejdB dedB ddfddZe
d"dedB dedB ddfddZe
	d"dedB dedB ddfddZe
	d"deeeef B dB dedB ddfddZe
d"dedB dedB ddfdd Z  ZS )#r  r   Nc                    s   t    tt| _d S r   )r   r   r   r   _placeholder_storager   r   rI   rJ   r     s   
z$BaseMultiModalContentParser.__init__r   placeholderc                 C   s$   t | }|r| j| | d S d S r   )MODALITY_PLACEHOLDERS_MAPr6  r   )r   r   r7  mod_placeholderrI   rI   rJ   _add_placeholder  s   z,BaseMultiModalContentParser._add_placeholderc                 C   s
   t | jS r   )re   r6  r   rI   rI   rJ   mm_placeholder_storage  s   
z2BaseMultiModalContentParser.mm_placeholder_storagert   rc   c                 C   r  r   r	  )r   rt   rc   rI   rI   rJ   parse_image  r   z'BaseMultiModalContentParser.parse_imagerb   c                 C   r  r   r	  )r   rb   rc   rI   rI   rJ   parse_image_embeds     z.BaseMultiModalContentParser.parse_image_embedsrm   c                 C   r  r   r	  )r   rm   rc   rI   rI   rJ   parse_image_pil     z+BaseMultiModalContentParser.parse_image_pilr]   c                 C   r  r   r	  )r   r]   rc   rI   rI   rJ   parse_audio!  r   z'BaseMultiModalContentParser.parse_audioinput_audioc                 C   r  r   r	  )r   rB  rc   rI   rI   rJ   parse_input_audio%  r@  z-BaseMultiModalContentParser.parse_input_audiorh   c                 C   r  r   r	  )r   rh   rc   rI   rI   rJ   parse_audio_embeds+  r>  z.BaseMultiModalContentParser.parse_audio_embedsrk   c                 C   r  r   r	  )r   rk   rc   rI   rI   rJ   parse_video3  r   z'BaseMultiModalContentParser.parse_video)r   Nr   )rH   rM   rN   r   r  rW   r:  re   r   r;  r   r<  r=  r"   r?  rA  r   rC  rD  rE  r  rI   rI   r   rJ   r    s\    ""*r  c                       s:  e Zd Zdeddf fddZedefddZddedB d	edB ddfd
dZ		ddee
eef B dB d	edB ddfddZ	ddee
eef B dB d	edB ddfddZ	ddejdB d	edB ddfddZddedB d	edB ddfddZ	ddedB d	edB ddfddZddedB d	edB ddfddZ  ZS )r-  trackerr   Nc                    D   t    || _| jjj}t|dd }tjtj	||j
|jd| _d S Nmedia_io_kwargs)rI  r   r   r   r   _trackerro   multimodal_configr   r6   loadr(   VLLM_MEDIA_CONNECTORr   r   
_connectorr   rF  rL  rI  r   rI   rJ   r   9  s   

z MultiModalContentParser.__init__c                 C   r   r   rK  ro   r   rI   rI   rJ   ro   G  r   z$MultiModalContentParser.model_configrt   rc   c                 C   6   |r| j |nd }| jd||f}| d| d S NrP   )rO  fetch_imagerK  r  r:  )r   rt   rc   rP   r7  rI   rI   rJ   r<  K     z#MultiModalContentParser.parse_imagerb   c                    s    j  }|jstdt|tr% fdd| D } jd||f}t|t	r9 j
|} jd||f}|d u rF jdd |f} d| d S )N9You must set `--enable-mm-embeds` to input `image_embeds`c                       i | ]\}}| j |qS rI   rO  fetch_image_embeddingr   kvr   rI   rJ   r   ]      z>MultiModalContentParser.parse_image_embeds.<locals>.<dictcomp>rb   rP   )ro   get_multimodal_configenable_mm_embedsr   r   re   r   rK  r  rW   rO  rY  r:  )r   rb   rc   	mm_configembedsr7  	embeddingrI   r   rJ   r=  Q  s    



z*MultiModalContentParser.parse_image_embedsrh   c                    s    j  }|jstdt|tr& fdd| D } jd||f}nt|t	r; j
|} jd||f}n	 jdd |f} d| d S )N9You must set `--enable-mm-embeds` to input `audio_embeds`c                    rW  rI   rO  fetch_audio_embeddingrZ  r   rI   rJ   r   x  r]  z>MultiModalContentParser.parse_audio_embeds.<locals>.<dictcomp>rh   rQ   )ro   r^  r_  r   r   re   r   rK  r  rW   rO  re  r:  )r   rh   rc   r`  ra  r7  rb  rI   r   rJ   rD  l  s   



z*MultiModalContentParser.parse_audio_embedsrm   c                 C   s"   | j d||f}| d| d S rS  )rK  r  r:  )r   rm   rc   r7  rI   rI   rJ   r?    s   z'MultiModalContentParser.parse_image_pilr]   c                 C   rR  NrQ   )rO  fetch_audiorK  r  r:  )r   r]   rc   rQ   r7  rI   rI   rJ   rA    rU  z#MultiModalContentParser.parse_audiorB  c                 C   H   |r| dd}| dd}|rd| d| }nd }nd }| ||S Nr  r   formatzdata:audio/z;base64,r   rA  r   rB  rc   
audio_dataaudio_formatr]   rI   rI   rJ   rC       z)MultiModalContentParser.parse_input_audiork   c                 C   s8   |r	| j j|dnd }| jd||f}| d| d S )N)rk   rR   )rO  fetch_videorK  r  r:  )r   rk   rc   rR   r7  rI   rI   rJ   rE    s   z#MultiModalContentParser.parse_videor   )rH   rM   rN   r)  r   r  r)   ro   rW   r<  re   r=  rD  r"   r?  rA  r   rC  rE  r  rI   rI   r   rJ   r-  8  sN     	


 
(r-  c                       s  e Zd Zdeddf fddZedefddZdedB d	edB fd
dZ	d$dedB d	edB ddfddZ
	d$deeeef B dB d	edB ddfddZ	d$deeeef B dB d	edB ddfddZ	d$dejdB d	edB ddfddZdedB d	edB fddZd$dedB d	edB ddfddZ	d$dedB d	edB ddfddZdedB d	edB fd d!Zd$dedB d	edB ddfd"d#Z  ZS )%r5  rF  r   Nc                    rG  rH  rJ  rP  r   rI   rJ   r     s   

z%AsyncMultiModalContentParser.__init__c                 C   r   r   rQ  r   rI   rI   rJ   ro     r   z)AsyncMultiModalContentParser.model_configrt   rc   c                    $   |r| j |I d H nd }||fS r   )rO  fetch_image_async)r   rt   rc   rP   rI   rI   rJ   _image_with_uuid_async     z3AsyncMultiModalContentParser._image_with_uuid_asyncc                 C   *   |  ||}| jd|}| d| d S rS  )rs  rK  r  r:  )r   rt   rc   coror7  rI   rI   rJ   r<       z(AsyncMultiModalContentParser.parse_imagerb   c                        j  }|jstdtjttjt	t
tjf B d B t
d B f   }t|t	r9 fdd| D }|||f t|t
rK j|}|||f |d u rV|d |f  jd|} d| d S )NrV  c                    rW  rI   rX  rZ  r   rI   rJ   r     r]  zCAsyncMultiModalContentParser.parse_image_embeds.<locals>.<dictcomp>rb   rP   )ro   r^  r_  r   r1  Futurer  r=   r   re   rW   r   r   
set_resultrO  rY  rK  r  r:  )r   rb   rc   r`  futurera  rb  r7  rI   r   rJ   r=    (   
"


z/AsyncMultiModalContentParser.parse_image_embedsrh   c                    rx  )Nrc  c                    rW  rI   rd  rZ  r   rI   rJ   r     r]  zCAsyncMultiModalContentParser.parse_audio_embeds.<locals>.<dictcomp>rh   rQ   )ro   r^  r_  r   r1  ry  r  r=   r   re   rW   r   r   rz  rO  re  rK  r  r:  )r   rh   rc   r`  r{  ra  rb  r7  rI   r   rJ   rD    r|  z/AsyncMultiModalContentParser.parse_audio_embedsrm   c                 C   s^   t jttjd B td B f   }|r|||f n|d |f | jd|}| d| d S rS  )	r1  ry  r  r"   rW   rz  rK  r  r:  )r   rm   rc   r{  r7  rI   rI   rJ   r?    s   z,AsyncMultiModalContentParser.parse_image_pilr]   c                    rq  r   )rO  fetch_audio_async)r   r]   rc   rQ   rI   rI   rJ   _audio_with_uuid_async  rt  z3AsyncMultiModalContentParser._audio_with_uuid_asyncc                 C   ru  rf  )r~  rK  r  r:  )r   r]   rc   rv  r7  rI   rI   rJ   rA    rw  z(AsyncMultiModalContentParser.parse_audiorB  c                 C   rh  ri  rk  rl  rI   rI   rJ   rC     ro  z.AsyncMultiModalContentParser.parse_input_audiork   c                    rq  r   )rO  fetch_video_async)r   rk   rc   rR   rI   rI   rJ   _video_with_uuid_async0  rt  z3AsyncMultiModalContentParser._video_with_uuid_asyncc                 C   ru  )NrR   )r  rK  r  r:  )r   rk   rc   rv  r7  rI   rI   rJ   rE  6  rw  z(AsyncMultiModalContentParser.parse_videor   )rH   rM   rN   r0  r   r  r)   ro   rW   rs  r<  re   r=  rD  r"   r?  r~  rA  r   rC  r  rE  r  rI   rI   r   rJ   r5    sT     	
#
#
 
(r5  chat_templatec                    s    du rdS t  tr  stdt  trLd}t fdd|D sFt  sHddlm} |  }| sJtd  d	  d
| dS dS dS t	t
  d)z5Raises if the provided chat template appears invalid.Nz-the supplied chat template path doesn't exist{}
c                 3       | ]}| v V  qd S r   rI   r   cr  rI   rJ   r   H  r   z)validate_chat_template.<locals>.<genexpr>r   CHAT_TEMPLATES_DIRz#The supplied chat template string (z/) appears path-like, but doesn't exist! Tried:  and z" is not a valid chat template type)r   r   existsFileNotFoundErrorrW   r   /vllm.transformers_utils.chat_templates.registryr  r   	TypeErrorr^   )r  JINJA_CHARSr  builtin_template_pathrI   r  rJ   validate_chat_template=  s.   


r  
is_literalr  c                   sB   d u rd S |rt  trtd S zt }| W  d    W S 1 s(w   Y  W d S  ty } zet  tr= d}t fdd|D sddlm} |  }z!t|}| W  d    W W  Y d }~S 1 sow   Y  W n ty   d  d  d	| d
| }t	||w t
 ddW  Y d }~S d }~ww )Nz<chat_template is expected to be read directly from its valuer  c                 3   r  r   rI   r  r  rI   rJ   r   t  r   z&_load_chat_template.<locals>.<genexpr>r   r  zThe supplied chat template (z=) looks like a file path, but it failed to be opened. Tried: r  z
. Reason: Tr  )r   r   r  openreadOSErrorr   r  r  r   _load_chat_template)r  r  fr&  r  r  r  msgrI   r  rJ   r  \  sJ   

(

2
r  c                C   s   t | |dS )Nr  )_cached_load_chat_template)r  r  rI   rI   rJ   load_chat_template  s   r  placeholder_storagetextsc                 C   s6   t |D ]\}}|| v r| | d||< qd|S )Nr   
)r  popr   )r  r  r  elemrI   rI   rJ   _get_interleaved_text_prompt  s
   
r  interleave_stringsc                 C   s   t dd |  D }|rt| |}nd|}g }|D ]1}||  ||8  < || dk rCtd| td| td| d|	|g||   qd||g S )	z;Combine multimodal prompts for a multimodal language model.c                 S   s   g | ]	}|D ]}|qqS rI   rI   )r   r  r\  rI   rI   rJ   r     s    z4_get_full_multimodal_text_prompt.<locals>.<listcomp>r  r   zPlaceholder count is negative! Ensure that the 'interleave_strings' flag is disabled (current value: %s) when manually placing image placeholders.zInput prompt: %szFound more 'zA' placeholders in input prompt than actual multimodal data items.)
r   valuesr  r   countr   errordebugr   extend)r  r  r  placeholder_countstext_promptmissing_placeholdersr7  rI   rI   rJ    _get_full_multimodal_text_prompt  s(   

r  _ContentPartc                 C      t | dd S Ntext_TextParserr   partrI   rI   rJ   <lambda>      r  c                 C   r  )Nry   )_ThinkParserr   r  rI   rI   rJ   r    r  c                 C   r  r  r  r  rI   rI   rJ   r    r  c                 C   r  r  r  r  rI   rI   rJ   r    r  c                 C   r  )Nrt   )_ResponsesInputImageParserr   r  rI   rI   rJ   r    r  c                 C      t | di dd S )Nrt   rU   )_ImageParserr   r  rI   rI   rJ   r    r   c                 C   r  )Nrb   )_ImageEmbedsParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nrh   )_AudioEmbedsParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nrm   )_PILImageParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nr]   rU   )_AudioParserr   r  rI   rI   rJ   r    r   c                 C   r  )NrB  )_InputAudioParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nrefusal)_RefusalParserr   r  rI   rI   rJ   r    r  c                 C   r  )Nrk   rU   )_VideoParserr   r  rI   rI   rJ   r    r   )r  ry   
input_textoutput_textinput_imagert   rb   rh   rm   r]   rB  r  rk   MM_PARSER_MAPr  c                 C   s  t | tsJ | dd}| dd}t |tr;|tv r;|du r;t| | }|dkr7| dddkr7td ||fS |du sC|durd| v ratt| }|dd}t |tr]|dd}d|fS d	| v rttt	| }|d	d}d	|fS d
| v rtt
| }|d
d}d
|fS d| v rtt| }|dd}	d|	fS d| v rtt| }|dd}
t |
tr|
dd}
d|
fS | ddurttttf | }d|fS d| v rtt| }|dd}t |tr|dd}d|fS tdt |tstd|dfS )a  
    Parses a given multi-modal content part based on its type.

    Args:
        part: A dict containing the content part, with a potential 'type' field.

    Returns:
        A tuple (part_type, content) where:
        - part_type: Type of the part (e.g., 'text', 'image_url').
        - content: Parsed content (e.g., text, image URL).

    Raises:
        ValueError: If the 'type' field is missing and no direct URL is found.
    r^   Nrc   rt   detailr   zB'image_url.detail' is currently not supported and will be ignored.rU   rm   rb   rh   r]   rB  rk   z(Missing 'type' field in multimodal part.z(Invalid 'type' field in multimodal part.zunknown part_type content)r   re   r   rW   r  r   r   r   rs   rp   ra   rg   rv   rw   r   )r  	part_typerc   r~   image_paramsrt   rm   rb   audio_paramsrh   r]   input_audio_paramsvideo_paramsrk   rI   rI   rJ   #_parse_chat_message_content_mm_part  st   





r  )r  r  r}   parts
mm_tracker
wrap_dictsc                C   s   t t  }| }|D ]}t||||d}|r|| q|r&t| |dgS tt t |}	| }
|
r:t	|
|	|}nd
|	}t| |dgS )Nr  r  )r}   r~   r  )r   r  r
   _parse_chat_message_content_partr   r   r   rW   r;  r  r   )r}   r  r  r  r  r~   	mm_parserr  	parse_resr  r;  r  rI   rI   rJ   !_parse_chat_message_content_partsa  s,   


r  r  c                C   s  t | tr| S t| \}}|tv r|du rtd| | dS |dv r0tt|}|r.d|dS |S | dd}|dur>t|}d}|dkrY|durNttj|nd}	|	|	| d}n|d	v rktt|}|
|| d}ny|d
kr|dur~tttttf B |nd}||| d}n[|dkr|durtttttf B |nd}||| d}n=|dkrtt|}||| d}n+|dkrtt|}
||
| d}n|dkrtt|}||| d}ntd| |rd|iS |rt| S dS )a|  Parses a single part of a conversation. If wrap_dicts is True,
    structured dictionary pieces for texts and images will be
    wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
    {"type": "image"}, respectively. Otherwise multimodal data will be
    handled by mm_parser, and texts will be returned as strings to be joined
    with multimodal placeholders.
    NzKSkipping multimodal part '%s' (type: '%s') with empty / unparsable content.)r  r  r  r  ry   r  r^   r  rc   rm   rP   )rt   r  rb   rh   rQ   r]   rB  rk   rR   zUnknown part type: r^   )r   rW   r  PART_TYPES_TO_SKIP_NONE_CONTENTr   r   r   r   r"   r?  r<  re   r=  rD  rA  r   rC  rE  r   r8  )r  r  r  r  r  r~   str_contentrc   r   image_contentdict_contentrI   rI   rJ   r    sn   



""



r  messagecontent_formatc           
      C   s(  | d }|  d}|  dp|  d}|d u rg }nt|tr&td|dg}t||||dk|d}|D ]^}|d	krbt| }	d
|	v rO|	d
 d urOt|	d
 |d
< |d uratt||d< tt||d< n|dkrtt| }	d|	v rt|	d |d< d| v rt| d tr| d |d< |dkr|  dd |d< q3|S )Nr}   r~   r   r   r  r  r   r  	assistantr   toolr   r>   	developerr   )	r   r   rW   r   r  _AssistantParserr   r   _ToolParser)
r  r  r  r  r}   r~   r   result
result_msg
parsed_msgrI   rI   rJ   _parse_chat_message_content  sF   

r  messagesc                 C   s   | D ]I}|d dkrKd|v rK| d}t|tsqt|dkr&|dd  q|D ]"}|d  d }rDt|ttfsCt||d d< q(i |d d< q(qd S )Nr}   r  r   r   function	arguments)r   r   r   r   r  re   jsonloads)r  r  r   r   r~   rI   rI   rJ   _postprocess_messages  s    

r  ro   c           	      C   sd   g }t |}| D ]}t||||dko|jd uo|jjd}|| qt| | \}}|||fS Nr   )r  )r)  r  rL  interleave_mm_stringsr  r  r+  	r  ro   r  conversationr  r  sub_messagesr   r   rI   rI   rJ   parse_chat_messages(  s    	
r  c           	         sl   g }t |}| D ]}t||||dko|jd uo|jjd}|| q	t| | I d H \}}|||fS r  )r0  r  rL  r  r  r  r+  r  rI   rI   rJ   parse_chat_messages_asyncI  s"   	
r  r  c                 C   sD   d}| D ]}|d dkr| d}||d urtt|nd7 }q|S )Nr   r}   r  r   )r   r   r   )r  r  r  r   rI   rI   rJ   get_history_tool_calls_cntj  s   
r  randomid_typec                 C   s$   | dkrd| d| S dt   S )Nkimi_k2z
functions.:zchatcmpl-tool-r8   )r  	func_namer  rI   rI   rJ   make_tool_call_ids  s   r  )r  NN)r1  r  rD   abcr   r   collectionsr   r   collections.abcr   r   r   	functoolsr	   r
   r   	itertoolsr   pathlibr   typingr   r   r   r   r   r   r   openai.types.chatr   r   r   r   r   r   r   r   r   $OpenAIChatCompletionContentPartParamr    OpenAIChatCompletionMessageParam@openai.types.chat.chat_completion_content_part_input_audio_paramr   openai.types.responsesr    openai_harmonyr!   OpenAIHarmonyMessagePILr"   pydanticr#   r$   r%   typing_extensionsr&   r'   vllmr(   vllm.configr)   vllm.loggerr*   vllm.model_executor.modelsr+   vllm.multimodalr,   r-   r.   vllm.multimodal.inputsr/   r0   r1   r2   r3   r4   vllm.multimodal.processingr5   vllm.multimodal.utilsr6   r7   
vllm.utilsr9   vllm.utils.collection_utilsr:   vllm.utils.import_utilsr;   r=   r<   globalsrH   r   rW   rK   r   rL   r8  rT   r\   ra   rg   ri   rj   rl   rp   rs   rv   rw   rx   rX   r|   r   ChatTemplateContentFormatOptionChatTemplateContentFormatr  r   r   r   r   r   re   r   r   r   r   r   r  objectr'  r)  r0  r  r-  r5  r  r{   r  r  r  r  r  r  r  r  r  r  r  r  validate_pythonr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rI   rI   rI   rJ   <module>   s2  
$(
 
			




3

$
$Z

 k
=p "

0






4









 

`
%

R

2
!
!	