o
    
۾iY                  	   @   s  U d dl mZmZ d dlmZ d dlmZmZmZm	Z	 d dl
mZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d	d
lmZmZmZ d	dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d	dl+m,Z, edZ-edZ.erd dl/m0Z1 nede2 dZ1G dd deee-e.f Z3G dd de3e	e- e-f Z4	d6dej5de6de7dB ddfddZ8G dd de3ej5e9ej5 B ej5f Z:G dd de3ee6ej5f ee6ej5f f Z;G dd  d e4e" Z<G d!d" d"e:Z=G d#d$ d$eZ>G d%d& d&e4e# Z?G d'd( d(e:Z@G d)d* d*e4e$ ZAG d+d, d,e:ZBG d-d. d.e4e ZCed/e3eef d0ZDG d1d2 d2ee6e3eef f ZEee&e ge3eef dB f ZFeeGd3< G d4d5 d5ZHdS )7    )ABCabstractmethod)UserDict)CallableIteratorMappingSequence)TYPE_CHECKINGAnyGenericLiteral
NamedTuple	TypeAlias	TypeGuardTypeVarN)assert_never)
is_list_of)
LazyLoader   )AudioResampler	AudioSpecnormalize_audio)
	AudioItemHfAudioItemHfImageItemHfVideoItem	ImageItemModalityDataMultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems	VideoItem)MediaWithBytes_T_IPILImagez	PIL.Imagec                       s  e Zd ZdZdededdf fddZdefdd	Zdefd
dZ	dede
fddZer5dee
 fddZedefddZedede
fddZdee
 fddZdedefddZdee fddZedeeef fddZedeeef fddZ  ZS )ModalityDataItemszy
    Represents data items for a modality in
    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    datamodalityreturnNc                    s   t    || _|| _d S N)super__init__r'   r(   )selfr'   r(   	__class__ I/home/ubuntu/.local/lib/python3.10/site-packages/vllm/multimodal/parse.pyr,   7   s   

zModalityDataItems.__init__c                 C   s"   t | j d| jdt|  dS )Nz
(modality=z, len=))type__name__r(   lenr-   r0   r0   r1   __repr__=   s   "zModalityDataItems.__repr__c                 C   s   |   S r*   	get_countr6   r0   r0   r1   __len__@   s   zModalityDataItems.__len__indexc                 C   
   |  |S r*   getr-   r;   r0   r0   r1   __getitem__C      
zModalityDataItems.__getitem__c                 C   s   d S r*   r0   r6   r0   r0   r1   __iter__H   s    zModalityDataItems.__iter__c                 C      t )zGet the number of data items.NotImplementedErrorr6   r0   r0   r1   r9   J      zModalityDataItems.get_countc                 C   rC   )zGet a data item by its index.rD   r?   r0   r0   r1   r>   O   rF   zModalityDataItems.getc                        fddt   D S )zGet all data items.c                       g | ]}  |qS r0   r=   .0idxr6   r0   r1   
<listcomp>V       z-ModalityDataItems.get_all.<locals>.<listcomp>ranger9   r6   r0   r6   r1   get_allT   s   zModalityDataItems.get_allc                 C   r<   r*   r=   r?   r0   r0   r1   get_item_for_hashX   rA   z#ModalityDataItems.get_item_for_hashc                    rG   )Nc                    rH   r0   )rQ   rI   r6   r0   r1   rL   \   rM   z<ModalityDataItems.get_all_items_for_hash.<locals>.<listcomp>rN   r6   r0   r6   r1   get_all_items_for_hash[   s   z(ModalityDataItems.get_all_items_for_hashc                 C   rC   )z)Get the data to pass to the HF processor.rD   r6   r0   r0   r1   get_processor_data^   rF   z$ModalityDataItems.get_processor_datac                 C   rC   )z+Get the data to pass directly to the model.rD   r6   r0   r0   r1   get_passthrough_datac   rF   z&ModalityDataItems.get_passthrough_data)r4   
__module____qualname____doc__r#   strr,   r7   intr:   r$   r@   r	   r   rB   r   r9   r>   listrP   objectrQ   rR   r   rS   rT   __classcell__r0   r0   r.   r1   r&   1   s&     r&   c                   @   s   e Zd ZdZdeee B defddZdefddZdedefd	d
Z	dedeee B fddZ
deeef fddZdeeef fddZdS )ProcessorBatchItemsz6Base class for data items that are arranged in a list.itemr)   c                 C      t |tr|jS |S z&Extract media from wrapper if present.
isinstancer"   mediar-   r^   r0   r0   r1   _unwrapl   s   zProcessorBatchItems._unwrapc                 C   
   t | jS r*   r5   r'   r6   r0   r0   r1   r9   p   rA   zProcessorBatchItems.get_countr;   c                 C      |  | j| S r*   re   r'   r?   r0   r0   r1   r>   s      zProcessorBatchItems.getc                 C   s
   | j | S r*   r'   r?   r0   r0   r1   rQ   v   s   
z%ProcessorBatchItems.get_item_for_hashc                 C   s   | j  d|  iS )Ns)r(   rP   r6   r0   r0   r1   rS   z      z&ProcessorBatchItems.get_processor_datac                 C      i S r*   r0   r6   r0   r0   r1   rT   }      z(ProcessorBatchItems.get_passthrough_dataN)r4   rU   rV   rW   r#   r"   re   rY   r9   r>   rQ   r   rX   r[   rS   rT   r0   r0   r0   r1   r]   i   s    r]   tensorr(   r;   r)   c              	   C   sZ   | j dk s
| j dkr+|durd| dnd}t|  d| d| j  d	t| j dS )
a  Validate tensor ndim for multimodal embeddings.

    Single embeddings should be 2D (seq_len, hidden_size).
    Batched embeddings should be 3D (batch, seq_len, hidden_size).

    Args:
        tensor: The tensor to validate.
        modality: The modality name for error messages (e.g., "image", "audio").
        index: Optional index for list items, included in error messages.
          Nz [] z
 embeddingzL must be 2D (seq_len, hidden_size) or 3D (batch, seq_len, hidden_size), got D tensor with shape )ndim
ValueError
capitalizetupleshape)rp   r(   r;   idx_strr0   r0   r1   validate_embedding_ndim   s   r|   c                	       s   e Zd ZdZ	ddejeej B dededB ddf fddZ	dd	d
Z
deddfddZdejeej B dejfddZdefddZdedejfddZdeeef fddZdeeef fddZdedefddZ  ZS )EmbeddingItemsz
    Base class for data items that are expressed as a batched embedding tensor,
    or a list of embedding tensors (one per item).
    Nr'   r(   expected_hidden_sizer)   c                    s0   t  || |   |d ur| | d S d S r*   )r+   r,   _validate_ndim_validate_hidden_size)r-   r'   r(   r~   r.   r0   r1   r,      s
   zEmbeddingItems.__init__c              
   C   sn   t | jtjrt| j| j dS t| jD ]\}}|jdkr4t| j	  d| d|j dt
|j qdS )z=Validate that embedding tensors have correct ndim (2D or 3D).rq    embedding [z)] must be 2D (seq_len, hidden_size), got ru   N)rb   r'   torchTensorr|   r(   	enumeraterv   rw   rx   ry   rz   )r-   rK   rp   r0   r0   r1   r      s   
zEmbeddingItems._validate_ndimc                 C   s   t | jtjr)| jjd }||kr't| j  d| d| dt| jj dS t	| jD ]%\}}|jd }||krSt| j  d| d| d| dt|j 	q.dS )a8  Validate that embedding hidden dimension matches expected size.

        This validates hidden dimensions to prevent vulnerabilities: Embeddings
        with correct ndim but wrong hidden dimension could bypass initial
        checks and cause crashes during model inference when dimensions don't match.
        z* embedding hidden dimension mismatch: got z, but model expects z. Embedding shape: r   z!] hidden dimension mismatch: got N)
rb   r'   r   r   rz   rw   r(   rx   ry   r   )r-   r~   actual_hidden_sizerK   rp   r0   r0   r1   r      s6   

z$EmbeddingItems._validate_hidden_sizer^   c                 C   r_   r`   ra   rd   r0   r0   r1   re      s   zEmbeddingItems._unwrapc                 C   rf   r*   rg   r6   r0   r0   r1   r9      rA   zEmbeddingItems.get_countr;   c                 C   rh   r*   ri   r?   r0   r0   r1   r>      rj   zEmbeddingItems.getc                 C   rn   r*   r0   r6   r0   r0   r1   rS      ro   z!EmbeddingItems.get_processor_datac                 C   s   | j  d| jiS )N_embeds)r(   r'   r6   r0   r0   r1   rT         z#EmbeddingItems.get_passthrough_dataitem_idxc                 C      t | |S r*   r5   r>   r-   r   r0   r0   r1   get_feature_size      zEmbeddingItems.get_feature_sizer*   )r)   N)r4   rU   rV   rW   r   r   rZ   rX   rY   r,   r   r   r"   re   r9   r>   r   r[   rS   rT   r   r\   r0   r0   r.   r1   r}      s0    	

r}   c                       s   e Zd ZdZdeeejf dedee de	eeejf geee
f f ddf
 fdd	Zdefd
dZdedeeejf fddZdeeef fddZdeeef fddZ  ZS )DictEmbeddingItemsz
    Base class for data items that are expressed as a dictionary of tensors.

    Usually, the dictionary keys correspond to the outputs of HF processor.
    r'   r(   required_fieldsfields_factoryr)   Nc                    s   ddl m} t || ||  }|r't| }d| d| }t|||}	||	  }
|
rEt|	 }d|d|}t||	| _|| _t	
|t||	| _d S )Nr   )BatchFeaturez$The data should contain the fields: z%, but only found the following keys: zrequired_fields=z should be a subset of fields=)%transformers.feature_extraction_utilsr   r+   r,   keyssetrw   fields_configr   r    from_hf_inputsdict_kwargs)r-   r'   r(   r   r   r   missing_required_data_keys	data_keysmsgr   missing_required_fieldsfieldsr.   r0   r1   r,      s,   


zDictEmbeddingItems.__init__c                 C   s   t | j| j S r*   )r5   r   r(   r6   r0   r0   r1   r9     rj   zDictEmbeddingItems.get_countr;   c                 C   s   | j | j |  S r*   )r   r(   get_datar?   r0   r0   r1   r>   "  rm   zDictEmbeddingItems.getc                 C   rn   r*   r0   r6   r0   r0   r1   rS   %  ro   z%DictEmbeddingItems.get_processor_datac                 C   s   | j S r*   rk   r6   r0   r0   r1   rT   (  s   z'DictEmbeddingItems.get_passthrough_data)r4   rU   rV   rW   r   rX   r   r   r   r   r   r,   rY   r9   r>   r[   rS   rT   r\   r0   r0   r.   r1   r      s*    
	&r   c                       s@   e Zd Zdee dB ddf fddZdedefddZ  ZS )	AudioProcessorItemsr'   Nr)   c                        |d u rd g}t  |d d S Naudior+   r,   r-   r'   r.   r0   r1   r,   -     zAudioProcessorItems.__init__r   c                 C   s   |  |}t|S r*   )r>   r5   )r-   r   r   r0   r0   r1   get_audio_length2  s   
z$AudioProcessorItems.get_audio_length)	r4   rU   rV   r   r   r,   rY   r   r\   r0   r0   r.   r1   r   ,      r   c                       >   e Zd Z	ddejeej B dedB ddf fddZ  ZS )AudioEmbeddingItemsNr'   r~   r)   c                       t  |d| d S r   r   r-   r'   r~   r.   r0   r1   r,   8     zAudioEmbeddingItems.__init__r*   	r4   rU   rV   r   r   rZ   rY   r,   r\   r0   r0   r.   r1   r   7      r   c                   @   s   e Zd ZU eed< eed< dS )	ImageSizewidthheightN)r4   rU   rV   rY   __annotations__r0   r0   r0   r1   r   @  s   
 r   c                       s@   e Zd Zdee dB ddf fddZdedefddZ  Z	S )	ImageProcessorItemsr'   Nr)   c                    r   Nimager   r   r.   r0   r1   r,   F  r   zImageProcessorItems.__init__r   c                 C   sT   |  |}t|tjrt|j S t|tjtj	fr$|j
\}}}t||S t| d S r*   r>   rb   r%   Imager   sizenpndarrayr   r   rz   r   r-   r   r   _hwr0   r0   r1   get_image_sizeK  s   


z"ImageProcessorItems.get_image_size)
r4   rU   rV   r   r   r,   rY   r   r   r\   r0   r0   r.   r1   r   E  r   r   c                       r   )ImageEmbeddingItemsNr'   r~   r)   c                    r   r   r   r   r.   r0   r1   r,   X  r   zImageEmbeddingItems.__init__r*   r   r0   r0   r.   r1   r   W  r   r   c                	       sz   e Zd Z	ddee dB deeef eeeef dB  B dB ddf fddZ	de
de
fdd	Zde
defd
dZ  ZS )VideoProcessorItemsNr'   metadatar)   c                    s&   |d u rd g}t  |d || _d S Nvideo)r+   r,   r   )r-   r'   r   r.   r0   r1   r,   a  s   
zVideoProcessorItems.__init__r   c                 C   r   r*   r   r   r0   r0   r1   get_num_framesk  r   z"VideoProcessorItems.get_num_framesc                 C   sX   |  |d }t|tjrt|j S t|tjtj	fr&|j
\}}}t||S t| d S )Nr   r   r   r0   r0   r1   get_frame_sizen  s   

z"VideoProcessorItems.get_frame_sizer*   )r4   rU   rV   r   r   r   rX   r
   rZ   r,   rY   r   r   r   r\   r0   r0   r.   r1   r   `  s    
"
r   c                       r   )VideoEmbeddingItemsNr'   r~   r)   c                    r   r   r   r   r.   r0   r1   r,   {  r   zVideoEmbeddingItems.__init__r*   r   r0   r0   r.   r1   r   z  r   r   c                       s.   e Zd ZdZdee ddf fddZ  ZS )VisionChunkProcessorItemszCProcessor items for vision chunks (unified image and video chunks).r'   r)   Nc                    s   t  |d d S )Nvision_chunkr   r   r.   r0   r1   r,     r   z"VisionChunkProcessorItems.__init__)r4   rU   rV   rW   r   r
   r,   r\   r0   r0   r.   r1   r     s    "r   _D)boundc                   @   sl   e Zd ZdZdddededefddZdeeef fd	d
Z	dede
e ee
e df B defddZdS )MultiModalDataItemsz
    As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but
    normalized such that each entry corresponds to a list.
    T)strictr(   r   r)   c                C   s<   || vr|rt |  }td|d| dS | |  S )z
        Get the number of data items belonging to a modality.

        If `strict=False`, return `0` instead of raising [`KeyError`][]
        even if the modality is not found.
        	Modality " not found. Available modalities: r   )r   r   KeyErrorr9   )r-   r(   r   available_modalitiesr0   r0   r1   r9     s   zMultiModalDataItems.get_countc                 C   s   dd |   D S )z3Get the number of items belonging to each modality.c                 S   s   i | ]	\}}||  qS r0   r8   )rJ   mitemsr0   r0   r1   
<dictcomp>  s    z6MultiModalDataItems.get_all_counts.<locals>.<dictcomp>)r   r6   r0   r0   r1   get_all_counts  s   z"MultiModalDataItems.get_all_countstyp.c                 C   s\   || vrt |  }td|d| | | }t||s,td|d| dt| |S )zs
        Get the data items belonging to a modality,
        requiring that they belong to a certain type.
        r   r   z(Invalid type of data items for modality=z. Expected type: z, but found type: )r   r   r   rb   	TypeErrorr3   )r-   r(   r   r   r   r0   r0   r1   	get_items  s"   	
zMultiModalDataItems.get_itemsN)r4   rU   rV   rW   rX   boolrY   r9   r   r   r3   r   ry   r   r0   r0   r0   r1   r     s    r   ModalityDataParserc                       s  e Zd ZdZdddddddedB dedB ded	 d
ededB ddf fddZe	de
deejeej B  fddZde
ded fddZdedeejedB f fddZdedeejeeef dB f fddZdee deeef dB fddZdee deeef dB fddZdee deeef dB fddZ dee deeef dB fd d!Z!de"ee#f fd"d#Z$d$e%de&fd%d&Z'  Z(S )'MultiModalDataParsera  
    Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
    into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].

    Args:
        target_sr (float, optional): Enables automatic resampling of audio
            items to the model's expected sampling rate.
        target_channels (int, optional): Target number of audio channels.
            If provided, normalizes audio to this many channels (e.g., 1 for mono).
            If None, audio channels are passed through unchanged.
        expected_hidden_size (int, optional): Expected hidden dimension for
            embedding inputs. If provided, validates that user-supplied
            embeddings have the correct hidden size to prevent crashes
            during model inference.
    NlibrosaF)	target_srtarget_channelsaudio_resample_methodvideo_needs_metadatar~   r   r   r   )r   scipyr   r~   r)   c                   s.   t    t||d| _|| _|| _|| _d S )N)r   method)r+   r,   r   audio_resamplerr   r   r~   )r-   r   r   r   r   r~   r.   r0   r1   r,     s   
	
zMultiModalDataParser.__init__r'   c                 C   s4   t |tjr|jdkS t|tjr|d jdkS dS )Nrr   r   rq   F)rb   r   r   rv   r   )clsr'   r0   r0   r1   is_embeddings  s
   
z"MultiModalDataParser.is_embeddingsc                 C   s6   t |trt|dkS t |tjtjfr|jdkS dS )Nr   F)rb   rZ   r5   r   r   r   r   r   r   r0   r0   r1   	_is_empty  s
   

zMultiModalDataParser._is_emptyr   c                 C   ^   t |tr|S t |trt|d fS t |tjr|d fS t |tjr)| d fS t	| d S r*   
rb   ry   rZ   r   arrayr   r   r   numpyr   )r-   r   r0   r0   r1   _get_audio_with_sr     

z'MultiModalDataParser._get_audio_with_srr   c                 C   r   r*   r   )r-   r   r0   r0   r1   _get_video_with_metadata  r   z-MultiModalDataParser._get_video_with_metadatac           	      C   s  |d u rt d S | |st|tr| |d rd S | |r&t|| jS t|ts>t|t	j
tjfr9|jdks>t|trB|g}nt|t	j
tjfrSdd |D }n|}tt	j
  }|D ]-}| |\}}|d u rm|}n| jj||d}| jd urt| jd}t||}|| q]t |S )Nr   r   c                 S      g | ]}|qS r0   r0   rJ   elemr0   r0   r1   rL   8      z:MultiModalDataParser._parse_audio_data.<locals>.<listcomp>)orig_sr)r   )r   r   rb   ry   r   r   r~   r   floatr   r   r   r   rv   rZ   r   r   resampler   r   r   append)	r-   r'   
data_items
new_audios	data_itemr   r   	new_audiospecr0   r0   r1   _parse_audio_data  s@   




z&MultiModalDataParser._parse_audio_datac                 C   s   |d u rt d S | |rd S | |rt|| jS t|tjtfs0t|t	j
tjfr7|jdkr7|g}t |S t|t	j
tjfrKdd |D }t |S |}t |S )Nrr   c                 S   r   r0   r0   r   r0   r0   r1   rL   a  r   z:MultiModalDataParser._parse_image_data.<locals>.<listcomp>)r   r   r   r   r~   rb   r%   r   r"   r   r   r   r   rv   )r-   r'   r   r0   r0   r1   _parse_image_dataM  s$   


z&MultiModalDataParser._parse_image_datac                 C   s4  |d u rt d S | |rd S | |rt|| jS t|tjs.t|t	j
tjfr2|jdkr2|g}n"t|t	j
tjfrCdd |D }nt|trRt|dkrR|g}n|}ttt	j
tttf d B f   }g }|D ]&}| |\}}| jr|d u r|td|||f || qh|| qh| jsd }t ||dS )N   c                 S   r   r0   r0   r   r0   r0   r1   rL   |  r   z:MultiModalDataParser._parse_video_data.<locals>.<listcomp>rq   ziVideo metadata is required but not found in mm input. Please check your video input in `multi_modal_data`)r   )r   r   r   r   r~   r   r%   r   rb   r   r   r   r   rv   ry   r5   rZ   r   rX   r
   r   r   rw   r   )r-   r'   r   
new_videosmetadata_lstr   r   r   r0   r0   r1   _parse_video_datag  s@   



 z&MultiModalDataParser._parse_video_datac                 C   s@   |du s	|  |rdS | |rtdt|tr|g}t|S )z9Parse vision chunk data (unified image and video chunks).Nz8Do not support embedding data for vision_chunk right now)r   r   rw   rb   r   r   r   r0   r0   r1   _parse_vision_chunk_data  s   

z-MultiModalDataParser._parse_vision_chunk_datac                 C   s   | j | j| j| jdS )N)r   r   r   r   )r   r   r  r  r6   r0   r0   r1   _get_subparsers  s
   z$MultiModalDataParser._get_subparsersmm_datac                 C   sV   |   }t }| D ]\}}||vrtd| || | }d ur(|||< q|S )NzUnsupported modality: )r  r   r   rw   )r-   r  
subparsersmm_itemskvparsed_datar0   r0   r1   parse_mm_data  s   z"MultiModalDataParser.parse_mm_data))r4   rU   rV   rW   r   rY   r   r   r,   classmethodr[   r   r   r   rZ   r   r   r   ry   r   r   r   r!   r   rX   r
   r   r   r&   r   r   r   r  r  r   r   r  r   r   r
  r\   r0   r0   r.   r1   r     sv    



.

/
r   r*   )Iabcr   r   collectionsr   collections.abcr   r   r   r   typingr	   r
   r   r   r   r   r   r   r   r   r   typing_extensionsr   vllm.utils.collection_utilsr   vllm.utils.import_utilsr   r   r   r   r   inputsr   r   r   r   r   r   r   r   r    r!   rc   r"   r#   r$   	PIL.Imager   r%   globalsr&   r]   r   rX   rY   r|   rZ   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   r0   r0   r1   <module>   sb   (08


W<			 8