o
    i                     @   sz  U d dl mZmZ d dlmZmZ d dlmZmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dlZd d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ ddl%m&Z& erd dl'Z'd dl(Z'd dl)m*Z* d dl+m,Z, n	e"de- dZ'e.Z,edZ/edej0df Z1ee2d< 	 ee3d ej0de3ej0 e3d f Z4ee2d< 	 ee3e5 ej0df Z6ee2d< 	 ee1de&e1 f Z7ee2d< 	 ee4de8e4e.e9ef f f Z:ee2d< 	 ee6e8ej0e5f df Z;ee2d< 	 e/e3e/dB  B dB Z<ee2d< 	 G dd deZ=G dd  d eZ>e=e>B Z?	 eG d!d" d"ed#d$Z@ee9e<e f ZAee2d%< 	 ee9e3e9dB  e9B f ZBee2d&< 	 e
d'd(G d)d* d*ZCee3d+ e3d de8d, f ZDee2d+< 	 d-eDd.eDd/eEfd0d1ZFd2eDd3e'jGjHd/eDfd4d5ZIe.e9eDf ZJee2d6< 	 d-eJd.eJd/eEfd7d8ZKe
G d9d: d:ZLe
G d;d< d<ZMe
d'd'd=G d>d? d?eZNe
d'd'd=G d@dA dAeNZOe
d'd'd=G dBdC dCeNZPe
d'd'd=G dDdE dEeNZQe
d'd(G dFdG dGZRG dHdI dIee9eMf ZSedJeSeSdB eSdKZTG dLdM dMee9eeT f ZUeUeS eUeSdB  B ZVee2dN< e.e9e3e9 f ZW	 ee9eeC f ZXee2dO< 	 G dPdQ dQe,ZYG dRdS dSeYZZdS )T    )ABCabstractmethod)UserDictdefaultdict)MappingSequence)	dataclass)cached_propertypartial)
accumulate)TYPE_CHECKINGAnyLiteral	TypeAlias	TypedDictUnioncastfinalN)Image)TypeVar)
is_list_of)
LazyLoaderjson_map_leaves   )MediaWithBytes)BatchFeature)_InputOptionstorch_Tr   torch.TensorHfImageItemHfVideoItemHfAudioItem	ImageItem	VideoItem	AudioItemModalityDatac                   @   s2   e Zd ZU dZed ed< eed< edB ed< dS )VisionChunkImagez.Represents an image wrapped as a vision chunk.imagetypeNuuid)__name__
__module____qualname____doc__r   __annotations__r   str r2   r2   L/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/multimodal/inputs.pyr(   o   s
   
 r(   c                   @   sF   e Zd ZU dZed ed< ee ed< edB ed< eed< e	ed< dS )VisionChunkVideoz'Represents a video chunk with metadata.video_chunkr*   Nr+   prompt	video_idx)
r,   r-   r.   r/   r   r0   listr   r1   intr2   r2   r2   r3   r4   w   s   
 r4   c                   @   sH   e Zd ZU dZee ed< 	 ee ed< 	 ee ed< 	 ee	 ed< dS )MultiModalDataBuiltinsz7Type annotations for modality types predefined by vLLM.r)   videoaudiovision_chunkN)
r,   r-   r.   r/   r'   r$   r0   r%   r&   VisionChunkr2   r2   r2   r3   r:      s   
 r:   F)totalMultiModalDataDictMultiModalUUIDDictT)frozenc                   @   s   e Zd ZU dZeed< 	 eed< 	 dZded< 	 edej	dB fdd	Z
defd
dZdededeeef fddZdeeeef  fddZdedefddZdS )PlaceholderRangea  
    Placeholder location information for multi-modal data.

    Example:

    Prompt: `AAAA BBBB What is in these images?`

    Images A and B will have:

    ```
    A: PlaceholderRange(offset=0, length=4)
    B: PlaceholderRange(offset=5, length=4)
    ```
    offsetlengthNztorch.Tensor | Noneis_embedreturnc                 C   s   | j d u rd S | j jddS )Nr   dim)rF   cumsumselfr2   r2   r3   embeds_cumsum   s   zPlaceholderRange.embeds_cumsumc                 C   s   | j d u r| jS t| j d S )N)rM   rE   r9   rK   r2   r2   r3   get_num_embeds   s   
zPlaceholderRange.get_num_embeds	start_idxend_idxc                 C   sJ   | j du r	||fS |dkrt| j |d  nd}t| j |d  }||fS )a  
        Returns the starting and ending indices of the embeddings of encoder outputs
        in the range of [start_idx, end_idx) in the placeholders.

        For example, given:
        PlaceholderRange(offset=2, length=5, is_embed=[False, True, False, True, True])

        If start_idx=3 and end_idx=5, the output is (1, 3) because we want to get
        the second and the third embeddings from the encoder output.
        Nr   r   )rM   r9   )rL   rP   rQ   embeds_start_idxembeds_end_idxr2   r2   r3   get_embeds_indices_in_range   s   
z,PlaceholderRange.get_embeds_indices_in_rangec                 C   s   | j du r| j| j| j d fgS | j  }ttj||dddk }ttj||dddk }tj	||fdd| j }dd |
 D S )	a  Extract the start and end indices of the embedded region in prompt.

        For example, given `PlaceholderRange(offset=2, length=5)` and
        `is_embed = [False, True, False, True, True]`, the output is
        `[(1 + offset, 1 + offset), (3 + offset, 4 + offset)]`.

        Returns:
            A tuple `(start, end)` representing the start and end
            indices (inclusive) of the embedded region.
            Returns full placeholder range if `is_embed` is `None`.
        Nr   )prepend)appendrN   rH   c                 S   s   g | ]}t |qS r2   )tuple).0xr2   r2   r3   
<listcomp>       z9PlaceholderRange.extract_embeds_range.<locals>.<listcomp>)rF   rD   rE   r9   r   nonzerodiff	new_zerosflattenstacktolist)rL   mask_istartsendsrangesr2   r2   r3   extract_embeds_range   s   

z%PlaceholderRange.extract_embeds_rangeotherc                 C   sb   t || jsdS | j| jf|j|jfksdS | jd u r |jd u S |jd u r*| jd u S t| j|jS NF)
isinstance	__class__rD   rE   rF   nested_tensors_equal)rL   rg   r2   r2   r3   __eq__   s   



zPlaceholderRange.__eq__)r,   r-   r.   r/   r9   r0   rF   r	   r   TensorrM   rO   rW   rT   r8   rf   objectboolrl   r2   r2   r2   r3   rC      s(   
 

rC   NestedTensors)r    .abrG   c                 C   s   t | tjrt |tjot| |S t |tjr$t | tjo#t|| S t | tr:t |to9tdd t| |D S t |trPt | toOtdd t|| D S | |kS )ze
    Equality check between
    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects.
    c                 s       | ]
\}}t ||V  qd S Nrk   )rX   a_b_r2   r2   r3   	<genexpr>$      
z'nested_tensors_equal.<locals>.<genexpr>c                 s   rs   rt   ru   )rX   rw   rv   r2   r2   r3   rx   (  ry   )ri   r   rm   equalr8   allziprq   rr   r2   r2   r3   rk     s   

rk   tensorsdevicec                    s    d u r| S t  fdd| S )Nc                    s   t | tjr| j ddS | S )NT)r   non_blocking)ri   r   rm   to)rY   r   r2   r3   <lambda>9  s   
z%_nested_tensors_h2d.<locals>.<lambda>r   )r~   r   r2   r   r3   _nested_tensors_h2d0  s   
r   BatchedTensorInputsc                    s   t  fdd D S )zq
    Equality check between
    [`BatchedTensorInputs`][vllm.multimodal.inputs.BatchedTensorInputs] objects.
    c                 3   s*    | ]}|v ot  | | V  qd S rt   ru   rX   kr}   r2   r3   rx   M  s   ( z(batched_tensors_equal.<locals>.<genexpr>)r{   r}   r2   r}   r3   batched_tensors_equalH  s   r   c                   @   sj   e Zd ZU dZded< 	 eed< 	 eed< 	 eed< 	 dZedB ed< 	 ed	e	d  d
e
e fddZdS )MultiModalFeatureSpecz
    Represents a single multimodal input with its processed data and metadata.

    Used to track multimodal data through processing and caching.
    A request containing multiple multimodal items will have one
    `MultiModalFeatureSpec` per item.
    zMultiModalKwargsItem | Nonedatamodality
identifiermm_positionNmm_hashfeatureskeysc                 C   sZ   t ttt f t}| D ]}|j}|d ur(|D ]}||v r'|| || j qqt|S rt   )r   r1   r8   rp   r   rV   dict)r   r   kwargsfitemr   r2   r2   r3   gather_kwargsq  s   z#MultiModalFeatureSpec.gather_kwargs)r,   r-   r.   r/   r0   r1   rC   r   staticmethodr8   setr   r2   r2   r2   r3   r   P  s   
  r   c                   @   s8   e Zd ZU dZeed< 	 ded< 	 dedefddZd	S )
MultiModalFieldElemz
    Represents a processed keyword argument to pass to a model for a
    [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem].
    r   BaseMultiModalFieldfieldrg   rG   c                 C   sb   t || jsdS | jd u r|jd u }n|jd u r| jd u }nt| j|j}|o0t| jt|ju S rh   )ri   rj   r   rk   r*   r   )rL   rg   
data_equalr2   r2   r3   rl     s   

zMultiModalFieldElem.__eq__N)	r,   r-   r.   r/   rp   r0   rn   ro   rl   r2   r2   r2   r3   r     s   
 	r   )rB   kw_onlyc                
   @   s   e Zd ZU dZdZeed< 	 dd Zede	de	de
d	ee fd
dZedee
 ded	e
fddZddddee dejjded	e
fddZdS )r   z
    Defines how to interpret tensor data belonging to a keyword argument for
    [`MultiModalKwargsItems`][vllm.multimodal.inputs.MultiModalKwargsItems],
    and vice versa.
    Fkeep_on_cpuc                    s&   t t| d dtdtf fdd}|S )N)r   r   rG   c                    s
    | dS )Nr   r2   r   r   r2   r3   factory  s   
z3BaseMultiModalField._field_factory.<locals>.factory)r
   r   rp   )rL   r   r2   r   r3   _field_factory  s   z"BaseMultiModalField._field_factoryr   keyr   rG   c                 C      t )a
  
        Construct
        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
        instances to represent the provided data.

        This is the inverse of
        [`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data].
        NotImplementedErrorrL   r   r   r   r2   r2   r3   build_elems  s   zBaseMultiModalField.build_elemsbatch
pin_memoryc                C   r   rt   r   rL   r   r   r2   r2   r3   _reduce_data  s   z BaseMultiModalField._reduce_dataNr   r   elemsr   c                C   st   dd |D }t t|dkrtd||dur| jrd}|r&| jr&d}dd |D }| j||d	}t||d
S )z
        Merge the data from multiple instances of
        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].

        This is the inverse of
        [`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems].
        c                 S   s   g | ]}t |jqS r2   )r*   r   rX   r   r2   r2   r3   rZ         z3BaseMultiModalField.reduce_data.<locals>.<listcomp>r   z#Cannot merge different field_types=NcpuFc                 S   s   g | ]}|j qS r2   r   rX   elemr2   r2   r3   rZ     s    )r   r   )lenr   
ValueErrorr   r   r   )rL   r   r   r   field_typesr   outr2   r2   r3   reduce_data  s   
zBaseMultiModalField.reduce_data)r,   r-   r.   r/   r   ro   r0   r   r   r1   rp   r   r   r   r8   r   r   typesDevicer   r2   r2   r2   r3   r     sD   
 	r   c                	   @   sH   e Zd ZdZdedededee fddZde	e d	e
defd
dZdS )MultiModalBatchedFieldzo
    Info:
        [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
    r   r   r   rG   c                    s   |     fdd|D S )Nc                    s   g | ]} |qS r2   r2   r   field_factoryr2   r3   rZ      r[   z6MultiModalBatchedField.build_elems.<locals>.<listcomp>)r   r   r2   r   r3   r     s   z"MultiModalBatchedField.build_elemsr   r   c                   s   t |dkrTt|tjddrTtttj |}t |dkr%|d d S |d j t	 fdd|D rTtj
t |g|d jR |d j|d j|d}tj||dS |S )	Nr   r{   checkr   c                 3   s    | ]}|j  kV  qd S rt   shaper   first_shaper2   r3   rx         z6MultiModalBatchedField._reduce_data.<locals>.<genexpr>dtyper   r   )r   )r   r   r   rm   r   r8   	unsqueeze
contiguousr   r{   emptyr   r   r`   )rL   r   r   r   r2   r   r3   r     s   
z#MultiModalBatchedField._reduce_dataN)r,   r-   r.   r/   r1   rp   r   r   r   r8   ro   r   r2   r2   r2   r3   r     s$    
	r   c                	   @   sn   e Zd ZU dZee eee  B ed< dZeed< de	de	de
dee fd	d
Zdee
 dede
fddZdS )MultiModalFlatFieldz
    Info:
        [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
        [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
    slicesr   rI   r   r   r   rG   c                    sB   |   t| jtddst tjsJ d fdd| jD S )Nr{   r   z,torch.Tensor is required for multiple slicesc                    s   g | ]} t t| qS r2   )r   slice)rX   sr   r   r2   r3   rZ   2  s    z3MultiModalFlatField.build_elems.<locals>.<listcomp>)r   r   r   r   ri   r   rm   r   r2   r   r3   r   '  s   zMultiModalFlatField.build_elemsr   r   c             	      s
  t |dkrt|tjddrtttj |}t |dkr"|d  S | j| jdk t |d j  dtjffdd  |d t	 fdd	|D r{\}}t
fd
d	|D }tjg |||R |d j|d j|d}tj|| j|dS |d j}g }t|D ]#kr|t
fdd	|D  q|tfdd	|D  qtj||d j|d j|d}d}	|D ]5}
g }t|D ]kr|t|	|	|
j   q|td|
j  q|
|t|< |	|
j 7 }	q|S | jdksJ ddd |D S )Nr   r{   r   r   tensorc                    s    | j d   | j  d d  fS Nr   r   )r   rH   r2   r3   _shape_before_afterD  s    z=MultiModalFlatField._reduce_data.<locals>._shape_before_afterc                 3   s    | ]	} |kV  qd S rt   r2   r   )r   r   r2   r3   rx   I  s    z3MultiModalFlatField._reduce_data.<locals>.<genexpr>c                 3       | ]}|j   V  qd S rt   r   r   rH   r2   r3   rx   K  r   r   )rI   r   c                 3   r   rt   r   rX   tdr2   r3   rx   a  r   c                 3   r   rt   r   r   r   r2   r3   rx   c  r   z$dim == 0 is required for nested listc                 S   s   g | ]	}|D ]}|qqS r2   r2   )rX   r   er2   r2   r3   rZ   ~      z4MultiModalFlatField._reduce_data.<locals>.<listcomp>)r   r   r   rm   r   r8   r   rI   r   r{   sumr   r   r   concatndimrangerV   maxzerosr   rW   )rL   r   r   shape_beforeshape_aftershape_concatr   r   	max_sizesconcat_offsetr   r   r2   )r   r   rI   r   r3   r   4  sV   
z MultiModalFlatField._reduce_dataN)r,   r-   r.   r/   r   r   r0   rI   r9   r1   rp   r   r   r8   ro   r   r2   r2   r2   r3   r     s(   
 
r   c                	   @   sR   e Zd ZU dZeed< dedededee	 fddZ
d	ee d
edefddZdS )MultiModalSharedFieldzm
    Info:
        [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared]
    
batch_sizer   r   r   rG   c                 C   s   |   }||g| j S rt   )r   r   )rL   r   r   r   r   r2   r2   r3   r     s   z!MultiModalSharedField.build_elemsr   r   c                C   s   |d S )Nr   r2   r   r2   r2   r3   r     s   z"MultiModalSharedField._reduce_dataN)r,   r-   r.   r/   r9   r0   r1   rp   r   r   r   r8   ro   r   r2   r2   r2   r3   r     s&   
 
	r   c                   @   s   e Zd ZU edddedefddZe	ddddedee eee  B d	e	defd
dZ
e	ddddeddd	e	defddZedddede	defddZeed< eed< dededee fddZdS )MultiModalFieldConfigFr   r   r   c                C   s   t t|d| dS )aN  
        Defines a field where an element in the batch is obtained by
        indexing into the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.

        Example:

        ```
        Input:
            Data: [[AAAA]
                [BBBB]
                [CCCC]]

        Output:
            Element 1: [AAAA]
            Element 2: [BBBB]
            Element 3: [CCCC]
        ```
        r   r   r   )r   r   )r   r   r2   r2   r3   batched  s   zMultiModalFieldConfig.batchedr   r   rI   c                C   s   t t|||d| dS )a  
        Defines a field where an element in the batch is obtained by
        slicing along the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            slices: For each multi-modal item, a slice (dim=0) or a tuple of
                slices (dim>0) that is used to extract the data corresponding
                to it.
            dim: The dimension to extract data, default to 0.
            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.

        Example:

        ```
        Given:
            slices: [slice(0, 3), slice(3, 7), slice(7, 9)]

        Input:
            Data: [AAABBBBCC]

        Output:
            Element 1: [AAA]
            Element 2: [BBBB]
            Element 3: [CC]
        ```

        ```
        Given:
            slices: [
                (slice(None), slice(0, 3)),
                (slice(None), slice(3, 7)),
                (slice(None), slice(7, 9))]
            dim: 1

        Input:
            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]

        Output:
            Element 1: [[A],[A],[A]]
            Element 2: [[B],[B],[B],[B]]
            Element 3: [[C],[C]]
        ```
        )r   rI   r   r   )r   r   )r   r   rI   r   r2   r2   r3   flat  s   5zMultiModalFieldConfig.flatsize_per_itemr    c                   sV   |j dkrtd|j dgt| fddtt|D }tj| | |dS )a  
        Defines a field where an element in the batch is obtained by
        slicing along the first dimension of the underlying data.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            size_per_item: For each multi-modal item, the size of the slice
                that is used to extract the data corresponding to it.
            dim: The dimension to slice, default to 0.
            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.

        Example:

        ```
        Given:
            size_per_item: [3, 4, 2]

        Input:
            Data: [AAABBBBCC]

        Output:
            Element 1: [AAA]
            Element 2: [BBBB]
            Element 3: [CC]
        ```

        ```
        Given:
            size_per_item: [3, 4, 2]
            dim: 1

        Input:
            Data: [[A],[A],[A],[B],[B],[B],[B],[C],[C]]

        Output:
            Element 1: [[A],[A],[A]]
            Element 2: [[B],[B],[B],[B]]
            Element 3: [[C],[C]]
        ```

        Info:
            [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
        r   z7size_per_item should be a 1-D tensor, but found shape: r   c                    s6   g | ]}t d d d f  t | |d  f qS r   )r   )rX   irI   
slice_idxsr2   r3   rZ   6  s    z9MultiModalFieldConfig.flat_from_sizes.<locals>.<listcomp>)rI   r   )r   r   r   r   r   r   r   r   )r   r   rI   r   r   r2   r   r3   flat_from_sizes  s    
5
z%MultiModalFieldConfig.flat_from_sizesr   c                C   s   t t||d| dS )a  
        Defines a field where an element in the batch is obtained by
        taking the entirety of the underlying data.

        This means that the data is the same for each element in the batch.

        Args:
            modality: The modality of the multi-modal item that uses this
                keyword argument.
            batch_size: The number of multi-modal items which share this data.
            keep_on_cpu: Whether to keep this field on the CPU for the model inputs.

        Example:

        ```
        Given:
            batch_size: 4

        Input:
            Data: [XYZ]

        Output:
            Element 1: [XYZ]
            Element 2: [XYZ]
            Element 3: [XYZ]
            Element 4: [XYZ]
        ```
        )r   r   r   )r   r   )r   r   r   r2   r2   r3   sharedC  s   #zMultiModalFieldConfig.sharedr   r   r   rG   c                 C   s   | j | j||S rt   )r   r   r   )rL   r   r   r2   r2   r3   r   q  s   z!MultiModalFieldConfig.build_elemsN)r   )r,   r-   r.   r   r1   ro   r   r   r   r9   r   r   r   r   r0   rp   r   r   r2   r2   r2   r3   r     s^   
 =H*r   c                   @   s:   e Zd ZdZed
defddZdeee	f fddZ
d	S )MultiModalKwargsItemz
    A dictionary of processed keyword arguments to pass to the model,
    corresponding to a single item in
    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
    r   nbytesc                 C   s*   t tj| tjdtddd}td|iS )zConvenience class for testing.)r   r   )r   )r   r   dummy)r   r   r   uint8r   r   )r   mm_elemr2   r2   r3   r     s
   zMultiModalKwargsItem.dummyrG   c                 C   s   dd |   D S )Nc                 S   s   i | ]\}}||j qS r2   r   )rX   r   r   r2   r2   r3   
<dictcomp>  s    z1MultiModalKwargsItem.get_data.<locals>.<dictcomp>)itemsrK   r2   r2   r3   get_data  s   zMultiModalKwargsItem.get_dataN)r   )r,   r-   r.   r/   r   r9   r   r   r1   rp   r   r2   r2   r2   r3   r   y  s
    r   _I)defaultc                       sx   e Zd ZdZedddeeef fddZdede	e
 f fd	d
ZdddZddddejjdedefddZ  ZS )MultiModalKwargsItemsa  
    A dictionary of processed multi-modal inputs by modality.

    For example, given a processor that processes
    images into `pixel_values` and `image_grid_thw`,
    and audios into `input_audio_features`,
    a prompt with 2 images and 1 audio will be processed
    into a `MultiModalKwargsItems` with the following structure:

    ```python
    MultiModalKwargsItems(
        {
            "image": [
                # For the first image
                MultiModalKwargsItem({"pixel_values": ..., "image_grid_thw": ...}),
                # For the second imgae
                MultiModalKwargsItem({"pixel_values": ..., "image_grid_thw": ...}),
            ],
            "audio": [
                # For the first audio
                MultiModalKwargsItem({"input_audio_features": ...}),
            ],
        }
    )
    ```

    Unlike HF processing which returns all items
    in a single dictionary with batched keyword arguments,
    we split up the items because some of them may already be cached.
    Also, items from multiple requests may be batched together to improve throughput,
    using the logic defined by the
    [`BaseMultiModalField`][vllm.multimodal.inputs.BaseMultiModalField]
    for each keyword argument.
    	hf_inputsr   config_by_keyc                    s  t ttt f   tttt f t}| D ]%\}}| |}|d ur<|||}t	|dkr<| |< ||j
 | qt ttt f  }| D ]?\}}	 fdd|	D dd  D }
t	t|
 dkrttd|d|
tt|
 }fdd	t|D ||< qJt|S )
Nr   c                    s   i | ]}| | qS r2   r2   r   )elems_by_keyr2   r3   r     r   z8MultiModalKwargsItems.from_hf_inputs.<locals>.<dictcomp>c                 S      i | ]	\}}|t |qS r2   r   rX   r   vr2   r2   r3   r     r   r   z0Cannot merge different batch sizes for modality=z! Found: batch_sizes=c                    s&   g | ] t  fd d D qS )c                    s   i | ]	\}}||  qS r2   r2   r   r   r2   r3   r     r   zCMultiModalKwargsItems.from_hf_inputs.<locals>.<listcomp>.<dictcomp>)r   r   )rX   )elems_in_modalityr  r3   rZ     s    z8MultiModalKwargsItems.from_hf_inputs.<locals>.<listcomp>)r   r1   r   r   r   r   r   getr   r   r   addr8   r   valuesr   nextiterr   r   )r   r   keys_by_modalityr   configr   r   items_by_modalityr   r   batch_sizesr   r2   )r   r  r3   from_hf_inputs  s2   

z$MultiModalKwargsItems.from_hf_inputsr   rG   c                    s0   || vrt d|dt|   t |S )Nz	Modality z" not found. Available modalities: )KeyErrorr   r   super__getitem__)rL   r   rj   r2   r3   r    s   
z!MultiModalKwargsItems.__getitem__+MultiModalKwargsItems[MultiModalKwargsItem]c                 C   sF   |   D ]\}}t|D ]\}}|d u rtd| d| dqq| S )NzFound empty mm_items[z][])r   	enumerateRuntimeError)rL   r   r   r   r   r2   r2   r3   require_data  s   z"MultiModalKwargsItems.require_dataNFr   r   r   c          	         s   ddl m |  } fdd| D }i }| D ]"\}}t|dkr8dd | D }td|d||d  q|S )	zAConstruct a dictionary of keyword arguments to pass to the model.r   )group_and_batch_mm_itemsc              	      s6   i | ]\}}t |d kr|dd | dD qS )r   c                 S   s   g | ]\}}|qS r2   r2   )rX   _r   r2   r2   r3   rZ     s    z=MultiModalKwargsItems.get_data.<locals>.<dictcomp>.<listcomp>r   r   )rX   r   r   r   r  r   r2   r3   r     s    		z2MultiModalKwargsItems.get_data.<locals>.<dictcomp>c                 S   r   r2   r   )rX   r   batchesr2   r2   r3   r     s    zNSome modalities cannot be merged into a single batch (num_batches_by_modality=)r   )utilsr  r  r   r   r  update)	rL   r   r   r  batches_by_modalityout_datar  r  num_batches_by_modalityr2   r  r3   r     s$   	zMultiModalKwargsItems.get_data)rG   r  )r,   r-   r.   r/   r   r   r1   r   r  r   r   r  r  r   r   r   ro   r   r   __classcell__r2   r2   r  r3   r     s&    #
#
	r   MultiModalKwargsOptionalItemsMultiModalPlaceholderDictc                   @   sJ   e Zd ZU dZed ed< 	 ee ed< 	 eed< 	 e	ed< 	 e
ed< dS )	MultiModalInputsz
    Represents the outputs of
    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor],
    ready to be passed to vLLM internals.
    
multimodalr*   prompt_token_ids	mm_kwargs	mm_hashesmm_placeholdersN)r,   r-   r.   r/   r   r0   r8   r9   r"  MultiModalHashesr#  r2   r2   r2   r3   r$  )  s   
 r$  c                   @   s   e Zd ZU dZee ed< dS )MultiModalEncDecInputsaV  
    Represents the outputs of
    [`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor]
    ready to be passed to vLLM internals.

    Note: Even text-only encoder-decoder models are currently implemented
    as multi-modal models for convenience.
    (Example: https://github.com/vllm-project/bart-plugin)
    encoder_prompt_token_idsN)r,   r-   r.   r/   r8   r9   r0   r2   r2   r2   r3   r+  C  s   
 
r+  )[abcr   r   collectionsr   r   collections.abcr   r   dataclassesr   	functoolsr	   r
   	itertoolsr   typingr   r   r   r   r   r   r   r   numpynp	PIL.Imager   typing_extensionsr   vllm.utils.collection_utilsr   vllm.utils.import_utilsr   vllm.utils.jsontreer   mediar   r   torch.types%transformers.feature_extraction_utilsr   vllm.inputs.datar   globalsr   r   ndarrayr!   r0   r8   r"   floatr#   r$   rW   r1   r%   r&   r'   r(   r4   r>   r:   r@   rA   rC   rp   ro   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"  r*  r#  r$  r+  r2   r2   r2   r3   <module>   s   (	 
	
 
d
.
$
M
)
d ] 

