o
    
۾ih&                     @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZ d dlZd dlm	Z d dlmZ d dlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZmZm Z  e
rad dl!Z"nede# dZ"de$fddZ%dddej&de'de$de$fddZ(dddej&de'de$de$fddZ)ddddejde$de$de$fdd Z*ddddejde$de$de$fd!d"Z+d#dd$ej,de$de$fd%d&Z-d#dd$ej,de$de$fd'd(Z.d)ede/e0e$e'f  fd*d+Z1d,efd-d.Z2dd/d0d1ee d2e"j3j4d3e5fd4d5Z6dd/d0d1ee d2e"j3j4d3e5dee0e'ef  fd6d7Z7dd/d0d8e/e0e$ef  d2e"j3j4d3e5dee0e$e'ef ddf fd9d:Z8	dGd;e$d<e9e$ef dB de0ej&e'e:B f fd=d>Z;	dGd?e$d@e9e$ef dB dejfdAdBZ<	dGdCe$dDe9e$ef dB de0ej,e9e$ef f fdEdFZ=dS )H    N)defaultdict)	GeneratorSequence)groupby)TYPE_CHECKINGAny)Image)
LazyLoader   )MultiModalHasher)BatchedTensorInputsMultiModalFieldElemMultiModalKwargsItemMultiModalPlaceholderDictMultiModalSharedField)AudioMediaIOImageMediaIOMediaConnectorVideoMediaIOtorchnamec                 C   s<   | dkrddl m} tjdtdd |S tdtd| )	NMEDIA_CONNECTOR_REGISTRYr
   )r   z`vllm.multimodal.utils.MEDIA_CONNECTOR_REGISTRY` has been moved to `vllm.multimodal.media.MEDIA_CONNECTOR_REGISTRY`. The old name will be removed in v0.17.   )
stacklevelzmodule z has no attribute )mediar   warningswarnDeprecationWarningAttributeError__name__)r   r    r    I/home/ubuntu/.local/lib/python3.10/site-packages/vllm/multimodal/utils.py__getattr__!   s   r"   WAVformataudiosampling_rater%   returnc                C   s   t  }|j| |f|dS )zEncode audio as base64.)audio_format)r   encode_base64)r&   r'   r%   audio_ior    r    r!   encode_audio_base642   s   r,   c                C   4   t | ||d}tjd|  d}d| d| S )zEncode audio as a data URL.r$   .r&   data:;base64,)r,   	mimetypes	types_mapgetlower)r&   r'   r%   	audio_b64mimetyper    r    r!   encode_audio_url=   s   r7   RGBPNG
image_moder%   imager;   c                C   s   t |d}|j| |dS )z
    Encode a pillow image to base64 format.

    By default, the image is converted into RGB format before being encoded.
    )r;   )image_format)r   r*   )r<   r;   r%   image_ior    r    r!   encode_image_base64I   s   
r?   c                C   r-   )z|
    Encode a pillow image as a data URL.

    By default, the image is converted into RGB format before being encoded.
    r:   r.   r<   r/   r0   )r?   r1   r2   r3   r4   )r<   r;   r%   	image_b64r6   r    r    r!   encode_image_urlX   s   rA   JPEGframesc                C   s   t  }t|}|j| |dS )N)video_format)r   r   r*   )rC   r%   r>   video_ior    r    r!   encode_video_base64h   s   rF   c                C   sD   t | |d}| dkrd}ntjd|  d}d| d| S )Nr$   jpegz
video/jpegr.   videor/   r0   )rF   r4   r1   r2   r3   )rC   r%   	video_b64r6   r    r    r!   encode_video_urlr   s
   rJ   mm_positionsc                 C   s0   dd |   D }t|dd d}dd |D S )a/  
    Given a `MultiModalPlaceholderDict`, output a sequence of keys to
    sort the dictionary by `offset` (starting index in the input sequence)
    in ascending order.

    Returns:
        A list of `(modality, idx)`, which can be used to access an item
        by `mm_positions[modality][idx]`.
    c                 s   s0    | ]\}}t |D ]
\}}|||fV  q
qd S N)	enumerate).0modalityitemsidxitemr    r    r!   	<genexpr>   s    z'argsort_mm_positions.<locals>.<genexpr>c                 S   s
   | d j S )Nr   )offsetxr    r    r!   <lambda>   s   
 z&argsort_mm_positions.<locals>.<lambda>keyc                 S   s   g | ]	\}}}||fqS r    r    )rN   rO   rQ   _r    r    r!   
<listcomp>   s    z(argsort_mm_positions.<locals>.<listcomp>)rP   sorted)rK   
flat_itemssorted_flat_itemsr    r    r!   argsort_mm_positions   s
   r_   elemc                 C   s   t | jtsd S tj| jdS )N)data)
isinstancefieldr   r   hash_kwargsra   )r`   r    r    r!   _get_group_hash   s   re   Fdevice
pin_memoryrP   rg   rh   c                   sV   t ttt f t}| D ]}| D ]\}}|| | qq fdd| D S )Nc                    s(   i | ]\}}||d  j j| dqS )r   rf   )rc   reduce_data)rN   rY   elemsrf   r    r!   
<dictcomp>   s    z#_batch_mm_items.<locals>.<dictcomp>)r   strlistr   rP   append)rP   rg   rh   rj   rR   rY   r`   r    rf   r!   _batch_mm_items   s   ro   c                c   sp    dd | D }dd t |D }d}|D ]}t| |||  ||d}||fV  ||7 }q|t| ks6J dS )a  
    Group consecutive items (possibly from different requests) into batches.

    Items must be split across groups if any of the following occurs,
    as the batch would otherwise be invalid:
    - They have different fields (e.g. mixed image and embedding inputs).
    - They have different values in `MultiModalSharedField`.

    Args:
        items: List of `MultiModalKwargsItem`.
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.

    Yields:
        A tuple `(num_items, grouped_kwargs)`, where:
        - `kwargs` is a dictionary of keyword arguments to pass to the model;
        - `num_items` is the corresponding number of items.
    c                 S   s.   g | ]}t d d t| dd dD qS )c                 s   s     | ]\}}|t |fV  qd S rL   )re   )rN   rY   r`   r    r    r!   rS      s
    

6group_and_batch_mm_items.<locals>.<listcomp>.<genexpr>c                 S      | d S Nr   r    )kvr    r    r!   rW          z5group_and_batch_mm_items.<locals>.<listcomp>.<lambda>rX   )tupler\   rP   )rN   rR   r    r    r!   r[      s    z,group_and_batch_mm_items.<locals>.<listcomp>c                 S   s"   g | ]\}}t d d |D qS )c                 s   s    | ]}d V  qdS )r
   Nr    )rN   rZ   r    r    r!   rS      s    rp   )sum)rN   rZ   groupr    r    r!   r[      s   " r   rf   N)r   ro   len)rP   rg   rh   	group_idsgroup_sizes	start_idx
group_size
group_datar    r    r!   group_and_batch_mm_items   s   

r~   	mm_kwargsc                c   sR    t | dd dD ]\}}dd |D }t|||dD ]
\}}|||fV  qq	dS )a  
    Group consecutive items (possibly from different requests) into batches.

    Items must be split across groups if any of the following occurs,
    as the batch would otherwise be invalid:
    - They have different fields (e.g. mixed image and embedding inputs).
    - They have different values in `MultiModalSharedField`.

    To simplify the implementation of `embed_multimodal`, we add another
    restriction that the items in a batch must belong to the same modality.

    Args:
        mm_kwargs: List of `(modality, item)`.
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.

    Yields:
        A tuple `(modality, num_items, grouped_kwargs)`, where:
        - `modality` is the modality of the batch;
        - `kwargs` is a dictionary of keyword arguments to pass to the model;
        - `num_items` is the corresponding number of items.
    c                 S   rq   rr   r    rU   r    r    r!   rW      rt   z-group_mm_kwargs_by_modality.<locals>.<lambda>rX   c                 S   s   g | ]\}}|qS r    r    )rN   rZ   rR   r    r    r!   r[     s    z/group_mm_kwargs_by_modality.<locals>.<listcomp>rf   N)r   r~   )r   rg   rh   rO   rw   	items_lst	num_itemsmm_kwargs_batchr    r    r!   group_mm_kwargs_by_modality   s   r   	audio_urlaudio_io_kwargsc                 C   &   |sdnd|i}t |dd}|| S )a+  
    Args:
        audio_url: URL of the audio file to fetch.
        audio_io_kwargs: Additional kwargs passed to handle audio IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    Nr&   /media_io_kwargsallowed_local_media_path)r   fetch_audio)r   r   r   media_connectorr    r    r!   r        
r   	image_urlimage_io_kwargsc                 C   r   )a+  
    Args:
        image_url: URL of the image file to fetch.
        image_io_kwargs: Additional kwargs passed to handle image IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    Nr<   r   r   )r   fetch_image)r   r   r   r   r    r    r!   r      r   r   	video_urlvideo_io_kwargsc                 C   r   )a+  
    Args:
        video_url: URL of the video file to fetch.
        video_io_kwargs: Additional kwargs passed to handle video IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    NrH   r   r   )r   fetch_video)r   r   r   r   r    r    r!   r   5  r   r   rL   )>r1   r   collectionsr   collections.abcr   r   	itertoolsr   typingr   r   numpynpnumpy.typingnptPILr   vllm.utils.import_utilsr	   hasherr   inputsr   r   r   r   r   r   r   r   r   r   torch.typesr   globalsrl   r"   ndarrayintr,   r7   r?   rA   NDArrayrF   rJ   rm   ru   r_   re   typesDeviceboolro   r~   r   dictfloatr   r   r   r    r    r    r!   <module>   s  










3
)

