o
    iG                     @   s   d dl mZmZ d dlmZ d dlmZmZ d dlm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZmZmZmZ d dlmZ d	d
lmZ d	dlmZ ddlmZ e
dedZeeZ eG dd dZ!G dd dee	e Z"dS )    )ABCabstractmethod)Mapping)	dataclassfield)GenericTypeVarN)Image)AudioDummyOptionsBaseDummyOptionsImageDummyOptionsVideoDummyOptions)init_logger   )MultiModalDataDict)MultiModalDataItems   )BaseProcessingInfo_I)boundc                   @   s^   e Zd ZU dZeee B ed< eed< e	e
dZeeef ed< e	e
dZeeef ed< dS )ProcessorInputszq
    Represents the keyword arguments to
    [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
    promptmm_items)default_factoryhf_processor_mm_kwargstokenization_kwargsN)__name__
__module____qualname____doc__strlistint__annotations__r   r   dictr   r   objectr    r&   r&   ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/multimodal/processing/dummy_inputs.pyr      s   
 r   c                       s:  e Zd ZdZdeddf fddZedeee	f defdd	Z
e	dd
e	deee	f deeef dB defddZ	dd
e	deee	f deeef dB defddZddde	de	dedB deej fddZddde	de	de	dedB deej f
ddZddde	de	de	de	dedB deej fddZ  ZS ) BaseDummyInputsBuilderz_
    Abstract base class that constructs the dummy data to profile
    multi-modal models.
    inforeturnNc                    s   t    || _d S N)super__init__r)   )selfr)   	__class__r&   r'   r-   0   s   

zBaseDummyInputsBuilder.__init__	mm_countsc                 C      t )zD
        Build the text input corresponding to `mm_counts`.
        NotImplementedError)r.   r1   r&   r&   r'   get_dummy_text5   s   z%BaseDummyInputsBuilder.get_dummy_textseq_len
mm_optionsc                 C   r2   )a  
        Build the multimodal input which, after processing, results in
        the maximum possible number of placeholder tokens.

        Args:
            seq_len: Sequence length
            mm_counts: Count of items per modality
            mm_options: Configurable options per modality (optional).
                       If None, use model defaults for backward compatibility.
                       If provided, models can use these to customize dummy
                       data generation.
        r3   )r.   r6   r1   r7   r&   r&   r'   get_dummy_mm_data<   s   z(BaseDummyInputsBuilder.get_dummy_mm_datac                 C   s>   |  |}| |||}| jj|dd}ddi}t|||dS )a,  
        Build the input which, after processing, results in
        the maximum possible number of placeholder tokens.

        Args:
            seq_len: Sequence length
            mm_counts: Count of items per modality
            mm_options: Configurable options per modality (optional)
        F)validate
truncation)r   r   r   )r5   r8   r)   parse_mm_datar   )r.   r6   r1   r7   
dummy_textdummy_mm_datadummy_mm_itemsr   r&   r&   r'   get_dummy_processor_inputsQ   s   
z1BaseDummyInputsBuilder.get_dummy_processor_inputs)	overrideslength
num_audiosr@   c                C   sR   |dkrg S |r|j r|j |krtd|j | t||j }t|f}|g| S )Nr   zOaudio.length override (%d) exceeds model's maximum length (%d), will be ignored)rA   loggerwarningminnpzeros)r.   rA   rB   r@   audior&   r&   r'   _get_dummy_audiosl   s   


z(BaseDummyInputsBuilder._get_dummy_audioswidthheight
num_imagesc                C   s   |dkrg S |r4|j r|j |krtd|j | t||j }|jr4|j|kr.td|j| t||j}tjd||fdd}|g| S )Nr   zMimage.width override (%d) exceeds model's maximum width (%d), will be ignoredzOimage.height override (%d) exceeds model's maximum height (%d), will be ignoredRGB   )color)rJ   rC   rD   rE   rK   r	   new)r.   rJ   rK   rL   r@   imager&   r&   r'   _get_dummy_images   s*   


z(BaseDummyInputsBuilder._get_dummy_images
num_frames
num_videosc                C   s   |dkrg S |rJ|j r|j |krtd|j | t||j }|jr4|j|kr.td|j| t||j}|jrJ|j|krDtd|j| t||j}tj|||dfdtjd}|g| S )Nr   z]video.num_frames override (%d) exceeds model's maximum number of frames (%d), will be ignoredzMvideo.width override (%d) exceeds model's maximum width (%d), will be ignoredzOvideo.height override (%d) exceeds model's maximum height (%d), will be ignored   rN   )dtype)	rS   rC   rD   rE   rJ   rK   rF   fulluint8)r.   rJ   rK   rS   rT   r@   videor&   r&   r'   _get_dummy_videos   s:   	



z(BaseDummyInputsBuilder._get_dummy_videosr+   )r   r   r   r   r   r-   r   r   r    r"   r5   r   r   r8   r   r?   r
   r!   nptNDArrayrI   r   r	   rR   r   rZ   __classcell__r&   r&   r/   r'   r(   *   s|    


 

'r(   )#abcr   r   collections.abcr   dataclassesr   r   typingr   r   numpyrF   numpy.typingr[   PILr	   vllm.config.multimodalr
   r   r   r   vllm.loggerr   inputsr   parser   contextr   r   r   rC   r   r(   r&   r&   r&   r'   <module>   s"   