o
    ei)                     @   s   d Z ddlmZ ddlmZ ddlmZmZmZm	Z	m
Z
 ddlmZmZmZ ddlmZ eeZeG dd deZdgZd	S )
zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
   )BatchFeature)ProcessorMixin)
AddedTokenPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypeauto_docstringlogging)
VideoInputc                $       s   e Zd Zd fdd	Ze																ddedB deeB ee B ee B d	e	d
e	e
B eB de	e
B eB dedB dededB de	dB de	de	de	de	de	de	de
eB dB def"ddZedd Z  ZS )InstructBlipVideoProcessorNc                    sP   t |dstdddd| _|j| jgdd n|j| _|| _t ||| dS )a&  
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
        video_tokenz<video>FT)
normalizedspecial)special_tokensN)hasattrr   r   
add_tokensnum_query_tokenssuper__init__)selfvideo_processor	tokenizerqformer_tokenizerr   kwargs	__class__ /home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.pyr   $   s   
z#InstructBlipVideoProcessor.__init__TF    imagestextadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsreturnc                    sz  |d u r|d u rt di }|d urt|tr|g}nt|ts+t|d ts+t d| jd||||||||	|
||||||d|}|d|d< |d|d< |d urZ|| j8 }| jd||||||||	|
|||||d d|}|d ur| jj	| j d	 }| j|d
|	|
||||d d	|D ]  fdd|  D | < q|
| |d ur| j||d}|
| t||d}|S )Nz3You have to specify at least one of images or text.r    zAInvalid input text. Please provide a string, or a list of strings)r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   	input_idsqformer_input_idsattention_maskqformer_attention_mask   F)r#   r)   r*   r+   r,   r-   r.   r0   c                    s   g | ]}  | qS r   r   ).0samplekvideo_text_encodingr   r   
<listcomp>   s    z7InstructBlipVideoProcessor.__call__.<locals>.<listcomp>)r0   )tensor_typer   )
ValueError
isinstancestrlistr   popr   r   r   contentupdater   r   )r   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r   encodingqformer_text_encodingtext_encodingvideo_tokensimage_encodingr   r9   r   __call__3   s   



z#InstructBlipVideoProcessor.__call__c                 C   s$   | j j}| jj}ddg}|| | S )Nr3   r5   )r   model_input_namesr   )r   tokenizer_input_namesvideo_processor_input_namesqformer_input_namesr   r   r   rK      s   z,InstructBlipVideoProcessor.model_input_names)N)NNTFNNr    NNFFFFFTN)__name__
__module____qualname__r   r
   r   r   r   rA   boolr@   r   r   intr	   r   rJ   propertyrK   __classcell__r   r   r   r   r   "   sp    

	

br   N)__doc__image_processing_utilsr   processing_utilsr   tokenization_utils_baser   r   r   r   r   utilsr	   r
   r   video_utilsr   
get_loggerrO   loggerr   __all__r   r   r   r   <module>   s   

{