o
    i+&                     @   s   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZ ddlmZmZ dd	lmZ d
dlmZ eeZG dd deZdgZdS )zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)OptionalUnion   )BatchFeature)ProcessorMixin)
AddedTokenPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging)
VideoInput   )AutoTokenizerc                $       s  e Zd ZdZg dZdZdZdZd$ fdd	Z								
														d%de	e
 deeeee ee f dedeeeef deeeef de	e dede	e de	e dedededededede	eeef  def"ddZedd Z fd d!Ze fd"d#Z  ZS )&InstructBlipVideoProcessora  
    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.

    Args:
        video_processor (`InstructBlipVideoVideoProcessor`):
            An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )video_processor	tokenizerqformer_tokenizerAutoVideoProcessorr   Nc                    sP   t |dstdddd| _|j| jgdd n|j| _|| _t ||| d S )Nvideo_tokenz<video>FT)
normalizedspecial)special_tokens)hasattrr   r   
add_tokensnum_query_tokenssuper__init__)selfr   r   r   r   kwargs	__class__ v/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.pyr   ?   s   
z#InstructBlipVideoProcessor.__init__TFr   imagestextadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsreturnc                    sz  |du r|du rt di }|durt|tr|g}nt|ts+t|d ts+t d| jd||||||||	|
||||||d|}|d|d< |d|d	< |durZ|| j8 }| jd||||||||	|
|||||dd|}|dur| jj	| j d
 }| j|d|	|
||||dd	|D ]  fdd|  D | < q|
| |dur| j||d}|
| t||d}|S )a%  
        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Nz3You have to specify at least one of images or text.r   zAInvalid input text. Please provide a string, or a list of strings)r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   	input_idsqformer_input_idsattention_maskqformer_attention_mask   F)r'   r-   r.   r/   r0   r1   r2   r4   c                    s   g | ]}  | qS r#   r#   ).0samplekvideo_text_encodingr#   r$   
<listcomp>   s    z7InstructBlipVideoProcessor.__call__.<locals>.<listcomp>)r4   )tensor_typer#   )
ValueError
isinstancestrlistr   popr   r   r   contentupdater   r   )r   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r    encodingqformer_text_encodingtext_encodingvideo_tokensimage_encodingr#   r=   r$   __call__H   s   



z#InstructBlipVideoProcessor.__call__c                 C   s$   | j j}| jj}ddg}|| | S )Nr7   r9   )r   model_input_namesr   )r   tokenizer_input_namesvideo_processor_input_namesqformer_input_namesr#   r#   r$   rO      s   z,InstructBlipVideoProcessor.model_input_namesc                    s   t j|rtd| dt j|dd t j|d}| j| d| jv }|r/| j	d t
 j|fi |}|rC|  jdg7  _|S )NzProvided path (z#) should be a directory, not a fileT)exist_okr   )ospathisfilerB   makedirsjoinr   save_pretrained
attributesremover   )r   save_directoryr    qformer_tokenizer_pathqformer_presentoutputsr!   r#   r$   rY      s   
z*InstructBlipVideoProcessor.save_pretrainedc                    s>   t  j|fi |}t|tr|d }tj|dd}||_|S )Nr   r   )	subfolder)r   from_pretrainedrC   tupler   r   )clspretrained_model_name_or_pathr    	processorr   r!   r#   r$   ra      s   
z*InstructBlipVideoProcessor.from_pretrained)N)NNTFNNr   NNFFFFFTN)__name__
__module____qualname____doc__rZ   video_processor_classtokenizer_classqformer_tokenizer_classr   r   r   r   r
   r	   rE   boolrD   r   r   intr   r   rN   propertyrO   rY   classmethodra   __classcell__r#   r#   r!   r$   r   '   s~    	

h
r   )ri   rT   typingr   r   image_processing_utilsr   processing_utilsr   tokenization_utils_baser   r   r	   r
   r   utilsr   r   video_utilsr   autor   
get_loggerrf   loggerr   __all__r#   r#   r#   r$   <module>   s   
 
1