o
    wiT+                     @   s   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZmZ dd	lmZ d
dlmZ eeZG dd deZdgZdS )zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)OptionalUnion   )BatchFeature)ProcessorMixin)
AddedTokenBatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging)
VideoInput   )AutoTokenizerc                $       s  e Zd ZdZg dZdZdZdZd( fdd	Z								
														d)de	de
eeee ee f dede
eeef de
eeef dee dedee dee dededededededee
eef  def"ddZdd Zd d! Zed"d# Z fd$d%Ze fd&d'Z  ZS )*InstructBlipVideoProcessora  
    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.

    Args:
        video_processor (`InstructBlipVideoVideoProcessor`):
            An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )video_processor	tokenizerqformer_tokenizerAutoVideoProcessorr   Nc                    sP   t |dstdddd| _|j| jgdd n|j| _|| _t ||| d S )Nvideo_tokenz<video>FT)
normalizedspecial)special_tokens)hasattrr   r   
add_tokensnum_query_tokenssuper__init__)selfr   r   r   r   kwargs	__class__ /home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.pyr   @   s   
z#InstructBlipVideoProcessor.__init__TFr   imagestextadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsreturnc                 K   s  |du r|du rt dt }|durt|tr|g}nt|ts,t|d ts,t d| jd||||||||	|
|||||dd|}| jdur{|dur{i }| jj| j d }| j|gt	| ddd}|D ]}d	d
 t
|| || D ||< qgn|}|durtd t||d}|| | jd||||||||	|
||||||d|}|d|d< |d|d< |dur| j||d}|| |S )a%  
        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Nz3You have to specify at least one of images or text.r   zAInvalid input text. Please provide a string, or a list of strings)r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5      F)r(   r5   c                 S   s   g | ]\}}|| qS r$   r$   ).0img_encodingtxt_encodingr$   r$   r%   
<listcomp>   s    z7InstructBlipVideoProcessor.__call__.<locals>.<listcomp>aK  Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.)tensor_type	input_idsqformer_input_idsattention_maskqformer_attention_mask)r5   r$   )
ValueErrorr   
isinstancestrlistr   r   r   contentlenziploggerwarning_oncer   updater   popr   )r    r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r!   encoding_text_encodingtext_encodingvideo_tokensvideo_token_encodingkqformer_text_encodingimage_encodingr$   r$   r%   __call__I   s   


z#InstructBlipVideoProcessor.__call__c                 O      | j j|i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r   batch_decoder    argsr!   r$   r$   r%   rV         z'InstructBlipVideoProcessor.batch_decodec                 O   rU   )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   decoderW   r$   r$   r%   rZ      rY   z!InstructBlipVideoProcessor.decodec                 C   s"   | j j}| jj}tt|| S N)r   model_input_namesimage_processorrD   dictfromkeys)r    tokenizer_input_namesimage_processor_input_namesr$   r$   r%   r\      s   z,InstructBlipVideoProcessor.model_input_namesc                    s   t j|rtd| dt j|dd t j|d}| j| d| jv }|r/| j	d t
 j|fi |}|rC|  jdg7  _|S )NzProvided path (z#) should be a directory, not a fileT)exist_okr   )ospathisfilerA   makedirsjoinr   save_pretrained
attributesremover   )r    save_directoryr!   qformer_tokenizer_pathqformer_presentoutputsr"   r$   r%   rh      s   
z*InstructBlipVideoProcessor.save_pretrainedc                    s>   t  j|fi |}t|tr|d }tj|dd}||_|S )Nr   r   )	subfolder)r   from_pretrainedrB   tupler   r   )clspretrained_model_name_or_pathr!   	processorr   r"   r$   r%   rp      s   
z*InstructBlipVideoProcessor.from_pretrainedr[   )NNTFNNr   NNFFFFFTN)__name__
__module____qualname____doc__ri   video_processor_classtokenizer_classqformer_tokenizer_classr   r   r   r   r
   rD   boolrC   r	   r   r   intr   r   rT   rV   rZ   propertyr\   rh   classmethodrp   __classcell__r$   r$   r"   r%   r   (   s    	

o
r   )rx   rc   typingr   r   image_processing_utilsr   processing_utilsr   tokenization_utils_baser   r   r	   r
   r   r   utilsr   r   video_utilsr   autor   
get_loggerru   rH   r   __all__r$   r$   r$   r%   <module>   s    
 
G