o
    	۷i"                     @   s   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZ dd	lmZ d
dlmZ eeZG dd de
ddZG dd deZdgZdS )zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)OptionalUnion   )BatchFeature)
ImageInput)ProcessingKwargsProcessorMixinUnpack)
AddedTokenPreTokenizedInput	TextInput)logging   )AutoTokenizerc                
   @   s*   e Zd Zdddddddddd	i dZdS )InstructBlipProcessorKwargsTFr   )	add_special_tokenspaddingstridereturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbose)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaults r    r    n/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/instructblip/processing_instructblip.pyr   !   s    
r   F)totalc                
       s   e Zd ZdZg dZdZdZdZd fdd	Z				dde	e
 d	eeeee ee f d
ee defddZedd Z fddZe fddZ  ZS )InstructBlipProcessora  
    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.

    Args:
        image_processor (`BlipImageProcessor`):
            An instance of [`BlipImageProcessor`]. The image processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):"
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )image_processor	tokenizerqformer_tokenizer)BlipImageProcessorBlipImageProcessorFastr   Nc                    sP   t |dstdddd| _|j| jgdd n|j| _|| _t ||| d S )Nimage_tokenz<image>FT)
normalizedspecial)special_tokens)hasattrr
   r)   
add_tokensnum_query_tokenssuper__init__)selfr$   r%   r&   r/   kwargs	__class__r    r!   r1   J   s   
zInstructBlipProcessor.__init__imagestextr3   returnc                    s  |du r|du rt d| jtfd| jji|}|d dd}i }|durt|tr0|g}nt|ts@t|d ts@t d| j	|fi |d }	|	d|d	< |	d
|d< |d 
ddurm|d d  | j8  < | j|fi |d }
|dur| jj| j }d|d d< d|d d< d|d d< | j|fi |d  |
D ] fdd|
 D |
< q||
 |dur| j|fi |d }|| t||d}|S )a  
        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Args:
            images (`ImageInput`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        Nz,You have to specify at least images or text.tokenizer_init_kwargsr   return_tensorsr   zAInvalid input text. Please provide a string, or a list of strings	input_idsqformer_input_idsattention_maskqformer_attention_mask
max_lengthFr   r   
truncationc                    s   g | ]}  | qS r    r    ).0sampleimage_text_encodingkr    r!   
<listcomp>   s    z2InstructBlipProcessor.__call__.<locals>.<listcomp>r   )tensor_type)
ValueError_merge_kwargsr   r%   init_kwargspop
isinstancestrlistr&   getr/   r)   contentupdater$   r   )r2   r6   r7   audiovideosr3   output_kwargsr:   encodingqformer_text_encodingtext_encodingimage_tokensimage_encodingr    rC   r!   __call__T   sH   


zInstructBlipProcessor.__call__c                 C   s$   | j j}| jj}ddg}|| | S )Nr<   r>   )r%   model_input_namesr$   )r2   tokenizer_input_namesimage_processor_input_namesqformer_input_namesr    r    r!   r[      s   z'InstructBlipProcessor.model_input_namesc                    s   t j|rtd| dt j|dd t j|d}| j| d| jv }|r/| j	d t
 j|fi |}|rC|  jdg7  _|S )NzProvided path (z#) should be a directory, not a fileT)exist_okr&   )ospathisfilerH   makedirsjoinr&   save_pretrained
attributesremover0   )r2   save_directoryr3   qformer_tokenizer_pathqformer_presentoutputsr4   r    r!   re      s   
z%InstructBlipProcessor.save_pretrainedc                    s>   t  j|fi |}t|tr|d }tj|dd}||_|S )Nr   r&   )	subfolder)r0   from_pretrainedrL   tupler   r&   )clspretrained_model_name_or_pathr3   	processorr&   r4   r    r!   rm      s   
z%InstructBlipProcessor.from_pretrained)N)NNNN)r   r   r   __doc__rf   image_processor_classtokenizer_classqformer_tokenizer_classr1   r   r   r   r   r   rN   r	   r   r   rZ   propertyr[   re   classmethodrm   __classcell__r    r    r4   r!   r#   2   s2    
C
r#   )rr   r`   typingr   r   image_processing_utilsr   image_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   r   r   utilsr   autor   
get_loggerr   loggerr   r#   __all__r    r    r    r!   <module>   s   
 
