o
    eik                     @   sF  d dl Z d dlmZmZ d dlmZmZmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ eeZ G dd deZ!G dd deZ"G dd deZ#G dd de	Z$G dd deZ%G dd de
Z&G dd deZ'G dd deZ(G dd deZ)g dZ*dS )     N)InstructBlipQFormerConfigInstructBlipVisionConfig)'BaseModelOutputWithVisionQformerOutputs$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelTransformersKwargs   )PreTrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPooling)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)Unpack)auto_docstringcan_return_tuplelogging   )CONFIG_MAPPING
AutoConfigc                   @      e Zd ZdS )InstructBlipVideoVisionConfigN__name__
__module____qualname__ r   r   }/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   -       r   c                   @   r   )InstructBlipVideoQFormerConfigNr   r   r   r   r   r!   1   r    r!   c                       sD   e Zd ZdZdZddiZeeedZ						d
 fdd		Z
  ZS )InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PreTrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PreTrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config)
    ```instructblipvideovideo_token_idvideo_token_index)text_configqformer_configvision_configN    c                    s  |d u rt d  }td nt|tr#|dd}t | di |}|d u r0t }td nt|tr<tdi |}|d u rIt }td nt|trUtdi |}|| _|| _	|| _
|| _|| _| j	j| j
_| jjtv | _d| _d| _t jdi | d S )	NoptzTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typez\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.z``vision_config` is `None`. initializing the `InstructBlipVideoVisionConfig` with default values.g      ?g{Gz?r   )r   loggerinfo
isinstancedictgetr!   r   r&   r(   r'   num_query_tokensr%   hidden_sizeencoder_hidden_sizer+   r   use_decoder_only_language_modelinitializer_factorinitializer_rangesuper__init__)selfr(   r'   r&   r1   r%   kwargstext_model_type	__class__r   r   r8   w   s8   	



z InstructBlipVideoConfig.__init__)NNNr)   N)r   r   r   __doc__r+   attribute_mapr   r!   r   sub_configsr8   __classcell__r   r   r<   r   r"   5   s    7r"   c                   @      e Zd ZdZdS ) InstructBlipVideoPreTrainedModel)videotextNr   r   r   input_modalitiesr   r   r   r   rC          rC   c                   @   rB   )InstructBlipVideoVisionModelrD   NrF   r   r   r   r   rI      rH   rI   c                   @   r   )InstructBlipVideoQFormerModelNr   r   r   r   r   rJ      r    rJ   c                   @   r   )4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r   r   r   rK      r    rK   c                    @   s   e Zd Z											ddejdejdejdB dejdB dejdB dejdB d	ejdB d
ejdB dedB dedB dedB dededB dee	 de
eB fddZdS )InstructBlipVideoModelNFpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskinputs_embedsoutput_attentionsoutput_hidden_statesreturn_dictinterpolate_pos_encoding	use_cacher:   returnc                 K   s:  |d ur|n| j j}|j\}}}}}||| |||}| j||	|
||d}|d }tj| d d tj|j	d}| j
|jd dd}tj| d d tj|j	d}|d u r^t|}|j|dd}|j|dd}tj||gdd}| j||||||	|
|d}|d d d d |dd d f }| |}||| j j| d}|d u r| j |}|| j jk}|d u rt|}n||  tj| j jtj|j	dk}|d}|d||j	}||j	|j}|||}| j jr| jd|||	|
||d|}n| jd|||||	|
||d	|}t|||d
S )N)rM   rU   rV   rW   rX   r   dtypedevicedim   )rP   rQ   query_embedsencoder_hidden_statesencoder_attention_maskrU   rV   rW   rT   rQ   rU   rV   rW   rY   )rT   rQ   rR   rS   rU   rV   rW   rY   )vision_outputsqformer_outputslanguage_model_outputsr   )configuse_return_dictshapereshapevision_modeltorchonessizelongr^   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr1   language_modelget_input_embeddingsr$   tensorall	unsqueeze	expand_astor]   masked_scatterr4   rK   )r9   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   r:   
batch_sizeframeschannelheightwidthrf   image_embedsimage_attention_maskrr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsspecial_image_maskoutputsr   r   r   forward   s     
$





	zInstructBlipVideoModel.forward)NNNNNNNNNFN)r   r   r   rn   FloatTensor
LongTensorTensorboolr   r   tuplerK   r   r   r   r   r   rL      sV    	
rL   c                "   @   s|  e Zd Zee		ddejdejdejdB dedB de	e
 deeB fd	d
Zdd ZdejdejfddZ												ddejdejdejdB dejdB dejdB dejdB dejdB dejdB dedB dedB dejdB dedB dededB de	e
 deeB f ddZe 						ddejdejdB dejdB dejdB dejdB dejdB dedejfddZdS ) )InstructBlipVideoForConditionalGenerationNFrM   rN   rO   rX   r:   rZ   c              	   K   sh  |j \}}}}	}
||| ||	|
}| jd||dd|}t|j|j|j|j|dd}|d }tj	|
 dd tj|jd}| j|j d dd}tj	|
 dd tj|jd}|du rdt|}|j|dd}|j|dd}tj||gd	d}| jd|||||dd
|}||_|d ddd|
d	ddf }| |}||| jj| d}||_|S )a  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.
        T)rM   rX   rW   N)last_hidden_statepooler_outputhidden_states
attentionsrf   rg   r   r[   r\   r_   ra   )rP   rQ   rb   rc   rd   rW   r   )rk   rl   rm   r   r   r   r   r   rn   ro   rp   rq   r^   rr   rs   rt   ru   rv   rw   rg   rx   ri   r1   )r9   rM   rN   rO   rX   r:   r   r   r   r   r   rf   r   r   rr   r   rg   r   video_featuresr   r   r   get_video_features   sT     
	$
z<InstructBlipVideoForConditionalGeneration.get_video_featuresc                  K   s   t d)Nz=No need to inherit as this architecture only supports videos.)AttributeError)super_kwargsr   r   r   get_image_featuresi  s   z<InstructBlipVideoForConditionalGeneration.get_image_featuresrP   rT   c                 C   s`   |du r||   tj| jjtj|jdk}|d}n|| jjk}|d	|
|j}|S )zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        Nr\   r[   )rz   rn   r{   ri   r$   rq   r^   r|   r}   r~   r   )r9   rP   rT   r   r   r   r   get_placeholder_maskl  s   z>InstructBlipVideoForConditionalGeneration.get_placeholder_maskrQ   rR   rS   rU   rV   labelsrW   rY   c                 K   sP  |dur|n| j j}| j|f|||dd|}|j}|j}|j}|du r+|  |}|du r4t|}|	|j
|j}| j||d}|||}| j jr{| jd|||	|
||d|}|ra|jn|d }d}|durz| jd||| j jjd|}n$| jd|||||	|
|||d	|}|r|jn|d }|r|jn|d	 }t|||||d
S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTrN   rO   rX   rW   rT   re   r   )logitsr   
vocab_size)	rT   rQ   rR   rS   rU   rV   rW   r   rY   ra   )lossr   rf   rg   rh   r   )ri   rj   r   r   rg   rf   rz   rn   rt   r   r^   r]   r   r   r4   ry   r   loss_functionr&   r   r   rK   )r9   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   r   rW   rX   rY   r:   r   r   rg   rf   r   r   r   r   r   r   r   r   {  s~   Q
	
z1InstructBlipVideoForConditionalGeneration.forwardc                 K   s  t | dr	|   |jd }	| j||||dd}
|
j}|du rM|du rG| jjg| jj d }|| jjj	g }t
j|gt
j|jd}||	d}|  |}|du rVt
|}||j|j}| j||d	}|||}||d
}| jjjsy||d< | jjdi ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapr   Tr   N   r\   ra   r   )rT   rQ   rP   r   )hasattr_preprocess_acceleraterk   r   r   ri   r%   r1   r&   bos_token_idrn   r{   rq   r^   repeatrz   rt   r   r]   r   r   ry   is_encoder_decodergenerate)r9   rM   rN   rO   rP   rQ   rT   rX   generate_kwargsr   r   r   video_tokensstart_tokensr   inputsr   r   r   r   r     s8   
"



z2InstructBlipVideoForConditionalGeneration.generate)NF)NNNNNNNNNNFN)NNNNNF)r   r   r   r   r   rn   r   r   r   r   r   r   r   r   r   r   rK   r   no_gradr   r   r   r   r   r     s    G	

 
r   )r"   r!   r   rI   rC   rJ   rL   r   )+rn   ;transformers.models.instructblip.configuration_instructblipr   r   6transformers.models.instructblip.modeling_instructblipr   r   r   r   r   r	   r
   r   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   models.auto.modeling_autor   processing_utilsr   utilsr   r   r   autor   r   
get_loggerr   r,   r   r!   r"   rC   rI   rJ   rK   rL   r   __all__r   r   r   r   <module>   s.   (
mm  8