o
    	۷ij                     @   s>  d dl mZmZ d dlZd dlmZmZ d dlmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ eeZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd de	Z%G dd de
Z&G dd deZ'g dZ(dS )     )OptionalUnionN)InstructBlipQFormerConfigInstructBlipVisionConfig)$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelTransformersKwargs   )PretrainedConfig)FlashAttentionKwargs)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)Unpack)logging   )CONFIG_MAPPING
AutoConfigc                   @      e Zd ZdS )InstructBlipVideoVisionConfigN__name__
__module____qualname__ r   r   u/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   -       r   c                   @   r   )InstructBlipVideoQFormerConfigNr   r   r   r   r   r   1   r   r   c                       s^   e Zd ZdZdZddiZeeedZ						d fdd		Z
ed
ededefddZ  ZS )InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
    ```instructblipvideovideo_token_idvideo_token_index)text_configqformer_configvision_configN    c                    s   t  jdi | |d u ri }td |d u ri }td |d u r*i }td tdi || _tdi || _|dd}t	| di || _
|| _|| _| jj| j_| j
jtv | _d| _d| _d S )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typeoptg      ?g{Gz?r   )super__init__loggerinfor   r&   r   r%   getr   r$   num_query_tokensr#   hidden_sizeencoder_hidden_sizer(   r   use_decoder_only_language_modelinitializer_factorinitializer_range)selfr&   r%   r$   r/   r#   kwargstext_model_type	__class__r   r   r+   w   s(   	



z InstructBlipVideoConfig.__init__r&   r%   r$   c                 K   s"   | d|  |  |  d|S )a  
        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
        language model configurations.

        Returns:
            [`InstructBlipVideoConfig`]: An instance of a configuration object
        )r&   r%   r$   Nr   )to_dict)clsr&   r%   r$   r6   r   r   r    from_vision_qformer_text_configs   s   z8InstructBlipVideoConfig.from_vision_qformer_text_configs)NNNr'   N)r   r   r   __doc__r(   attribute_mapr   r   r   sub_configsr+   classmethodr   r<   __classcell__r   r   r8   r   r    5   s.    7#r    c                   @   r   ) InstructBlipVideoPreTrainedModelNr   r   r   r   r   rB      r   rB   c                   @   r   )InstructBlipVideoVisionModelNr   r   r   r   r   rC      r   rC   c                   @   r   )InstructBlipVideoQFormerModelNr   r   r   r   r   rD      r   rD   c                   @   r   )4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r   r   r   rE      r   rE   c                !   @   s   e Zd Z											ddejdejdeej deej deej deej d	eej d
eej dee dee dee dedee de	e
 deeef fddZdS )InstructBlipVideoModelNFpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskinputs_embedsoutput_attentionsoutput_hidden_statesreturn_dictinterpolate_pos_encoding	use_cacher6   returnc                 K   s:  |d ur|n| j j}|j\}}}}}||| |||}| j||	|
||d}|d }tj| d d tj|j	d}| j
|jd dd}tj| d d tj|j	d}|d u r^t|}|j|dd}|j|dd}tj||gdd}| j||||||	|
|d}|d d d d |dd d f }| |}||| j j| d}|d u r| j |}|| j jk}|d u rt|}n||  tj| j jtj|j	dk}|d}|d||j	}||j	|j}|||}| j jr| jd|||	|
||d|}n| jd|||||	|
||d	|}t|||d
S )N)rG   rO   rP   rQ   rR   r   dtypedevicedim   )rJ   rK   query_embedsencoder_hidden_statesencoder_attention_maskrO   rP   rQ   rN   rK   rO   rP   rQ   rS   )rN   rK   rL   rM   rO   rP   rQ   rS   )vision_outputsqformer_outputslanguage_model_outputsr   )configuse_return_dictshapereshapevision_modeltorchonessizelongrX   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr/   language_modelget_input_embeddingsr"   tensorall	unsqueeze	expand_astorW   masked_scatterr2   rE   )r5   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   r6   
batch_sizeframeschannelheightwidthr`   image_embedsimage_attention_maskrl   query_attention_maskquery_outputsquery_outputlanguage_model_inputsspecial_image_maskoutputsr   r   r   forward   s     
$





	zInstructBlipVideoModel.forward)NNNNNNNNNFN)r   r   r   rh   FloatTensorr   
LongTensorTensorboolr   r   r   tuplerE   r   r   r   r   r   rF      sV    	

rF   c                #   @   s  e Zd Z			ddejdejdeej dee dee f
dd	Z			ddejdejdeej dee dee f
d
dZ	dejdejfddZ
												ddejdejdeej deej deej deej deej deej dee dee deej dee dedee dee deeef f ddZe 						ddejdeej deej deej deej deej dedejfddZdS ) )InstructBlipVideoForConditionalGenerationNFrG   rH   rI   rR   rQ   c                 C   s>  |j \}}}}	}
||| ||	|
}| j||dd}|d }tj| dd tj|jd}| j	|j d dd}tj| dd tj|jd}|du rRt
|}|j|dd}|j|dd}tj||gdd}| j|||||dd	}|d ddd|dddf }| |}||| jj| d}|r|||fS |S )
a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        T)rG   rR   rQ   r   NrU   rV   rY   r[   )rJ   rK   r\   r]   r^   rQ   )re   rf   rg   rh   ri   rj   rk   rX   rl   rm   rn   ro   rp   rq   rr   rc   r/   )r5   rG   rH   rI   rR   rQ   r{   r|   r}   r~   r   r`   r   r   rl   r   r   r   r   r   r   r   get_video_features0  s<     
$

z<InstructBlipVideoForConditionalGeneration.get_video_featuresc                 C   s   d S )Nr   )r5   rG   rH   rI   rR   rQ   r   r   r   get_image_featuresl  s   z<InstructBlipVideoForConditionalGeneration.get_image_featuresrJ   rN   c                 C   s`   |du r||   tj| jjtj|jdk}|d}n|| jjk}|d	|
|j}|S )zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        NrV   rU   )rt   rh   ru   rc   r"   rk   rX   rv   rw   rx   ry   )r5   rJ   rN   r   r   r   r   get_placeholder_maskv  s   z>InstructBlipVideoForConditionalGeneration.get_placeholder_maskrK   rL   rM   rO   rP   labelsrS   r6   rT   c                 K   s\  |dur|n| j j}| j||||dd\}}}|s| n|}|s%| n|}|du r1|  |}|du r:t|}||j|j	}| j
||d}|||}| j jr| jd|||	|
||d|}|rg|jn|d }d}|dur| jd||| j jjd|}n$| jd|||||	|
|||d	|}|r|jn|d }|r|jn|d	 }t|||||d
S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTrH   rI   rR   rQ   rN   r_   r   )logitsr   
vocab_size)	rN   rK   rL   rM   rO   rP   rQ   r   rS   r[   )lossr   r`   ra   rb   r   )rc   rd   r   to_tuplert   rh   rn   ry   rX   rW   r   rz   r2   rs   r   loss_functionr$   r   r   rE   )r5   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   r   rQ   rR   rS   r6   r   r`   r   r   r   r   r   r   r   r   r     sv   Q
	
z1InstructBlipVideoForConditionalGeneration.forwardc                 K   s  t | dr	|   |jd }	| j||||dd\}
}}|du rM|du rG| jjg| jj d }|| jjjg }t	j
|gt	j|jd}||	d}|  |}|du rVt	|}|
|j|j}
| j||d	}|||
}||d
}| jjjsy||d< | jjdi ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapr   Tr   N   rV   r[   r   )rN   rK   rJ   r   )hasattr_preprocess_acceleratere   r   rc   r#   r/   r$   bos_token_idrh   ru   rk   rX   repeatrt   rn   ry   rW   r   rz   rs   is_encoder_decodergenerate)r5   rG   rH   rI   rJ   rK   rN   rR   generate_kwargsr{   r   r`   r   video_tokensstart_tokensr   inputsr   r   r   r   r     s6   
"



z2InstructBlipVideoForConditionalGeneration.generate)NFF)NNNNNNNNNNFN)NNNNNF)r   r   r   rh   r   r   r   r   r   r   r   r   r   r   r   rE   r   no_gradr   r   r   r   r   r   /  s    
@

	


 
r   )r    r   r   rC   rB   rD   rF   r   ))typingr   r   rh   ;transformers.models.instructblip.configuration_instructblipr   r   6transformers.models.instructblip.modeling_instructblipr   r   r   r	   r
   r   r   configuration_utilsr   modeling_flash_attention_utilsr   models.auto.modeling_autor   processing_utilsr   utilsr   autor   r   
get_loggerr   r,   r   r   r    rB   rC   rD   rE   rF   r   __all__r   r   r   r   <module>   s.   $

}m  /