o
    wiim                     @   sF  d dl mZmZ d dlZd dlZd dlmZmZ d dlm	Z	m
Z
mZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ eeZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd de
Z&G dd deZ'G dd de	Z(g dZ)dS )     )OptionalUnionN)InstructBlipQFormerConfigInstructBlipVisionConfig)$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelKwargsForCausalLM   )PretrainedConfig)FlashAttentionKwargs)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)Unpack)logging   )CONFIG_MAPPING
AutoConfigc                   @      e Zd ZdS )InstructBlipVideoVisionConfigN__name__
__module____qualname__ r   r   |/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   .       r   c                   @   r   )InstructBlipVideoQFormerConfigNr   r   r   r   r   r   2   r   r   c                       s^   e Zd ZdZdZddiZeeedZ						d fdd		Z
ed
ededefddZ  ZS )InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
    ```instructblipvideovideo_token_idvideo_token_index)text_configqformer_configvision_configN    c                    s   t  jdi | |d u ri }td |d u ri }td |d u r*i }td tdi || _tdi || _d|v rB|d nd}t| di || _	|| _
|| _| jj| j_| j	jtv | _d| _d| _d S )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typeoptg      ?g{Gz?r   )super__init__loggerinfor   r&   r   r%   r   r$   num_query_tokensr#   hidden_sizeencoder_hidden_sizer(   r   use_decoder_only_language_modelinitializer_factorinitializer_range)selfr&   r%   r$   r.   r#   kwargstext_model_type	__class__r   r   r+   x   s(   	



z InstructBlipVideoConfig.__init__r&   r%   r$   c                 K   s"   | d|  |  |  d|S )a  
        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
        language model configurations.

        Returns:
            [`InstructBlipVideoConfig`]: An instance of a configuration object
        )r&   r%   r$   Nr   )to_dict)clsr&   r%   r$   r5   r   r   r    from_vision_qformer_text_configs   s   z8InstructBlipVideoConfig.from_vision_qformer_text_configs)NNNr'   N)r   r   r   __doc__r(   attribute_mapr   r   r   sub_configsr+   classmethodr   r;   __classcell__r   r   r7   r   r    6   s.    7#r    c                   @   r   ) InstructBlipVideoPreTrainedModelNr   r   r   r   r   rA      r   rA   c                   @   r   )InstructBlipVideoVisionModelNr   r   r   r   r   rB      r   rB   c                   @   r   )InstructBlipVideoQFormerModelNr   r   r   r   r   rC      r   rC   c                   @   r   )4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r   r   r   rD      r   rD   c                   @   s   e Zd Z										ddejdejdeej deej deej deej d	eej d
ee dee dee dedee dee	 de
eef fddZdS )InstructBlipVideoModelNFpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictinterpolate_pos_encoding	use_cacher5   returnc                 K   s  |
d ur|
n| j j}
|j\}}}}}||| |||}| j|||	|
|d}|d }tj| d d tj|j	d}| j
|jd dd}tj| d d tj|j	d}|d u r^t|}|j|dd}|j|dd}tj||gdd}| j|||||||	|
d}|d d d d |dd d f }| |}||| j j| d}| j |}|d u rt|}|| j jkd|}| ||< | j jr| jd||||	|
|d|}n| jd||||||	|
|d	|}t|||d
S )N)rF   rM   rN   rO   rP   r   dtypedevicedim   )rI   rJ   query_embedsencoder_hidden_statesencoder_attention_maskrM   rN   rO   inputs_embedsrJ   rM   rN   rO   rQ   )r^   rJ   rK   rL   rM   rN   rO   rQ   )vision_outputsqformer_outputslanguage_model_outputsr   )configuse_return_dictshapereshapevision_modeltorchonessizelongrV   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr.   language_modelget_input_embeddingsr"   	unsqueeze	expand_asflattenr1   rD   )r4   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   r5   
batch_sizeframeschannelheightwidthr_   image_embedsimage_attention_maskrk   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr^   special_image_maskoutputsr   r   r   forward   s     
$



	zInstructBlipVideoModel.forward)
NNNNNNNNFN)r   r   r   rg   FloatTensorr   
LongTensorboolr   r   r   tuplerD   r   r   r   r   r   rE      sP    	

rE   c                !   @   st  e Zd Z			ddejdejdeej dee dee f
dd	Z			ddejdejdeej dee dee f
d
dZ												ddejdejdeej deej deej deej deej dee dee deej dee dedee de
e deeef fddZe 					ddejdeej deej deej deej dedejfddZdS ))InstructBlipVideoForConditionalGenerationNFrF   rG   rH   rP   rO   c                 C   s>  |j \}}}}	}
||| ||	|
}| j||dd}|d }tj| dd tj|jd}| j	|j d dd}tj| dd tj|jd}|du rRt
|}|j|dd}|j|dd}tj||gdd}| j|||||dd	}|d ddd|dddf }| |}||| jj| d}|r|||fS |S )
a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        T)rF   rP   rO   r   NrS   rT   rW   rY   )rI   rJ   rZ   r[   r\   rO   )rd   re   rf   rg   rh   ri   rj   rV   rk   rl   rm   rn   ro   rp   rq   rb   r.   )r4   rF   rG   rH   rP   rO   rw   rx   ry   rz   r{   r_   r|   r}   rk   r~   r   r   r   r   r   r   get_video_features(  s<     
$

z<InstructBlipVideoForConditionalGeneration.get_video_featuresc                 C   s   d S )Nr   )r4   rF   rG   rH   rP   rO   r   r   r   get_image_featuresd  s   z<InstructBlipVideoForConditionalGeneration.get_image_featuresrI   rJ   rK   rL   rM   rN   labelsrQ   r5   rR   c                 K   s  |dur|n| j j}| j||||dd\}}}|s| n|}|s%| n|}tj| dd tj|jd}| j	
 |}|du rGt|}t| j dddurg|| j jkd|}| |j||< ntd tj|||jgdd	}tj|||jgdd	}| j jr| j	d||||	||d
|}|r|jn|d }d}|
dur| jd||
| j jjd|}n$| j	d||||||	||
|d	|}|r|jn|d }|r|jn|d }t|||||dS )a0
  
        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTrG   rH   rP   rO   rS   rT   r"   K  Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.rY   rW   r]   r   )logitsr   
vocab_size)	r^   rJ   rK   rL   rM   rN   rO   r   rQ   )lossr   r_   r`   ra   r   )rb   rc   r   to_tuplerg   rh   ri   rj   rV   rr   rs   rm   getattrr"   rt   ru   rv   tor,   warning_oncero   r1   r   loss_functionr$   r   r   rD   )r4   rF   rG   rH   rI   rJ   rK   rL   rM   rN   r   rO   rP   rQ   r5   r   r_   r   language_model_attention_maskr^   r   r   r   r   r   r   r   r   n  s   I
	
z1InstructBlipVideoForConditionalGeneration.forwardc                 K   s  t | dr	|   |jd }| j||||dd\}	}
}tj|	 dd tj|	jd}|du r\| j	j
jg}t| j	dddurK| j	jg| j	j d	 | }tj|gtj|jd}||d
}|du ret|}|  |}t| j	dddur|| j	jkd|}|	 |j||< n@td tj|	||	jgd
d}tj|||jgd
d}| jj	js|dd|	jd
  d
 |d< |dd|	jd
  |d< ||d}| jj	js||d< | jjdi ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapr   Tr   NrS   rT   r"      rY   r   rW   
max_length   
min_length)r^   rJ   rI   r   )hasattr_preprocess_acceleraterd   r   rg   rh   ri   rj   rV   rb   r$   bos_token_idr   r"   r.   tensorrepeatrm   rs   rt   ru   rv   r   r,   r   ro   rr   is_encoder_decodergetgenerate)r4   rF   rG   rH   rI   rJ   rP   generate_kwargsrw   r   r_   r   language_attention_maskstart_tokensr^   r   inputsr   r   r   r   r     sR   





z2InstructBlipVideoForConditionalGeneration.generate)NFF)NNNNNNNNNFN)NNNNF)r   r   r   rg   r   r   r   r   r   r   r   r   r   r   rD   r   no_gradr   r   r   r   r   r   '  s    
@
	


 	r   )r    r   r   rB   rA   rC   rE   r   )*typingr   r   rg   torch.utils.checkpoint;transformers.models.instructblip.configuration_instructblipr   r   6transformers.models.instructblip.modeling_instructblipr   r   r   r	   r
   r   r   configuration_utilsr   modeling_flash_attention_utilsr   models.auto.modeling_autor   processing_utilsr   utilsr   autor   r   
get_loggerr   r,   r   r   r    rA   rB   rC   rD   rE   r   __all__r   r   r   r   <module>   s0   $

}d  ;