o
    ei                    @   s  d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5 e'6e7Z8G dd dej9Z:G dd dej9Z;e%G dd deZ<	dNdej9dej=d ej=d!ej=d"ej=dB d#e>d$e>fd%d&Z?G d'd( d(ej9Z@G d)d* d*ej9ZAG d+d, d,eZBG d-d. d.ej9ZCG d/d0 d0e<ZDG d1d2 d2ej9ZEG d3d4 d4ej9ZFG d5d6 d6ej9ZGG d7d8 d8ej9ZHG d9d: d:ej9ZIG d;d< d<eZJG d=d> d>ej9ZKG d?d@ d@e<ZLee%dAdBG dCdD dDe#ZMe%dEdBG dFdG dGe<ZNee%G dHdI dIeZOe%dJdBG dKdL dLe<eZPg dMZQdS )O    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastSeq2SeqLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                       s\   e Zd Zdef fddZdejdededejfdd	Zddej	de
dejfddZ  ZS )!InstructBlipVideoVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
dd| j| _tjd| j| j| jd| _| j| j d | _| jd | _tt	
d| j| j| _d S )Nr#   r   )in_channelsout_channelskernel_sizestrider   )super__init__r(   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr(   	__class__ ~/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr.   =   s   
z*InstructBlipVideoVisionEmbeddings.__init__
embeddingsheightwidthreturnc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r#   Ng      ?r   r   r   bicubicF)sizemodealign_cornersdim)shaper;   r4   jit
is_tracingr2   r   reshapepermuter   
functionalinterpolateviewcat)r=   rB   rC   rD   r9   r:   class_pos_embedpatch_pos_embedrL   
new_height	new_widthsqrt_num_positionsr@   r@   rA   interpolate_pos_encodingO   s(   



z:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingFpixel_valuesr[   c                 C   s   |j \}}}}| jjj}| |j|d}|ddd}| j|dd|}	t	j
|	|gdd}
|r<| |
||}n| j}|
|d d d |
dd d f | }
|
S )Ndtyper   r#   rF   rK   )rM   r8   weightr^   toflatten	transposer6   expandr4   rU   r[   r;   rH   )r=   r\   r[   
batch_size_rC   rD   target_dtypepatch_embedsclass_embedsrB   r;   r@   r@   rA   forwardw   s   
*z)InstructBlipVideoVisionEmbeddings.forwardF)__name__
__module____qualname__r&   r.   r4   Tensorintr[   FloatTensorboolri   __classcell__r@   r@   r>   rA   r'   <   s    $(r'   c                       s2   e Zd ZdZ fddZ				dddZ  ZS )	"InstructBlipVideoQFormerEmbeddingsz;Construct the embeddings from word and position embeddings.c                    s~   t    tj|j|j|jd| _t|j|j| _	tj
|j|jd| _t|j| _| jdt|jddd || _d S )N)padding_idxepsposition_idsr#   rF   F)
persistent)r-   r.   r   	Embedding
vocab_sizer/   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_eps	layernormDropouthidden_dropout_probdropoutregister_bufferr4   arangerc   r(   r<   r>   r@   rA   r.      s   

z+InstructBlipVideoQFormerEmbeddings.__init__Nr   c                 C   s   |d ur|  d }nd}|d u r | jd d ||| f  }|d urD| |}| ||j}|| }|d urCtj||fdd}n|}|| j	j
j}| 	|}| |}|S )Nr#   r   rK   )rH   rw   cloner}   r   r`   devicer4   rU   r   r_   r^   r   )r=   	input_idsrw   query_embedspast_key_values_length
seq_lengthrB   r   r@   r@   rA   ri      s"   


z*InstructBlipVideoQFormerEmbeddings.forwardNNNr   )rk   rl   rm   __doc__r.   ri   rr   r@   r@   r>   rA   rs      s    rs   c                       sV   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZg dZe  fddZ  ZS ) InstructBlipVideoPreTrainedModelr(   blip)videotextT)rs   InstructBlipVideoAttention*InstructBlipVideoQFormerMultiHeadAttention"InstructBlipVideoQFormerSelfOutputc                    s   t  | | jj}t|tr#tj|jd|d tj|j	d|d dS t|t
tfr2t|j dS t|trJt|jt|jjd d dS dS )zInitialize the weights        )meanstdrF   rx   N)r-   _init_weightsr(   initializer_range
isinstancer'   inittrunc_normal_r;   r6   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelzeros_query_tokensrs   copy_rw   r4   r   rM   rc   )r=   modulefactorr>   r@   rA   r      s   

&z.InstructBlipVideoPreTrainedModel._init_weights)rk   rl   rm   r$   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr4   no_gradr   rr   r@   r@   r>   rA   r      s   
 r   r   r   querykeyvalueattention_maskscalingr   c           
      K   sp   t ||dd| }|d ur|| }tjj|dd}tjj||| jd}t ||}	|	dd }	|	|fS )NrF   rK   )ptrainingr#   r   )	r4   matmulrb   r   rR   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr@   r@   rA   eager_attention_forward   s   
r   c                       sh   e Zd ZdZ fddZdejdedefddZd	ejd
e	ejejdB e	ej dB f fddZ
  ZS )r   z=Multi-headed attention from 'Attention Is All You Need' paperc                    s  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	d| _
|j| _tj| jd| j dd| _|jr]tt| j}tt| j}nd }d }|d uryt|tj|dd|f}t|| j_t| j| j| _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr   )bias)requires_grad)r-   r.   r(   r/   r0   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasr3   r4   zerosrU   
zeros_liker   
projection)r=   r(   q_biasv_biasr   r>   r@   rA   r.      s0   

z#InstructBlipVideoAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr#   r   )rT   r   r   rb   r   )r=   r   r   r   r@   r@   rA   _shape  s    z!InstructBlipVideoAttention._shapehidden_statesrE   Nc                 K   s   |  \}}}| |}|||d| j|| j ddddd}|d |d |d }}}	t| jjt	}
|
| |||	fd| j
sBdn| j| jd|\}}|||d	 }| |}||fS )
z#Input shape: Batch x Time x Channelr   r   r   r#      Nr   )r   r   r   rF   )rH   r   rP   r   rQ   r   get_interfacer(   _attn_implementationr   r   r   r   r   r   )r=   r   r   r   tgt_lenr0   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   r@   r@   rA   ri     s0   



z"InstructBlipVideoAttention.forward)rk   rl   rm   r   r.   r4   rn   ro   r   tupleri   rr   r@   r@   r>   rA   r      s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )InstructBlipVideoMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r-   r.   r(   r   
hidden_actactivation_fnr   r   r/   intermediate_sizefc1fc2r<   r>   r@   rA   r.   =  s
   
zInstructBlipVideoMLP.__init__r   rE   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   r=   r   r@   r@   rA   ri   D  s   


zInstructBlipVideoMLP.forwardrk   rl   rm   r.   r4   rn   ri   rr   r@   r@   r>   rA   r   <  s    r   c                       sD   e Zd Zdef fddZedejdee	 dej
fddZ  ZS )	InstructBlipVideoEncoderLayerr(   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S Nru   )r-   r.   r/   r0   r   	self_attnr   r   r   layer_norm1r   mlplayer_norm2r<   r>   r@   rA   r.   L  s   


z&InstructBlipVideoEncoderLayer.__init__r   r   rE   c                 K   sR   |}|  |}| jdd|i|\}}|| }|}| |}| |}|| }|S )Nr   r@   )r   r   r   r   )r=   r   r   residualre   r@   r@   rA   ri   T  s   



z%InstructBlipVideoEncoderLayer.forward)rk   rl   rm   r$   r.   r   r4   rn   r   r   rp   ri   rr   r@   r@   r>   rA   r   K  s    r   c                       sD   e Zd ZdZdef fddZedee de	e
B fddZ  ZS )	InstructBlipVideoEncodera"  
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InstructBlipVideoEncoderLayer`].

    Args:
        config (`InstructBlipVideoConfig`):
            The corresponding vision configuration for the `InstructBlipVideoEncoder`.
    r(   c                    :   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r@   )r   ).0re   r(   r@   rA   
<listcomp>x  s    z5InstructBlipVideoEncoder.__init__.<locals>.<listcomp>F)	r-   r.   r(   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr<   r>   r   rA   r.   u  s   
 
z!InstructBlipVideoEncoder.__init__r   rE   c                 K   s*   |}| j D ]
}||fi |}qt|dS )Nlast_hidden_state)r   r   )r=   inputs_embedsr   r   encoder_layerr@   r@   rA   ri   {  s   

z InstructBlipVideoEncoder.forward)rk   rl   rm   r   r$   r.   r   r   r   r   r   ri   rr   r@   r@   r>   rA   r   k  s    	r   c                       s   e Zd ZU dZdZeed< eedZ	def fddZ
eedde			ddejd	B d
edee deeB fddZdd Z  ZS )InstructBlipVideoVisionModelr\   r   r(   )r   
attentionsc                    sJ   t  | || _|j}t|| _t|| _tj	||j
d| _|   d S r   )r-   r.   r(   r/   r'   rB   r   encoderr   r   r   post_layernorm	post_init)r=   r(   r0   r>   r@   rA   r.     s   

z%InstructBlipVideoVisionModel.__init__F)tie_last_hidden_statesNr[   r   rE   c                 K   sn   |d u rt d| j||d}| jdd|i|}|j}| |}|d d dd d f }| |}t||dS )Nz You have to specify pixel_values)r[   r   r   r   pooler_outputr@   )r   rB   r  r   r  r   )r=   r\   r[   r   r   encoder_outputsr   pooled_outputr@   r@   rA   ri     s    	

z$InstructBlipVideoVisionModel.forwardc                 C      | j S r   )rB   r=   r@   r@   rA   get_input_embeddings     z1InstructBlipVideoVisionModel.get_input_embeddingsNF)rk   rl   rm   main_input_namer   r&   r   r   r   _can_record_outputsr.   r   r   r   r4   rp   rq   r   r   r   r   ri   r  rr   r@   r@   r>   rA   r     s.   
 r   c                       s`   e Zd Zd fdd	Zdd Zdd Zdd	 Zd
d Zdd Z			dde	e
 fddZ  ZS )r   Fc                    s   t    || _|j|j dkrt|dstd|j|jf |j| _t|j|j | _| j| j | _	t
|j| j	| _|rQt
|j| j	| _t
|j| j	| _nt
|j| j	| _t
|j| j	| _t
|j| _d| _d S )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)F)r-   r.   r(   r/   r   hasattrr   ro   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   r   attention_probs_dropout_probr   save_attentionr=   r(   is_cross_attentionr>   r@   rA   r.     s&   


z3InstructBlipVideoQFormerMultiHeadAttention.__init__c                 C   
   || _ d S r   attn_gradients)r=   r  r@   r@   rA   save_attn_gradients     
z>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradientsc                 C   r	  r   r  r
  r@   r@   rA   get_attn_gradients  r  z=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradientsc                 C   r  r   attention_map)r=   r   r@   r@   rA   save_attention_map  r  z=InstructBlipVideoQFormerMultiHeadAttention.save_attention_mapc                 C   r	  r   r  r
  r@   r@   rA   get_attention_map  r  z<InstructBlipVideoQFormerMultiHeadAttention.get_attention_mapc                 C   s6   |  d d | j| jf }|j| }|ddddS )NrF   r   r   r#   r   )rH   r   r  rT   rQ   )r=   xnew_x_shaper@   r@   rA   transpose_for_scores  s   
z?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scoresNr   c                 K   s*  |d u}|r|  | |}|  | |}|}n|  | |}|  | |}| |}	|  |	}
t|
|dd}|t| j	 }|j
}|d urP|| }tjdd||}|rk| jrk| | || j | |}t||}|dddd }| d d | jf }|j| }||fS )NrF   r   rK   r   r   r#   r   )r%  r   r   r   r4   r   rb   mathsqrtr  r^   r   Softmaxr`   r  r!  register_hookr  r   rQ   r   rH   r  rT   )r=   r   r   encoder_hidden_statesencoder_attention_maskr   r  	key_layervalue_layermixed_query_layerquery_layerattention_scoresattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shaper@   r@   rA   ri     s0   





z2InstructBlipVideoQFormerMultiHeadAttention.forwardrj   NNN)rk   rl   rm   r.   r  r  r!  r"  r%  r   r   ri   rr   r@   r@   r>   rA   r     s    r   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )r   c                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S r   )r-   r.   r   r   r/   denser   r   r   r   r   r<   r>   r@   rA   r.   #     
z+InstructBlipVideoQFormerSelfOutput.__init__r   input_tensorrE   c                 C   &   |  |}| |}| || }|S r   r8  r   r   r=   r   r:  r@   r@   rA   ri   )     

z*InstructBlipVideoQFormerSelfOutput.forwardr   r@   r@   r>   rA   r   "      $r   c                       sb   e Zd Zd fdd	Z			ddejdejdB dejdB dejdB d	ee d
ejfddZ	  Z
S )!InstructBlipVideoQFormerAttentionFc                    s$   t    t||| _t|| _d S r   )r-   r.   r   	attentionr   outputr  r>   r@   rA   r.   1  s   
z*InstructBlipVideoQFormerAttention.__init__Nr   r   r*  r+  r   rE   c           	      K   s.   | j d||||d|\}}| ||}|S )N)r   r   r*  r+  r@   )rA  rB  )	r=   r   r   r*  r+  r   r   re   attention_outputr@   r@   rA   ri   6  s   
z)InstructBlipVideoQFormerAttention.forwardrj   r6  )rk   rl   rm   r.   r4   rn   rp   r   r   ri   rr   r@   r@   r>   rA   r@  0  s$    r@  c                       r   )$InstructBlipVideoQFormerIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r-   r.   r   r   r/   r   r8  r   r   strr   intermediate_act_fnr<   r>   r@   rA   r.   J  s
   
z-InstructBlipVideoQFormerIntermediate.__init__r   rE   c                 C   s   |  |}| |}|S r   )r8  rF  r   r@   r@   rA   ri   R  s   

z,InstructBlipVideoQFormerIntermediate.forwardr   r@   r@   r>   rA   rD  I  s    rD  c                       r7  )InstructBlipVideoQFormerOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r-   r.   r   r   r   r/   r8  r   r   r   r   r   r<   r>   r@   rA   r.   Y  r9  z'InstructBlipVideoQFormerOutput.__init__r   r:  rE   c                 C   r;  r   r<  r=  r@   r@   rA   ri   _  r>  z&InstructBlipVideoQFormerOutput.forwardr   r@   r@   r>   rA   rG  X  r?  rG  c                       sH   e Zd Z fddZ				ddee fddZdd	 Zd
d Z  Z	S )InstructBlipVideoQFormerLayerc                    s~   t    |j| _d| _t|| _|| _||j dkr&t|dd| _d| _	nd| _	t
|| _t|| _t
|| _t|| _d S )Nr#   r   T)r  F)r-   r.   chunk_size_feed_forwardseq_len_dimr@  rA  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionrD  intermediaterG  rB  intermediate_queryoutput_query)r=   r(   rK  r>   r@   rA   r.   g  s   




z&InstructBlipVideoQFormerLayer.__init__Nr   r   c              	   K   s   | j |fd|i|}|dkri|d d d |d d f }| jr4|d u r'td| j|f|||d|}t| j| j| j|}	|jd |krgt| j	| j| j|d d |d d d f 
|	j}
tj|	|
gdd}	|	S t| j	| j| j|}	|	S )Nr   r   z>encoder_hidden_states must be given for cross-attention layers)r   r*  r+  r#   rK   )rA  rN  r   rM  r   feed_forward_chunk_queryrI  rJ  rM   feed_forward_chunkr`   r   r4   rU   )r=   r   r   r*  r+  query_lengthr   rC  query_attention_outputlayer_outputlayer_output_textr@   r@   rA   ri   {  sZ   	z%InstructBlipVideoQFormerLayer.forwardc                 C      |  |}| ||}|S r   )rO  rB  r=   rC  intermediate_outputrV  r@   r@   rA   rS       
z0InstructBlipVideoQFormerLayer.feed_forward_chunkc                 C   rX  r   )rP  rQ  rY  r@   r@   rA   rR    r[  z6InstructBlipVideoQFormerLayer.feed_forward_chunk_queryr   )
rk   rl   rm   r.   r   r   ri   rS  rR  rr   r@   r@   r>   rA   rH  f  s    
5rH  c                       s<   e Zd Z fddZe				ddee fddZ  ZS )	InstructBlipVideoQFormerEncoderc                    r   )Nc                    s   g | ]}t  |qS r@   )rH  )r   rK  r   r@   rA   r         z<InstructBlipVideoQFormerEncoder.__init__.<locals>.<listcomp>F)	r-   r.   r(   r   r   r   r   layerr   r<   r>   r   rA   r.     s   

z(InstructBlipVideoQFormerEncoder.__init__Nr   r   c           	      K   s@   t | jjD ]}| j| }||||f||d|}qt|dS )N)r+  rT  r   )r   r(   r   r^  r   )	r=   r   r   r*  r+  rT  r   ilayer_moduler@   r@   rA   ri     s   

	z'InstructBlipVideoQFormerEncoder.forwardr   )	rk   rl   rm   r.   r   r   r   ri   rr   r@   r@   r>   rA   r\    s    r\  c                       s  e Zd ZdZdZdZdZdZee	e
dddge	e
dddgdZdef fd	d
Zdd Zdd Z	ddejdee dejdedejf
ddZeee					d dejdejdB dejdB dejdB dejdB dejdB dee deej eB fddZ  Z S )!InstructBlipVideoQFormerModelz
    Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
    instruction as input.
    Fr#   z
.attention)index
layer_namez.crossattention)r   r   cross_attentionsr(   c                    s2   t  | || _t|| _t|| _|   d S r   )r-   r.   r(   rs   rB   r\  r  r  r<   r>   r@   rA   r.     s
   

z&InstructBlipVideoQFormerModel.__init__c                 C   s   | j jS r   rB   r}   r
  r@   r@   rA   r    s   z2InstructBlipVideoQFormerModel.get_input_embeddingsc                 C   s   || j _d S r   re  r=   r   r@   r@   rA   set_input_embeddings  s   z2InstructBlipVideoQFormerModel.set_input_embeddingsr   input_shaper   	has_queryrE   c                 C   s   |  dkr|dddddddf }n|  dkr(|ddddddf }ntd| d|j d|j| jd}d| d	 }|S )
a>  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device: (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r   Nr   z!Wrong shape for input_ids (shape z) or attention_mask (shape )r]   g      ?g     )rL   r   rM   r`   r^   )r=   r   rh  r   ri  extended_attention_maskr@   r@   rA   get_extended_attention_mask  s   	z9InstructBlipVideoQFormerModel.get_extended_attention_maskNr   rw   r   r*  r+  r   c                    sT  |du r|du rt d|dur|jd nd} j|||d}	|	 dd }
|
\}}|	j}|du r;tj||f|d} ||
|}|durt|t	rU|d  \}}}n| \}}}||f}t|t	ro fdd	|D }n|du rtj||d} 
|}n 
|}nd} j|	f||||d
|}|j}|dddddf }t||dS )a$  
        query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
            Hidden states to be used in the attention computation. If cross-attention,
            will be used for the query (i.e., key and value will use the encoder_hidden_states).
        Nz7You have to specify query_embeds when input_ids is Noner#   r   )r   rw   r   rF   )r   c                    s   g | ]}  |qS r@   )invert_attention_mask)r   maskr
  r@   rA   r   a  r]  z9InstructBlipVideoQFormerModel.forward.<locals>.<listcomp>)r   r*  r+  rT  r  )r   rM   rB   rH   r   r4   onesrl  r   listrm  r  r   r   )r=   r   r   rw   r   r*  r+  r   rT  embedding_outputrh  rd   r   r   rk  encoder_batch_sizeencoder_sequence_lengthre   encoder_hidden_shapeencoder_extended_attention_maskr  sequence_outputr  r@   r
  rA   ri   /  sT   

z%InstructBlipVideoQFormerModel.forwardrj   )NNNNN)!rk   rl   rm   r   r   r   r   r   rH  r   r   r  r%   r.   r  rg  r4   rn   r   ro   r   rq   rl  r   r   r   
LongTensorrp   r   r   r   ri   rr   r@   r@   r>   rA   ra    sh    


+	ra  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                   @   s   e Zd ZU dZdZeej dB ed< dZ	eej dB ed< dZ
edB ed< dZedB ed< dZeeB dB ed< dee fd	d
ZdS )4InstructBlipVideoForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Language modeling loss from the language model.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head of the language model.
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
        Outputs of the language model.
    Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrE   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS )r|  r}  r~  N)getattrto_tuple)r   kr
  r@   rA   	<genexpr>  s    
zPInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>)r   keysr
  r@   r
  rA   r    s   z=InstructBlipVideoForConditionalGenerationModelOutput.to_tuple)rk   rl   rm   r   rz  r   r4   rp   r   r{  r|  r   r}  r   r~  r   r   r   r  r@   r@   r@   rA   ry  {  s   
 ry  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c                "       s   e Zd ZdZdgZdef fddZdd Zdd	 Zd
d Z	de
jde
jfddZee											d de
jde
jde
jdB de
jdB de
jdB de
jdB de
jdB de
jdB dedB dedB dedB dededB dee deeB fddZ  ZS )!r   r\   r   r(   c                    sp   t  | t|j| _ttd|j	|j
j| _t|j
| _t|j
j|jj| _t|j| _|   d S Nr#   )r-   r.   r   vision_configvision_modelr   r3   r4   r   num_query_tokensqformer_configr/   r   ra  qformerr   text_configlanguage_projectionr    from_configlanguage_modelr  r<   r>   r@   rA   r.     s   zInstructBlipVideoModel.__init__c                 C   
   | j  S r   r  r  r
  r@   r@   rA   r    r  z+InstructBlipVideoModel.get_input_embeddingsc                 C      | j | d S r   r  rg  rf  r@   r@   rA   rg       z+InstructBlipVideoModel.set_input_embeddingsc                 C   P   | j }t|dkrd|vrtj dkrtd t| jdr&d| jj	_
dS dS z
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        r#   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maplenr4   cudadevice_countloggerwarningr  r  r  io_same_devicer=   r  r@   r@   rA   _preprocess_accelerate     "z-InstructBlipVideoModel._preprocess_accelerater   r   c                 C   `   |du r||   tj| jjtj|jdk}|d}n|| jjk}|d	|
|j}|S zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        Nr^   r   rF   )r  r4   r   r(   image_token_idlongr   all	unsqueeze	expand_asr`   r=   r   r   special_image_maskr@   r@   rA   get_placeholder_mask     z+InstructBlipVideoModel.get_placeholder_maskNFqformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictr[   	use_cacher   rE   c                 K   s:  |dur|n| j j}|j\}}}}}||| |||}| j||	|
||d}|d }tj| dd tj|j	d}| j
|jd dd}tj| dd tj|j	d}|du r^t|}|j|dd}|j|dd}tj||gdd}| j||||||	|
|d}|d ddd|dddf }| |}||| j j| d}|du r| j |}|| j jk}|du rt|}n||  tj| j jtj|j	dk}|d}|d||j	}||j	|j}|||}| j jr| jd|||	|
||d	|}n| jd|||||	|
||d
|}t|||dS )a  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        N)r\   r  r  r  r[   r   rF   r  rK   r#   )r   r   r   r*  r+  r  r  r  r   r   r  r  r  r  )r   r   r  r  r  r  r  r  r  r@   )r(   use_return_dictrM   rP   r  r4   ro  rH   r  r   r   rc   	ones_likerepeat_interleaverU   r  r  r  r  r  video_token_idr   r  r  r  r`   r^   masked_scatteruse_decoder_only_language_modelry  )r=   r\   r  r  r   r   r  r  r   r  r  r  r[   r  r   rd   frameschannelrC   rD   r|  image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputsr@   r@   rA   ri     s   )  
$





	zInstructBlipVideoModel.forward)NNNNNNNNNFN)rk   rl   rm   r  _keep_in_fp32_modulesr$   r.   r  rg  r  r4   rw  rp   r  r   r   rn   rq   r   r
   r   ry  ri   rr   r@   r@   r>   rA   r     sh    	
r   c                   @   s2   e Zd ZU dZdZedB ed< dZedB ed< dS )'BaseModelOutputWithVisionQformerOutputsz
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    Nr|  r}  )	rk   rl   rm   r   r|  r   r   r}  r   r@   r@   r@   rA   r  a  s   
 r  a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c                $       s  e Zd ZU eed< dZdZdgZdef fddZdd Z	d	d
 Z
dd ZdejfddZd. fdd	Zdd Zdd ZdejdejfddZee												d/dejdejdejdB dejdB dejdB dejdB d ejdB dejdB d!edB d"edB d#ejdB d$edB d%ed&edB d'ee deeB f d(d)Ze 						d0dejdejdB dejdB dejdB dejdB dejdB d%edejfd*d+Zee		d1dejdejdejdB d%edB d'ee dee B fd,d-Z!  Z"S )2r   r(   r\   Tr   c                    s   t  | t|j| _tt	d|j
|jj| _t|j| _t|jj|jj| _|jr7t|j}nt|j}|| _|   d S r  )r-   r.   r   _from_configr  r  r   r3   r4   r   r  r  r/   r   ra  r  r   r  r  r  r!   r  r"   r  r  )r=   r(   r  r>   r@   rA   r.     s   z2InstructBlipVideoForConditionalGeneration.__init__c                 C   r  r   r  r
  r@   r@   rA   r    r  z>InstructBlipVideoForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  rf  r@   r@   rA   rg    r  z>InstructBlipVideoForConditionalGeneration.set_input_embeddingsc                 C   r  r   )r  set_output_embeddings)r=   new_embeddingsr@   r@   rA   r    r  z?InstructBlipVideoForConditionalGeneration.set_output_embeddingsrE   c                 C   r  r   )r  get_output_embeddingsr
  r@   r@   rA   r    r  z?InstructBlipVideoForConditionalGeneration.get_output_embeddingsNc                    s    |d u r	| j  S t j|dS )N)modality)r  get_encoderr-   )r=   r  r>   r@   rA   r    s   
z5InstructBlipVideoForConditionalGeneration.get_encoderc                 C   r  r   )r  get_decoderr
  r@   r@   rA   r    r  z5InstructBlipVideoForConditionalGeneration.get_decoderc                 C   r  r  r  r  r@   r@   rA   r    r  z@InstructBlipVideoForConditionalGeneration._preprocess_accelerater   r   c                 C   r  r  )r  r4   r   r(   r  r  r   r  r  r  r`   r  r@   r@   rA   r    r  z>InstructBlipVideoForConditionalGeneration.get_placeholder_maskFr  r  r   r  r  r  r  labelsr  r[   r  r   c                 K   sP  |dur|n| j j}| j|f|||dd|}|j}|j}|j}|du r+|  |}|du r4t|}|	|j
|j}| j||d}|||}| j jr{| jd|||	|
||d|}|ra|jn|d }d}|durz| jd||| j jjd|}n$| jd|||||	|
|||d	|}|r|jn|d }|r|jn|d	 }t|||||d
S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTr  r  r[   r  r   r  r   )r{  r  r{   )	r   r   r  r  r  r  r  r  r  r#   )rz  r{  r|  r}  r~  r@   )r(   r  get_video_featuresr  r}  r|  r  r4   r  r`   r   r^   r  r  r  r  r{  loss_functionr  r{   rz  ry  )r=   r\   r  r  r   r   r  r  r   r  r  r  r  r[   r  r   video_featuresr  r}  r|  r  r  r{  rz  r@   r@   rA   ri     s~   S
	
z1InstructBlipVideoForConditionalGeneration.forwardc                 K   s  t | dr	|   |jd }	| j||||dd}
|
j}|du rM|du rG| jjg| jj d }|| jjj	g }t
j|gt
j|jd}||	d}|  |}|du rVt
|}||j|j}| j||d	}|||}||d
}| jjjsy||d< | jjdi ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        r  r   Tr  Nr   r  r#   r  )r   r   r   r@   )r  r  rM   r  r  r(   video_token_indexr  r  bos_token_idr4   r   r  r   repeatr  r  r`   r^   r  r  r  is_encoder_decodergenerate)r=   r\   r  r  r   r   r   r[   generate_kwargsrd   r  r  video_tokensstart_tokensr  inputsr  r@   r@   rA   r  _  s8   
"



z2InstructBlipVideoForConditionalGeneration.generatec              	   K   sh  |j \}}}}	}
||| ||	|
}| jd||dd|}t|j|j|j|j|dd}|d }tj	|
 dd tj|jd}| j|j d dd}tj	|
 dd tj|jd}|du rdt|}|j|dd}|j|dd}tj||gd	d}| jd|||||dd
|}||_|d ddd|
d	ddf }| |}||| jj| d}||_|S )a  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.
        T)r\   r[   r  N)r   r  r   r   r|  r}  r   rF   r  rK   r#   )r   r   r   r*  r+  r  r@   )rM   rP   r  r  r   r  r   r   r4   ro  rH   r  r   r   rc   r  r  rU   r  r}  r  r(   r  )r=   r\   r  r  r[   r   rd   r  r  rC   rD   r|  r  r  r   r  r}  r  r  r@   r@   rA   r    sT     
	$
z<InstructBlipVideoForConditionalGeneration.get_video_featuresr   )NNNNNNNNNNFN)NNNNNFr  )#rk   rl   rm   r$   r   r  r   r  r.   r  rg  r  r   Moduler  r  r  r  r4   rw  rp   r  r   r   rq   r   r   r   ry  ri   r   r  r  r  rr   r@   r@   r>   rA   r   o  s   
 
	
 
Fr   )r   r   ra  r   r   )r   )Rr&  collections.abcr   dataclassesr   typingr   r4   r    r   r   activationsr   
generationr	   modeling_flash_attention_utilsr
   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   r   autor    r!   r"   configuration_instructblipvideor$   r%   r&   
get_loggerrk   r  r  r'   rs   r   rn   floatr   r   r   r   r   r   r   r   r@  rD  rG  rH  r\  ra  ry  r   r  r   __all__r@   r@   r@   rA   <module>   s     
J2)
J  6aU$  ?	  z