o
    	۷iy                    @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1 e%2e3Z4G dd de	j5Z6e#G dd deZ7	dIde	j5dej8dej8dej8deej8 de9de9fd d!Z:G d"d# d#e	j5Z;G d$d% d%e	j5Z<G d&d' d'eZ=G d(d) d)e	j5Z>G d*d+ d+e7Z?G d,d- d-e	j5Z@G d.d/ d/e	j5ZAG d0d1 d1e	j5ZBG d2d3 d3e	j5ZCG d4d5 d5e	j5ZDG d6d7 d7eZEG d8d9 d9e	j5ZFG d:d; d;e	j5ZGG d<d= d=e7ZHee#d>d?G d@dA dAe!ZIe#dBd?G dCdD dDe7ZJe#dEd?G dFdG dGe7eZKg dHZLdS )J    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)OutputRecordercheck_model_inputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                       s\   e Zd Zdef fddZdejdededejfdd	Zddej	de
dejfddZ  ZS )!InstructBlipVideoVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
dd| j| _tjd| j| j| jd| _| j| j d | _| jd | _tt	
d| j| j| _d S )Nr#   r   )in_channelsout_channelskernel_sizestrider   )super__init__r(   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr(   	__class__ v/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr.   8   s   
z*InstructBlipVideoVisionEmbeddings.__init__
embeddingsheightwidthreturnc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r#   Ng      ?r   r   r   bicubicF)sizemodealign_cornersdim)shaper;   r4   jit
is_tracingr2   r   reshapepermuter   
functionalinterpolateviewcat)r=   rB   rC   rD   r9   r:   class_pos_embedpatch_pos_embedrL   
new_height	new_widthsqrt_num_positionsr@   r@   rA   interpolate_pos_encodingJ   s(   



z:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingFpixel_valuesr[   c                 C   s   |j \}}}}| jjj}| |j|d}|ddd}| j|dd|}	t	j
|	|gdd}
|r<| |
||}n| j}|
|d d d |
dd d f | }
|
S )Ndtyper   r#   rF   rK   )rM   r8   weightr^   toflatten	transposer6   expandr4   rU   r[   r;   rH   )r=   r\   r[   
batch_size_rC   rD   target_dtypepatch_embedsclass_embedsrB   r;   r@   r@   rA   forwardr   s   
*z)InstructBlipVideoVisionEmbeddings.forwardF)__name__
__module____qualname__r&   r.   r4   Tensorintr[   FloatTensorboolri   __classcell__r@   r@   r>   rA   r'   7   s    $(r'   c                   @   sB   e Zd ZU eed< dZdZdZdZdZ	dZ
dZg dZdd ZdS ) InstructBlipVideoPreTrainedModelr(   blipT)"InstructBlipVideoQFormerEmbeddingsInstructBlipVideoAttention*InstructBlipVideoQFormerMultiHeadAttention"InstructBlipVideoQFormerSelfOutputc                 C   s   | j j}t|tjtjfr%|jjjd|d |j	dur#|j	j
  dS dS t|tjr6|jjjd|d dS t|tjrK|j	j
  |jjd dS t|trftjj|jd|d tjj|jd|d dS t|ttfru|jj
  dS dS )zInitialize the weights        )meanstdN      ?)r(   initializer_range
isinstancer   Linearr7   r_   datanormal_biaszero_	Embedding	LayerNormfill_r'   inittrunc_normal_r;   r6   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelquery_tokens)r=   modulefactorr@   r@   rA   _init_weights   s"   

z.InstructBlipVideoPreTrainedModel._init_weightsN)rk   rl   rm   r$   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr   r@   r@   r@   rA   rs      s   
 rs   ry   r   querykeyvalueattention_maskscalingdropoutc           
      K   sp   t ||dd| }|d ur|| }tjj|dd}tjj||| jd}t ||}	|	dd }	|	|fS )NrF   rK   )ptrainingr#   r   )	r4   matmulrb   r   rR   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr@   r@   rA   eager_attention_forward   s   
r   c                       sv   e Zd ZdZ fddZdejdedefddZ		dd
ejde	ej de
eje	ej e	e
ej  f fddZ  ZS )rv   z=Multi-headed attention from 'Attention Is All You Need' paperc                    s  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	d| _
|j| _tj| jd| j dd| _|jr]tt| j}tt| j}nd }d }|d uryt|tj|dd|f}t|| j_t| j| j| _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr   )r   )requires_grad)r-   r.   r(   r/   r0   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   r   qkvqkv_biasr3   r4   zerosrU   
zeros_liker   
projection)r=   r(   q_biasv_biasr   r>   r@   rA   r.      s0   

z#InstructBlipVideoAttention.__init__tensorseq_lenbszc                 C   s    | ||| j| jdd S )Nr#   r   )rT   r   r   rb   r   )r=   r   r   r   r@   r@   rA   _shape   s    z!InstructBlipVideoAttention._shapeNhidden_states	head_maskrE   c                 K   s   |  \}}}| |}|||d| j|| j ddddd}|d |d |d }}	}
t}| jjdkr<t| jj }|| ||	|
fd| j	sHdn| j
| jd	|\}}|||d
 }| |}||fS )z#Input shape: Batch x Time x Channelr   r   r   r#      eagerNry   )r   r   r   rF   )rH   r   rP   r   rQ   r   r(   _attn_implementationr   r   r   r   r   r   )r=   r   r   r   r   tgt_lenr0   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   r@   r@   rA   ri      s0   



z"InstructBlipVideoAttention.forwardN)rk   rl   rm   __doc__r.   r4   rn   ro   r   r   tupleri   rr   r@   r@   r>   rA   rv      s    rv   c                       2   e Zd Z fddZdejdejfddZ  ZS )InstructBlipVideoMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )r-   r.   r(   r	   
hidden_actactivation_fnr   r   r/   intermediate_sizefc1fc2r<   r>   r@   rA   r.     s
   
zInstructBlipVideoMLP.__init__r   rE   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   r=   r   r@   r@   rA   ri     s   


zInstructBlipVideoMLP.forwardrk   rl   rm   r.   r4   rn   ri   rr   r@   r@   r>   rA   r     s    r   c                	       sJ   e Zd Zdef fddZedejdejdee	 dej
fdd	Z  ZS )
InstructBlipVideoEncoderLayerr(   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S Neps)r-   r.   r/   r0   rv   	self_attnr   r   layer_norm_epslayer_norm1r   mlplayer_norm2r<   r>   r@   rA   r.     s   


z&InstructBlipVideoEncoderLayer.__init__r   r   r   rE   c                 K   sT   |}|  |}| jd||d|\}}|| }|}| |}| |}|| }|S )N)r   r   r@   )r   r   r   r   )r=   r   r   r   residualre   r@   r@   rA   ri   $  s   



z%InstructBlipVideoEncoderLayer.forward)rk   rl   rm   r$   r.   r   r4   rn   r   r   rp   ri   rr   r@   r@   r>   rA   r     s    r   c                
       sV   e Zd ZdZdef fddZe	ddeej	 de
e deeef fd	d
Z  ZS )InstructBlipVideoEncodera"  
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InstructBlipVideoEncoderLayer`].

    Args:
        config (`InstructBlipVideoConfig`):
            The corresponding vision configuration for the `InstructBlipVideoEncoder`.
    r(   c                    :   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r@   )r   ).0re   r(   r@   rA   
<listcomp>J  s    z5InstructBlipVideoEncoder.__init__.<locals>.<listcomp>F)	r-   r.   r(   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr<   r>   r   rA   r.   G  s   
 
z!InstructBlipVideoEncoder.__init__Nr   r   rE   c                 K   s.   |}| j D ]}||fd|i|}qt|dS )Nr   last_hidden_state)r   r   )r=   inputs_embedsr   r   r   encoder_layerr@   r@   rA   ri   M  s   

z InstructBlipVideoEncoder.forwardr   )rk   rl   rm   r   r$   r.   r   r   r4   rn   r   r   r   r   r   ri   rr   r@   r@   r>   rA   r   =  s    	
r   c                       s   e Zd ZU dZeed< eedZdef fddZ	e
dde		ddeej d	ed
ee deeef fddZdd Z  ZS )InstructBlipVideoVisionModelr\   r(   )r   
attentionsc                    sJ   t  | || _|j}t|| _t|| _tj	||j
d| _|   d S r   )r-   r.   r(   r/   r'   rB   r   encoderr   r   r   post_layernorm	post_init)r=   r(   r0   r>   r@   rA   r.   g  s   

z%InstructBlipVideoVisionModel.__init__F)tie_last_hidden_statesNr[   r   rE   c                 K   sn   |d u rt d| j||d}| jdd|i|}|j}| |}|d d dd d f }| |}t||dS )Nz You have to specify pixel_values)r[   r   r   r   pooler_outputr@   )r   rB   r   r   r   r   )r=   r\   r[   r   r   encoder_outputsr   pooled_outputr@   r@   rA   ri   r  s    

z$InstructBlipVideoVisionModel.forwardc                 C      | j S r   )rB   r=   r@   r@   rA   get_input_embeddings     z1InstructBlipVideoVisionModel.get_input_embeddings)NF)rk   rl   rm   main_input_namer&   r   r   rv   _can_record_outputsr.   r   r   r   r4   rp   rq   r   r   r   r   r   ri   r   rr   r@   r@   r>   rA   r   _  s*   
 
r   c                       sb   e Zd Zd fdd	Zdd Zdd Zdd	 Zd
d Zdd Z				dde	e
 fddZ  ZS )rw   Fc                    s"  t    || _|j|j dkrt|dstd|j|jf |j| _t|j|j | _| j| j | _	t
|j| j	| _|rQt
|j| j	| _t
|j| j	| _nt
|j| j	| _t
|j| j	| _t
|j| _t|dd| _| jdks{| jdkr|j| _t
d|j d	 | j| _d
| _d S )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)position_embedding_typeabsoluterelative_keyrelative_key_queryr   r#   F)r-   r.   r(   r/   r   hasattrr   ro   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   Dropoutattention_probs_dropout_probr   getattrr   max_position_embeddingsr   distance_embeddingsave_attentionr=   r(   is_cross_attentionr>   r@   rA   r.     s.   


z3InstructBlipVideoQFormerMultiHeadAttention.__init__c                 C   
   || _ d S r   attn_gradients)r=   r  r@   r@   rA   save_attn_gradients     
z>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradientsc                 C   r   r   r  r   r@   r@   rA   get_attn_gradients  r   z=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradientsc                 C   r  r   attention_map)r=   r  r@   r@   rA   save_attention_map  r  z=InstructBlipVideoQFormerMultiHeadAttention.save_attention_mapc                 C   r   r   r  r   r@   r@   rA   get_attention_map  r   z<InstructBlipVideoQFormerMultiHeadAttention.get_attention_mapc                 C   s6   |  d d | j| jf }|j| }|ddddS )NrF   r   r   r#   r   )rH   r   r   rT   rQ   )r=   xnew_x_shaper@   r@   rA   transpose_for_scores  s   
z?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scoresNr   c                 K   s  |d u}|r|  | |}|  | |}	|}n|  | |}|  | |}	| |}
|  |
}t||dd}| jdksG| jdkr| d }tj	|tj
|jddd}tj	|tj
|jddd}|| }| || j d }|j|jd}| jdkrtd||}|| }n| jdkrtd||}td	||}|| | }|t| j }|j}|d ur|| }tjdd
||}|r| jr| | || j | |}|d ur|| }t||	}|dddd }| d d | jf }|j| }||fS )NrF   r   r   r   r#   r^   devicer]   zbhld,lrd->bhlrzbhrd,lrd->bhlrrK   r   r   r   )r  r   r   r   r4   r   rb   r   rH   arangelongr  rT   r  r  r`   r^   einsummathsqrtr   r   Softmaxr  r  register_hookr  r   rQ   r   r  )r=   r   r   r   encoder_hidden_statesencoder_attention_maskr   r
  	key_layervalue_layermixed_query_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shaper@   r@   rA   ri     sP   








z2InstructBlipVideoQFormerMultiHeadAttention.forwardrj   NNNN)rk   rl   rm   r.   r  r  r  r  r  r   r   ri   rr   r@   r@   r>   rA   rw     s    rw   c                       8   e Zd Z fddZdejdejdejfddZ  ZS )rx   c                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S r   )r-   r.   r   r   r/   denser   r   r  hidden_dropout_probr   r<   r>   r@   rA   r.        
z+InstructBlipVideoQFormerSelfOutput.__init__r   input_tensorrE   c                 C   &   |  |}| |}| || }|S r   r7  r   r   r=   r   r:  r@   r@   rA   ri        

z*InstructBlipVideoQFormerSelfOutput.forwardr   r@   r@   r>   rA   rx         $rx   c                       sv   e Zd Zd fdd	Zdd Z				ddejdeej d	eej d
eej deej de	e
 dejfddZ  ZS )!InstructBlipVideoQFormerAttentionFc                    s,   t    t||| _t|| _t | _d S r   )r-   r.   rw   	attentionrx   outputsetpruned_headsr	  r>   r@   rA   r.     s   

z*InstructBlipVideoQFormerAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r#   rK   )lenr   rA  r   r   rD  r   r   r   r   rB  r7  r  union)r=   headsindexr@   r@   rA   prune_heads"  s   z-InstructBlipVideoQFormerAttention.prune_headsNr   r   r   r!  r"  r   rE   c           
      K   s0   | j d|||||d|\}}| ||}	|	S )N)r   r   r   r!  r"  r@   )rA  rB  )
r=   r   r   r   r!  r"  r   r   re   attention_outputr@   r@   rA   ri   4  s   	
z)InstructBlipVideoQFormerAttention.forwardrj   r5  )rk   rl   rm   r.   rI  r4   rn   r   rp   r   r   ri   rr   r@   r@   r>   rA   r@    s,    r@  c                       r   )$InstructBlipVideoQFormerIntermediatec                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r-   r.   r   r   r/   r   r7  r~   r   strr	   intermediate_act_fnr<   r>   r@   rA   r.   J  s
   
z-InstructBlipVideoQFormerIntermediate.__init__r   rE   c                 C   s   |  |}| |}|S r   )r7  rM  r   r@   r@   rA   ri   R  s   

z,InstructBlipVideoQFormerIntermediate.forwardr   r@   r@   r>   rA   rK  I  s    rK  c                       r6  )InstructBlipVideoQFormerOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r   )r-   r.   r   r   r   r/   r7  r   r   r  r8  r   r<   r>   r@   rA   r.   Y  r9  z'InstructBlipVideoQFormerOutput.__init__r   r:  rE   c                 C   r;  r   r<  r=  r@   r@   rA   ri   _  r>  z&InstructBlipVideoQFormerOutput.forwardr   r@   r@   r>   rA   rN  X  r?  rN  c                       sJ   e Zd Z fddZ					ddee fddZdd	 Zd
d Z  Z	S )InstructBlipVideoQFormerLayerc                    s~   t    |j| _d| _t|| _|| _||j dkr&t|dd| _d| _	nd| _	t
|| _t|| _t
|| _t|| _d S )Nr#   r   T)r
  F)r-   r.   chunk_size_feed_forwardseq_len_dimr@  rA  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionrK  intermediaterN  rB  intermediate_queryoutput_query)r=   r(   rR  r>   r@   rA   r.   g  s   




z&InstructBlipVideoQFormerLayer.__init__Nr   r   c              	   K   s   | j |f||d|}|dkrk|d d d |d d f }	| jr6|d u r(td| j|	f||||d|}	t| j| j| j|	}
|jd |krit| j	| j| j|d d |d d d f 
|
j}tj|
|gdd}
|
S t| j	| j| j|}
|
S )N)r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r   r!  r"  r#   rK   )rA  rU  r   rT  r   feed_forward_chunk_queryrP  rQ  rM   feed_forward_chunkr`   r  r4   rU   )r=   r   r   r   r!  r"  query_lengthr   rJ  query_attention_outputlayer_outputlayer_output_textr@   r@   rA   ri   {  s^   
	z%InstructBlipVideoQFormerLayer.forwardc                 C      |  |}| ||}|S r   )rV  rB  r=   rJ  intermediate_outputr]  r@   r@   rA   rZ       
z0InstructBlipVideoQFormerLayer.feed_forward_chunkc                 C   r_  r   )rW  rX  r`  r@   r@   rA   rY    rb  z6InstructBlipVideoQFormerLayer.feed_forward_chunk_queryNNNNr   )
rk   rl   rm   r.   r   r   ri   rZ  rY  rr   r@   r@   r>   rA   rO  f  s    
8rO  c                       s>   e Zd Z fddZe					ddee fddZ  ZS )	InstructBlipVideoQFormerEncoderc                    r   )Nc                    s   g | ]}t  |qS r@   )rO  )r   rR  r   r@   rA   r         z<InstructBlipVideoQFormerEncoder.__init__.<locals>.<listcomp>F)	r-   r.   r(   r   r   r   r   layerr   r<   r>   r   rA   r.     s   

z(InstructBlipVideoQFormerEncoder.__init__Nr   r   c                 K   sV   t | jjD ]}| j| }	|d ur|| nd }
|	|||
|f||d|}qt|dS )N)r"  r[  r   )r   r(   r   rf  r   )r=   r   r   r   r!  r"  r[  r   ilayer_modulelayer_head_maskr@   r@   rA   ri     s"   

z'InstructBlipVideoQFormerEncoder.forwardrc  )	rk   rl   rm   r.   r   r   r   ri   rr   r@   r@   r>   rA   rd    s    rd  c                       s2   e Zd ZdZ fddZ				dddZ  ZS )	ru   z;Construct the embeddings from word and position embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	tj
|j|jd| _t|j| _| jdt|jddd t|dd| _|| _d S )	N)padding_idxr   position_ids)r#   rF   F)
persistentr   r   )r-   r.   r   r   
vocab_sizer/   pad_token_idword_embeddingsr  position_embeddingsr   r   	layernormr  r8  r   register_bufferr4   r  rc   r  r   r(   r<   r>   r@   rA   r.     s   

z+InstructBlipVideoQFormerEmbeddings.__init__Nr   c                 C   s   |d ur|  d }nd}|d u r | jd d ||| f  }|d urI| |}| jdkr;| ||j}|| }|d urHtj	||fdd}n|}|| j
jj}| 
|}| |}|S )Nr#   r   r   rK   )rH   rk  clonero  r   rp  r`   r  r4   rU   rq  r_   r^   r   )r=   	input_idsrk  query_embedspast_key_values_lengthr(  rB   rp  r@   r@   rA   ri     s$   



z*InstructBlipVideoQFormerEmbeddings.forward)NNNr   )rk   rl   rm   r   r.   ri   rr   r@   r@   r>   rA   ru     s    ru   c                       s  e Zd ZdZdZdZdZdZee	e
dddge	e
dddgdZdef fd	d
Zdd Zdd Zdd Z	d"dejdee dejdedejf
ddZe e						d#dejdeej deej deej deej deej deej dee deeej e f fd d!Z!  Z"S )$InstructBlipVideoQFormerModelz
    Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
    instruction as input.
    Fr#   z
.attention)rH  
layer_namez.crossattention)r   r   cross_attentionsr(   c                    s2   t  | || _t|| _t|| _|   d S r   )r-   r.   r(   ru   rB   rd  r   r   r<   r>   r@   rA   r.   -  s
   

z&InstructBlipVideoQFormerModel.__init__c                 C   s   | j jS r   rB   ro  r   r@   r@   rA   r   7  s   z2InstructBlipVideoQFormerModel.get_input_embeddingsc                 C   s   || j _d S r   rz  r=   r   r@   r@   rA   set_input_embeddings:  s   z2InstructBlipVideoQFormerModel.set_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   rf  rA  rI  )r=   heads_to_prunerf  rG  r@   r@   rA   _prune_heads=  s   z*InstructBlipVideoQFormerModel._prune_headsr   input_shaper  	has_queryrE   c                 C   s   |  dkr|dddddddf }n|  dkr(|ddddddf }ntd| d|j d|j| jd}d| d	 }|S )
a>  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device: (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r   Nr   z!Wrong shape for input_ids (shape z) or attention_mask (shape )r]   r|   g     )rL   r   rM   r`   r^   )r=   r   r  r  r  extended_attention_maskr@   r@   rA   get_extended_attention_maskE  s   	z9InstructBlipVideoQFormerModel.get_extended_attention_maskNrt  rk  ru  r   r!  r"  r   c                    sf  |du r|du rt d|dur|jd nd}	 j|||d}
|
 dd }|\}}|
j}|du r;tj||f|d} |||}|durt|t	rU|d  \}}}n| \}}}||f}t|t	ro fdd	|D }n|du rtj||d} 
|}n 
|}nd} | jj} j|
f|||||	d
|}|j}|dddddf }t||dS )a$  
        query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
            Hidden states to be used in the attention computation. If cross-attention,
            will be used for the query (i.e., key and value will use the encoder_hidden_states).
        Nz7You have to specify query_embeds when input_ids is Noner#   r   )rt  rk  ru  rF   )r  c                    s   g | ]}  |qS r@   )invert_attention_mask)r   maskr   r@   rA   r     re  z9InstructBlipVideoQFormerModel.forward.<locals>.<listcomp>)r   r   r!  r"  r[  r   )r   rM   rB   rH   r  r4   onesr  r~   listr  get_head_maskr(   r   r   r   r   )r=   rt  r   rk  ru  r   r!  r"  r   r[  embedding_outputr  rd   r(  r  r  encoder_batch_sizeencoder_sequence_lengthre   encoder_hidden_shapeencoder_extended_attention_maskr   sequence_outputr   r@   r   rA   ri   p  sX   

	z%InstructBlipVideoQFormerModel.forwardrj   )NNNNNN)#rk   rl   rm   r   r   r   r   r   rO  r   rw   r   r%   r.   r   r|  r  r4   rn   r   ro   r  rq   r  r   r   
LongTensorr   rp   r   r   r   r   ri   rr   r@   r@   r>   rA   rw    sn    


+	
rw  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                   @   s   e Zd ZU dZdZeeej  e	d< dZ
eeej  e	d< dZeej e	d< dZeeej  e	d< dZeeej  e	d< dee fd	d
ZdS )4InstructBlipVideoForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Language modeling loss from the language model.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head of the language model.
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
        Outputs of the language model.
    Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrE   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS )r  r  r  N)r  to_tuple)r   kr   r@   rA   	<genexpr>  s    
zPInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>)r   keysr   r@   r   rA   r    s   z=InstructBlipVideoForConditionalGenerationModelOutput.to_tuple)rk   rl   rm   r   r  r   r   r4   rp   r   r  r  r  r  r   r  r@   r@   r@   rA   r    s   
 r  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c                #       s  e Zd ZdZdgZdef fddZdd Zdd	 Zd
d Z	dd Z
dejdejfddZee											d"dejdejdeej deej deej deej deej deej dee dee dee dedee dee deeef fd d!Z  ZS )#r   r\   r   r(   c                    s   t  | t|j| _ttd|j	|j
j| _t|j
| _t|j
j|jj| _t|j| _| jjd ur@| j| jj | jjd urN| j| jj |   d S Nr#   )r-   r.   r   vision_configvision_modelr   r3   r4   r   num_query_tokensqformer_configr/   r   rw  qformerr   text_configlanguage_projectionr    from_configlanguage_modelr   extend_keep_in_fp32_modulesr   r<   r>   r@   rA   r.     s   zInstructBlipVideoModel.__init__c                 C   
   | j  S r   r  r   r   r@   r@   rA   r     r  z+InstructBlipVideoModel.get_input_embeddingsc                 C      | j | d S r   r  r|  r{  r@   r@   rA   r|       z+InstructBlipVideoModel.set_input_embeddingsc                 C   ,   | j js| jj| jj_| jj| jj_d S d S r   r(   use_decoder_only_language_modelr  sharedr   embed_tokensdecoderr   r@   r@   rA   _tie_weights	     z#InstructBlipVideoModel._tie_weightsc                 C   P   | j }t|dkrd|vrtj dkrtd t| jdr&d| jj	_
dS dS z
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        r#   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maprE  r4   cudadevice_countloggerwarningr   r  r  io_same_devicer=   r  r@   r@   rA   _preprocess_accelerate     "z-InstructBlipVideoModel._preprocess_acceleratert  r   c                 C   `   |du r||   tj| jjtj|jdk}|d}n|| jjk}|d	|
|j}|S zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        Nr  rF   )r   r4   r   r(   image_token_idr  r  all	unsqueeze	expand_asr`   r=   rt  r   special_image_maskr@   r@   rA   get_placeholder_mask"     z+InstructBlipVideoModel.get_placeholder_maskNFqformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictr[   	use_cacher   rE   c                 K   s:  |dur|n| j j}|j\}}}}}||| |||}| j||	|
||d}|d }tj| dd tj|j	d}| j
|jd dd}tj| dd tj|j	d}|du r^t|}|j|dd}|j|dd}tj||gdd}| j||||||	|
|d}|d ddd|dddf }| |}||| j j| d}|du r| j |}|| j jk}|du rt|}n||  tj| j jtj|j	dk}|d}|d||j	}||j	|j}|||}| j jr| jd|||	|
||d	|}n| jd|||||	|
||d
|}t|||dS )a  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        N)r\   r  r  r  r[   r   rF   r  rK   r#   )rt  r   ru  r!  r"  r  r  r  r   r   r  r  r  r  )r   r   r  r  r  r  r  r  r  r@   )r(   use_return_dictrM   rP   r  r4   r  rH   r  r  r   rc   	ones_likerepeat_interleaverU   r  r  r  r  r   video_token_idr   r  r  r  r`   r^   masked_scatterr  r  )r=   r\   r  r  rt  r   r  r  r   r  r  r  r[   r  r   rd   frameschannelrC   rD   r  image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputsr@   r@   rA   ri   1  s   )  
$





	zInstructBlipVideoModel.forward)NNNNNNNNNFN)rk   rl   rm   r   r  r$   r.   r   r|  r  r  r4   r  rp   r  r   r   r   rn   rq   r   r   r   r   r  ri   rr   r@   r@   r>   rA   r     sj    	

r   a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c                %       s   e Zd ZU eed< dZdZdgZdef fddZdd Z	d	d
 Z
dd ZdejfddZdd Zdd Zdd Zdd Z			d2dejdejdeej dee dee f
ddZd ejd!ejfd"d#Zee												d3dejdejdeej d eej d$eej d%eej d&eej d!eej d'ee d(ee d)eej dee ded*ee d+ee deee f f d,d-Z!e" 						d4dejdeej deej d eej d$eej d!eej dedejfd.d/Z#			d2dejdejdeej dee dee f
d0d1Z$  Z%S )5r   r(   r\   Tr   c                    s   t  | t|j| _tt	d|j
|jj| _t|j| _t|jj|jj| _|jr7t|j}nt|j}|jd urI| j|j |jd urU| j|j || _|   d S r  )r-   r.   r   _from_configr  r  r   r3   r4   r   r  r  r/   r   rw  r  r   r  r  r  r!   r  r"   r   r  r  r  r   )r=   r(   r  r>   r@   rA   r.     s   

z2InstructBlipVideoForConditionalGeneration.__init__c                 C   r  r   r  r   r@   r@   rA   r     r  z>InstructBlipVideoForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  r{  r@   r@   rA   r|    r  z>InstructBlipVideoForConditionalGeneration.set_input_embeddingsc                 C   r  r   )r  set_output_embeddings)r=   new_embeddingsr@   r@   rA   r    r  z?InstructBlipVideoForConditionalGeneration.set_output_embeddingsrE   c                 C   r  r   )r  get_output_embeddingsr   r@   r@   rA   r    r  z?InstructBlipVideoForConditionalGeneration.get_output_embeddingsc                 C   r  r   )r  get_encoderr   r@   r@   rA   r    r  z5InstructBlipVideoForConditionalGeneration.get_encoderc                 C   r  r   )r  get_decoderr   r@   r@   rA   r    r  z5InstructBlipVideoForConditionalGeneration.get_decoderc                 C   r  r   r  r   r@   r@   rA   r    r  z6InstructBlipVideoForConditionalGeneration._tie_weightsc                 C   r  r  r  r  r@   r@   rA   r    r  z@InstructBlipVideoForConditionalGeneration._preprocess_accelerateNFr  r  r[   r  c                 C   s   dS )$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        Nr@   )r=   r\   r  r  r[   r  r@   r@   rA   get_image_features
  s   z<InstructBlipVideoForConditionalGeneration.get_image_featuresrt  r   c                 C   r  r  )r   r4   r   r(   r  r  r  r  r  r  r`   r  r@   r@   rA   r    r  z>InstructBlipVideoForConditionalGeneration.get_placeholder_maskr   r  r  r  r  labelsr  r   c                 K   s\  |dur|n| j j}| j||||dd\}}}|s| n|}|s%| n|}|du r1|  |}|du r:t|}||j|j	}| j
||d}|||}| j jr| jd|||	|
||d|}|rg|jn|d }d}|dur| jd||| j jjd|}n$| jd|||||	|
|||d	|}|r|jn|d }|r|jn|d	 }t|||||d
S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTr  r  r[   r  r   r  r   )r  r  rm  )	r   r   r  r  r  r  r  r  r  r#   )r  r  r  r  r  r@   )r(   r  get_video_featuresr  r   r4   r  r`   r  r^   r  r  r  r  r  loss_functionr  rm  r  r  )r=   r\   r  r  rt  r   r  r  r   r  r  r  r  r[   r  r   r  r  r  r  r  r  r  r@   r@   rA   ri   *  sv   S
	
z1InstructBlipVideoForConditionalGeneration.forwardc                 K   s  t | dr	|   |jd }	| j||||dd\}
}}|du rM|du rG| jjg| jj d }|| jjjg }t	j
|gt	j|jd}||	d}|  |}|du rVt	|}|
|j|j}
| j||d	}|||
}||d
}| jjjsy||d< | jjdi ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        r  r   Tr  Nr   r  r#   r  )r   r   rt  r@   )r   r  rM   r  r(   video_token_indexr  r  bos_token_idr4   r   r  r  repeatr   r  r`   r^   r  r  r  is_encoder_decodergenerate)r=   r\   r  r  rt  r   r   r[   generate_kwargsrd   r  r  r  video_tokensstart_tokensr  inputsr  r@   r@   rA   r    s6   
"



z2InstructBlipVideoForConditionalGeneration.generatec                 C   s>  |j \}}}}	}
||| ||	|
}| j||dd}|d }tj| dd tj|jd}| j	|j d dd}tj| dd tj|jd}|du rRt
|}|j|dd}|j|dd}tj||gdd}| j|||||dd	}|d ddd|dddf }| |}||| jj| d}|r|||fS |S )
r  T)r\   r[   r  r   NrF   r  rK   r#   )rt  r   ru  r!  r"  r  )rM   rP   r  r4   r  rH   r  r  r   rc   r  r  rU   r  r  r(   r  )r=   r\   r  r  r[   r  rd   r  r  rC   rD   r  r  r  r   r  r  r  r  r@   r@   rA   r    s<     
$

z<InstructBlipVideoForConditionalGeneration.get_video_features)NFF)NNNNNNNNNNFN)NNNNNF)&rk   rl   rm   r$   r   r   r   r  r.   r   r|  r  r   Moduler  r  r  r  r  r4   rp   r  r   rq   r  r  r   r   r   r   r   r   r  ri   no_gradr  r  rr   r@   r@   r>   rA   r     s   
 

	

 
Ir   )r   rs   rw  r   r   )ry   )Mr  dataclassesr   typingr   r   r   r   r4   r   activationsr	   
generationr
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   utils.genericr   r   autor    r!   r"   configuration_instructblipvideor$   r%   r&   
get_loggerrk   r  r  r'   rs   rn   floatr   rv   r   r   r   r   rw   rx   r@  rK  rN  rO  rd  ru   rw  r  r   r   __all__r@   r@   r@   rA   <module>   s    
J.
L""4z.X'3 - J	   