o
    eix                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlZ	ddl
Z
ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( e)e*Z+eeddG dd deZ,eeddG dd deZ-dd Z.G dd dej/Z0G dd  d ej/Z1		!dGd"ej/d#e
j2d$e
j2d%e
j2d&e
j2dB d'e3dB d(e3d)ee fd*d+Z4G d,d- d-ej/Z5G d.d/ d/ej/Z6G d0d1 d1ej/Z7G d2d3 d3ej/Z8G d4d5 d5ej/Z9G d6d7 d7eZ:G d8d9 d9ej/Z;eG d:d; d;eZ<eG d<d= d=e<Z=G d>d? d?ej/Z>ed@dG dAdB dBe<Z?edCdG dDdE dEe<Z@g dFZAdS )Hz,PyTorch VideoMAE (masked autoencoder) model.    N)Callable)deepcopy)	dataclass)nn)MSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)can_return_tuplemerge_with_config_defaults)capture_outputs   )VideoMAEConfigz[
    Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sP   e Zd ZU dZdZejdB ed< dZe	ej dB ed< dZ
e	ej dB ed< dS )VideoMAEDecoderOutputz
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    Nlogitshidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tupler    r'   r'   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/videomae/modeling_videomae.pyr   )   s
   
 r   zb
    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.
    c                   @   sb   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ej dB ed< dZe
ej dB ed< dS )VideoMAEForPreTrainingOutputz
    loss (`torch.FloatTensor` of shape `(1,)`):
        Pixel reconstruction loss.
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    Nlossr   r   r   )r   r    r!   r"   r*   r#   r$   r%   r   r   r&   r   r'   r'   r'   r(   r)   :   s   
 r)   c                    s    fddt fddt| D }t |dddddf |dddddf< t |dddddf |dddddf< t|dS )	z Sinusoid position encoding tablec                    s    fddt D S )Nc              	      s(   g | ]}t d d|d     qS )i'     )nppower).0hid_j)d_hidpositionr'   r(   
<listcomp>U   s   ( zOget_sinusoid_encoding_table.<locals>.get_position_angle_vec.<locals>.<listcomp>)ranger1   )r0   r4   r(   get_position_angle_vecT   s   z;get_sinusoid_encoding_table.<locals>.get_position_angle_vecc                    s   g | ]} |qS r'   r'   )r.   pos_i)r5   r'   r(   r2   W       z/get_sinusoid_encoding_table.<locals>.<listcomp>Nr   r+   r   )r,   arrayr3   sincosr#   r$   	unsqueeze)
n_positionr0   sinusoid_tabler'   )r0   r5   r(   get_sinusoid_encoding_tableP   s
   ..r>   c                       (   e Zd ZdZ fddZdd Z  ZS )VideoMAEEmbeddingsz7
    Construct the patch and position embeddings.

    c                    s8   t    t|| _| jj| _t| j|j| _|| _d S N)	super__init__VideoMAEPatchEmbeddingspatch_embeddingsnum_patchesr>   hidden_sizeposition_embeddingsconfigselfrI   	__class__r'   r(   rC   d   s
   



zVideoMAEEmbeddings.__init__c                 C   sZ   |  |}|| j |j|jdd }|d ur+|j\}}}||  }||d|}|S )NTdevicecopy)rE   rH   detachtype_astorO   shapereshape)rK   pixel_valuesbool_masked_pos
embeddings
batch_size_num_channelsr'   r'   r(   forwardm   s   

zVideoMAEEmbeddings.forwardr   r    r!   r"   rC   r]   __classcell__r'   r'   rL   r(   r@   ^   s    	r@   c                       r?   )rD   aw  
    Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
    height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

    The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
    patch_size).

    c           	         s   t    |j}|j}|j}|j}|j}|j}t|t	j
jr |n||f}t|t	j
jr-|n||f}|| _|| _t|| _|d |d  |d |d   || j  }|| _|| _tj||| j|d |d f| j|d |d fd| _d S )Nr   r   )in_channelsout_channelskernel_sizestride)rB   rC   
image_size
patch_sizer\   rG   
num_framestubelet_size
isinstancecollectionsabcIterableintrF   r   Conv3d
projection)	rK   rI   rd   re   r\   rG   rf   rg   rF   rL   r'   r(   rC      s,   

(z VideoMAEPatchEmbeddings.__init__c              
   C   s   |j \}}}}}|| jkrtd|| jd ks|| jd kr6td| d| d| jd  d| jd  d	|dddd	d
}| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r+   r      )rU   r\   
ValueErrorrd   permutern   flatten	transpose)rK   rW   rZ   rf   r\   heightwidthrY   r'   r'   r(   r]      s   
(zVideoMAEPatchEmbeddings.forwardr^   r'   r'   rL   r(   rD      s    	rD           modulequerykeyvalueattention_maskscalingdropoutkwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )NrQ         r+   r   dim)ptrainingr   )
sizer#   matmulrt   r   
functionalsoftmaxr~   r   
contiguous)
rx   ry   rz   r{   r|   r}   r~   r   attn_weightsattn_outputr'   r'   r(   eager_attention_forward   s   
r   c                       sL   e Zd Zdeddf fddZd	dejdB deejejf fddZ  Z	S )
VideoMAESelfAttentionrI   returnNc                    s
  t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	dd| _tj|j| j	dd| _tj|j| j	dd| _|jr}tt| j	| _tt| j	| _d S d | _d | _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r   Fbias)rB   rC   rG   num_attention_headshasattrrq   rI   rl   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr}   	is_causalr   Linearry   rz   r{   qkv_bias	Parameterr#   zerosq_biasv_biasrJ   rL   r'   r(   rC      s,   


zVideoMAESelfAttention.__init__r   c              
   C   s*  |j \}}}| jd urtj| jddnd }tjj|| jj	|d}tjj|| j
j	| jd}tjj|| jj	| jd}||d| j| jdd}	||d| j| jdd}
||d| j| jdd}t| jjt}|| ||	|
d | j| j| jsydn| jd\}}| d d	 | jf }||}||fS )
NF)requires_grad)inputweightr   rQ   r   r+   rw   )r   r}   r~   )rU   r   r#   
zeros_liker   r   r   linearrz   r   r{   ry   viewr   r   rt   r   get_interfacerI   _attn_implementationr   r   r}   r   r   r   r   rV   )rK   r   rZ   
seq_lengthr[   k_biaskeysvaluesqueries	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper'   r'   r(   r]      s0   

zVideoMAESelfAttention.forwardrA   )
r   r    r!   r   rC   r#   Tensorr&   r]   r_   r'   r'   rL   r(   r      s    .r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
VideoMAESelfOutputz
    The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rI   c                    s.   t    t|j|j| _t|j| _d S rA   )	rB   rC   r   r   rG   denseDropouthidden_dropout_probr~   rJ   rL   r'   r(   rC        
zVideoMAESelfOutput.__init__r   input_tensorr   c                 C      |  |}| |}|S rA   r   r~   rK   r   r   r'   r'   r(   r]        

zVideoMAESelfOutput.forward
r   r    r!   r"   r   rC   r#   r   r]   r_   r'   r'   rL   r(   r     s    $r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )VideoMAEAttentionrI   c                    s"   t    t|| _t|| _d S rA   )rB   rC   r   	attentionr   outputrJ   rL   r'   r(   rC   !  s   

zVideoMAEAttention.__init__r   r   c                 C   s   |  |\}}| ||}|S rA   )r   r   )rK   r   self_attn_outputr[   r   r'   r'   r(   r]   &  s   zVideoMAEAttention.forward	r   r    r!   r   rC   r#   r   r]   r_   r'   r'   rL   r(   r      s    r   c                       r   )VideoMAEIntermediaterI   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rA   )rB   rC   r   r   rG   intermediate_sizer   rh   
hidden_actstrr   intermediate_act_fnrJ   rL   r'   r(   rC   .  s
   
zVideoMAEIntermediate.__init__r   r   c                 C   r   rA   )r   r   )rK   r   r'   r'   r(   r]   6  r   zVideoMAEIntermediate.forwardr   r'   r'   rL   r(   r   -  s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	VideoMAEOutputrI   c                    s.   t    t|j|j| _t|j| _	d S rA   )
rB   rC   r   r   r   rG   r   r   r   r~   rJ   rL   r'   r(   rC   >  r   zVideoMAEOutput.__init__r   r   r   c                 C   s    |  |}| |}|| }|S rA   r   r   r'   r'   r(   r]   C  s   

zVideoMAEOutput.forwardr   r'   r'   rL   r(   r   =  s    $r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	VideoMAELayerz?This corresponds to the Block class in the timm implementation.rI   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)rB   rC   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormrG   layer_norm_epslayernorm_beforelayernorm_afterrJ   rL   r'   r(   rC   N  s   



zVideoMAELayer.__init__r   r   c                 C   s@   |  |}| |}|| }| |}| |}| ||}|S rA   )r   r   r   r   r   )rK   r   hidden_states_normattention_outputlayer_outputr'   r'   r(   r]   X  s   



zVideoMAELayer.forwardr   r'   r'   rL   r(   r   K  s    
r   c                       6   e Zd Zdef fddZdejdefddZ  Z	S )VideoMAEEncoderrI   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r'   r   r.   r[   rI   r'   r(   r2   n  r7   z,VideoMAEEncoder.__init__.<locals>.<listcomp>F)	rB   rC   rI   r   
ModuleListr3   num_hidden_layerslayergradient_checkpointingrJ   rL   r   r(   rC   k  s   
 
zVideoMAEEncoder.__init__r   r   c                 C   s&   t | jD ]\}}||}qt|dS )Nlast_hidden_state)	enumerater   r
   )rK   r   ilayer_moduler'   r'   r(   r]   q  s   

zVideoMAEEncoder.forward)
r   r    r!   r   rC   r#   r   r
   r]   r_   r'   r'   rL   r(   r   j  s    r   c                   @   sH   e Zd ZU eed< dZdZdZdZddgZ	dZ
dZdZdZeedZd	S )
VideoMAEPreTrainedModelrI   videomaerW   videoTr@   r   )r   r   N)r   r    r!   r   r%   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr'   r'   r'   r(   r   x  s   
 
r   c                       s`   e Zd Z fddZdd Zeedde	ddej	d	ej
dB d
ee defddZ  ZS )VideoMAEModelc                    sT   t  | || _t|| _t|| _|jrd | _n
t	j
|j|jd| _|   d S )Nr   )rB   rC   rI   r@   rY   r   encoderuse_mean_pooling	layernormr   r   rG   r   	post_initrJ   rL   r'   r(   rC     s   

zVideoMAEModel.__init__c                 C   s   | j jS rA   )rY   rE   )rK   r'   r'   r(   get_input_embeddings  s   z"VideoMAEModel.get_input_embeddingsF)tie_last_hidden_statesNrW   rX   r   r   c                 K   s:   |  ||}| |}|j}| jdur| |}t|dS )aB  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
            length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import VideoMAEVideoProcessor, VideoMAEModel
        >>> from huggingface_hub import hf_hub_download

        >>> # replace this with your own video file
        >>> video_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )

        >>> video_processor = VideoMAEVideoProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

        >>> # prepare video for the model
        >>> inputs = video_processor(video_path, return_tensors="pt")

        >>> # forward pass
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 1568, 768]
        ```Nr   )rY   r   r   r   r
   )rK   rW   rX   r   embedding_outputencoder_outputssequence_outputr'   r'   r(   r]     s   *



zVideoMAEModel.forwardrA   )r   r    r!   rC   r   r   r   r   r#   r$   
BoolTensorr   r   r
   r]   r_   r'   r'   rL   r(   r     s     r   c                       r   )VideoMAEDecoderrI   c                    s   t    |j|j |jd  }t| |j _|j _	|j
 _|j _t fddt|jD | _t|j| _|dkrFt|j|nt | _d| _ | _d S )Nr+   c                    r   r'   r   r   decoder_configr'   r(   r2     r7   z,VideoMAEDecoder.__init__.<locals>.<listcomp>r   F)rB   rC   r\   rg   re   r   decoder_hidden_sizerG   decoder_num_hidden_layersr   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r   r3   decoder_layersr   normr   Identityheadr   rI   )rK   rI   decoder_num_labelsrL   r  r(   rC     s   

zVideoMAEDecoder.__init__r   return_token_numc                 C   sH   | j D ]}||}q|d d | d f }| |}| |}t|dS )N)r   )r  r	  r  r   )rK   r   r  r   r   r'   r'   r(   r]     s   




zVideoMAEDecoder.forward)
r   r    r!   r   rC   r#   r   rl   r]   r_   r'   r'   rL   r(   r    s    r  zb
    The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.
    c                
       sF   e Zd Z fddZeedejdejde	e
 defddZ  ZS )	VideoMAEForPreTrainingc                    st   t  | || _t|| _tj|j|jdd| _	t
tdd|j| _t| jjj|j| _t|| _|   d S )NFr   r   )rB   rC   rI   r   r   r   r   rG   r  encoder_to_decoderr   r#   r   
mask_tokenr>   rY   rF   rH   r  decoderr   rJ   rL   r'   r(   rC     s   

zVideoMAEForPreTraining.__init__rW   rX   r   r   c                 K   s$  | j |fd|i|}|j}| |}|j\}}}|du r!td| j|dd|}	|	 j	|j
dd}	|	|  |d|}
|	| |d|}tj||
 | j| gdd}| ||jd }|j}d}t  | jjd	krt|}n2|j
}|j}ttj	||d
ddddddf }ttj	||d
ddddddf }|| | }|j\}}}}}| jj| jj}}| jjr|||| |||| ||| |}|dddddddd	 }|||| | | | | || | |}||jddd |jdddd  d  }|||| | | | | || | | }nB| jjd	kr*td|||| |||| ||| |}|dddddddd	 }|||| | | | | || | | }|j\}}}|| |d|}W d   n	1 s{w   Y  t! }|||}t"|||j#|j$dS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
            (image_size // patch_size) ** 2`.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 16
        >>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")

        >>> pixel_values = image_processor(video, return_tensors="pt").pixel_values

        >>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
        >>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
        >>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss = outputs.loss
        ```rX   Nz!One must provided a boolean mask rQ   TrN   r   r   r   )rO   dtyper   rp      r+         r   )r   keepdim)r   unbiasedr  gư>zQCan't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False.r*   r   r   r   )%r   r   r  rU   rq   rH   expandrS   rR   rT   rO   rV   r#   catr  r  r   no_gradrI   r\   r  	as_tensorr   r   rg   re   norm_pix_lossr   rr   r   meanvarsqrtr   r)   r   r   )rK   rW   rX   r   outputsr   rZ   r[   r\   expanded_position_embeddingspos_emb_visiblepos_emb_maskx_fulldecoder_outputsr   r*   framesrO   r  r  stdtimeru   rv   rg   re   frames_normvideos_patchlabelsloss_fctr'   r'   r(   r]     s   #
&&

J
zVideoMAEForPreTraining.forward)r   r    r!   rC   r   r   r#   r$   r   r   r   r)   r]   r_   r'   r'   rL   r(   r    s    r  z
    VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.
    c                       sT   e Zd Z fddZee		d
dejdB dejdB dee	 de
fdd	Z  ZS )VideoMAEForVideoClassificationc                    sf   t  | |j| _t|| _|jrt|jnd | _	|jdkr(t
|j|jnt | _|   d S )Nr   )rB   rC   
num_labelsr   r   r   r   r   rG   fc_normr   r
  
classifierr   rJ   rL   r'   r(   rC     s   
$z'VideoMAEForVideoClassification.__init__NrW   r,  r   r   c           	      K   s   | j |fi |}|j}| jdur|d}| |}n|dddf }| |}d}|dur;| j||| jfi |}t|||j|j	dS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import VideoMAEVideoProcessor, VideoMAEForVideoClassification
        >>> from huggingface_hub import hf_hub_download

        >>> # replace this with your own video file
        >>> video_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )

        >>> video_processor = VideoMAEVideoProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
        >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

        >>> inputs = video_processor(video_path, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        ...     logits = outputs.logits

        >>> # model predicts one of the 400 Kinetics-400 classes
        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])
        eating spaghetti
        ```Nr   r   r  )
r   r   r0  r  r1  loss_functionrI   r   r   r   )	rK   rW   r,  r   r!  r   r   r   r*   r'   r'   r(   r]     s    )


z&VideoMAEForVideoClassification.forward)NN)r   r    r!   rC   r   r   r#   r   r   r   r   r]   r_   r'   r'   rL   r(   r.    s    r.  )r  r   r   r.  )Nrw   )Br"   collections.abcri   r   rP   r   dataclassesr   numpyr,   r#   r   torch.nnr   activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.constantsr   r   utils.genericr   r   utils.output_capturingr   configuration_videomaer   
get_loggerr   loggerr   r)   r>   Moduler@   rD   r   floatr   r   r   r   r   r   r   r   r   r   r  r  r.  __all__r'   r'   r'   r(   <module>   s   
!<
=G% $N