o
    	۷iە                     @   s  d Z ddlZddlmZ ddlmZ ddlmZm	Z	 ddl
ZddlZddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ e#,e-Z.ee"ddG dd de Z/ee"ddG dd de Z0dd Z1G dd dej2Z3G dd  d ej2Z4	!dFd"ej2d#ej5d$ej5d%ej5d&e	ej5 d'e6d(e6fd)d*Z7G d+d, d,ej2Z8G d-d. d.ej2Z9G d/d0 d0ej2Z:G d1d2 d2ej2Z;G d3d4 d4ej2Z<G d5d6 d6eZ=G d7d8 d8ej2Z>e"G d9d: d:eZ?e"G d;d< d<e?Z@G d=d> d>ej2ZAe"d?dG d@dA dAe?ZBe"dBdG dCdD dDe?ZCg dEZDdS )Gz,PyTorch VideoMAE (masked autoencoder) model.    N)deepcopy)	dataclass)CallableOptional)nn)MSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringlogging)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)can_return_tuplecheck_model_inputs   )VideoMAEConfigz[
    Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sP   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dS )VideoMAEDecoderOutputz
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    Nlogitshidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tupler     r)   r)   d/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/videomae/modeling_videomae.pyr   *   s
   
 r   zb
    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.
    c                   @   sb   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dS )VideoMAEForPreTrainingOutputz
    loss (`torch.FloatTensor` of shape `(1,)`):
        Pixel reconstruction loss.
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    Nlossr   r   r    )r!   r"   r#   r$   r,   r   r%   r&   r'   r   r   r(   r    r)   r)   r)   r*   r+   ;   s   
 r+   c                    s    fddt fddt| D }t |dddddf |dddddf< t |dddddf |dddddf< t|dS )	z Sinusoid position encoding tablec                    s    fddt D S )Nc              	      s(   g | ]}t d d|d     qS )i'     )nppower).0hid_j)d_hidpositionr)   r*   
<listcomp>V   s   ( zOget_sinusoid_encoding_table.<locals>.get_position_angle_vec.<locals>.<listcomp>)ranger3   )r2   r6   r*   get_position_angle_vecU   s   z;get_sinusoid_encoding_table.<locals>.get_position_angle_vecc                    s   g | ]} |qS r)   r)   )r0   pos_i)r7   r)   r*   r4   X       z/get_sinusoid_encoding_table.<locals>.<listcomp>Nr   r-   r   )r.   arrayr5   sincosr%   r&   	unsqueeze)
n_positionr2   sinusoid_tabler)   )r2   r7   r*   get_sinusoid_encoding_tableQ   s
   ..r@   c                       (   e Zd ZdZ fddZdd Z  ZS )VideoMAEEmbeddingsz7
    Construct the patch and position embeddings.

    c                    s8   t    t|| _| jj| _t| j|j| _|| _d S N)	super__init__VideoMAEPatchEmbeddingspatch_embeddingsnum_patchesr@   hidden_sizeposition_embeddingsconfigselfrK   	__class__r)   r*   rE   e   s
   



zVideoMAEEmbeddings.__init__c                 C   sZ   |  |}|| j |j|jdd }|d ur+|j\}}}||  }||d|}|S )NTdevicecopy)rG   rJ   detachtype_astorQ   shapereshape)rM   pixel_valuesbool_masked_pos
embeddings
batch_size_num_channelsr)   r)   r*   forwardn   s   

zVideoMAEEmbeddings.forwardr!   r"   r#   r$   rE   r_   __classcell__r)   r)   rN   r*   rB   _   s    	rB   c                       rA   )rF   aw  
    Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
    height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

    The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
    patch_size).

    c           	         s   t    |j}|j}|j}|j}|j}|j}t|t	j
jr |n||f}t|t	j
jr-|n||f}|| _|| _t|| _|d |d  |d |d   || j  }|| _|| _tj||| j|d |d f| j|d |d fd| _d S )Nr   r   )in_channelsout_channelskernel_sizestride)rD   rE   
image_size
patch_sizer^   rI   
num_framestubelet_size
isinstancecollectionsabcIterableintrH   r   Conv3d
projection)	rM   rK   rf   rg   r^   rI   rh   ri   rH   rN   r)   r*   rE      s,   

(z VideoMAEPatchEmbeddings.__init__c              
   C   s   |j \}}}}}|| jkrtd|| jd ks|| jd kr6td| d| d| jd  d| jd  d	|dddd	d
}| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r-   r      )rW   r^   
ValueErrorrf   permuterp   flatten	transpose)rM   rY   r\   rh   r^   heightwidthr[   r)   r)   r*   r_      s   
(zVideoMAEPatchEmbeddings.forwardr`   r)   r)   rN   r*   rF      s    	rF           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )NrS   )dimdtype)ptrainingr   r-   )r%   matmulrv   r   
functionalsoftmaxfloat32rV   r   r   r   
contiguous)
rz   r{   r|   r}   r~   r   r   kwargsattn_weightsattn_outputr)   r)   r*   eager_attention_forward   s   r   c                       sL   e Zd Zdeddf fddZd	deej deejejf fddZ	  Z
S )
VideoMAESelfAttentionrK   returnNc                    s
  t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	dd| _tj|j| j	dd| _tj|j| j	dd| _|jr}tt| j	| _tt| j	| _d S d | _d | _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      Fbias)rD   rE   rI   num_attention_headshasattrrs   rK   rn   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearr{   r|   r}   qkv_bias	Parameterr%   zerosq_biasv_biasrL   rN   r)   r*   rE      s,   


zVideoMAESelfAttention.__init__	head_maskc              
   C   s6  |j \}}}| jd urtj| jddnd }tjj|| jj	|d}tjj|| j
j	| jd}tjj|| jj	| jd}	||d| j| jdd}
||d| j| jdd}|	|d| j| jdd}t}| jjdkrpt| jj }|| ||
||| j| j| jsdn| jd	\}}| d d
 | jf }||}||fS )NF)requires_grad)inputweightr   rS   r   r-   eagerry   )r   r   r   r   )rW   r   r%   
zeros_liker   r   r   linearr|   r   r}   r{   viewr   r   rv   r   rK   _attn_implementationr   r   r   r   r   sizer   rX   )rM   r   r   r\   
seq_lengthr]   k_biaskeysvaluesqueries	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper)   r)   r*   r_      s0   

zVideoMAESelfAttention.forwardrC   )r!   r"   r#   r   rE   r   r%   Tensorr(   r_   ra   r)   r)   rN   r*   r      s    .r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
VideoMAESelfOutputz
    The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rK   c                    s.   t    t|j|j| _t|j| _d S rC   )	rD   rE   r   r   rI   denseDropouthidden_dropout_probr   rL   rN   r)   r*   rE        
zVideoMAESelfOutput.__init__r   input_tensorr   c                 C      |  |}| |}|S rC   r   r   rM   r   r   r)   r)   r*   r_        

zVideoMAESelfOutput.forward)
r!   r"   r#   r$   r   rE   r%   r   r_   ra   r)   r)   rN   r*   r     s    $r   c                       sV   e Zd Zdef fddZdee fddZddej	d	e
ej	 d
ej	fddZ  ZS )VideoMAEAttentionrK   c                    s*   t    t|| _t|| _t | _d S rC   )rD   rE   r   	attentionr   outputsetpruned_headsrL   rN   r)   r*   rE   $  s   


zVideoMAEAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r   )lenr   r   r   r   r   r   r{   r|   r}   r   r   r   union)rM   r   indexr)   r)   r*   prune_heads*  s   zVideoMAEAttention.prune_headsNr   r   r   c                 C   s    |  ||\}}| ||}|S rC   )r   r   )rM   r   r   self_attn_outputr]   r   r)   r)   r*   r_   <  s   zVideoMAEAttention.forwardrC   )r!   r"   r#   r   rE   r   rn   r   r%   r   r   r_   ra   r)   r)   rN   r*   r   #  s    *r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )VideoMAEIntermediaterK   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rC   )rD   rE   r   r   rI   intermediate_sizer   rj   
hidden_actstrr	   intermediate_act_fnrL   rN   r)   r*   rE   D  s
   
zVideoMAEIntermediate.__init__r   r   c                 C   r   rC   )r   r   )rM   r   r)   r)   r*   r_   L  r   zVideoMAEIntermediate.forward	r!   r"   r#   r   rE   r%   r   r_   ra   r)   r)   rN   r*   r   C  s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	VideoMAEOutputrK   c                    s.   t    t|j|j| _t|j| _	d S rC   )
rD   rE   r   r   r   rI   r   r   r   r   rL   rN   r)   r*   rE   T  r   zVideoMAEOutput.__init__r   r   r   c                 C   s    |  |}| |}|| }|S rC   r   r   r)   r)   r*   r_   Y  s   

zVideoMAEOutput.forwardr   r)   r)   rN   r*   r   S  s    $r   c                       sH   e Zd ZdZdef fddZddejdeej dejfd	d
Z	  Z
S )VideoMAELayerz?This corresponds to the Block class in the timm implementation.rK   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)rD   rE   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormrI   layer_norm_epslayernorm_beforelayernorm_afterrL   rN   r)   r*   rE   d  s   



zVideoMAELayer.__init__Nr   r   r   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S rC   )r   r   r   r   r   )rM   r   r   hidden_states_normattention_outputlayer_outputr)   r)   r*   r_   n  s   


zVideoMAELayer.forwardrC   )r!   r"   r#   r$   r   rE   r%   r   r   r_   ra   r)   r)   rN   r*   r   a  s    *
r   c                       sB   e Zd Zdef fddZd
dejdeej defdd	Z	  Z
S )VideoMAEEncoderrK   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r)   r   r0   r]   rK   r)   r*   r4     r9   z,VideoMAEEncoder.__init__.<locals>.<listcomp>F)	rD   rE   rK   r   
ModuleListr5   num_hidden_layerslayergradient_checkpointingrL   rN   r   r*   rE     s   
 
zVideoMAEEncoder.__init__Nr   r   r   c                 C   s<   t | jD ]\}}|d ur|| nd }|||}qt|dS )Nlast_hidden_state)	enumerater   r   )rM   r   r   ilayer_modulelayer_head_maskr)   r)   r*   r_     s   
zVideoMAEEncoder.forwardrC   )r!   r"   r#   r   rE   r%   r   r   r   r_   ra   r)   r)   rN   r*   r     s    (r   c                   @   sL   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZdZeedZdd	 Zd
S )VideoMAEPreTrainedModelrK   videomaerY   TrB   r   )r   r    c                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsry   )meanstdNg      ?)rj   r   r   ro   r   datanormal_rK   initializer_ranger   zero_r   fill_)rM   rz   r)   r)   r*   _init_weights  s   
z%VideoMAEPreTrainedModel._init_weightsN)r!   r"   r#   r   r'   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   r)   r)   r)   r*   r     s   
 r   c                       sp   e Zd Z fddZdd Zdd Zedde				dd
ej	de
ej de
ej dee def
ddZ  ZS )VideoMAEModelc                    sT   t  | || _t|| _t|| _|jrd | _n
t	j
|j|jd| _|   d S )Nr   )rD   rE   rK   rB   r[   r   encoderuse_mean_pooling	layernormr   r   rI   r   	post_initrL   rN   r)   r*   rE     s   

zVideoMAEModel.__init__c                 C   s   | j jS rC   )r[   rG   )rM   r)   r)   r*   get_input_embeddings  s   z"VideoMAEModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rM   heads_to_pruner   r   r)   r)   r*   _prune_heads  s   zVideoMAEModel._prune_headsF)tie_last_hidden_statesNrY   rZ   r   r   r   c                 K   sN   |  || jj}| ||}| j||d}|j}| jdur"| |}t|dS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. If `None`, then all patches are considered. Sequence
            length is `(num_frames // tubelet_size) * (image_size // patch_size) ** 2`.

        Examples:

        ```python
        >>> import av
        >>> import numpy as np

        >>> from transformers import AutoImageProcessor, VideoMAEModel
        >>> from huggingface_hub import hf_hub_download

        >>> np.random.seed(0)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample 16 frames
        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        >>> video = read_video_pyav(container, indices)

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

        >>> # prepare video for the model
        >>> inputs = image_processor(list(video), return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 1568, 768]
        ```r   Nr   )get_head_maskrK   r   r[   r  r   r  r   )rM   rY   rZ   r   r   embedding_outputencoder_outputssequence_outputr)   r)   r*   r_     s   ^


zVideoMAEModel.forward)NN)r!   r"   r#   rE   r
  r  r   r   r%   r&   r   
BoolTensorr   r   r   r   r_   ra   r)   r)   rN   r*   r    s&    r  c                       s6   e Zd Zdef fddZdejdefddZ  Z	S )VideoMAEDecoderrK   c                    s   t    |j|j |jd  }t| |j _|j _	|j
 _|j _t fddt|jD | _t|j| _|dkrFt|j|nt | _d| _ | _d S )Nr-   c                    r   r)   r   r   decoder_configr)   r*   r4   >  r9   z,VideoMAEDecoder.__init__.<locals>.<listcomp>r   F)rD   rE   r^   ri   rg   r   decoder_hidden_sizerI   decoder_num_hidden_layersr   decoder_num_attention_headsr   decoder_intermediate_sizer   r   r   r5   decoder_layersr   normr   Identityheadr   rK   )rM   rK   decoder_num_labelsrN   r  r*   rE   3  s   

zVideoMAEDecoder.__init__r   return_token_numc                 C   sT   | j D ]}||d d}q|dkr|d d | d f }| |}| |}t|dS )Nr  r   )r   )r  r  r  r   )rM   r   r!  r   r   r)   r)   r*   r_   I  s   



zVideoMAEDecoder.forward)
r!   r"   r#   r   rE   r%   r   rn   r_   ra   r)   r)   rN   r*   r  2  s    r  zb
    The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.
    c                       sT   e Zd Z fddZee	ddejdejde	ej
 dee def
d	d
Z  ZS )VideoMAEForPreTrainingc                    st   t  | || _t|| _tj|j|jdd| _	t
tdd|j| _t| jjj|j| _t|| _|   d S )NFr   r   )rD   rE   rK   r  r   r   r   rI   r  encoder_to_decoderr   r%   r   
mask_tokenr@   r[   rH   rJ   r  decoderr	  rL   rN   r)   r*   rE   ^  s   

zVideoMAEForPreTraining.__init__NrY   rZ   r   r   r   c                 K   s&  | j |f||d|}|j}| |}|j\}}}	|du r"td| j|dd|}
|
 j	|j
dd}
|
|  |d|	}|
| |d|	}tj|| | j| gdd}| ||jd }|j}d}t  | jjd	kru|}n2|j
}|j}ttj	||d
ddddddf }ttj	||d
ddddddf }|| | }|j\}}}	}}| jj| jj}}| jjr |||| ||	|| ||| |}|dddddddd	 }|||| | | | | || | |	}||jddd |jdddd  d  }|||| | | | | || | |	 }nB| jjd	kr+td|||| ||	|| ||| |}|dddddddd	 }|||| | | | | || | |	 }|j\}}}	|| |d|	}W d   n	1 s|w   Y  t! }|||}t"|||j#|j$dS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Each video in the
            batch must have the same number of masked patches. Sequence length is `(num_frames // tubelet_size) *
            (image_size // patch_size) ** 2`.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, VideoMAEForPreTraining
        >>> import numpy as np
        >>> import torch

        >>> num_frames = 16
        >>> video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
        >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")

        >>> pixel_values = image_processor(video, return_tensors="pt").pixel_values

        >>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
        >>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
        >>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss = outputs.loss
        ```)rZ   r   Nz!One must provided a boolean mask rS   TrP   r   r   r   )rQ   r   r   rr      r-         r   )r   keepdim)r   unbiasedr)  gư>zQCan't unnormalize non-RGB images. Consider setting config.norm_pix_loss to False.r,   r   r   r    )%r   r   r#  rW   rs   rJ   expandrU   rT   rV   rQ   rX   r%   catr$  r%  r   no_gradrK   r^   r   	as_tensorr   r   ri   rg   norm_pix_lossr   rt   r   r   varsqrtr   r+   r   r    )rM   rY   rZ   r   r   outputsr  r\   r]   r^   expanded_position_embeddingspos_emb_visiblepos_emb_maskx_fulldecoder_outputsr   r,   framesrQ   r   r   r   timerw   rx   ri   rg   frames_normvideos_patchlabelsloss_fctr)   r)   r*   r_   o  s   $
&&

J
zVideoMAEForPreTraining.forwardrC   )r!   r"   r#   rE   r   r   r%   r&   r  r   r   r   r   r+   r_   ra   r)   r)   rN   r*   r"  X  s     r"  z
    VideoMAE Model transformer with a video classification head on top (a linear layer on top of the average pooled hidden
    states of all tokens) e.g. for ImageNet.
    c                       s`   e Zd Z fddZee			ddeej deej deej de	e
 def
d	d
Z  ZS )VideoMAEForVideoClassificationc                    sf   t  | |j| _t|| _|jrt|jnd | _	|jdkr(t
|j|jnt | _|   d S )Nr   )rD   rE   
num_labelsr  r   r  r   r   rI   fc_normr   r  
classifierr	  rL   rN   r)   r*   rE   
  s   
$z'VideoMAEForVideoClassification.__init__NrY   r   r=  r   r   c           
      K   s   | j |fd|i|}|j}| jdur|d}| |}n|dddf }| |}d}	|dur=| j||| jfi |}	t|	||j|j	dS )a!  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import av
        >>> import torch
        >>> import numpy as np

        >>> from transformers import AutoImageProcessor, VideoMAEForVideoClassification
        >>> from huggingface_hub import hf_hub_download

        >>> np.random.seed(0)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample 16 frames
        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
        >>> video = read_video_pyav(container, indices)

        >>> image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
        >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")

        >>> inputs = image_processor(list(video), return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        ...     logits = outputs.logits

        >>> # model predicts one of the 400 Kinetics-400 classes
        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])
        eating spaghetti
        ```r   Nr   r   r+  )
r   r   rA  r   rB  loss_functionrK   r   r   r    )
rM   rY   r   r=  r   r3  r  r   r   r,   r)   r)   r*   r_     s    \


z&VideoMAEForVideoClassification.forward)NNN)r!   r"   r#   rE   r   r   r   r%   r   r   r   r   r_   ra   r)   r)   rN   r*   r?    s$    r?  )r"  r  r   r?  )ry   )Er$   collections.abcrk   rR   r   dataclassesr   typingr   r   numpyr.   r%   r   torch.nnr   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.constantsr   r   utils.genericr   r   configuration_videomaer   
get_loggerr!   loggerr   r+   r@   ModulerB   rF   r   floatr   r   r   r   r   r   r   r   r   r  r  r"  r?  __all__r)   r)   r)   r*   <module>   s   
!=
=  & ' 