o
    eiQ                     @   s  d dl Zd dl mZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddlm Z  G dd dej!Z"G dd dej!Z#		d7dej!dej$dej$dej$dej$dB de%dB de%dee fddZ&G dd dej!Z'G d d! d!ej!Z(G d"d# d#ej!Z)G d$d% d%ej!Z*G d&d' d'ej!Z+G d(d) d)eZ,eG d*d+ d+eZ-G d,d- d-ej!Z.G d.d/ d/ej!Z/eG d0d1 d1e-Z0ed2d3G d4d5 d5e-Z1g d6Z2dS )8    N)Callable   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )IJepaConfigc                       sB   e Zd ZdZdef fddZddejdedejfd	d
Z	  Z
S )IJepaPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    configc                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesnnConv2d
projection)selfr   r   r   r   r   r#   	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ijepa/modeling_ijepa.pyr       s   
 zIJepaPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingreturnc              
   C   s   |j \}}}}|| jkrtd| j d| d|s?|| jd ks(|| jd kr?td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper   
ValueErrorr   r&   flatten	transpose)r'   r,   r-   
batch_sizer   heightwidth
embeddingsr*   r*   r+   forward/   s(   
zIJepaPatchEmbeddings.forwardF)__name__
__module____qualname____doc__r   r   torchTensorboolr:   __classcell__r*   r*   r(   r+   r      s    $r   c                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdejdB dedejfddZ  ZS )IJepaEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fr   use_mask_tokenr.   Nc                    st   t    |rttdd|jnd | _t|| _	| j	j
}ttd||j| _t|j| _|j| _|| _d S )Nr   )r   r   r$   	Parameterr@   zerosr   
mask_tokenr   patch_embeddingsr#   randnposition_embeddingsDropouthidden_dropout_probdropoutr   r   )r'   r   rE   r#   r(   r*   r+   r   E   s   
 

zIJepaEmbeddings.__init__r9   r7   r8   c                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r1   bicubicF)sizemodealign_corners)r2   rK   r@   jit
is_tracingr   r   reshapepermuter$   
functionalinterpolateview)r'   r9   r7   r8   r#   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionsr*   r*   r+   r-   O   s&   




z(IJepaEmbeddings.interpolate_pos_encodingr,   bool_masked_posr-   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }|r=|| ||| }n|| j }| |}|S )N)r-   r   rO   g      ?)	r2   rI   rH   expand	unsqueezetype_asr-   rK   rN   )r'   r,   ra   r-   r6   _r7   r8   r9   
seq_lengthmask_tokensmaskr*   r*   r+   r:   v   s   


zIJepaEmbeddings.forwardr;   )NF)r<   r=   r>   r?   r   rB   r   r@   rA   intr-   
BoolTensorr:   rC   r*   r*   r(   r+   rD   @   s    
*rD           modulequerykeyvalueattention_maskscalingrN   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )NrO         r1   r   r]   )ptrainingr   )
rQ   r@   matmulr5   r$   rX   softmaxrN   rv   
contiguous)
rl   rm   rn   ro   rp   rq   rN   rr   attn_weightsattn_outputr*   r*   r+   eager_attention_forward   s   
r|   c                       sB   e Zd Zdef fddZdejdeejejf fddZ  Z	S )IJepaSelfAttentionr   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r/   rs   F)bias)r   r   r   num_attention_headshasattrr3   r   ri   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probrq   	is_causalr$   Linearqkv_biasrm   rn   ro   r'   r   r(   r*   r+   r      s"   

zIJepaSelfAttention.__init__hidden_statesr.   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t	| j
jt}|| |||d | j| j| jsHdn| jd\}}	| d d | jf }
||
}||	fS )Nr   rO   r   r1   rk   )r   rq   rN   )r2   r   r   rn   rZ   r5   ro   rm   r
   get_interfacer   _attn_implementationr|   r   rq   rv   r   rQ   r   rV   )r'   r   r6   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper*   r*   r+   r:      s*   


zIJepaSelfAttention.forward)
r<   r=   r>   r   r   r@   rA   tupler:   rC   r*   r*   r(   r+   r}      s    (r}   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
IJepaSelfOutputz
    The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S N)	r   r   r$   r   r   denserL   rM   rN   r   r(   r*   r+   r         
zIJepaSelfOutput.__init__r   input_tensorr.   c                 C      |  |}| |}|S r   r   rN   r'   r   r   r*   r*   r+   r:         

zIJepaSelfOutput.forward
r<   r=   r>   r?   r   r   r@   rA   r:   rC   r*   r*   r(   r+   r      s    $r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )IJepaAttentionr   c                    s"   t    t|| _t|| _d S r   )r   r   r}   	attentionr   outputr   r(   r*   r+   r      s   

zIJepaAttention.__init__r   r.   c                 C   s   |  |\}}| ||}|S r   )r   r   )r'   r   self_attn_outputre   r   r*   r*   r+   r:      s   zIJepaAttention.forward	r<   r=   r>   r   r   r@   rA   r:   rC   r*   r*   r(   r+   r          r   c                       r   )IJepaIntermediater   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r$   r   r   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnr   r(   r*   r+   r      s
   
zIJepaIntermediate.__init__r   r.   c                 C   r   r   )r   r   )r'   r   r*   r*   r+   r:     r   zIJepaIntermediate.forwardr   r*   r*   r(   r+   r      s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	IJepaOutputr   c                    s.   t    t|j|j| _t|j| _	d S r   )
r   r   r$   r   r   r   r   rL   rM   rN   r   r(   r*   r+   r     r   zIJepaOutput.__init__r   r   r.   c                 C   s    |  |}| |}|| }|S r   r   r   r*   r*   r+   r:     s   

zIJepaOutput.forwardr   r*   r*   r(   r+   r     s    $r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	
IJepaLayerz?This corresponds to the Block class in the timm implementation.r   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r   r   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r$   	LayerNormr   layer_norm_epslayernorm_beforelayernorm_afterr   r(   r*   r+   r     s   



zIJepaLayer.__init__r   r.   c                 C   s@   |  |}| |}|| }| |}| |}| ||}|S r   )r   r   r   r   r   )r'   r   hidden_states_normattention_outputlayer_outputr*   r*   r+   r:   %  s   



zIJepaLayer.forwardr   r*   r*   r(   r+   r     s    
r   c                   @   sp   e Zd ZU eed< dZdZdZdZddgZ	dZ
dZdZdZeedZe d	ejejB ejB d
dfddZdS )IJepaPreTrainedModelr   ijepar,   )imageTrD   r   )r   
attentionsrl   r.   Nc                 C   s   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS t |trTtj|jd| jjd |jdurVt
|j dS dS dS )zInitialize the weightsrk   )meanstdN)r   r$   r   r%   inittrunc_normal_weightr   initializer_ranger   zeros_r   ones_rD   rK   rH   )r'   rl   r*   r*   r+   _init_weightsG  s   


z"IJepaPreTrainedModel._init_weights)r<   r=   r>   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r}   _can_record_outputsr@   no_gradr$   r   r%   r   r   r*   r*   r*   r+   r   6  s    
 &r   c                       s6   e Zd Zdef fddZdejdefddZ  Z	S )IJepaEncoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r*   )r   ).0re   r   r*   r+   
<listcomp>[  s    z)IJepaEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r$   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r(   r   r+   r   X  s   
 
zIJepaEncoder.__init__r   r.   c                 C   s&   t | jD ]\}}||}qt|dS )N)last_hidden_state)	enumerater   r   )r'   r   ilayer_moduler*   r*   r+   r:   ^  s   

zIJepaEncoder.forward)
r<   r=   r>   r   r   r@   rA   r   r:   rC   r*   r*   r(   r+   r   W  s    r   c                       r   )IJepaPoolerr   c                    s,   t    t|j|j| _t|j | _	d S r   )
r   r   r$   r   r   pooler_output_sizer   r   
pooler_act
activationr   r(   r*   r+   r   f  s   
zIJepaPooler.__init__r   r.   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r'   r   first_token_tensorpooled_outputr*   r*   r+   r:   k  s   

zIJepaPooler.forwardr   r*   r*   r(   r+   r   e  r   r   c                       s   e Zd Zddededef fddZdefdd	Zee	dd
e
			ddejdB dejdB dedB dee def
ddZ  ZS )
IJepaModelFr   add_pooling_layerrE   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )rE   r   N)r   r   r   rD   r9   r   encoderr$   r   r   r   	layernormr   pooler	post_init)r'   r   r   rE   r(   r*   r+   r   v  s   
zIJepaModel.__init__r.   c                 C   s   | j jS r   )r9   rI   )r'   r*   r*   r+   get_input_embeddings  s   zIJepaModel.get_input_embeddings)tie_last_hidden_statesNr,   ra   r-   rr   c           
      K   s   |du rt d| jjjjj}|j|kr||}| j|||d}| |}|j}| 	|}| j
dur8| 
|nd}	t||	dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)ra   r-   )r   pooler_output)r3   r9   rI   r&   r   dtypetor   r   r   r   r   )
r'   r,   ra   r-   rr   expected_dtypeembedding_outputencoder_outputssequence_outputr   r*   r*   r+   r:     s   



zIJepaModel.forward)FFNNN)r<   r=   r>   r   rB   r   r   r   r   r   r   r@   rA   rj   r   r   r   r:   rC   r*   r*   r(   r+   r   t  s(    r   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                       sd   e Zd Zdef fddZee			ddejdB dejdB de	dB de
e d	ef
d
dZ  ZS )IJepaForImageClassificationr   c                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NF)r   r   )r   r   
num_labelsr   r   r$   r   r   Identity
classifierr   r   r(   r*   r+   r     s
   $z$IJepaForImageClassification.__init__Nr,   labelsr-   rr   r.   c           	      K   sf   | j |fd|i|}|j}| |jdd}d}|dur)| j||| jfi |}t|||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r-   r   rt   N)losslogitsr   r   )	r   r   r   r   loss_functionr   r	   r   r   )	r'   r,   r   r-   rr   outputsr   r   r   r*   r*   r+   r:     s$   z#IJepaForImageClassification.forwardr   )r<   r=   r>   r   r   r   r   r@   rA   rB   r   r   r	   r:   rC   r*   r*   r(   r+   r     s$    r   )r   r   r   )Nrk   )3collections.abcr    r   r@   torch.nnr$    r   r   activationsr   modeling_layersr   modeling_outputsr   r   r	   modeling_utilsr
   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   configuration_ijepar   Moduler   rD   rA   floatr|   r}   r   r   r   r   r   r   r   r   r   r   __all__r*   r*   r*   r+   <module>   sh   'W
2 :1