o
    i[                     @   s  d dl Zd dlmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZ ddl m!Z! G dd dej"Z#G dd dej"Z$	d5dej"dej%dej%dej%deej% de&de&fddZ'G dd dej"Z(G dd dej"Z)G d d! d!ej"Z*G d"d# d#ej"Z+G d$d% d%ej"Z,G d&d' d'eZ-eG d(d) d)eZ.G d*d+ d+ej"Z/G d,d- d-ej"Z0eG d.d/ d/e.Z1ed0d1G d2d3 d3e.Z2g d4Z3dS )6    N)CallableOptionalUnion   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstring	torch_int)can_return_tuplecheck_model_inputs   )IJepaConfigc                       sB   e Zd ZdZdef fddZddejdedejfd	d
Z	  Z
S )IJepaPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    configc                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesnnConv2d
projection)selfr   r   r   r   r    r%   	__class__ e/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/ijepa/modeling_ijepa.pyr      s   
 zIJepaPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingreturnc              
   C   s   |j \}}}}|| jkrtd| j d| d|s?|| jd ks(|| jd kr?td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper   
ValueErrorr   r(   flatten	transpose)r)   r.   r/   
batch_sizer   heightwidth
embeddingsr,   r,   r-   forward.   s(   
zIJepaPatchEmbeddings.forwardF)__name__
__module____qualname____doc__r   r   torchTensorboolr<   __classcell__r,   r,   r*   r-   r      s    $r   c                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdeej dedejfddZ  ZS )IJepaEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fr   use_mask_tokenr0   Nc                    st   t    |rttdd|jnd | _t|| _	| j	j
}ttd||j| _t|j| _|j| _|| _d S )Nr   )r   r   r&   	ParameterrB   zerosr    
mask_tokenr   patch_embeddingsr%   randnposition_embeddingsDropouthidden_dropout_probdropoutr   r   )r)   r   rG   r%   r*   r,   r-   r   D   s   
 

zIJepaEmbeddings.__init__r;   r9   r:   c                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r3   bicubicF)sizemodealign_corners)r4   rM   rB   jit
is_tracingr   r   reshapepermuter&   
functionalinterpolateview)r)   r;   r9   r:   r%   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionsr,   r,   r-   r/   N   s&   




z(IJepaEmbeddings.interpolate_pos_encodingr.   bool_masked_posr/   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }|r=|| ||| }n|| j }| |}|S )N)r/   r   rQ         ?)	r4   rK   rJ   expand	unsqueezetype_asr/   rM   rP   )r)   r.   rc   r/   r8   _r9   r:   r;   
seq_lengthmask_tokensmaskr,   r,   r-   r<   u   s   


zIJepaEmbeddings.forwardr=   )NF)r>   r?   r@   rA   r   rD   r   rB   rC   intr/   r   
BoolTensorr<   rE   r,   r,   r*   r-   rF   ?   s    
*rF           modulequerykeyvalueattention_maskscalingrP   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )NrQ   )r_   dtype)ptrainingr   r3   )rB   matmulr7   r&   rZ   softmaxfloat32torv   rP   rx   
contiguous)
ro   rp   rq   rr   rs   rt   rP   kwargsattn_weightsattn_outputr,   r,   r-   eager_attention_forward   s   r   c                	       sP   e Zd Zdef fddZ	d
dejdeej deejejf fdd	Z	  Z
S )IJepaSelfAttentionr   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r1   g      F)bias)r   r   r    num_attention_headshasattrr5   r   rl   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probrt   	is_causalr&   Linearqkv_biasrp   rq   rr   r)   r   r*   r,   r-   r      s"   

zIJepaSelfAttention.__init__Nhidden_states	head_maskr0   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr?t| j	j
 }|| ||||| j| j| jsNdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   rQ   r   r3   eagerrn   )r   rt   rP   ru   )r4   r   r   rq   r\   r7   rr   rp   r   r   _attn_implementationr   r   rt   rx   r   rS   r   rX   )r)   r   r   r8   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper,   r,   r-   r<      s*   


zIJepaSelfAttention.forwardN)r>   r?   r@   r   r   rB   rC   r   tupler<   rE   r,   r,   r*   r-   r      s    r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
IJepaSelfOutputz
    The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S r   )	r   r   r&   r   r    denserN   rO   rP   r   r*   r,   r-   r         
zIJepaSelfOutput.__init__r   input_tensorr0   c                 C      |  |}| |}|S r   r   rP   r)   r   r   r,   r,   r-   r<         

zIJepaSelfOutput.forward)
r>   r?   r@   rA   r   r   rB   rC   r<   rE   r,   r,   r*   r-   r      s    $r   c                       sV   e Zd Zdef fddZdee fddZddej	d	e
ej	 d
ej	fddZ  ZS )IJepaAttentionr   c                    s*   t    t|| _t|| _t | _d S r   )r   r   r   	attentionr   outputsetpruned_headsr   r*   r,   r-   r      s   


zIJepaAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r_   )lenr   r   r   r   r   r   rp   rq   rr   r   r   r   union)r)   r   indexr,   r,   r-   prune_heads   s   zIJepaAttention.prune_headsNr   r   r0   c                 C   s    |  ||\}}| ||}|S r   )r   r   )r)   r   r   self_attn_outputrh   r   r,   r,   r-   r<     s   zIJepaAttention.forwardr   )r>   r?   r@   r   r   r   rl   r   rB   rC   r   r<   rE   r,   r,   r*   r-   r      s    *r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )IJepaIntermediater   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r&   r   r    intermediate_sizer   r!   
hidden_actstrr   intermediate_act_fnr   r*   r,   r-   r     s
   
zIJepaIntermediate.__init__r   r0   c                 C   r   r   )r   r   )r)   r   r,   r,   r-   r<     r   zIJepaIntermediate.forward	r>   r?   r@   r   r   rB   rC   r<   rE   r,   r,   r*   r-   r     s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	IJepaOutputr   c                    s.   t    t|j|j| _t|j| _	d S r   )
r   r   r&   r   r   r    r   rN   rO   rP   r   r*   r,   r-   r   "  r   zIJepaOutput.__init__r   r   r0   c                 C   s    |  |}| |}|| }|S r   r   r   r,   r,   r-   r<   '  s   

zIJepaOutput.forwardr   r,   r,   r*   r-   r   !  s    $r   c                       sH   e Zd ZdZdef fddZddejdeej dejfd	d
Z	  Z
S )
IJepaLayerz?This corresponds to the Block class in the timm implementation.r   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r   r   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r&   	LayerNormr    layer_norm_epslayernorm_beforelayernorm_afterr   r*   r,   r-   r   1  s   



zIJepaLayer.__init__Nr   r   r0   c                 C   sB   |  |}| ||}|| }| |}| |}| ||}|S r   )r   r   r   r   r   )r)   r   r   hidden_states_normattention_outputlayer_outputr,   r,   r-   r<   ;  s   


zIJepaLayer.forwardr   )r>   r?   r@   rA   r   r   rB   rC   r   r<   rE   r,   r,   r*   r-   r   .  s    *
r   c                   @   sf   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZdZeedZdeejejejf d	d
fddZd
S )IJepaPreTrainedModelr   ijepar.   TrF   r   )r   
attentionsro   r0   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trotjj|jjt	j
d| jjd|jj|j_|jdurq|jj  dS dS dS )zInitialize the weightsrn   )meanstdNrd   )r!   r&   r   r'   inittrunc_normal_weightdatar|   rB   r{   r   initializer_rangerv   r   zero_r   fill_rF   rM   rJ   )r)   ro   r,   r,   r-   _init_weights\  s0   




z"IJepaPreTrainedModel._init_weights)r>   r?   r@   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   r&   r   r'   r   r   r,   r,   r,   r-   r   L  s   
 &r   c                       sB   e Zd Zdef fddZd
dejdeej defdd	Z	  Z
S )IJepaEncoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r,   )r   ).0rh   r   r,   r-   
<listcomp>w  s    z)IJepaEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r&   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r*   r   r-   r   t  s   
 
zIJepaEncoder.__init__Nr   r   r0   c                 C   s<   t | jD ]\}}|d ur|| nd }|||}qt|dS )N)last_hidden_state)	enumerater   r   )r)   r   r   ilayer_modulelayer_head_maskr,   r,   r-   r<   z  s   
zIJepaEncoder.forwardr   )r>   r?   r@   r   r   rB   rC   r   r   r<   rE   r,   r,   r*   r-   r   s  s    (r   c                       r   )IJepaPoolerr   c                    s,   t    t|j|j| _t|j | _	d S r   )
r   r   r&   r   r    pooler_output_sizer   r   
pooler_act
activationr   r*   r,   r-   r     s   
zIJepaPooler.__init__r   r0   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r)   r   first_token_tensorpooled_outputr,   r,   r-   r<     s   

zIJepaPooler.forwardr   r,   r,   r*   r-   r     s    r   c                       s   e Zd Zddededef fddZdefdd	Zd
ee	e
e	 f fddZedde				ddeej deej deej dee dee defddZ  ZS )
IJepaModelFr   add_pooling_layerrG   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )rG   r   N)r   r   r   rF   r;   r   encoderr&   r   r    r   	layernormr   pooler	post_init)r)   r   r   rG   r*   r,   r-   r     s   
zIJepaModel.__init__r0   c                 C   s   | j jS r   )r;   rK   )r)   r,   r,   r-   get_input_embeddings  s   zIJepaModel.get_input_embeddingsheads_to_prunec                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r)   r   r   r   r,   r,   r-   _prune_heads  s   zIJepaModel._prune_heads)tie_last_hidden_statesNr.   rc   r   r/   r~   c                 K   s   |du rt d| || jj}| jjjjj}|j|kr!|	|}| j|||d}| j
||d}|j}	| |	}	| jdurB| |	nd}
t|	|
dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rc   r/   )r   )r   pooler_output)r5   get_head_maskr   r   r;   rK   r(   r   rv   r|   r   r   r   r   r	   )r)   r.   rc   r   r/   r~   expected_dtypeembedding_outputencoder_outputssequence_outputr   r,   r,   r-   r<     s   


zIJepaModel.forward)FFNNNN)r>   r?   r@   r   rD   r   r   r   dictrl   listr   r   r   r   rB   rC   rm   r   r   r	   r<   rE   r,   r,   r*   r-   r     s.    r   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                       sp   e Zd Zdef fddZee				ddeej	 deej	 deej	 dee
 d	ee d
efddZ  ZS )IJepaForImageClassificationr   c                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NF)r   r   )r   r   
num_labelsr   r   r&   r   r    Identity
classifierr   r   r*   r,   r-   r     s
   $z$IJepaForImageClassification.__init__Nr.   r   labelsr/   r~   r0   c           
      K   sh   | j |f||d|}|j}| |jdd}d}	|dur*| j||| jfi |}	t|	||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   r/   r   r   N)losslogitsr   r   )	r   r   r	  r   loss_functionr   r
   r   r   )
r)   r.   r   r
  r/   r~   outputsr  r  r  r,   r,   r-   r<     s&   z#IJepaForImageClassification.forwardr  )r>   r?   r@   r   r   r   r   r   rB   rC   rD   r   r   r
   r<   rE   r,   r,   r*   r-   r    s*    r  )r   r   r  )rn   )4collections.abcr"   typingr   r   r   rB   torch.nnr&   activationsr   modeling_layersr   modeling_outputsr   r	   r
   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   utils.genericr   r   configuration_ijepar   Moduler   rF   rC   floatr   r   r   r   r   r   r   r   r   r   r   r  __all__r,   r,   r,   r-   <module>   s`   'X
4&I3