o
    i                     @   s   d dl mZmZ d dlZd dlmZ d dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZmZ G d
d deZeG dd deZG dd deeZeddG dd deeZg dZdS )    )OptionalUnionN)IJepaConfig   )BaseModelOutputWithPoolingImageClassifierOutput)Unpack)TransformersKwargsauto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelViTPreTrainedModelc                	       st   e Zd Zddededdf fddZdejd	ed
edejfddZ			ddejde
ej dedejfddZ  ZS )IJepaEmbeddingsFconfiguse_mask_tokenreturnNc                    s6   t  || | `| jj}ttd||j	| _
d S )N   )super__init__	cls_tokenpatch_embeddingsnum_patchesnn	Parametertorchrandnhidden_sizeposition_embeddings)selfr   r   r   	__class__ d/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/ijepa/modular_ijepa.pyr      s   zIJepaEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r   bicubicF)sizemodealign_corners)shaper    r   jit
is_tracing
patch_sizer   reshapepermuter   
functionalinterpolateview)r!   r&   r'   r(   r   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionsr$   r$   r%   interpolate_pos_encoding   s&   




z(IJepaEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posr=   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }|r=|| ||| }n|| j }| |}|S )N)r=   r   r)         ?)	r.   r   
mask_tokenexpand	unsqueezetype_asr=   r    dropout)r!   r>   r?   r=   
batch_size_r'   r(   r&   
seq_lengthmask_tokensmaskr$   r$   r%   forward=   s   


zIJepaEmbeddings.forward)F)NF)__name__
__module____qualname__r   boolr   r   Tensorintr=   r   
BoolTensorrK   __classcell__r$   r$   r"   r%   r      s    *r   c                   @   s.   e Zd Zdeejejejf ddfddZdS )IJepaPreTrainedModelmoduler   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trotjj|jjt	j
d| jjd|jj|j_|jdurq|jj  dS dS dS )zInitialize the weightsg        )meanstdNr@   )
isinstancer   LinearConv2dinittrunc_normal_weightdatator   float32r   initializer_rangedtypebiaszero_	LayerNormfill_r   r    rA   )r!   rU   r$   r$   r%   _init_weightsZ   s0   




z"IJepaPreTrainedModel._init_weights)	rL   rM   rN   r   r   rY   rZ   re   rg   r$   r$   r$   r%   rT   X   s    &rT   c                       s,   e Zd Zddededef fddZ  ZS )
IJepaModelFr   add_pooling_layerr   c                    s$   t  | || _t||d| _dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   N)r   r   r   r   r&   )r!   r   ri   r   r"   r$   r%   r   r   s   zIJepaModel.__init__)FF)rL   rM   rN   r   rO   r   rS   r$   r$   r"   r%   rh   q   s    $rh   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                       sh   e Zd Zdef fddZ				ddeej deej deej dee d	e	e
 d
efddZ  ZS )IJepaForImageClassificationr   c                    s&   t  | t|dd| _|   d S )NF)ri   )r   r   rh   ijepa	post_init)r!   r   r"   r$   r%   r      s   z$IJepaForImageClassification.__init__Nr>   	head_masklabelsr=   kwargsr   c           
      K   sh   | j |f||d|}|j}| |jdd}d}	|dur*| j||| jfi |}	t|	||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )rn   r=   r   )r9   N)losslogitshidden_states
attentions)	rl   last_hidden_state
classifierrV   loss_functionr   r   rs   rt   )
r!   r>   rn   ro   r=   rp   outputssequence_outputrr   rq   r$   r$   r%   rK      s&   z#IJepaForImageClassification.forward)NNNN)rL   rM   rN   r   r   r   r   rP   rO   r   r	   r   rK   rS   r$   r$   r"   r%   rk   ~   s&    rk   )rT   rh   rk   )typingr   r   r   torch.nnr   -transformers.models.ijepa.configuration_ijepar   modeling_outputsr   r   processing_utilsr   utilsr	   r
   r   vit.modeling_vitr   r   r   r   r   rT   rh   rk   __all__r$   r$   r$   r%   <module>   s"    J*