o
    ei                     @   s   d dl Z d dlmZ d dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZmZ G d
d deZeG dd deZG dd deeZeddG dd deeZg dZdS )    N)IJepaConfig   )initialization)BaseModelOutputWithPoolingImageClassifierOutput)Unpack)TransformersKwargsauto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelViTPreTrainedModelc                	       st   e Zd Zddededdf fddZdejd	ed
edejfddZ			ddejdej
dB dedejfddZ  ZS )IJepaEmbeddingsFconfiguse_mask_tokenreturnNc                    s6   t  || | `| jj}ttd||j	| _
d S )N   )super__init__	cls_tokenpatch_embeddingsnum_patchesnn	Parametertorchrandnhidden_sizeposition_embeddings)selfr   r   r   	__class__ e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ijepa/modular_ijepa.pyr      s   zIJepaEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r   bicubicF)sizemodealign_corners)shaper   r   jit
is_tracing
patch_sizer
   reshapepermuter   
functionalinterpolateview)r    r%   r&   r'   r   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionsr#   r#   r$   interpolate_pos_encoding   s&   




z(IJepaEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posr<   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }|r=|| ||| }n|| j }| |}|S )N)r<   r   r(   g      ?)	r-   r   
mask_tokenexpand	unsqueezetype_asr<   r   dropout)r    r=   r>   r<   
batch_size_r&   r'   r%   
seq_lengthmask_tokensmaskr#   r#   r$   forward<   s   


zIJepaEmbeddings.forward)F)NF)__name__
__module____qualname__r   boolr   r   Tensorintr<   
BoolTensorrI   __classcell__r#   r#   r!   r$   r      s    *r   c                   @   s4   e Zd Ze dejejB ejB ddfddZ	dS )IJepaPreTrainedModelmoduler   Nc                 C   s   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS t |trTtj|jd| jjd |jdurVt
|j dS dS dS )zInitialize the weightsg        )meanstdN)
isinstancer   LinearConv2dinittrunc_normal_weightr   initializer_rangebiaszeros_	LayerNormones_r   r   r?   )r    rS   r#   r#   r$   _init_weightsY   s   


z"IJepaPreTrainedModel._init_weights)
rJ   rK   rL   r   no_gradr   rW   rX   r_   ra   r#   r#   r#   r$   rR   W   s    &rR   c                       s,   e Zd Zddededef fddZ  ZS )
IJepaModelFr   add_pooling_layerr   c                    s$   t  | || _t||d| _dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   N)r   r   r   r   r%   )r    r   rd   r   r!   r#   r$   r   j   s   zIJepaModel.__init__)FF)rJ   rK   rL   r   rM   r   rQ   r#   r#   r!   r$   rc   i   s    $rc   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                       s\   e Zd Zdef fddZ			ddejdB dejdB dedB dee	 d	e
f
d
dZ  ZS )IJepaForImageClassificationr   c                    s&   t  | t|dd| _|   d S )NF)rd   )r   r   rc   ijepa	post_init)r    r   r!   r#   r$   r      s   z$IJepaForImageClassification.__init__Nr=   labelsr<   kwargsr   c           	      K   sf   | j |fd|i|}|j}| |jdd}d}|dur)| j||| jfi |}t|||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r<   r   )r8   N)losslogitshidden_states
attentions)	rg   last_hidden_state
classifierrT   loss_functionr   r   rm   rn   )	r    r=   ri   r<   rj   outputssequence_outputrl   rk   r#   r#   r$   rI      s$   z#IJepaForImageClassification.forward)NNN)rJ   rK   rL   r   r   r   rN   rM   r   r   r   rI   rQ   r#   r#   r!   r$   rf   v   s     rf   )rR   rc   rf   )r   torch.nnr   -transformers.models.ijepa.configuration_ijepar    r   rY   modeling_outputsr   r   processing_utilsr   utilsr   r	   r
   vit.modeling_vitr   r   r   r   r   rR   rc   rf   __all__r#   r#   r#   r$   <module>   s"    J(