o
    wi)%                     @   s   d dl mZmZ d dlZd dlmZ d dlmZmZmZ d dl	m
Z
 ddlmZ ddlmZ ddlmZmZ d	d
lmZmZmZ G dd deZeG dd deZG dd deeZeddG dd deeZg dZdS )    )OptionalUnionN)BCEWithLogitsLossCrossEntropyLossMSELoss)IJepaConfig   )ImageClassifierOutput)PreTrainedModel)auto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelc                	       st   e Zd Zddededdf fddZdejd	ed
edejfddZ			ddejde
ej dedejfddZ  ZS )IJepaEmbeddingsFconfiguse_mask_tokenreturnNc                    s6   t  || | `| jj}ttd||j	| _
d S )N   )super__init__	cls_tokenpatch_embeddingsnum_patchesnn	Parametertorchrandnhidden_sizeposition_embeddings)selfr   r   r   	__class__ d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/ijepa/modular_ijepa.pyr      s   zIJepaEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r   bicubicF)sizemodealign_corners)shaper    r   jit
is_tracing
patch_sizer   reshapepermuter   
functionalinterpolateview)r!   r&   r'   r(   r   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionsr$   r$   r%   interpolate_pos_encoding   s&   




z(IJepaEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posr=   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }|r=|| ||| }n|| j }| |}|S )N)r=   r   r)         ?)	r.   r   
mask_tokenexpand	unsqueezetype_asr=   r    dropout)r!   r>   r?   r=   
batch_size_r'   r(   r&   
seq_lengthmask_tokensmaskr$   r$   r%   forward>   s   


zIJepaEmbeddings.forward)F)NF)__name__
__module____qualname__r   boolr   r   Tensorintr=   r   
BoolTensorrK   __classcell__r$   r$   r"   r%   r      s    *r   c                   @   sV   e Zd ZeZdZdZdZddgZdZ	dZ
dZdZdeejejejf ddfd	d
ZdS )IJepaPreTrainedModelijepar>   Tr   
IJepaLayermoduler   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trotjj|jjt	j
d| jjd|jj|j_|jdurq|jj  dS dS dS )zInitialize the weightsg        )meanstdNr@   )
isinstancer   LinearConv2dinittrunc_normal_weightdatator   float32r   initializer_rangedtypebiaszero_	LayerNormfill_r   r    rA   )r!   rW   r$   r$   r%   _init_weightse   s0   




z"IJepaPreTrainedModel._init_weights)rL   rM   rN   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backendr   r   r[   r\   rg   ri   r$   r$   r$   r%   rT   Y   s    &rT   c                       s,   e Zd Zddededef fddZ  ZS )
IJepaModelFr   add_pooling_layerr   c                    s$   t  | || _t||d| _dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   N)r   r   r   r   r&   )r!   r   rt   r   r"   r$   r%   r   }   s   zIJepaModel.__init__)FF)rL   rM   rN   r   rO   r   rS   r$   r$   r"   r%   rs   |   s    $rs   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                       s   e Zd Zdef fddZ							ddeej deej deej dee d	ee d
ee dee de	e
ef fddZ  ZS )IJepaForImageClassificationr   c                    s&   t  | t|dd| _|   d S )NF)rt   )r   r   rs   rU   	post_init)r!   r   r"   r$   r%   r      s   z$IJepaForImageClassification.__init__Nr>   	head_masklabelsoutput_attentionsoutput_hidden_statesr=   return_dictr   c                 C   sv  |dur|n| j j}| j||||||d}|d }	| |	jdd}
d}|dur||
j}| j jdu rX| jdkr>d| j _n| jdkrT|j	t
jksO|j	t
jkrTd| j _nd| j _| j jdkrvt }| jdkrp||
 | }n+||
|}n%| j jdkrt }||
d	| j|d	}n| j jdkrt }||
|}|s|
f|dd  }|dur|f| S |S t||
|j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)rx   rz   r{   r=   r|   r   r   )r9   
regressionsingle_label_classificationmulti_label_classificationr)   )losslogitshidden_states
attentions)r   use_return_dictrU   
classifierrX   ra   deviceproblem_type
num_labelsrd   r   longrQ   r   squeezer   r6   r   r	   r   r   )r!   r>   rx   ry   rz   r{   r=   r|   outputssequence_outputr   r   loss_fctoutputr$   r$   r%   rK      sP   	

"


z#IJepaForImageClassification.forward)NNNNNNN)rL   rM   rN   r   r   r   r   rP   rO   r   tupler	   rK   rS   r$   r$   r"   r%   rv      s4    
	rv   )rT   rs   rv   )typingr   r   r   torch.nnr   r   r   r   -transformers.models.ijepa.configuration_ijepar   modeling_outputsr	   modeling_utilsr
   utilsr   r   vit.modeling_vitr   r   r   r   rT   rs   rv   __all__r$   r$   r$   r%   <module>   s$    J"J