o
    wim                     @   s  d dl Zd dlmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ e e!Z"G dd dej#Z$G dd dej#Z%eG dd deZ&	d4dej#dej'dej'dej'deej' de(de(fddZ)G dd dej#Z*G dd  d ej#Z+G d!d" d"ej#Z,G d#d$ d$ej#Z-G d%d& d&ej#Z.G d'd( d(eZ/G d)d* d*ej#Z0G d+d, d,ej#Z1eG d-d. d.e&Z2ed/d0G d1d2 d2e&Z3g d3Z4dS )5    N)CallableOptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )IJepaConfigc                       s<   e Zd ZdZ fddZd
dejdedejfdd	Z  Z	S )IJepaPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesnnConv2d
projection)selfconfigr   r   r   r   r$   	__class__ e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/ijepa/modeling_ijepa.pyr   !   s   
 zIJepaPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingreturnc              
   C   s   |j \}}}}|| jkrtd| j d| d|s?|| jd ks(|| jd kr?td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper   
ValueErrorr   r'   flatten	transpose)r(   r.   r/   
batch_sizer   heightwidth
embeddingsr,   r,   r-   forward0   s(   
zIJepaPatchEmbeddings.forwardF)
__name__
__module____qualname____doc__r   torchTensorboolr<   __classcell__r,   r,   r*   r-   r      s    $r   c                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdeej dedejfddZ  ZS )IJepaEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fr)   use_mask_tokenr0   Nc                    st   t    |rttdd|jnd | _t|| _	| j	j
}ttd||j| _t|j| _|j| _|| _d S )Nr   )r   r   r%   	ParameterrB   zerosr   
mask_tokenr   patch_embeddingsr$   randnposition_embeddingsDropouthidden_dropout_probdropoutr   r)   )r(   r)   rG   r$   r*   r,   r-   r   F   s   
 

zIJepaEmbeddings.__init__r;   r9   r:   c                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r3   bicubicF)sizemodealign_corners)r4   rM   rB   jit
is_tracingr   r   reshapepermuter%   
functionalinterpolateview)r(   r;   r9   r:   r$   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionsr,   r,   r-   r/   P   s&   




z(IJepaEmbeddings.interpolate_pos_encodingr.   bool_masked_posr/   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }|r=|| ||| }n|| j }| |}|S )N)r/   r   rQ         ?)	r4   rK   rJ   expand	unsqueezetype_asr/   rM   rP   )r(   r.   rc   r/   r8   _r9   r:   r;   
seq_lengthmask_tokensmaskr,   r,   r-   r<   w   s   


zIJepaEmbeddings.forwardr=   NF)r>   r?   r@   rA   r   rD   r   rB   rC   intr/   r   
BoolTensorr<   rE   r,   r,   r*   r-   rF   A   s    
*rF   c                   @   sV   e Zd ZeZdZdZdZddgZdZ	dZ
dZdZdeejejejf ddfd	d
ZdS )IJepaPreTrainedModelijepar.   TrF   
IJepaLayermoduler0   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trotjj|jjt	j
d| jjd|jj|j_|jdurq|jj  dS dS dS )zInitialize the weights        )meanstdNrd   )r    r%   Linearr&   inittrunc_normal_weightdatatorB   float32r)   initializer_rangedtypebiaszero_	LayerNormfill_rF   rM   rJ   )r(   rr   r,   r,   r-   _init_weights   s0   




z"IJepaPreTrainedModel._init_weights)r>   r?   r@   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backendr   r%   rv   r&   r   r   r,   r,   r,   r-   ro      s    &ro   rs   rr   querykeyvalueattention_maskscalingrP   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )NrQ   )r_   r~   )ptrainingr   r3   )rB   matmulr7   r%   rZ   softmaxr|   r{   r~   rP   r   
contiguous)
rr   r   r   r   r   r   rP   kwargsattn_weightsattn_outputr,   r,   r-   eager_attention_forward   s   r   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )IJepaSelfAttentionr)   r0   Nc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r1   g      F)r   )r   r   r   num_attention_headshasattrr5   r)   rm   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr%   rv   qkv_biasr   r   r   r(   r)   r*   r,   r-   r      s"   

zIJepaSelfAttention.__init__xc                 C   s6   |  d d | j| jf }||}|ddddS )NrQ   r   r3   r   r   )rS   r   r   r\   rY   )r(   r   new_x_shaper,   r,   r-   transpose_for_scores   s   
z'IJepaSelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc              
   C   s   |  | |}|  | |}|  | |}t}| jjdkr4| jjdkr.|r.td nt	| jj }|| ||||| j
| j| jsCdn| jd\}}	| d d | jf }
||
}|rc||	f}|S |f}|S )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rs   )r   r   rP   r   )r   r   r   r   r   r)   _attn_implementationloggerwarning_oncer   r   r   r   r   rS   r   rX   )r(   hidden_statesr   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputsr,   r,   r-   r<      s4   

zIJepaSelfAttention.forwardrl   )r>   r?   r@   r   r   rB   rC   r   r   rD   r   tupler<   rE   r,   r,   r*   r-   r      s    r   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )IJepaSelfOutputz
    The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r)   r0   Nc                    s.   t    t|j|j| _t|j| _d S N)	r   r   r%   rv   r   denserN   rO   rP   r   r*   r,   r-   r        
zIJepaSelfOutput.__init__r   input_tensorc                 C      |  |}| |}|S r   r   rP   r(   r   r   r,   r,   r-   r<        

zIJepaSelfOutput.forward)
r>   r?   r@   rA   r   r   rB   rC   r<   rE   r,   r,   r*   r-   r     s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )IJepaAttentionr)   r0   Nc                    s*   t    t|| _t|| _t | _d S r   )r   r   r   	attentionr   outputsetpruned_headsr   r*   r,   r-   r   $  s   


zIJepaAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r_   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r(   r   indexr,   r,   r-   prune_heads*  s   zIJepaAttention.prune_headsFr   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r(   r   r   r   self_outputsattention_outputr   r,   r,   r-   r<   <  s   zIJepaAttention.forwardrl   )r>   r?   r@   r   r   r   rm   r   rB   rC   r   rD   r   r   r<   rE   r,   r,   r*   r-   r   #  s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	IJepaIntermediater)   r0   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r%   rv   r   intermediate_sizer   r    
hidden_actstrr	   intermediate_act_fnr   r*   r,   r-   r   K  s
   
zIJepaIntermediate.__init__r   c                 C   r   r   )r   r   )r(   r   r,   r,   r-   r<   S  r   zIJepaIntermediate.forward	r>   r?   r@   r   r   rB   rC   r<   rE   r,   r,   r*   r-   r   J  s    r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )
IJepaOutputr)   r0   Nc                    s.   t    t|j|j| _t|j| _	d S r   )
r   r   r%   rv   r   r   r   rN   rO   rP   r   r*   r,   r-   r   [  r   zIJepaOutput.__init__r   r   c                 C   s    |  |}| |}|| }|S r   r   r   r,   r,   r-   r<   `  s   

zIJepaOutput.forwardr   r,   r,   r*   r-   r   Z  s    $r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )rq   z?This corresponds to the Block class in the timm implementation.r)   r0   Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r   r   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r%   r   r   layer_norm_epslayernorm_beforelayernorm_afterr   r*   r,   r-   r   l  s   



zIJepaLayer.__init__Fr   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r   r   r   r   )r(   r   r   r   self_attention_outputsr   r   layer_outputr,   r,   r-   r<   v  s   


zIJepaLayer.forwardrl   )r>   r?   r@   rA   r   r   rB   rC   r   rD   r   r   r<   rE   r,   r,   r*   r-   rq   i  s    rq   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )IJepaEncoderr)   r0   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r,   )rq   ).0rh   r)   r,   r-   
<listcomp>  s    z)IJepaEncoder.__init__.<locals>.<listcomp>F)	r   r   r)   r%   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r*   r   r-   r     s   
 
zIJepaEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ](\}}	|r||f }|d ur$|| nd }
|	||
|}|d }|r9||d f }q|rA||f }|sOtdd |||fD S t|||dS )Nr,   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r,   )r   vr,   r,   r-   	<genexpr>  s    z'IJepaEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	enumerater   r   r   )r(   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputsr,   r,   r-   r<     s(   

zIJepaEncoder.forward)NFFT)r>   r?   r@   r   r   rB   rC   r   rD   r   r   r   r<   rE   r,   r,   r*   r-   r     s&    	
r   c                       s*   e Zd Zdef fddZdd Z  ZS )IJepaPoolerr)   c                    s,   t    t|j|j| _t|j | _	d S r   )
r   r   r%   rv   r   pooler_output_sizer   r	   
pooler_act
activationr   r*   r,   r-   r     s   
zIJepaPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r(   r   first_token_tensorpooled_outputr,   r,   r-   r<     s   

zIJepaPooler.forward)r>   r?   r@   r   r   r<   rE   r,   r,   r*   r-   r     s    r   c                       s   e Zd Zddededef fddZdefdd	Zd
ee	e
e	 f ddfddZe							ddeej deej deej dee dee dee dee deeef fddZ  ZS )
IJepaModelFr)   add_pooling_layerrG   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )rG   r   N)r   r   r)   rF   r;   r   encoderr%   r   r   r   	layernormr   pooler	post_init)r(   r)   r   rG   r*   r,   r-   r     s   
zIJepaModel.__init__r0   c                 C   s   | j jS r   )r;   rK   )r(   r,   r,   r-   get_input_embeddings  s   zIJepaModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r(   r  r   r   r,   r,   r-   _prune_heads  s   zIJepaModel._prune_headsr.   rc   r   r   r   r/   r   c                 C   s
  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| || j j}| jjj	j
j}|j|kr?||}| j|||d}	| j|	||||d}
|
d }| |}| jdurd| |nd}|s{|durp||fn|f}||
dd  S t|||
j|
jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rc   r/   )r   r   r   r   r   r   )r   pooler_outputr   r   )r)   r   r   use_return_dictr5   get_head_maskr   r;   rK   r'   ry   r~   r{   r   r  r  r   r   r   )r(   r.   rc   r   r   r   r/   r   expected_dtypeembedding_outputencoder_outputssequence_outputr   head_outputsr,   r,   r-   r<     s@   


zIJepaModel.forward)FFNNNNNNN)r>   r?   r@   r   rD   r   r   r  dictrm   listr  r   r   rB   rC   rn   r   r   r   r<   rE   r,   r,   r*   r-   r     s:    
	r   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                       s   e Zd Zdeddf fddZe							ddeej deej deej d	ee	 d
ee	 dee	 dee	 de
eef fddZ  ZS )IJepaForImageClassificationr)   r0   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NF)r   r   )r   r   
num_labelsr   rp   r%   rv   r   Identity
classifierr  r   r*   r,   r-   r   :  s
   $z$IJepaForImageClassification.__init__r.   r   labelsr   r   r/   r   c                 C   sv  |dur|n| j j}| j||||||d}|d }	| |	jdd}
d}|dur||
j}| j jdu rX| jdkr>d| j _n| jdkrT|j	t
jksO|j	t
jkrTd| j _nd| j _| j jdkrvt }| jdkrp||
 | }n+||
|}n%| j jdkrt }||
d	| j|d	}n| j jdkrt }||
|}|s|
f|dd  }|dur|f| S |S t||
|j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r/   r   r   r   r   
regressionsingle_label_classificationmulti_label_classificationrQ   )losslogitsr   r   )r)   r	  rp   r  rt   r{   deviceproblem_typer  r~   rB   longrm   r   squeezer   r\   r   r   r   r   )r(   r.   r   r  r   r   r/   r   r   r  r  r  loss_fctr   r,   r,   r-   r<   F  sP   	

"


z#IJepaForImageClassification.forwardr  )r>   r?   r@   r   r   r   r   rB   rC   rD   r   r   r   r<   rE   r,   r,   r*   r-   r  +  s6    
	r  )ro   r   r  )rs   )5collections.abcr!   typingr   r   r   rB   torch.nnr%   r   r   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   configuration_ijepar   
get_loggerr>   r   Moduler   rF   ro   rC   floatr   r   r   r   r   r   rq   r   r   r   r  __all__r,   r,   r,   r-   <module>   s`   
'Q)
>'*+]R