o
    i+p                     @   s`  d Z ddlZddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% e&e'Z(G dd dej)Z*G dd dej)Z+	dDdej)dej,dej,dej,deej, de-de-fddZ.G dd  d ej)Z/G d!d" d"ej)Z0G d#d$ d$ej)Z1G d%d& d&ej)Z2dEd(ej,d)e-d*e3d+ej,fd,d-Z4G d.d/ d/ej)Z5G d0d1 d1ej)Z6G d2d3 d3ej)Z7G d4d5 d5eZ8G d6d7 d7ej)Z9eG d8d9 d9eZ:eG d:d; d;e:Z;ed<d=G d>d? d?e:Z<ed@d=G dAdB dBe:e Z=g dCZ>dS )FzPyTorch DINOv2 model.    N)CallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringlogging	torch_int)BackboneMixin)can_return_tuplecheck_model_inputs   )Dinov2Configc                       sj   e Zd ZdZdeddf fddZdejded	edejfd
dZ	ddejde
ej dejfddZ  ZS )Dinov2EmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    configreturnNc                    s   t    ttdd|j| _|jrtt	d|j| _
t|| _| jj}ttd|d |j| _t|j| _|j| _|j| _|| _d S )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenuse_mask_tokenzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r*   	__class__ ^/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/dinov2/modeling_dinov2.pyr   +   s   


zDinov2Embeddings.__init__
embeddingsheightwidthc                 C   s  |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}|j	}t
jj|tj|	|
fdd	d
j|d}|dddddd|}tj||fddS )a-  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicF)sizemodealign_cornersdtypedim)shaper+   r!   jit
is_tracingr/   r   reshapepermuter?   r   
functionalinterpolatetofloat32viewcat)r0   r5   r6   r7   r*   num_positionsclass_pos_embedpatch_pos_embedrA   
new_height	new_widthsqrt_num_positionstarget_dtyper3   r3   r4   interpolate_pos_encoding9   s.   




z)Dinov2Embeddings.interpolate_pos_encodingpixel_valuesbool_masked_posc           
      C   s   |j \}}}}| jjjj}| |j|d}|d ur/| jr/t|	d| j
|j	d|}| j|dd}	tj|	|fdd}|| ||| }| |}|S )Nr>   r8   r   r   r@   )rB   r)   
projectionweightr?   rI   r%   r!   where	unsqueezer'   r$   expandrL   rT   r.   )
r0   rU   rV   
batch_size_r6   r7   rS   r5   
cls_tokensr3   r3   r4   forwarda   s   
zDinov2Embeddings.forwardN)__name__
__module____qualname____doc__r   r   r!   TensorintrT   r   r_   __classcell__r3   r3   r1   r4   r   &   s
    *(r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r(   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r   r   
image_sizer/   num_channelsr#   
isinstancecollectionsabcIterabler*   r   Conv2drW   )r0   r   rj   r/   rk   r#   r*   r1   r3   r4   r   ~   s   
 zDinov2PatchEmbeddings.__init__rU   r   c                 C   sH   |j d }|| jkrtd| j d| d| |ddd}|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r9   )rB   rk   
ValueErrorrW   flatten	transpose)r0   rU   rk   r5   r3   r3   r4   r_      s   

zDinov2PatchEmbeddings.forward)	ra   rb   rc   rd   r   r!   re   r_   rg   r3   r3   r1   r4   r(   w   s    r(           modulequerykeyvalueattention_maskscalingr.   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )Nr8   )rA   r?   )ptrainingr   r9   )r!   matmulrt   r   rG   softmaxrJ   rI   r?   r.   r~   
contiguous)
rv   rw   rx   ry   rz   r{   r.   kwargsattn_weightsattn_outputr3   r3   r4   eager_attention_forward   s   r   c                	       sP   e Zd Zdef fddZ	d
dejdeej deejejf fdd	Z	  Z
S )Dinov2SelfAttentionr   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads rq   g      Fbias)r   r   r#   num_attention_headshasattrrr   r   rf   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr{   	is_causalr   Linearqkv_biasrw   rx   ry   r0   r   r1   r3   r4   r      s"   

zDinov2SelfAttention.__init__Nhidden_states	head_maskr   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t}| j	j
dkr?t| j	j
 }|| ||||| j| j| jsNdn| jd\}	}
|	 d d | jf }|	|}	|	|
fS )	Nr   r8   r   r9   eagerru   )r   r{   r.   r|   )rB   r   r   rx   rK   rt   ry   rw   r   r   _attn_implementationr   r   r{   r~   r   r;   r   rE   )r0   r   r   r\   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper3   r3   r4   r_      s*   


zDinov2SelfAttention.forwardr`   )ra   rb   rc   r   r   r!   re   r   tupler_   rg   r3   r3   r1   r4   r      s    r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
Dinov2SelfOutputz
    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S r`   )	r   r   r   r   r#   denser,   r-   r.   r   r1   r3   r4   r      s   
zDinov2SelfOutput.__init__r   input_tensorr   c                 C   s   |  |}| |}|S r`   )r   r.   )r0   r   r   r3   r3   r4   r_      s   

zDinov2SelfOutput.forward)
ra   rb   rc   rd   r   r   r!   re   r_   rg   r3   r3   r1   r4   r      s    $r   c                       sV   e Zd Zdef fddZdee fddZddej	d	e
ej	 d
ej	fddZ  ZS )Dinov2Attentionr   c                    s*   t    t|| _t|| _t | _d S r`   )r   r   r   	attentionr   outputsetpruned_headsr   r1   r3   r4   r      s   


zDinov2Attention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r@   )lenr   r   r   r   r   r   rw   rx   ry   r   r   r   union)r0   r   indexr3   r3   r4   prune_heads  s   zDinov2Attention.prune_headsNr   r   r   c                 C   s    |  ||\}}| ||}|S r`   )r   r   )r0   r   r   self_attn_outputr]   r   r3   r3   r4   r_     s   zDinov2Attention.forwardr`   )ra   rb   rc   r   r   r   rf   r   r!   re   r   r_   rg   r3   r3   r1   r4   r      s    *r   c                       4   e Zd Zd fddZdejdejfddZ  ZS )	Dinov2LayerScaler   Nc                    s(   t    t|jt|j | _d S r`   )	r   r   r   r    layerscale_valuer!   onesr#   lambda1r   r1   r3   r4   r     s   
zDinov2LayerScale.__init__hidden_statec                 C   s
   || j  S r`   )r   r0   r   r3   r3   r4   r_   #  s   
zDinov2LayerScale.forwardr   Nra   rb   rc   r   r!   re   r_   rg   r3   r3   r1   r4   r     s    r   Finput	drop_probr~   r   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    ru   r   r   )r   )r?   device)rB   ndimr!   randr?   r   floor_div)r   r   r~   	keep_probrB   random_tensorr   r3   r3   r4   	drop_path(  s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )Dinov2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S r`   )r   r   r   )r0   r   r1   r3   r4   r   @  s   

zDinov2DropPath.__init__r   c                 C   s   t || j| jS r`   )r   r   r~   )r0   r   r3   r3   r4   r_   D  s   zDinov2DropPath.forwardc                 C   s   d| j  S )Nzp=)r   r0   r3   r3   r4   
extra_reprG  s   zDinov2DropPath.extra_reprr`   )ra   rb   rc   rd   r   floatr   r!   re   r_   strr   rg   r3   r3   r1   r4   r   =  s
    r   c                       r   )		Dinov2MLPr   Nc                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
r(t|j	 | _n|j	| _tj||dd| _d S )NTr   )r   r   r#   rf   	mlp_ratior   r   fc1rl   
hidden_actr   r   
activationfc2r0   r   in_featuresout_featureshidden_featuresr1   r3   r4   r   L  s   

zDinov2MLP.__init__r   c                 C   s"   |  |}| |}| |}|S r`   )r   r   r   r   r3   r3   r4   r_   W  s   


zDinov2MLP.forwardr   r   r3   r3   r1   r4   r   K  s    r   c                       r   )	Dinov2SwiGLUFFNr   Nc                    sl   t    |j }}t|j|j }t|d d d d d }tj|d| dd| _tj||dd| _d S )Nr9   r         Tr   )	r   r   r#   rf   r   r   r   
weights_inweights_outr   r1   r3   r4   r   _  s   

zDinov2SwiGLUFFN.__init__r   c                 C   s6   |  |}|jddd\}}tj|| }| |S )Nr9   r8   r@   )r   chunkr   rG   silur   )r0   r   x1x2hiddenr3   r3   r4   r_   h  s   

zDinov2SwiGLUFFN.forwardr   r   r3   r3   r1   r4   r   ^  s    	r   c                       sN   e Zd ZdZdeddf fddZ	ddejdeej dejfd	d
Z	  Z
S )Dinov2LayerzCThis corresponds to the Block class in the original implementation.r   r   Nc                    s   t    tj|j|jd| _t|| _t	|| _
|jdkr#t|jnt | _tj|j|jd| _|jr;t|| _nt|| _t	|| _d S )Nepsru   )r   r   r   	LayerNormr#   layer_norm_epsnorm1r   r   r   layer_scale1drop_path_rater   Identityr   norm2use_swiglu_ffnr   mlpr   layer_scale2r   r1   r3   r4   r   r  s   



zDinov2Layer.__init__r   r   c                 C   s^   |  |}| ||}| |}| || }| |}| |}| |}| || }|S r`   )r   r   r   r   r   r   r   )r0   r   r   hidden_states_normself_attention_outputlayer_outputr3   r3   r4   r_     s   




zDinov2Layer.forwardr`   )ra   rb   rc   rd   r   r   r!   re   r   r_   rg   r3   r3   r1   r4   r   o  s    r   c                	       sH   e Zd Zdef fddZ	ddejdeej ded	e	fd
dZ
  ZS )Dinov2Encoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r3   )r   .0r]   r   r3   r4   
<listcomp>  s    z*Dinov2Encoder.__init__.<locals>.<listcomp>F)	r   r   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r1   r   r4   r     s   
 
zDinov2Encoder.__init__NFr   r   output_hidden_statesr   c                 C   sj   |r|gnd }t | jD ]\}}|d ur|| nd }|||}|r&|| qt||r1t|dS d dS )N)last_hidden_stater   )	enumerater   appendr
   r   )r0   r   r   r   all_hidden_statesilayer_modulelayer_head_maskr3   r3   r4   r_     s   


zDinov2Encoder.forward)NF)ra   rb   rc   r   r   r!   re   r   boolr
   r_   rg   r3   r3   r1   r4   r     s    r   c                   @   sb   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdZdeiZdeejejejf dd	fd
dZd	S )Dinov2PreTrainedModelr   dinov2rU   Tr   
attentionsrv   r   Nc                 C   s4  t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trtjj|jjt	j
d| jjd|jj|j_tjj|jjt	j
d| jjd|jj|j_| jjr|jj  dS dS t |tr|jj| jj dS dS )zInitialize the weightsru   )meanstdNg      ?)rl   r   r   rp   inittrunc_normal_rX   datarI   r!   rJ   r   initializer_ranger?   r   zero_r   fill_r   r+   r$   r%   r'   r   r   r   )r0   rv   r3   r3   r4   _init_weights  sB   





z#Dinov2PreTrainedModel._init_weights)ra   rb   rc   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   _can_record_outputsr   r   r   rp   r   r	  r3   r3   r3   r4   r     s   
 &r   c                       s   e Zd Zdef fddZdefddZdeee	e f ddfd	d
Z
edde				ddeej deej deej dee def
ddZ  ZS )Dinov2Modelr   c                    sF   t  | || _t|| _t|| _tj|j	|j
d| _|   d S )Nr   )r   r   r   r   r5   r   encoderr   r   r#   r   	layernorm	post_initr   r1   r3   r4   r     s   

zDinov2Model.__init__r   c                 C      | j jS r`   r5   r)   r   r3   r3   r4   get_input_embeddings     z Dinov2Model.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )r0   r  r   r   r3   r3   r4   _prune_heads  s   zDinov2Model._prune_headsF)tie_last_hidden_statesrU   rV   r   r   c           
      K   s   |du r| j j}|du rtd| || j j}| j||d}| j|||d}|j}| |}|dddddf }	t	||	|j
dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
            pre-training.
        Nz You have to specify pixel_values)rV   )r   r   r   )r   pooler_outputr   )r   r   rr   get_head_maskr   r5   r  r   r  r   r   )
r0   rU   rV   r   r   r   embedding_outputencoder_outputssequence_outputpooled_outputr3   r3   r4   r_     s"   
zDinov2Model.forward)NNNN)ra   rb   rc   r   r   r(   r  dictrf   listr  r   r   r   r!   re   r   r   r_   rg   r3   r3   r1   r4   r    s*    r  z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    )custom_introc                       sj   e Zd Zdeddf fddZee			ddeej	 deej	 deej	 d	e
e def
d
dZ  ZS )Dinov2ForImageClassificationr   r   Nc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   r9   )r   r   
num_labelsr  r   r   r   r#   r   
classifierr  r   r1   r3   r4   r   +  s   
$z%Dinov2ForImageClassification.__init__rU   r   labelsr   c                 K   s   | j |fd|i|}|j}|dddf }|ddddf }tj||jddgdd}	| |	}
d}|durD| j||
| jfi |}t||
|j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r   Nr   r   r@   )losslogitsr   r   )r   r   r!   rL   r  r+  loss_functionr   r   r   r   )r0   rU   r   r,  r   outputsr$  r$   patch_tokenslinear_inputr.  r-  r3   r3   r4   r_   9  s   
z$Dinov2ForImageClassification.forward)NNN)ra   rb   rc   r   r   r   r   r   r!   re   r   r   r   r_   rg   r3   r3   r1   r4   r)  $  s$    r)  zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                	       sR   e Zd Z fddZdefddZee	ddej	de
e defd	d
Z  ZS )Dinov2Backbonec                    sj   t    t     fddt jd D | _t | _t | _	t
j j jd| _|   d S )Nc                    s   g | ]} j qS r3   )r#   r   r   r3   r4   r   g  s    z+Dinov2Backbone.__init__.<locals>.<listcomp>r   r   )r   r   _init_backboner   r   num_featuresr   r5   r   r  r   r   r#   r   r  r  r   r1   r   r4   r   c  s   

zDinov2Backbone.__init__r   c                 C   r  r`   r  r   r3   r3   r4   r  p  r  z#Dinov2Backbone.get_input_embeddingsNrU   r   c                 K   s   |du r| j j}| |}| j|dd}|j}g }t| j|D ]F\}}	|| jv re| j jr1| 	|	}	| j j
r`|	ddddf }	|j\}
}}}| j j}|	|
|| || d}	|	dddd }	||	 qtt||rp|d	S dd	S )
a%  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 16, 16]
        ```NT)r   r   r8   r   r   r9   )feature_mapsr   )r   r   r5   r  r   zipstage_namesr   apply_layernormr  reshape_hidden_statesrB   r/   rE   rF   r   r   r	   r   )r0   rU   r   r   r"  r   r   r6  stager   r\   r]   r6   r7   r/   r3   r3   r4   r_   s  s0   



zDinov2Backbone.forwardr`   )ra   rb   rc   r   r(   r  r   r   r!   re   r   r   r	   r_   rg   r3   r3   r1   r4   r3  ]  s    r3  )r)  r  r   r3  )ru   )ru   F)?rd   collections.abcrm   typingr   r   r   r!   r   activationsr   modeling_layersr   modeling_outputsr	   r
   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.backbone_utilsr   utils.genericr   r   configuration_dinov2r   
get_loggerra   loggerModuler   r(   re   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r)  r3  __all__r3   r3   r3   r4   <module>   sr   
Q)
5 
*.D3J