o
    eib                     @   sn  d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$ e%e&Z'G dd dej(Z)G dd dej(Z*		dFdej(dej+dej+dej+dej+dB de,dB de,dee fdd Z-G d!d" d"ej(Z.G d#d$ d$ej(Z/G d%d& d&ej(Z0G d'd( d(ej(Z1dGd*ej+d+e,d,e2d-ej+fd.d/Z3G d0d1 d1ej(Z4G d2d3 d3ej(Z5G d4d5 d5ej(Z6G d6d7 d7eZ7G d8d9 d9ej(Z8eG d:d; d;eZ9eG d<d= d=e9Z:ed>d?G d@dA dAe9Z;edBd?G dCdD dDee9Z<g dEZ=dS )HzPyTorch DINOv2 model.    N)Callable)nn   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )Dinov2Configc                       sj   e Zd ZdZdeddf fddZdejded	edejfd
dZ	ddejdejdB dejfddZ
  ZS )Dinov2EmbeddingszM
    Construct the CLS token, mask token, position and patch embeddings.
    configreturnNc                    s   t    ttdd|j| _|jrtt	d|j| _
t|| _| jj}ttd|d |j| _t|j| _|j| _|j| _|| _d S )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenuse_mask_tokenzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r(   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dinov2/modeling_dinov2.pyr   +   s   


zDinov2Embeddings.__init__
embeddingsheightwidthc                 C   s  |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}|j	}t
jj|tj|	|
fdd	d
j|d}|dddddd|}tj||fddS )a-  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicF)sizemodealign_cornersdtypedim)shaper)   r   jit
is_tracingr-   r   reshapepermuter=   r   
functionalinterpolatetofloat32viewcat)r.   r3   r4   r5   r(   num_positionsclass_pos_embedpatch_pos_embedr?   
new_height	new_widthsqrt_num_positionstarget_dtyper1   r1   r2   interpolate_pos_encoding9   s.   




z)Dinov2Embeddings.interpolate_pos_encodingpixel_valuesbool_masked_posc           
      C   s   |j \}}}}| jjjj}| |j|d}|d ur/| jr/t|	d| j
|j	d|}| j|dd}	tj|	|fdd}|| ||| }| |}|S )Nr<   r6   r   r   r>   )r@   r'   
projectionweightr=   rG   r#   r   where	unsqueezer%   r"   expandrJ   rR   r,   )
r.   rS   rT   
batch_size_r4   r5   rQ   r3   
cls_tokensr1   r1   r2   forwarda   s   
zDinov2Embeddings.forwardN)__name__
__module____qualname____doc__r   r   r   TensorintrR   r]   __classcell__r1   r1   r/   r2   r   &   s
    *(r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r&   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r   r   
image_sizer-   num_channelsr!   
isinstancecollectionsabcIterabler(   r   Conv2drU   )r.   r   rh   r-   ri   r!   r(   r/   r1   r2   r   ~   s   
 zDinov2PatchEmbeddings.__init__rS   r   c                 C   sH   |j d }|| jkrtd| j d| d| |ddd}|S )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r7   )r@   ri   
ValueErrorrU   flatten	transpose)r.   rS   ri   r3   r1   r1   r2   r]      s   

zDinov2PatchEmbeddings.forward)	r_   r`   ra   rb   r   r   rc   r]   re   r1   r1   r/   r2   r&   w   s    r&           modulequerykeyvalueattention_maskscalingr,   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )Nr6         r7   r   r>   )ptrainingr   )
r9   r   matmulrr   r   rE   softmaxr,   r}   
contiguous)
rt   ru   rv   rw   rx   ry   r,   rz   attn_weightsattn_outputr1   r1   r2   eager_attention_forward   s   
r   c                       sB   e Zd Zdef fddZdejdeejejf fddZ  Z	S )Dinov2SelfAttentionr   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads ro   r{   Fbias)r   r   r!   num_attention_headshasattrrp   r   rd   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probry   	is_causalr   Linearqkv_biasru   rv   rw   r.   r   r/   r1   r2   r      s"   

zDinov2SelfAttention.__init__hidden_statesr   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t	| j
jt}|| |||d | j| j| jsHdn| jd\}}	| d d | jf }
||
}||	fS )Nr   r6   r   r7   rs   )r   ry   r,   )r@   r   r   rv   rI   rr   rw   ru   r   get_interfacer   _attn_implementationr   r   ry   r}   r   r9   r   rC   )r.   r   rZ   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper1   r1   r2   r]      s*   


zDinov2SelfAttention.forward)
r_   r`   ra   r   r   r   rc   tupler]   re   r1   r1   r/   r2   r      s    (r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
Dinov2SelfOutputz
    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S r^   )	r   r   r   r   r!   denser*   r+   r,   r   r/   r1   r2   r      s   
zDinov2SelfOutput.__init__r   input_tensorr   c                 C   s   |  |}| |}|S r^   )r   r,   )r.   r   r   r1   r1   r2   r]      s   

zDinov2SelfOutput.forward
r_   r`   ra   rb   r   r   r   rc   r]   re   r1   r1   r/   r2   r      s    $r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Dinov2Attentionr   c                    s"   t    t|| _t|| _d S r^   )r   r   r   	attentionr   outputr   r/   r1   r2   r      s   

zDinov2Attention.__init__r   r   c                 C   s   |  |\}}| ||}|S r^   )r   r   )r.   r   self_attn_outputr[   r   r1   r1   r2   r]     s   zDinov2Attention.forward)	r_   r`   ra   r   r   r   rc   r]   re   r1   r1   r/   r2   r      s    r   c                       4   e Zd Zd fddZdejdejfddZ  ZS )	Dinov2LayerScaler   Nc                    s(   t    t|jt|j | _d S r^   )	r   r   r   r   layerscale_valuer   onesr!   lambda1r   r/   r1   r2   r     s   
zDinov2LayerScale.__init__hidden_statec                 C   s
   || j  S r^   )r   r.   r   r1   r1   r2   r]     s   
zDinov2LayerScale.forwardr   Nr_   r`   ra   r   r   rc   r]   re   r1   r1   r/   r2   r     s    r   Finput	drop_probr}   r   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    rs   r   r   )r   )r=   device)r@   ndimr   randr=   r   floor_div)r   r   r}   	keep_probr@   random_tensorr   r1   r1   r2   	drop_path  s   r   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )Dinov2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S r^   )r   r   r   )r.   r   r/   r1   r2   r   $  s   

zDinov2DropPath.__init__r   c                 C   s   t || j| jS r^   )r   r   r}   )r.   r   r1   r1   r2   r]   (  s   zDinov2DropPath.forwardc                 C   s   d| j  S )Nzp=)r   r.   r1   r1   r2   
extra_repr+  s   zDinov2DropPath.extra_reprr^   )r_   r`   ra   rb   floatr   r   rc   r]   strr   re   r1   r1   r/   r2   r   !  s
    r   c                       r   )		Dinov2MLPr   Nc                    sn   t    |j }}t|j|j }tj||dd| _t|j	t
r(t|j	 | _n|j	| _tj||dd| _d S )NTr   )r   r   r!   rd   	mlp_ratior   r   fc1rj   
hidden_actr   r   
activationfc2r.   r   in_featuresout_featureshidden_featuresr/   r1   r2   r   0  s   

zDinov2MLP.__init__r   c                 C   s"   |  |}| |}| |}|S r^   )r   r   r   r   r1   r1   r2   r]   ;  s   


zDinov2MLP.forwardr   r   r1   r1   r/   r2   r   /  s    r   c                       r   )	Dinov2SwiGLUFFNr   Nc                    sl   t    |j }}t|j|j }t|d d d d d }tj|d| dd| _tj||dd| _d S )Nr7   r         Tr   )	r   r   r!   rd   r   r   r   
weights_inweights_outr   r/   r1   r2   r   C  s   

zDinov2SwiGLUFFN.__init__r   c                 C   s6   |  |}|jddd\}}tj|| }| |S )Nr7   r6   r>   )r   chunkr   rE   silur   )r.   r   x1x2hiddenr1   r1   r2   r]   L  s   

zDinov2SwiGLUFFN.forwardr   r   r1   r1   r/   r2   r   B  s    	r   c                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
Dinov2LayerzCThis corresponds to the Block class in the original implementation.r   r   Nc                    s   t    tj|j|jd| _t|| _t	|| _
|jdkr#t|jnt | _tj|j|jd| _|jr;t|| _nt|| _t	|| _d S )Nepsrs   )r   r   r   	LayerNormr!   layer_norm_epsnorm1r   r   r   layer_scale1drop_path_rater   Identityr   norm2use_swiglu_ffnr   mlpr   layer_scale2r   r/   r1   r2   r   V  s   



zDinov2Layer.__init__r   c                 C   s\   |  |}| |}| |}| || }| |}| |}| |}| || }|S r^   )r   r   r   r   r   r   r   )r.   r   hidden_states_normself_attention_outputlayer_outputr1   r1   r2   r]   f  s   





zDinov2Layer.forwardr   r1   r1   r/   r2   r   S  s    r   c                       s<   e Zd Zdef fddZd
dejdedefdd	Z	  Z
S )Dinov2Encoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r1   )r   .0r[   r   r1   r2   
<listcomp>  s    z*Dinov2Encoder.__init__.<locals>.<listcomp>F)	r   r   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r/   r   r2   r   }  s   
 
zDinov2Encoder.__init__Fr   output_hidden_statesr   c                 C   sT   |r|gnd }t | jD ]\}}||}|r|| qt||r&t|dS d dS )N)last_hidden_stater   )	enumerater   appendr
   r   )r.   r   r   all_hidden_statesilayer_moduler1   r1   r2   r]     s   

zDinov2Encoder.forward)F)r_   r`   ra   r   r   r   rc   boolr
   r]   re   r1   r1   r/   r2   r   |  s    "r   c                   @   sl   e Zd ZU eed< dZdZdZdZdgZ	dZ
dZdZdZdeiZe dejejB ejB d	d
fddZd
S )Dinov2PreTrainedModelr   dinov2rS   )imageTr   
attentionsrt   r   Nc                 C   s   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS t |tr`tj|jd| jjd tj|jd| jjd | jjr^t
|j dS dS t |trpt|j| jj dS dS )zInitialize the weightsrs   )meanstdN)rj   r   r   rn   inittrunc_normal_rV   r   initializer_ranger   zeros_r   ones_r   r)   r"   r#   r%   r   	constant_r   r   )r.   rt   r1   r1   r2   _init_weights  s"   


z#Dinov2PreTrainedModel._init_weights)r_   r`   ra   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   _can_record_outputsr   no_gradr   r   rn   r   r  r1   r1   r1   r2   r     s   
 &r   c                       st   e Zd Zdef fddZdefddZeedde							dd
e
jd	B de
jd	B ded	B defddZ  ZS )Dinov2Modelr   c                    sF   t  | || _t|| _t|| _tj|j	|j
d| _|   d S )Nr   )r   r   r   r   r3   r   encoderr   r   r!   r   	layernorm	post_initr   r/   r1   r2   r     s   

zDinov2Model.__init__r   c                 C      | j jS r^   r3   r'   r   r1   r1   r2   get_input_embeddings     z Dinov2Model.get_input_embeddingsF)tie_last_hidden_statesNrS   rT   r   c           	      K   sr   |du r| j j}|du rtd| j||d}| j||d}|j}| |}|dddddf }t|||jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
            pre-training.
        Nz You have to specify pixel_values)rT   r   r   )r   pooler_outputr   )	r   r   rp   r3   r  r   r  r   r   )	r.   rS   rT   r   rz   embedding_outputencoder_outputssequence_outputpooled_outputr1   r1   r2   r]     s   
zDinov2Model.forward)NNN)r_   r`   ra   r   r   r&   r  r   r   r   r   rc   r   r   r]   re   r1   r1   r/   r2   r    s$    r  z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    )custom_introc                       s^   e Zd Zdeddf fddZee		ddejdB dejdB de	e
 defd	d
Z  ZS )Dinov2ForImageClassificationr   r   Nc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   r7   )r   r   
num_labelsr  r   r   r   r!   r   
classifierr  r   r/   r1   r2   r     s   
$z%Dinov2ForImageClassification.__init__rS   labelsrz   c                 K   s   | j |fi |}|j}|dddf }|ddddf }tj||jddgdd}| |}	d}
|durB| j||	| jfi |}
t|
|	|j	|j
dS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r>   )losslogitsr   r   )r   r   r   rJ   r   r   loss_functionr   r   r   r   )r.   rS   r!  rz   outputsr  r"   patch_tokenslinear_inputr#  r"  r1   r1   r2   r]     s   
z$Dinov2ForImageClassification.forward)NN)r_   r`   ra   r   r   r   r   r   rc   r   r   r   r]   re   r1   r1   r/   r2   r    s    r  zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                
       sV   e Zd Z fddZdefddZeee	dde	j
dedB defd	d
Z  ZS )Dinov2Backbonec                    s^   t     fddt jd D | _t | _t | _t	j
 j jd| _|   d S )Nc                    s   g | ]} j qS r1   )r!   r   r   r1   r2   r   (  s    z+Dinov2Backbone.__init__.<locals>.<listcomp>r   r   )r   r   r   r   num_featuresr   r3   r   r  r   r   r!   r   r  r  r   r/   r   r2   r   %  s   

zDinov2Backbone.__init__r   c                 C   r  r^   r  r   r1   r1   r2   r  1  r  z#Dinov2Backbone.get_input_embeddingsNrS   r   c                 K   s   |du r| j j}| |}| j|dd}|j}g }t| j|D ]F\}}	|| jv re| j jr1| 	|	}	| j j
r`|	ddddf }	|j\}
}}}| j j}|	|
|| || d}	|	dddd }	||	 qtt||rp|d	S dd	S )
av  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 16, 16]
        ```NTr  r   r6   r   r   r7   )feature_mapsr   )r   r   r3   r  r   zipstage_namesr   apply_layernormr  reshape_hidden_statesr@   r-   rC   rD   r   r   r	   r   )r.   rS   r   rz   r  r   r   r*  stager   rZ   r[   r4   r5   r-   r1   r1   r2   r]   4  s0    



zDinov2Backbone.forwardr^   )r_   r`   ra   r   r&   r  r   r   r   r   rc   r   r	   r]   re   r1   r1   r/   r2   r(    s    r(  )r  r  r   r(  )Nrs   )rs   F)>rb   collections.abcrk   r   r   r    r   r   activationsr   backbone_utilsr   modeling_layersr   modeling_outputsr	   r
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_dinov2r   
get_loggerr_   loggerModuler   r&   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r(  __all__r1   r1   r1   r2   <module>   sz   
Q(
3 
)"32L