o
    eidt                  	   @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ eeZG dd dejZejjdd Z dd Z!G dd dejZ"d7dej#de$de%dej#fddZ&G dd dejZ'G d d! d!ejZ(G d"d# d#ejZ)G d$d% d%ejZ*d&d' Z+d(d) Z,G d*d+ d+eZ-G d,d- d-ejZ.eG d.d/ d/eZ/eG d0d1 d1e/Z0ed2d3G d4d5 d5ee/Z1g d6Z2dS )8zPyTorch ViTDet backbone.    N)nn   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutputBaseModelOutput)PreTrainedModel)auto_docstringlogging   )VitDetConfigc                       s>   e Zd ZdZ fddZdd Zdejdejfdd	Z  Z	S )
VitDetEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) to be consumed by a Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _
|| _|| _|| _|jr]|d }ttd||j| _nd | _tj||||d| _d S )Nr   r   )kernel_sizestride)super__init__pretrain_image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterable
image_sizenum_patches use_absolute_position_embeddingsr   	Parametertorchzerosposition_embeddingsConv2d
projection)selfconfigr   r   r   r   r   num_positions	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vitdet/modeling_vitdet.pyr   )   s   
 zVitDetEmbeddings.__init__c                 C   s   |r|ddddf }|j d }tt|}|| |kr"tdtj s/||ks/||krOtj	j
|d||ddddd||fdd	d
}|ddddS |d||dS )a  
        Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
        original embeddings.

        Args:
            abs_pos_embeddings (`torch.Tensor`):
                Absolute positional embeddings with (1, num_position, num_channels).
            has_cls_token (`bool`):
                If true, has 1 embedding in abs_pos_embeddings for cls token.
            height (`int`):
                Height of input image tokens.
            width (`int`):
                Width of input image tokens.

        Returns:
            Absolute positional embeddings after processing with shape (1, height, width, num_channels)
        Nr   z5Absolute position embeddings must be a square number.r   r      bicubicF)sizemodealign_corners)shapeintmathsqrt
ValueErrorr    jit
is_tracingr   
functionalinterpolatereshapepermute)r%   abs_pos_embeddingshas_cls_tokenheightwidthnum_positionr/   new_abs_pos_embeddingsr*   r*   r+   get_absolute_positions?   s   
z'VitDetEmbeddings.get_absolute_positionspixel_valuesreturnc                 C   s   |j d }|| jkrtd| j d| d| |}| jd urA|dddd}|| | jd|j d |j d  }|dddd}|S )	Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r-   r   T)r2   r   r6   r$   r"   r<   rC   )r%   rD   r   
embeddingsr*   r*   r+   forwarde   s"   



zVitDetEmbeddings.forward)
__name__
__module____qualname____doc__r   rC   r    TensorrH   __classcell__r*   r*   r(   r+   r   #   s
    &r   c                 C   s   t dt| | d }|jd |kr3tjj|d|jd dddd|dd}|d|dd}n|}t	| dddf t||  d }t	|dddf t| | d }|| |d t| | d  }||
  S )	a  
    Get relative positional embeddings according to the relative positions of query and key sizes.

    Args:
        q_size (`int`):
            Size of query q.
        k_size (`int`):
            Size of key k.
        rel_pos (`torch.Tensor`):
            Relative position embeddings (num_embeddings, num_channels).

    Returns:
        Extracted positional embeddings according to relative positions.
    r-   r   r   r,   linear)r/   r0   Ng      ?)r3   maxr2   r   r9   r:   r;   r<   r    arangelong)q_sizek_sizerel_posmax_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordsr*   r*   r+   get_rel_pos{   s   $$r[   c                 C   s   |\}}|\}}	t |||}
t ||	|}|j\}}}|||||}td||
}
td||}| |||||	|
dddddddddf  |dddddddddf  ||| ||	 } | S )a  
    Calculate decomposed Relative Positional Embeddings as introduced in
    [MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).

    Args:
        attn (`torch.Tensor`):
            Attention map.
        queries (`torch.Tensor`):
            Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
        rel_pos_h (`torch.Tensor`):
            Relative position embeddings (Lh, num_channels) for height axis.
        rel_pos_w (`torch.Tensor`):
            Relative position embeddings (Lw, num_channels) for width axis.
        q_size (`tuple[int]`):
            Spatial sequence size of query q with (queries_height, queries_width).
        k_size (`tuple[int]`):
            Spatial sequence size of key k with (keys_height, keys_width).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r[   r2   r;   r    einsumview)attnqueries	rel_pos_h	rel_pos_wrS   rT   queries_heightqueries_widthkeys_height
keys_widthrelative_heightrelative_width
batch_size_dimr_qrelative_weightr*   r*   r+   !add_decomposed_relative_positions   s      rm   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
VitDetAttentionz=Multi-head Attention block with relative position embeddings.Nc                    s   t    |j}|j}|| _|| }|d | _tj||d |jd| _	t||| _
|j| _| jrSttd|d  d || _ttd|d  d || _dS dS )z
        Args:
            config (`VitDetConfig`):
                Model configuration.
            input_size (`tuple[int]`, *optional*):
                Input resolution, only required in case relative position embeddings are added.
        g      r   biasr-   r   r   N)r   r   r   num_attention_heads	num_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsr   r    r!   r`   ra   )r%   r&   
input_sizerj   rr   head_dimr(   r*   r+   r      s   

 $zVitDetAttention.__init__Fc                 C   s&  |j \}}}}| |||| d| jdddddd}|d|| j || dd\}}	}
|| j |	dd }| jrMt	||| j
| j||f||f}|jdd}||
 }||| j||d}|ddddd}||||d}| |}|r||| j|j d |j d }||f}|S |f}|S )	Nr   r,   r-   r   r      )rj   )r2   rv   r;   rr   r<   unbindrs   	transposerx   rm   r`   ra   softmaxr]   rw   )r%   hidden_stateoutput_attentionsrh   r?   r@   ri   rv   r_   keysvaluesattention_scoresattention_probsoutputsr*   r*   r+   rH      s,   ,&
zVitDetAttention.forwardNFrI   rJ   rK   rL   r   rH   rN   r*   r*   r(   r+   rn      s    rn           Finput	drop_probtrainingrE   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   )r   )dtypedevice)r2   ndimr    randr   r   floor_div)r   r   r   	keep_probr2   random_tensoroutputr*   r*   r+   	drop_path  s   r   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )VitDetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rE   c                    s   t    || _d S r   )r   r   r   )r%   r   r(   r*   r+   r     s   

zVitDetDropPath.__init__hidden_statesc                 C   s   t || j| jS r   )r   r   r   )r%   r   r*   r*   r+   rH     s   zVitDetDropPath.forwardc                 C   s   d| j  S )Nzp=)r   r%   r*   r*   r+   
extra_repr"  s   zVitDetDropPath.extra_reprr   )rI   rJ   rK   rL   floatr   r    rM   rH   strr   rN   r*   r*   r(   r+   r     s
    r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )VitDetLayerNormaL  
    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
    channel dimension for inputs that have shape (batch_size, channels, height, width).
    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
    ư>c                    s@   t    tt|| _tt|| _|| _	|f| _
d S r   )r   r   r   r   r    onesweightr!   rp   epsnormalized_shape)r%   r   r   r(   r*   r+   r   -  s
   
zVitDetLayerNorm.__init__c                 C   sn   |j ddd}|| dj ddd}|| t|| j  }| jd d d d f | | jd d d d f  }|S )Nr   T)keepdimr-   )meanpowr    r5   r   r   rp   )r%   xusr*   r*   r+   rH   4  s
   ,zVitDetLayerNorm.forward)r   r   r*   r*   r(   r+   r   &  s    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )VitDetResBottleneckBlockz
    The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
    1x1, 3x3, 1x1.
    c                    s   t    tj||ddd| _t|| _t|j | _	tj||dddd| _
t|| _t|j | _tj||ddd| _t|| _dS )ar  
        Args:
            config (`VitDetConfig`):
                Model configuration.
            in_channels (`int`):
                Number of input channels.
            out_channels (`int`):
                Number of output channels.
            bottleneck_channels (`int`):
                Number of output channels for the 3x3 "bottleneck" conv layers.
        r   Fro   r   )paddingrp   N)r   r   r   r#   conv1r   norm1r   
hidden_actact1conv2norm2act2conv3norm3)r%   r&   in_channelsout_channelsbottleneck_channelsr(   r*   r+   r   B  s   


z!VitDetResBottleneckBlock.__init__c                 C   s&   |}|   D ]}||}q|| }|S r   )children)r%   r   outlayerr*   r*   r+   rH   Z  s
   
z VitDetResBottleneckBlock.forwardr   r*   r*   r(   r+   r   <  s    r   c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  ZS )
	VitDetMlpin_featureshidden_featuresrE   Nc                    sD   t    t||| _t|j | _t||| _t	|j
| _d S r   )r   r   r   rt   fc1r   r   actfc2Dropoutdropout_probdrop)r%   r&   r   r   r(   r*   r+   r   d  s
   
zVitDetMlp.__init__r   c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   )r%   r   r*   r*   r+   rH   k  s   




zVitDetMlp.forward)	rI   rJ   rK   r3   r   r    rM   rH   rN   r*   r*   r(   r+   r   c  s    r   c              	   C   s   | j \}}}}|||  | }|||  | }tj| ddd|d|f} || || }}	| ||| ||	| ||} | dddddd d|||}
|
||	ffS )a  
    Partition into non-overlapping windows with padding if needed.

    Args:
        hidden_state (`torch.Tensor`):
            Input tokens with [batch_size, height, width, num_channels].
        window_size (`int`):
            Window size.

    Returns:
        `tuple(torch.FloatTensor)` comprising various elements:
        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
        - (padded_height, padded_width): padded height and width before partition
    r   r   r   r-   r{      r,   )r2   r   r9   padr]   r<   
contiguous)r   window_sizerh   r?   r@   r   
pad_height	pad_widthpadded_heightpadded_widthwindowsr*   r*   r+   window_partitionu  s   $r   c           
      C   s   |\}}|\}}| j d || | |  }| ||| || ||d}	|	dddddd }	|	|||d}	|	ddd|d|ddf  }	|	S )	aB  
    Window unpartition into original sequences and removing padding.

    Args:
        windows (`torch.Tensor`):
            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
        window_size (`int`):
            Window size.
        pad_height_width (`tuple[int]`):
            Padded height and width (padded_height, padded_width).
        height_width (`tuple[int]`):
            Original height and width before padding.

    Returns:
        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
    r   r,   r   r   r-   r{   r   N)r2   r]   r<   r   )
r   r   pad_height_widthheight_widthr   r   r?   r@   rh   r   r*   r*   r+   window_unpartition  s   $r   c                       sl   e Zd ZdZ	ddededededd	f
 fd
dZ	dde	j
dedee	j
e	j
f ee	j
 B fddZ  ZS )VitDetLayerzCThis corresponds to the Block class in the original implementation.r   Fr&   drop_path_rater   use_residual_blockrE   Nc           	         s  t    |j}|j}t|ttfr|n||f}|j}t|ttfr$|n||f}|d |d  |d |d  f}tj	||j
d| _t||dkrI|n||fd| _|dkrXt|nt | _tj	||j
d| _t||t||j d| _|| _|| _| jrt||||d d| _d S d S )	Nr   r   )r   )ry   r   )r&   r   r   r-   )r&   r   r   r   )r   r   r   r   r   listtupler   r   	LayerNormlayer_norm_epsr   rn   	attentionr   Identityr   r   r   r3   	mlp_ratiomlpr   r   r   residual)	r%   r&   r   r   r   rj   r   r   ry   r(   r*   r+   r     s0   
 zVitDetLayer.__init__r   r   c           	      C   s   | dddd}|}| |}| jdkr'|jd |jd }}t|| j\}}| j||d}|d }|dd  }| jdkrGt|| j|||f}|| | }|| | | 	| }| dddd}| j
rk| |}|f| }|S )Nr   r-   r   r   )r   )r<   r   r   r2   r   r   r   r   r   r   r   r   )	r%   r   r   shortcutr?   r@   r   self_attention_outputsr   r*   r*   r+   rH     s*   




zVitDetLayer.forward)r   r   Fr   )rI   rJ   rK   rL   r   r   r3   boolr   r    rM   r   rH   rN   r*   r*   r(   r+   r     s,    &r   c                       sR   e Zd Zdeddf fddZ			ddejd	ed
ededee	B f
ddZ
  ZS )VitDetEncoderr&   rE   Nc              	      s   t    || _|j}dd tjd|j|ddD }g }t|D ]}|t	||| ||j
v r1|jnd||jv d q t|| _d| _d S )Nc                 S   s   g | ]}|  qS r*   )item).0r   r*   r*   r+   
<listcomp>  s    z*VitDetEncoder.__init__.<locals>.<listcomp>r   cpu)r   )r   r   r   F)r   r   r&   num_hidden_layersr    linspacer   rangeappendr   window_block_indicesr   residual_block_indicesr   
ModuleListr   gradient_checkpointing)r%   r&   depthr   layersir(   r*   r+   r     s    
	
zVitDetEncoder.__init__FTr   r   output_hidden_statesreturn_dictc           
      C   s   |rdnd }|r
dnd }t | jD ]\}}|r||f }|||}	|	d }|r.||	d f }q|r6||f }|sDtdd |||fD S t|||dS )Nr*   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r*   )r   vr*   r*   r+   	<genexpr>4  s    z(VitDetEncoder.forward.<locals>.<genexpr>last_hidden_stater   
attentions)	enumerater   r   r	   )
r%   r   r   r   r   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputsr*   r*   r+   rH     s&   


zVitDetEncoder.forward)FFT)rI   rJ   rK   r   r   r    rM   r   r   r	   rH   rN   r*   r*   r(   r+   r     s     r   c                   @   sR   e Zd ZU eed< dZdZdZdZg Z	e
 dejejB ejB ddfd	d
ZdS )VitDetPreTrainedModelr&   vitdetrD   )imageTmodulerE   Nc                 C   sl  t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS t |trItj|jd| jjd dS t |trj| jjrjtj|jd| jjd tj|jd| jjd dS t |tr|j|j|jfD ]}tj|jddd |j	durt|j	d qw|j|jfD ]}t|j t
|j	 qt
|jj t
|jj	 dS dS )zInitialize the weightsr   )r   stdNfan_outrelu)r0   nonlinearityr   )r   r   rt   r#   inittrunc_normal_r   r&   initializer_rangerp   zeros_r   ones_r   r"   rn   rx   r`   ra   r   r   r   r   kaiming_normal_	constant_r   r   r   )r%   r   r   r*   r*   r+   _init_weightsE  s2   



z#VitDetPreTrainedModel._init_weights)rI   rJ   rK   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr    no_gradr   rt   r#   r   r  r*   r*   r*   r+   r   <  s   
 &r   c                       sr   e Zd Zdef fddZdefddZe				ddej	dB d	e
dB d
e
dB de
dB deeB f
ddZ  ZS )VitDetModelr&   c                    s2   t  | || _t|| _t|| _|   d S r   )r   r   r&   r   rG   r   encoder	post_initr%   r&   r(   r*   r+   r   c  s
   

zVitDetModel.__init__rE   c                 C      | j jS r   rG   r$   r   r*   r*   r+   get_input_embeddingsm     z VitDetModel.get_input_embeddingsNrD   r   r   r   c           	      K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| |}| j||||d}|d }|sC|f|dd  S t||j|j	dS )a  
        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetModel
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetModel(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 768, 14, 14]
        ```Nz You have to specify pixel_values)r   r   r   r   r   r   )
r&   r   r   use_return_dictr6   rG   r  r	   r   r   )	r%   rD   r   r   r   kwargsembedding_outputencoder_outputssequence_outputr*   r*   r+   rH   p  s*   
zVitDetModel.forward)NNNN)rI   rJ   rK   r   r   r   r  r   r    rM   r   r   r	   rH   rN   r*   r*   r(   r+   r  a  s&    
r  zF
    ViTDet backbone, to be used with frameworks like Mask R-CNN.
    )custom_introc                       sb   e Zd Z fddZdefddZe			ddejde	dB d	e	dB d
e	dB de
f
ddZ  ZS )VitDetBackbonec                    sJ   t    t | _t | _ fddt jd D | _| 	  d S )Nc                    s   g | ]} j qS r*   )r   )r   ri   r&   r*   r+   r     s    z+VitDetBackbone.__init__.<locals>.<listcomp>r   )
r   r   r   rG   r   r  r   r   num_featuresr  r  r(   r"  r+   r     s
   

zVitDetBackbone.__init__rE   c                 C   r  r   r  r   r*   r*   r+   r    r  z#VitDetBackbone.get_input_embeddingsNrD   r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}| j|d||d}|r1|jn|d }d}	t| j|D ]\}
}|
| j	v rK|	|f7 }	q=|sf|r[|	f|dd  }|S |	f|dd  }|S t
|	|rm|jnd|jdS )a  
        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetBackbone
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetBackbone(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```NT)r   r   r   r   r*   r-   )feature_mapsr   r   )r&   r  r   r   rG   r  r   zipstage_namesout_featuresr   r   )r%   rD   r   r   r   r  r  r   r   r$  stager   r   r*   r*   r+   rH     s8   


zVitDetBackbone.forward)NNN)rI   rJ   rK   r   r   r  r   r    rM   r   r   rH   rN   r*   r*   r(   r+   r!    s$    
r!  )r  r   r!  )r   F)3rL   collections.abcr   r4   r    r    r   r  activationsr   backbone_utilsr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   utilsr   r   configuration_vitdetr   
get_loggerrI   loggerModuler   r7   script_if_tracingr[   rm   rn   rM   r   r   r   r   r   r   r   r   r   r   r   r   r  r!  __all__r*   r*   r*   r+   <module>   sJ   
X
$) ?' P8$GN