o
    eiz                  	   @   sv  d Z ddlZddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ e rPddlmZmZ ndd Zdd ZeeZeeddG dd deZeeddG dd deZeeddG dd deZG dd dej Z!G dd dej Z"G d d! d!ej Z#dGd$ej$d%e%d&e&d'ej$fd(d)Z'G d*d+ d+ej Z(G d,d- d-ej Z)G d.d/ d/ej Z*G d0d1 d1ej Z+G d2d3 d3ej Z,G d4d5 d5ej Z-G d6d7 d7ej Z.G d8d9 d9ej Z/G d:d; d;ej Z0eG d<d= d=eZ1eG d>d? d?e1Z2ed@dG dAdB dBe1Z3edCdG dDdE dEe	e1Z4g dFZ5dS )Hz9PyTorch Dilated Neighborhood Attention Transformer model.    N)	dataclass)nn   )ACT2FN)BackboneMixin)BackboneOutput)PreTrainedModel)ModelOutputOptionalDependencyNotAvailableauto_docstringis_natten_availableloggingrequires_backends   )DinatConfig)
natten2davnatten2dqkrpbc                  O      t  Nr
   argskwargs r   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/dinat/modeling_dinat.pyr   )      r   c                  O   r   r   r   r   r   r   r   r   ,   r   r   zO
    Dinat encoder's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sr   e Zd ZU dZdZejdB ed< dZe	ejdf dB ed< dZ
e	ejdf dB ed< dZe	ejdf dB ed< dS )DinatEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tupler    r!   r   r   r   r   r   6   s   
 	r   zW
    Dinat model's outputs that also contains a pooling of the last hidden states.
    c                   @      e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dZe
ejdf dB ed< dS )	DinatModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r    r!   )r"   r#   r$   r%   r   r&   r'   r(   r,   r   r)   r    r!   r   r   r   r   r+   L   s   
 r+   z1
    Dinat outputs for image classification.
    c                   @   r*   )	DinatImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlosslogits.r   r    r!   )r"   r#   r$   r%   r.   r&   r'   r(   r/   r   r)   r    r!   r   r   r   r   r-   e   s   
 r-   c                       s>   e Zd ZdZ fddZdejdB deej fddZ	  Z
S )	DinatEmbeddingsz6
    Construct the patch and position embeddings.
    c                    s4   t    t|| _t|j| _t|j	| _
d S r   )super__init__DinatPatchEmbeddingspatch_embeddingsr   	LayerNorm	embed_dimnormDropouthidden_dropout_probdropoutselfconfig	__class__r   r   r2      s   

zDinatEmbeddings.__init__pixel_valuesNreturnc                 C   s"   |  |}| |}| |}|S r   )r4   r7   r:   )r<   r@   
embeddingsr   r   r   forward   s   


zDinatEmbeddings.forward)r"   r#   r$   r%   r2   r&   r'   r)   TensorrC   __classcell__r   r   r>   r   r0      s    &r0   c                       s:   e Zd ZdZ fddZdejdB dejfddZ  Z	S )	r3   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
    Transformer.
    c              
      sr   t    |j}|j|j}}|| _|dkrntdttj| j|d ddddtj|d |dddd| _	d S )N   z2Dinat only supports patch size of 4 at the moment.   r   r   rG   rG   r   r   )kernel_sizestridepadding)
r1   r2   
patch_sizenum_channelsr6   
ValueErrorr   
SequentialConv2d
projection)r<   r=   rN   rO   hidden_sizer>   r   r   r2      s   

zDinatPatchEmbeddings.__init__r@   NrA   c                 C   s>   |j \}}}}|| jkrtd| |}|dddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   rG   r   r   )shaperO   rP   rS   permute)r<   r@   _rO   heightwidthrB   r   r   r   rC      s   

zDinatPatchEmbeddings.forward)
r"   r#   r$   r%   r2   r&   r'   rD   rC   rE   r   r   r>   r   r3      s    "r3   c                       sL   e Zd ZdZejfdedejddf fddZde	j
de	j
fd	d
Z  ZS )DinatDownsamplerz
    Convolutional Downsampling Layer.

    Args:
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    dim
norm_layerrA   Nc                    s>   t    || _tj|d| ddddd| _|d| | _d S )NrG   rH   rI   rJ   F)rK   rL   rM   bias)r1   r2   r[   r   rR   	reductionr7   )r<   r[   r\   r>   r   r   r2      s   
zDinatDownsampler.__init__input_featurec                 C   s0   |  |dddddddd}| |}|S )Nr   r   r   rG   )r^   rV   r7   )r<   r_   r   r   r   rC      s   "
zDinatDownsampler.forward)r"   r#   r$   r%   r   r5   intModuler2   r&   rD   rC   rE   r   r   r>   r   rZ      s    "
rZ           Finput	drop_probtrainingrA   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    rb   r   r   )r   )dtypedevice)rU   ndimr&   randrf   rg   floor_div)rc   rd   re   	keep_probrU   random_tensoroutputr   r   r   	drop_path   s   ro   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )DinatDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nrd   rA   c                    s   t    || _d S r   )r1   r2   rd   )r<   rd   r>   r   r   r2      s   

zDinatDropPath.__init__r   c                 C   s   t || j| jS r   )ro   rd   re   r<   r   r   r   r   rC      s   zDinatDropPath.forwardc                 C   s   d| j  S )Nzp=)rd   r<   r   r   r   
extra_repr   s   zDinatDropPath.extra_reprr   )r"   r#   r$   r%   floatr2   r&   rD   rC   strrs   rE   r   r   r>   r   rp      s
    rp   c                       B   e Zd Z fddZ	d
dejdedB deej fdd	Z  Z	S )NeighborhoodAttentionc                    s   t    || dkrtd| d| d|| _t|| | _| j| j | _|| _|| _t	
t|d| j d d| j d | _t	j| j| j|jd| _t	j| j| j|jd| _t	j| j| j|jd| _t	|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()rG   r   )r]   )r1   r2   rP   num_attention_headsr`   attention_head_sizeall_head_sizerK   dilationr   	Parameterr&   zerosrpbLinearqkv_biasquerykeyvaluer8   attention_probs_dropout_probr:   r<   r=   r[   	num_headsrK   r|   r>   r   r   r2      s   
*zNeighborhoodAttention.__init__Fr   output_attentionsNrA   c                 C   s  |j \}}}| ||d| j| jdd}| ||d| j| jdd}| ||d| j| jdd}|t	| j }t
||| j| j| j}	tjj|	dd}
| |
}
t|
|| j| j}|ddddd }| d d | jf }||}|r||
f}|S |f}|S )	Nr   rG   )r[   r   r   rF   )rU   r   viewry   rz   	transposer   r   mathsqrtr   r   rK   r|   r   
functionalsoftmaxr:   r   rV   
contiguoussizer{   )r<   r   r   
batch_size
seq_lengthrW   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr   r   r   rC     s2   	

zNeighborhoodAttention.forwardF
r"   r#   r$   r2   r&   rD   boolr)   rC   rE   r   r   r>   r   rw      s    rw   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )NeighborhoodAttentionOutputc                    s*   t    t||| _t|j| _d S r   )r1   r2   r   r   denser8   r   r:   r<   r=   r[   r>   r   r   r2   7  s   
z$NeighborhoodAttentionOutput.__init__r   input_tensorrA   c                 C      |  |}| |}|S r   r   r:   )r<   r   r   r   r   r   rC   <  s   

z#NeighborhoodAttentionOutput.forwardr"   r#   r$   r2   r&   rD   rC   rE   r   r   r>   r   r   6  s    $r   c                       rv   )NeighborhoodAttentionModulec                    s,   t    t|||||| _t||| _d S r   )r1   r2   rw   r<   r   rn   r   r>   r   r   r2   D  s   
z$NeighborhoodAttentionModule.__init__Fr   r   NrA   c                 C   s2   |  ||}| |d |}|f|dd   }|S Nr   r   )r<   rn   )r<   r   r   self_outputsattention_outputr   r   r   r   rC   I  s   z#NeighborhoodAttentionModule.forwardr   r   r   r   r>   r   r   C  s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )DinatIntermediatec                    sJ   t    t|t|j| | _t|jt	rt
|j | _d S |j| _d S r   )r1   r2   r   r   r`   	mlp_ratior   
isinstance
hidden_actru   r   intermediate_act_fnr   r>   r   r   r2   U  s
   
zDinatIntermediate.__init__r   rA   c                 C   r   r   )r   r   rq   r   r   r   rC   ]     

zDinatIntermediate.forwardr   r   r   r>   r   r   T  s    r   c                       r   )DinatOutputc                    s4   t    tt|j| || _t|j| _	d S r   )
r1   r2   r   r   r`   r   r   r8   r9   r:   r   r>   r   r   r2   d  s   
zDinatOutput.__init__r   rA   c                 C   r   r   r   rq   r   r   r   rC   i  r   zDinatOutput.forwardr   r   r   r>   r   r   c  s    r   c                	       sR   e Zd Zd fdd	Zdd Z	ddejded	B d
eejejf fddZ	  Z
S )
DinatLayerrb   c                    s   t    |j| _|j| _|| _| j| j | _tj||jd| _	t
|||| j| jd| _|dkr4t|nt | _tj||jd| _t||| _t||| _|jdkretj|jtd|f dd| _d S d | _d S )Neps)rK   r|   rb   r   rG   T)requires_grad)r1   r2   chunk_size_feed_forwardrK   r|   window_sizer   r5   layer_norm_epslayernorm_beforer   	attentionrp   Identityro   layernorm_afterr   intermediater   rn   layer_scale_init_valuer}   r&   oneslayer_scale_parameters)r<   r=   r[   r   r|   drop_path_rater>   r   r   r2   p  s$   

zDinatLayer.__init__c           
      C   sd   | j }d}||k s||k r.d }}td|| }td|| }	dd||||	f}tj||}||fS )N)r   r   r   r   r   r   r   )r   maxr   r   pad)
r<   r   rX   rY   r   
pad_valuespad_lpad_tpad_rpad_br   r   r   	maybe_pad  s   zDinatLayer.maybe_padFr   r   NrA   c                 C   s  |  \}}}}|}| |}| |||\}}|j\}	}
}}	| j||d}|d }|d dkp5|d dk}|rJ|d d d |d |d d f  }| jd urV| jd | }|| | }| |}| 	| 
|}| jd urv| jd | }|| | }|r||d f}|S |f}|S )N)r   r   r      r   )r   r   r   rU   r   r   r   ro   r   rn   r   )r<   r   r   r   rX   rY   channelsshortcutr   rW   
height_pad	width_padattention_outputsr   
was_paddedlayer_outputlayer_outputsr   r   r   rC     s,   
$


zDinatLayer.forward)rb   r   )r"   r#   r$   r2   r   r&   rD   r   r)   rC   rE   r   r   r>   r   r   o  s    r   c                       rv   )
DinatStagec                    sf   t     | _| _t fddt|D | _|d ur+|tjd| _	nd | _	d| _
d S )Nc              	      s&   g | ]}t  | | d qS ))r=   r[   r   r|   r   )r   .0ir=   	dilationsr[   r   r   r   r   
<listcomp>  s    z'DinatStage.__init__.<locals>.<listcomp>)r[   r\   F)r1   r2   r=   r[   r   
ModuleListrangelayersr5   
downsamplepointing)r<   r=   r[   depthr   r   r   r   r>   r   r   r2     s   

zDinatStage.__init__Fr   r   NrA   c                 C   sn   |  \}}}}t| jD ]\}}|||}|d }q|}	| jd ur'| |	}||	f}
|r5|
|dd  7 }
|
S r   )r   	enumerater   r   )r<   r   r   rW   rX   rY   r   layer_moduler   !hidden_states_before_downsamplingstage_outputsr   r   r   rC     s   



zDinatStage.forwardr   r   r   r   r>   r   r     s    r   c                       s^   e Zd Z fddZ				ddejdedB dedB d	edB d
edB deeB fddZ	  Z
S )DinatEncoderc                    sh   t    t j_ _dd tjd jt	 jddD t
 fddtjD _d S )Nc                 S   s   g | ]}|  qS r   )item)r   xr   r   r   r     s    z)DinatEncoder.__init__.<locals>.<listcomp>r   cpu)rg   c                    s|   g | ]:}t  t jd |   j|  j|  j| t jd| t jd|d   |jd k r8tnddqS )rG   Nr   )r=   r[   r   r   r   r   r   )	r   r`   r6   depthsr   r   sum
num_levelsrZ   )r   i_layerr=   dprr<   r   r   r     s    
*)r1   r2   lenr   r   r=   r&   linspacer   r   r   r   r   levelsr;   r>   r   r   r2     s   
$

zDinatEncoder.__init__FTr   r   Noutput_hidden_states(output_hidden_states_before_downsamplingreturn_dictrA   c                 C   s  |rdnd }|r
dnd }|rdnd }|r&| dddd}	||f7 }||	f7 }t| jD ]H\}
}|||}|d }|d }|rS|rS| dddd}	||f7 }||	f7 }n|ri|si| dddd}	||f7 }||	f7 }|rs||dd  7 }q+|stdd |||fD S t||||dS )	Nr   r   r   r   rG   c                 s   s    | ]	}|d ur|V  qd S r   r   )r   vr   r   r   	<genexpr>!  s    z'DinatEncoder.forward.<locals>.<genexpr>)r   r   r    r!   )rV   r   r   r)   r   )r<   r   r   r   r   r   all_hidden_statesall_reshaped_hidden_statesall_self_attentionsreshaped_hidden_stater   r   r   r   r   r   r   rC     s<   





zDinatEncoder.forward)FFFT)r"   r#   r$   r2   r&   rD   r   r)   r   rC   rE   r   r   r>   r   r     s&    r   c                   @   s"   e Zd ZU eed< dZdZdZdS )DinatPreTrainedModelr=   dinatr@   )imageN)r"   r#   r$   r   r(   base_model_prefixmain_input_nameinput_modalitiesr   r   r   r   r   +  s
   
 r   c                       sh   e Zd Zd fdd	Zdd Ze				ddejdB dedB d	edB d
edB de	e
B f
ddZ  ZS )
DinatModelTc                    s   t  | t| dg || _t|j| _t|jd| jd   | _	t
|| _t|| _tj| j	|jd| _|r=tdnd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        nattenrG   r   r   N)r1   r2   r   r=   r   r   r   r`   r6   num_featuresr0   rB   r   encoderr   r5   r   	layernormAdaptiveAvgPool1dpooler	post_init)r<   r=   add_pooling_layerr>   r   r   r2   5  s   

zDinatModel.__init__c                 C      | j jS r   rB   r4   rr   r   r   r   get_input_embeddingsK     zDinatModel.get_input_embeddingsNr@   r   r   r   rA   c                 K   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| |}| j||||d}|d }| |}d }	| jd urW| |	dd
dd}	t	|	d}	|se||	f|dd   }
|
S t||	|j|j|jdS )Nz You have to specify pixel_valuesr   r   r   r   r   rG   )r   r,   r   r    r!   )r=   r   r   use_return_dictrP   rB   r  r  r  flattenr   r&   r+   r   r    r!   )r<   r@   r   r   r   r   embedding_outputencoder_outputssequence_outputpooled_outputrn   r   r   r   rC   N  s:   	


zDinatModel.forward)T)NNNN)r"   r#   r$   r2   r  r   r&   r'   r   r)   r+   rC   rE   r   r   r>   r   r  3  s&    r  z
    Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    c                       sj   e Zd Z fddZe					ddejdB dejdB dedB dedB dedB d	e	e
B fd
dZ  ZS )DinatForImageClassificationc                    s\   t  | t| dg |j| _t|| _|jdkr#t| jj|jnt	 | _
|   d S )Nr  r   )r1   r2   r   
num_labelsr  r   r   r   r  r   
classifierr  r;   r>   r   r   r2     s   
"z$DinatForImageClassification.__init__Nr@   labelsr   r   r   rA   c                 K   s   |dur|n| j j}| j||||d}|d }| |}	d}
|dur*| ||	| j }
|s@|	f|dd  }|
dur>|
f| S |S t|
|	|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   rG   )r.   r/   r   r    r!   )	r=   r  r   r  loss_functionr-   r   r    r!   )r<   r@   r  r   r   r   r   r   r  r/   r.   rn   r   r   r   rC     s,   
z#DinatForImageClassification.forward)NNNNN)r"   r#   r$   r2   r   r&   r'   
LongTensorr   r)   r-   rC   rE   r   r   r>   r   r    s*    r  zL
    NAT backbone, to be used with frameworks like DETR and MaskFormer.
    c                       s\   e Zd Z fddZdd Ze			ddejdedB dedB d	edB d
e	f
ddZ
  ZS )DinatBackbonec                    s   t    t| dg t | _t | _ jg fddtt	 j
D  | _i }t| j| jD ]\}}t|||< q2t|| _|   d S )Nr  c                    s   g | ]}t  jd |  qS )rG   )r`   r6   r   r=   r   r   r     s    z*DinatBackbone.__init__.<locals>.<listcomp>)r1   r2   r   r0   rB   r   r  r6   r   r   r   r  zipout_featuresr   r   r5   
ModuleDicthidden_states_normsr  )r<   r=   r   stagerO   r>   r  r   r2     s   

&zDinatBackbone.__init__c                 C   r
  r   r  rr   r   r   r   r    r  z"DinatBackbone.get_input_embeddingsNr@   r   r   r   rA   c                 K   s,  |dur|n| j j}|dur|n| j j}|dur|n| j j}| |}| j||dddd}|j}d}	t| j|D ]A\}
}|
| j	v ry|j
\}}}}|dddd }|||| |}| j|
 |}|||||}|dddd }|	|f7 }	q8|s|	f}|r||jf7 }|S t|	|r|jnd|jd	S )
a  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 512, 7, 7]
        ```NT)r   r   r   r   r   r   rG   r   r   )feature_mapsr   r    )r=   r  r   r   rB   r  r!   r  stage_namesr  rU   rV   r   r   r   r   r   r    )r<   r@   r   r   r   r   r  r   r   r"  r!  hidden_stater   rO   rX   rY   rn   r   r   r   rC     sD   $


zDinatBackbone.forward)NNN)r"   r#   r$   r2   r  r   r&   rD   r   r   rC   rE   r   r   r>   r   r    s$    r  )r  r  r   r  )rb   F)6r%   r   dataclassesr   r&   r   activationsr   backbone_utilsr   modeling_outputsr   modeling_utilsr   utilsr	   r
   r   r   r   r   configuration_dinatr   natten.functionalr   r   
get_loggerr"   loggerr   r+   r-   ra   r0   r3   rZ   rD   rt   r   ro   rp   rw   r   r   r   r   r   r   r   r   r  r  r  __all__r   r   r   r   <module>   sr    
$ FG/FK?d