o
    i{N                  	   @   s  d Z ddlZddlmZ ddlmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ eeZdZdZg dZdZdZd8dej de!de"dej fddZ#G dd dej$Z%G dd dej$Z&G dd dej$Z'G dd  d ej$Z(G d!d" d"ej$Z)G d#d$ d$ej$Z*G d%d& d&ej$Z+G d'd( d(ej$Z,G d)d* d*ej$Z-G d+d, d,ej$Z.G d-d. d.eZ/d/Z0d0Z1ed1e0G d2d3 d3e/Z2ed4e0G d5d6 d6e/Z3g d7Z4dS )9z-PyTorch Visual Attention Network (VAN) model.    N)OrderedDict)OptionalUnion)nn   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )	VanConfigr   z!Visual-Attention-Network/van-base)r   i      r   ztabby, tabby cat        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r#   c/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/deprecated/van/modeling_van.py	drop_path1   s   
r%   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )VanDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r#   r$   r)   H   s   

zVanDropPath.__init__hidden_statesc                 C   s   t || j| jS r'   )r%   r   r   )r*   r-   r#   r#   r$   forwardL   s   zVanDropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r*   r#   r#   r$   
extra_reprO   s   zVanDropPath.extra_reprr'   )__name__
__module____qualname____doc__r   floatr)   r   Tensorr.   strr/   __classcell__r#   r#   r+   r$   r&   E   s
    r&   c                	       sJ   e Zd ZdZddedededef fdd	Zd
ejdejfddZ  Z	S )VanOverlappingPatchEmbeddera  
    Downsamples the input using a patchify operation with a `stride` of 4 by default making adjacent windows overlap by
    half of the area. From [PVTv2: Improved Baselines with Pyramid Vision
    Transformer](https://huggingface.co/papers/2106.13797).
    r   r   in_channelshidden_size
patch_sizestridec                    s4   t    tj|||||d d| _t|| _d S )N   )kernel_sizer<   padding)r(   r)   r   Conv2dconvolutionBatchNorm2dnormalization)r*   r9   r:   r;   r<   r+   r#   r$   r)   Z   s
   
z$VanOverlappingPatchEmbedder.__init__r   r   c                 C   s   |  |}| |}|S r'   )rA   rC   )r*   r   hidden_stater#   r#   r$   r.   a   s   

z#VanOverlappingPatchEmbedder.forward)r   r   
r0   r1   r2   r3   intr)   r   r5   r.   r7   r#   r#   r+   r$   r8   S   s     r8   c                       sR   e Zd ZdZ		ddededededef
 fd	d
Zdej	dej	fddZ
  ZS )VanMlpLayerz
    MLP with depth-wise convolution, from [PVTv2: Improved Baselines with Pyramid Vision
    Transformer](https://huggingface.co/papers/2106.13797).
    gelu      ?r9   r:   out_channels
hidden_actdropout_ratec                    sj   t    tj||dd| _tj||dd|d| _t| | _t|| _	tj||dd| _
t|| _d S )Nr   r>      r>   r?   groups)r(   r)   r   r@   in_dense
depth_wiser   
activationDropoutdropout1	out_densedropout2)r*   r9   r:   rJ   rK   rL   r+   r#   r$   r)   m   s   

zVanMlpLayer.__init__rD   r   c                 C   s@   |  |}| |}| |}| |}| |}| |}|S r'   )rQ   rR   rS   rU   rV   rW   r*   rD   r#   r#   r$   r.   }   s   





zVanMlpLayer.forward)rH   rI   )r0   r1   r2   r3   rF   r6   r4   r)   r   r5   r.   r7   r#   r#   r+   r$   rG   g   s     
rG   c                       <   e Zd ZdZdef fddZdejdejfddZ  Z	S )	VanLargeKernelAttentionz-
    Basic Large Kernel Attention (LKA).
    r:   c                    sN   t    tj||dd|d| _tj||ddd|d| _tj||dd	| _d S )
N   r=   rO   r   rN   	   )r>   dilationr?   rP   r   rM   )r(   r)   r   r@   rR   depth_wise_dilated
point_wiser*   r:   r+   r#   r$   r)      s   
z VanLargeKernelAttention.__init__rD   r   c                 C   s"   |  |}| |}| |}|S r'   )rR   r^   r_   rX   r#   r#   r$   r.      s   


zVanLargeKernelAttention.forwardrE   r#   r#   r+   r$   rZ      s    rZ   c                       rY   )	VanLargeKernelAttentionLayerzV
    Computes attention using Large Kernel Attention (LKA) and attends the input.
    r:   c                    s   t    t|| _d S r'   )r(   r)   rZ   	attentionr`   r+   r#   r$   r)      s   
z%VanLargeKernelAttentionLayer.__init__rD   r   c                 C   s   |  |}|| }|S r'   )rb   )r*   rD   rb   attendedr#   r#   r$   r.      s   
z$VanLargeKernelAttentionLayer.forwardrE   r#   r#   r+   r$   ra      s    ra   c                       B   e Zd ZdZddedef fddZdejdejfd	d
Z	  Z
S )VanSpatialAttentionLayerz
    Van spatial attention layer composed by projection (via conv) -> act -> Large Kernel Attention (LKA) attention ->
    projection (via conv) + residual connection.
    rH   r:   rK   c              	      sV   t    ttdtj||ddfdt| fg| _t|| _	tj||dd| _
d S )Nconvr   rM   act)r(   r)   r   
Sequentialr   r@   r   pre_projectionra   attention_layerpost_projection)r*   r:   rK   r+   r#   r$   r)      s   


z!VanSpatialAttentionLayer.__init__rD   r   c                 C   s.   |}|  |}| |}| |}|| }|S r'   )ri   rj   rk   r*   rD   residualr#   r#   r$   r.      s   


z VanSpatialAttentionLayer.forward)rH   )r0   r1   r2   r3   rF   r6   r)   r   r5   r.   r7   r#   r#   r+   r$   re      s    re   c                       rd   )VanLayerScalingzT
    Scales the inputs by a learnable parameter initialized by `initial_value`.
    {Gz?r:   initial_valuec                    s(   t    tj|t| dd| _d S )NT)requires_grad)r(   r)   r   	Parameterr   onesweight)r*   r:   rp   r+   r#   r$   r)      s   
zVanLayerScaling.__init__rD   r   c                 C   s   | j dd| }|S )N)rt   	unsqueezerX   r#   r#   r$   r.      s   zVanLayerScaling.forward)ro   )r0   r1   r2   r3   rF   r4   r)   r   r5   r.   r7   r#   r#   r+   r$   rn      s    rn   c                	       sN   e Zd ZdZ		ddedededef fdd	Zd
ej	dej	fddZ
  ZS )VanLayerzv
    Van layer composed by normalization layers, large kernel attention (LKA) and a multi layer perceptron (MLP).
    r   rI   configr:   	mlp_ratiodrop_path_ratec                    s   t    |dkrt|nt | _t|| _t||j	| _
t||j| _t|| _t||| ||j	|j| _t||j| _d S )Nr   )r(   r)   r&   r   Identityr%   rB   pre_normomalizationre   rK   rb   rn   layer_scale_init_valueattention_scalingpost_normalizationrG   rL   mlpmlp_scaling)r*   rx   r:   ry   rz   r+   r#   r$   r)      s   
zVanLayer.__init__rD   r   c                 C   sl   |}|  |}| |}| |}| |}|| }|}| |}| |}| |}| |}|| }|S r'   )r|   rb   r~   r%   r   r   r   rl   r#   r#   r$   r.      s   







zVanLayer.forward)r   rI   r0   r1   r2   r3   r   rF   r4   r)   r   r5   r.   r7   r#   r#   r+   r$   rw      s    rw   c                       s^   e Zd ZdZ		ddededededed	ed
edef fddZdej	dej	fddZ
  ZS )VanStagez2
    VanStage, consisting of multiple layers.
    r   r   rx   r9   r:   r;   r<   depthry   rz   c	           	         sT   t    t|||| _tj fddt|D  | _tj j	d| _
d S )Nc                    s   g | ]
}t  d qS ))ry   rz   )rw   ).0_rx   rz   r:   ry   r#   r$   
<listcomp>  s    z%VanStage.__init__.<locals>.<listcomp>eps)r(   r)   r8   
embeddingsr   rh   rangelayers	LayerNormlayer_norm_epsrC   )	r*   rx   r9   r:   r;   r<   r   ry   rz   r+   r   r$   r)     s   
zVanStage.__init__rD   r   c                 C   s^   |  |}| |}|j\}}}}|ddd}| |}|||||dddd}|S )Nr=   r   r   rN   )r   r   r   flatten	transposerC   viewpermute)r*   rD   
batch_sizer:   heightwidthr#   r#   r$   r.     s   


zVanStage.forward)r   r   r   r#   r#   r+   r$   r      s,    	r   c                       sX   e Zd ZdZdef fddZ		ddejdee	 d	ee	 d
e
eef fddZ  ZS )
VanEncoderz4
    VanEncoder, consisting of multiple stages.
    rx   c                    s   t    tg | _|j}|j}|j}|j}|j	}dd t
jd|jt|jddD }tt||||||D ])\}\}	}
}}}}|dk}||d  }|rP|j}| jt||||	|
|||d q7d S )Nc                 S   s   g | ]}|  qS r#   )item)r   xr#   r#   r$   r   8  s    z'VanEncoder.__init__.<locals>.<listcomp>r   cpu)r   r   )r;   r<   r   ry   rz   )r(   r)   r   
ModuleListstagespatch_sizesstrideshidden_sizesdepths
mlp_ratiosr   linspacerz   sum	enumeratezipnum_channelsappendr   )r*   rx   r   r   r   r   r   drop_path_rates	num_stager;   r<   r:   r   mlp_expansionrz   is_first_stager9   r+   r#   r$   r)   0  s<   
zVanEncoder.__init__FTrD   output_hidden_statesreturn_dictr   c                 C   s\   |rdnd }t | jD ]\}}||}|r||f }q|s(tdd ||fD S t||dS )Nr#   c                 s   s    | ]	}|d ur|V  qd S r'   r#   )r   vr#   r#   r$   	<genexpr>_  s    z%VanEncoder.forward.<locals>.<genexpr>)last_hidden_stater-   )r   r   tupler   )r*   rD   r   r   all_hidden_statesr   stage_moduler#   r#   r$   r.   P  s   
zVanEncoder.forward)FT)r0   r1   r2   r3   r   r)   r   r5   r   boolr   r   r   r.   r7   r#   r#   r+   r$   r   +  s    #
r   c                   @   s.   e Zd ZU dZeed< dZdZdZdd Z	dS )	VanPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    rx   vanpixel_valuesTc                 C   s   t |tjr*tjj|j| jjd t |tjr&|jdur(tj	|jd dS dS dS t |tj
rBtj	|jd tj	|jd dS t |tjrt|jd |jd  |j }||j }|jjdtd|  |jdurv|jj  dS dS dS )zInitialize the weights)stdNr   g      ?r   g       @)
isinstancer   Linearinittrunc_normal_rt   rx   initializer_rangebias	constant_r   r@   r>   rJ   rP   datanormal_mathsqrtzero_)r*   modulefan_outr#   r#   r$   _init_weightso  s    

z VanPreTrainedModel._init_weightsN)
r0   r1   r2   r3   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   r#   r#   r#   r$   r   d  s   
 r   aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VanConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zxThe bare VAN model outputting raw features without any specific head on top. Note, VAN does not have an embedding layer.c                       sl   e Zd Z fddZeeeeee	de
d		ddeej dee dee d	eeef fd
dZ  ZS )VanModelc                    s@   t  | || _t|| _tj|jd |jd| _	| 
  d S )Nru   r   )r(   r)   rx   r   encoderr   r   r   r   	layernorm	post_initr*   rx   r+   r#   r$   r)     s
   
zVanModel.__init__vision)
checkpointoutput_typeconfig_classmodalityexpected_outputNr   r   r   r   c                 C   sx   |d ur|n| j j}|d ur|n| j j}| j|||d}|d }|jddgd}|s4||f|dd   S t|||jdS )Nr   r   r   ru   )dimr   )r   pooler_outputr-   )rx   r   use_return_dictr   meanr	   r-   )r*   r   r   r   encoder_outputsr   pooled_outputr#   r#   r$   r.     s"   zVanModel.forward)NN)r0   r1   r2   r)   r   VAN_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr	   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r   FloatTensorr   r   r   r.   r7   r#   r#   r+   r$   r     s*    	

r   z
    VAN Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                       sx   e Zd Z fddZeeeeee	e
d				ddeej deej dee dee d	eeef f
d
dZ  ZS )VanForImageClassificationc                    sJ   t  | t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   ru   )r(   r)   r   r   
num_labelsr   r   r   r{   
classifierr   r   r+   r#   r$   r)     s
   
$z"VanForImageClassification.__init__)r   r   r   r   Nr   labelsr   r   r   c           
      C   s   |dur|n| j j}| j|||d}|r|jn|d }| |}d}|dur.| ||| j }|sD|f|dd  }	|durB|f|	 S |	S t|||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r=   )losslogitsr-   )rx   r   r   r   r   loss_functionr
   r-   )
r*   r   r   r   r   outputsr   r   r   r"   r#   r#   r$   r.     s   
z!VanForImageClassification.forward)NNNN)r0   r1   r2   r)   r   r   r   _IMAGE_CLASS_CHECKPOINTr
   r   _IMAGE_CLASS_EXPECTED_OUTPUTr   r   r   
LongTensorr   r   r   r.   r7   r#   r#   r+   r$   r     s0    
r   )r   r   r   )r   F)5r3   r   collectionsr   typingr   r   r   r   activationsr   modeling_outputsr   r	   r
   modeling_utilsr   utilsr   r   r   r   configuration_vanr   
get_loggerr0   loggerr   r   r   r   r   r5   r4   r   r%   Moduler&   r8   rG   rZ   ra   re   rn   rw   r   r   r   VAN_START_DOCSTRINGr   r   r   __all__r#   r#   r#   r$   <module>   sT   
  ++903