o
    iqo                  	   @   s  d Z ddlZddlZddlmZ ddlZddlZddlmZm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ eeZd=deeef fddZG dd de	j Z!G dd de	j"Z#G dd de	j$Z%G dd de	j&Z'G dd de	j$Z(d>dejde)dedejfd d!Z*G d"d# d#e	j$Z+d?d%d&Z,G d'd( d(e	j$Z-G d)d* d*e	j$Z.G d+d, d,e	j$Z/G d-d. d.e	j$Z0G d/d0 d0e	j$Z1eG d1d2 d2eZ2eG d3d4 d4e2Z3ed5d6G d7d8 d8e2Z4ed9d6G d:d; d;e2eZ5g d<Z6dS )@z9PyTorch BiT model. Also supports backbone for ViT hybrid.    N)Optional)Tensornn   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin   )	BitConfig   returnc                 C   s   d}| du r|d ||d   d } | |fS t | tr_|  } | dkrI|dkrA||d  d dkrA|d ||d   d } | |fS d} d}| |fS | dkrSd} | |fS |d ||d   d } | |fS )	al  
    Utility function to get the tuple padding value given the kernel_size and padding.

    Args:
        padding (Union[`str`, `int`], *optional*):
            Padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
            PyTorch is used.
        kernel_size (`int`, *optional*, defaults to 7):
            Kernel size of the convolution layers.
        stride (`int`, *optional*, defaults to 1):
            Stride value of the convolution layers.
        dilation (`int`, *optional*, defaults to 1):
            Dilation value of the convolution layers.
    FNr      samer   Tvalid)
isinstancestrlower)paddingkernel_sizestridedilationdynamic r   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/bit/modeling_bit.pyget_padding_value)   s$   
r    c                       s6   e Zd ZdZ						d
 fdd	Zdd	 Z  ZS )WeightStandardizedConv2dzConv2d with Weight Standardization. Includes TensorFlow compatible SAME padding. Used for ViT Hybrid model.

    Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
    Standardization](https://huggingface.co/papers/1903.10520v2)
    r   SAMEFư>c
              
      sT   t ||||d\}}
t j||||||||d |
r"t|||| _nd | _|	| _d S )N)r   r   )r   r   r   groupsbias)r    super__init__DynamicPad2dpadeps)self
in_channelout_channelsr   r   r   r   r$   r%   r*   
is_dynamic	__class__r   r   r'   Y   s   

z!WeightStandardizedConv2d.__init__c              	   C   sj   | j d ur
|  |}tjj| jd| jdd d dd| jd| j}tj	||| j
| j| j| j| j}|S )Nr   T        )trainingmomentumr*   )r)   r   
functional
batch_normweightreshaper-   r*   
reshape_asconv2dr%   r   r   r   r$   )r+   hidden_stater7   r   r   r   forwardv   s   

z WeightStandardizedConv2d.forward)r   r"   r   r   Fr#   __name__
__module____qualname____doc__r'   r<   __classcell__r   r   r/   r   r!   R   s    r!   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	BitGroupNormActivationzQ
    A module that combines group normalization with an activation function.
    h㈵>Tc                    s8   t  j|j|||d |rt|j | _d S t | _d S )N)r*   affine)r&   r'   
num_groupsr   
hidden_act
activationr   Identity)r+   confignum_channelsr*   rE   apply_activationr/   r   r   r'      s   zBitGroupNormActivation.__init__c                 C   s*   t j|| j| j| j| j}| |}|S N)r   r5   
group_normrF   r7   r%   r*   rH   )r+   r;   r   r   r   r<      s   
zBitGroupNormActivation.forward)rD   TTr=   r   r   r/   r   rC      s    rC   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )r(   z
    A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input
    hidden states.
    r   c                    sj   t    t|tr||f}t|tr||f}t|tr ||f}|| _|| _|| _|| _dd }|| _d S )Nc                 S   s0   t t| | d | |d |  d |  dS )Nr   r   )maxmathceil)xr   r   r   r   r   r   compute_padding   s   0z.DynamicPad2d.__init__.<locals>.compute_padding)	r&   r'   r   intr   r   r   valuerS   )r+   r   r   r   rU   rS   r/   r   r   r'      s   




zDynamicPad2d.__init__c                 C   s   |  dd  \}}| || jd | jd | jd }| || jd | jd | jd }|dks4|dkrNtjj||d ||d  |d ||d  g| jd}|S )Nr   r   r   )rU   )	sizerS   r   r   r   r   r5   r)   rU   )r+   inputinput_heightinput_widthpadding_heightpadding_widthr   r   r   r<      s   ""


zDynamicPad2d.forward)r   r=   r   r   r/   r   r(      s    r(   c                       s<   e Zd ZdZ						ddef fd	d
Zdd Z  ZS )BitMaxPool2dz1Tensorflow like 'SAME' wrapper for 2D max poolingNr   Fr   r   r   Tr   c                    s   t |tjjr	|n||f}t |tjjr|n||f}t |tjjr#|n||f}t ||||| |r=t||||| _d S t	 | _d S rM   )
r   collectionsabcIterabler&   r'   r(   r)   r   rI   )r+   r   r   r   	ceil_moder   padding_valueuse_dynamic_paddingr/   r   r   r'      s   
zBitMaxPool2d.__init__c                 C   s*   |  |}tj|| j| j| j| j| jS rM   )	r)   r   r5   
max_pool2dr   r   r   r   rb   r+   hidden_statesr   r   r   r<      s   
zBitMaxPool2d.forward)Nr   Fr^   r   T)r>   r?   r@   rA   rT   r'   r<   rB   r   r   r/   r   r]      s    r]   c                       s8   e Zd ZdZdef fddZdedefddZ  ZS )	BitEmbeddingszL
    BiT Embeddings (stem) composed of a single aggressive convolution.
    rJ   c                    s   t    t|j|jddd|jd| _tdd|jd| _	|jd ur.|j
 dkr.t | _ntjdd	d
| _|jdkrDt||jd| _nt | _|j| _d S )Nr   r   :0yE>)r   r   r*   r   r   )r   r   rd   r"   )r   r   r   r   r2   )r   rU   preactivationrK   )r&   r'   r!   rK   embedding_sizeglobal_paddingconvolutionr]   embedding_dynamic_paddingpoolerupperr   rI   r)   ConstantPad2d
layer_typerC   normr+   rJ   r/   r   r   r'      s"   
	

zBitEmbeddings.__init__pixel_valuesr   c                 C   sH   |j d }|| jkrtd| |}| |}| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)shaperK   
ValueErrorrn   r)   rt   rp   )r+   rv   rK   	embeddingr   r   r   r<     s   





zBitEmbeddings.forward)	r>   r?   r@   rA   r   r'   r   r<   rB   r   r   r/   r   rh      s    rh   r2   FrX   	drop_probr3   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r2   r   r   )r   )dtypedevice)rw   ndimtorchrandr{   r|   floor_div)rX   rz   r3   	keep_probrw   random_tensoroutputr   r   r   	drop_path  s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )BitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nrz   r   c                    s   t    || _d S rM   )r&   r'   rz   )r+   rz   r/   r   r   r'   /  s   

zBitDropPath.__init__rg   c                 C   s   t || j| jS rM   )r   rz   r3   rf   r   r   r   r<   3     zBitDropPath.forwardc                 C   s   d| j  S )Nzp=)rz   )r+   r   r   r   
extra_repr6  s   zBitDropPath.extra_reprrM   )r>   r?   r@   rA   r   floatr'   r~   r   r<   r   r   rB   r   r   r/   r   r   ,  s
    r      c                 C   s:   |}t |t| |d  | | }|d|  k r||7 }|S )Nr   g?)rO   rT   )rU   divisor	min_value	new_valuer   r   r   make_div:  s
   r   c                       :   e Zd ZdZ								d fdd	Zd	d
 Z  ZS )BitPreActivationBottleneckLayera  Pre-activation (v2) bottleneck block.
    Follows the implementation of "Identity Mappings in Deep Residual Networks":
    https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua

    Except it puts the stride on 3x3 conv when available.
    N      ?r   r2   Fc              	      s   t    |p|}|p|}t|| }|
r t||||dd| _nd | _t||| _t||dd|jd| _	t||d| _
t||d||d|jd| _t||| _t||dd|jd| _|	d	krdt|	| _d S t | _d S )
NTr   preactr   ri   r*   r   rk   r   )r   r$   r*   r   r   )r&   r'   r   BitDownsampleConv
downsamplerC   norm1r!   rm   conv1norm2conv2norm3conv3r   r   rI   r   )r+   rJ   in_channelsr-   bottle_ratior   r   first_dilationr$   drop_path_rateis_first_layermid_channelsr/   r   r   r'   J  s,   

$z(BitPreActivationBottleneckLayer.__init__c                 C   s^   |  |}|}| jd ur| |}| |}| | |}| | |}| |}|| S rM   )r   r   r   r   r   r   r   r   )r+   rg   hidden_states_preactshortcutr   r   r   r<   v  s   




z'BitPreActivationBottleneckLayer.forwardNr   r   r   Nr   r2   Fr=   r   r   r/   r   r   B  s    ,r   c                       r   )BitBottleneckLayerz\Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT Hybrid.Nr   r   r2   Fc              
      s   t    |p|}|p|}t|| }|
r t||||dd| _nd | _t||dd|jd| _t||d| _	t||d|||d|jd| _
t||d| _t||dd|jd| _t||dd	| _|	d
kret|	nt | _t|j | _d S )NFr   r   ri   r   rk   r   )r   r   r$   r*   r   rK   rL   r   )r&   r'   r   r   r   r!   rm   r   rC   r   r   r   r   r   r   r   rI   r   r   rG   rH   )r+   rJ   r   r-   r   r   r   r   r$   r   r   mid_chsr/   r   r   r'     s<   


zBitBottleneckLayer.__init__c                 C   sp   |}| j d ur|  |}| |}| |}| |}| |}| |}| |}| |}| || }|S rM   )	r   r   r   r   r   r   r   r   rH   )r+   rg   r   r   r   r   r<     s   








zBitBottleneckLayer.forwardr   r=   r   r   r/   r   r     s    1r   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )r   r   Tc                    sH   t    t||d|d|jd| _|rt | _d S t||dd| _d S )Nr   ri   )r   r*   r   Fr   )	r&   r'   r!   rm   convr   rI   rC   rt   )r+   rJ   r   r-   r   r   r/   r   r   r'     s   
zBitDownsampleConv.__init__c                 C   s   |  | |S rM   )rt   r   )r+   rR   r   r   r   r<     r   zBitDownsampleConv.forward)r   T)r>   r?   r@   r'   r<   rB   r   r   r/   r   r     s
    r   c                       s@   e Zd ZdZ		d fdd	Zdd Zded	efd
dZ  ZS )BitStagez7
    A ResNet v2 stage composed by stacked layers.
    r   Nc	                    s   t    |dv rdnd}	|jdkrt}
nt}
|}t | _t|D ]$}| 	|||\}}}| j
t||
|||||||	||d	 |}|}	q"d S )N)r   r   r   r   
bottleneck)r   r   r   r   r   r   )r&   r'   rs   r   r   r   
Sequentiallayersrange_get_updated_hyperparameters
add_moduler   )r+   rJ   r   r-   r   r   depthr   layer_dropoutr   	layer_clsprev_chs	layer_idxr   r   r/   r   r   r'     s8   



zBitStage.__init__c                 C   s0   |r|| }nd}|dkrd}|dk}|||fS )zt
        Get the new hyper-parameters with respect to the previous ones and the index of the current layer.
        r2   r   r   r   )r+   r   r   r   r   r   r   r   r   r     s   

z%BitStage._get_updated_hyperparametersrX   r   c                 C   s$   |}t | jD ]\}}||}q|S rM   )	enumerater   )r+   rX   r;   _layerr   r   r   r<   )  s   
zBitStage.forward)r   N)	r>   r?   r@   rA   r'   r   r   r<   rB   r   r   r/   r   r     s    .r   c                	       sH   e Zd Zdef fddZdd Z	dded	ed
edefddZ	  Z
S )
BitEncoderrJ   c              
      s   t    tg | _|j}d}d}dd tt	d|j
t|j|jD }tt|j|j|D ]-\}\}}}	| |||||\}
}}t|||
||||	d}|
}||9 }| jt|| q3d S )N   r   c                 S   s   g | ]}|  qS r   )tolist).0rR   r   r   r   
<listcomp>;  s    z'BitEncoder.__init__.<locals>.<listcomp>r   )r   r   r   r   )r&   r'   r   
ModuleListstagesrl   r~   r   nplinspacer   sumdepthssplitr   ziphidden_sizesr   r   r   r   )r+   rJ   r   current_strider   layer_dropouts	stage_idxcurrent_depthcurrent_hidden_sizer   r-   r   stager/   r   r   r'   1  s6   
"


zBitEncoder.__init__c                 C   s>   t ||j }|dkrdnd}||jkr||9 }d}|||fS )Nr   r   r   )r   width_factoroutput_stride)r+   r   r   r   r   rJ   r-   r   r   r   r   r   W  s   

z'BitEncoder._get_updated_hyperparametersFTr;   output_hidden_statesreturn_dictr   c                 C   sb   |rdnd }| j D ]}|r||f }||}q	|r||f }|s+tdd ||fD S t||dS )Nr   c                 s   s    | ]	}|d ur|V  qd S rM   r   )r   vr   r   r   	<genexpr>n  s    z%BitEncoder.forward.<locals>.<genexpr>)last_hidden_staterg   )r   tupler   )r+   r;   r   r   rg   stage_moduler   r   r   r<   _  s   



zBitEncoder.forward)FT)r>   r?   r@   r   r'   r   r   boolr   r<   rB   r   r   r/   r   r   0  s    &	r   c                   @   s,   e Zd ZU eed< dZdZdgZdd ZdS )BitPreTrainedModelrJ   bitrv   rh   c                 C   s   t |tjrtjj|jddd d S t |tjrMtjj|jt	dd |j
d urKtj|j\}}|dkr=dt	| nd}tj|j
| | d S d S t |tjtjfrhtj|jd tj|j
d d S d S )Nfan_outrelu)modenonlinearity   )ar   r   )r   r   Conv2dinitkaiming_normal_r7   Linearkaiming_uniform_rP   sqrtr%   _calculate_fan_in_and_fan_outuniform_BatchNorm2d	GroupNorm	constant_)r+   modulefan_inr   boundr   r   r   _init_weights}  s   
z BitPreTrainedModel._init_weightsN)	r>   r?   r@   r   __annotations__base_model_prefixmain_input_name_no_split_modulesr   r   r   r   r   r   v  s   
 r   c                
       sF   e Zd Z fddZe	d
dedee dee defdd	Z	  Z
S )BitModelc                    sd   t  | || _t|| _t|| _|jdkr!t||j	d dnt
 | _t
d| _|   d S )Nrj   r1   rk   )r   r   )r&   r'   rJ   rh   embedderr   encoderrs   rC   r   r   rI   rt   AdaptiveAvgPool2drp   	post_initru   r/   r   r   r'     s   


zBitModel.__init__Nrv   r   r   r   c                 C   s   |d ur|n| j j}|d ur|n| j j}| |}| j|||d}|d }| |}| |}|s;||f|dd   S t|||jdS )Nr   r   r   r   )r   pooler_outputrg   )	rJ   r   use_return_dictr   r   rt   rp   r	   rg   )r+   rv   r   r   embedding_outputencoder_outputsr   pooled_outputr   r   r   r<     s"   


zBitModel.forwardNN)r>   r?   r@   r'   r   r   r   r   r	   r<   rB   r   r   r/   r   r     s    r   z
    BiT Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                       s\   e Zd Z fddZe				ddeej deej dee	 dee	 de
f
d	d
Z  ZS )BitForImageClassificationc                    s^   t  | |j| _t|| _tt |jdkr#t|j	d |jnt
 | _|   d S )Nr   r1   )r&   r'   
num_labelsr   r   r   r   Flattenr   r   rI   
classifierr   ru   r/   r   r   r'     s   
$z"BitForImageClassification.__init__Nrv   labelsr   r   r   c           
      C   s   |dur|n| j j}| j|||d}|r|jn|d }| |}d}|dur.| ||| j }|sD|f|dd  }	|durB|f|	 S |	S t|||jdS )a0  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   )losslogitsrg   )rJ   r   r   r   r  loss_functionr
   rg   )
r+   rv   r  r   r   outputsr   r  r  r   r   r   r   r<     s   
z!BitForImageClassification.forward)NNNN)r>   r?   r@   r'   r   r   r~   FloatTensor
LongTensorr   r
   r<   rB   r   r   r/   r   r     s$    r   zL
    BiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                
       sJ   e Zd ZdZ fddZe	ddedee dee de	fd	d
Z
  ZS )BitBackboneFc                    s>   t  | t  | t|| _|jg|j | _|   d S rM   )	r&   r'   _init_backboner   r   rl   r   num_featuresr   ru   r/   r   r   r'     s
   
zBitBackbone.__init__Nrv   r   r   r   c           
      C   s   |dur|n| j j}|dur|n| j j}| j|ddd}|j}d}t| jD ]\}}|| jv r6||| f7 }q&|sF|f}	|rD|	|jf7 }	|	S t||rP|jddS dddS )aN  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("google/bit-50")
        >>> model = AutoBackbone.from_pretrained("google/bit-50")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   r   )feature_mapsrg   
attentions)	rJ   r   r   r   rg   r   stage_namesout_featuresr   )
r+   rv   r   r   r  rg   r  idxr   r   r   r   r   r<     s.   
zBitBackbone.forwardr   )r>   r?   r@   has_attentionsr'   r   r   r   r   r   r<   rB   r   r   r/   r   r
    s    
r
  )r   r   r   r
  )Nr   r   r   )r2   F)r   )7rA   r_   rP   typingr   numpyr   r~   r   r   activationsr   modeling_outputsr   r   r	   r
   modeling_utilsr   utilsr   r   utils.backbone_utilsr   configuration_bitr   
get_loggerr>   loggerr   r   r    r   r!   r   rC   Moduler(   	MaxPool2dr]   rh   r   r   r   r   r   r   r   r   r   r   r   r   r
  __all__r   r   r   r   <module>   sR   
)03 3
DIJF1.>