o
    ei@                     @   sz  d Z ddlZddlZddlmZmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ eeZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"eG dd deZ#eG dd de#Z$edd G d!d" d"e#Z%ed#d G d$d% d%ee#Z&g d&Z'dS )'zPyTorch ResNet model.    N)Tensornn   )initialization)ACT2FN)BackboneMixin)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging   )ResNetConfigc                       sH   e Zd Z	ddededededef
 fd	d
ZdedefddZ  ZS )ResNetConvLayerr   r   reluin_channelsout_channelskernel_sizestride
activationc                    sV   t    tj|||||d dd| _t|| _|d ur$t| | _	d S t | _	d S )N   F)r   r   paddingbias)
super__init__r   Conv2dconvolutionBatchNorm2dnormalizationr   Identityr   )selfr   r   r   r   r   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/resnet/modeling_resnet.pyr   '   s   
$zResNetConvLayer.__init__inputreturnc                 C   s"   |  |}| |}| |}|S N)r   r    r   r"   r'   hidden_stater%   r%   r&   forward1   s   


zResNetConvLayer.forward)r   r   r   )	__name__
__module____qualname__intstrr   r   r,   __classcell__r%   r%   r#   r&   r   &   s    
r   c                       s8   e Zd ZdZdef fddZdedefddZ  ZS )	ResNetEmbeddingszO
    ResNet Embeddings (stem) composed of a single aggressive convolution.
    configc                    sB   t    t|j|jdd|jd| _tjdddd| _	|j| _d S )N   r   )r   r   r   r   r   )r   r   r   )
r   r   r   num_channelsembedding_size
hidden_actembedderr   	MaxPool2dpoolerr"   r4   r#   r%   r&   r   =   s   
zResNetEmbeddings.__init__pixel_valuesr(   c                 C   s4   |j d }|| jkrtd| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)shaper6   
ValueErrorr9   r;   )r"   r=   r6   	embeddingr%   r%   r&   r,   E   s   



zResNetEmbeddings.forward)	r-   r.   r/   __doc__r   r   r   r,   r2   r%   r%   r#   r&   r3   8   s    r3   c                       sB   e Zd ZdZddededef fddZded	efd
dZ  ZS )ResNetShortCutz
    ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
    downsample the input using `stride=2`.
    r   r   r   r   c                    s0   t    tj||d|dd| _t|| _d S )Nr   F)r   r   r   )r   r   r   r   r   r   r    )r"   r   r   r   r#   r%   r&   r   V   s   
zResNetShortCut.__init__r'   r(   c                 C   s   |  |}| |}|S r)   )r   r    r*   r%   r%   r&   r,   [   s   

zResNetShortCut.forward)r   )	r-   r.   r/   rA   r0   r   r   r,   r2   r%   r%   r#   r&   rB   P   s    rB   c                	       s<   e Zd ZdZddedededef fdd	Zd
d Z  ZS )ResNetBasicLayerzO
    A classic ResNet's residual layer composed by two `3x3` convolutions.
    r   r   r   r   r   r   c                    sf   t    ||kp|dk}|rt|||dnt | _tt|||dt||d d| _t	| | _
d S )Nr   r   r   r   r   rB   r   r!   shortcut
Sequentialr   layerr   r   )r"   r   r   r   r   should_apply_shortcutr#   r%   r&   r   f   s   
zResNetBasicLayer.__init__c                 C   .   |}|  |}| |}||7 }| |}|S r)   rI   rG   r   r"   r+   residualr%   r%   r&   r,   r      


zResNetBasicLayer.forward)r   r   )	r-   r.   r/   rA   r0   r1   r   r,   r2   r%   r%   r#   r&   rC   a   s     rC   c                       sL   e Zd ZdZ				ddededed	ed
edef fddZdd Z  Z	S )ResNetBottleNeckLayera  
    A classic ResNet's bottleneck layer composed by three `3x3` convolutions.

    The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
    convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`. If
    `downsample_in_bottleneck` is true, downsample will be in the first layer instead of the second layer.
    r   r      Fr   r   r   r   	reductiondownsample_in_bottleneckc           	   
      s   t    ||kp|dk}|| }|rt|||dnt | _tt||d|r)|nddt|||s3|nddt||dd d| _t	| | _
d S )Nr   rD   )r   r   )r   r   rF   )	r"   r   r   r   r   rR   rS   rJ   reduces_channelsr#   r%   r&   r      s   
	zResNetBottleNeckLayer.__init__c                 C   rK   r)   rL   rM   r%   r%   r&   r,      rO   zResNetBottleNeckLayer.forward)r   r   rQ   F)
r-   r.   r/   rA   r0   r1   boolr   r,   r2   r%   r%   r#   r&   rP   {   s(    rP   c                       sN   e Zd ZdZ		ddededededef
 fdd	Zd
edefddZ  Z	S )ResNetStagez4
    A ResNet stage composed by stacked layers.
    r   r4   r   r   r   depthc                    s   t     jdkrtnt jdkr|| j jd}n	|| jd}tj|g fddt	|d D R  | _
d S )N
bottleneck)r   r   rS   )r   r   c                    s   g | ]
} j d qS )rE   )r8   ).0_r4   rI   r   r%   r&   
<listcomp>   s    z(ResNetStage.__init__.<locals>.<listcomp>r   )r   r   
layer_typerP   rC   r8   rS   r   rH   rangelayers)r"   r4   r   r   r   rW   first_layerr#   r[   r&   r      s    

zResNetStage.__init__r'   r(   c                 C   s   |}| j D ]}||}q|S r)   )r_   )r"   r'   r+   rI   r%   r%   r&   r,      s   

zResNetStage.forward)r   r   )
r-   r.   r/   rA   r   r0   r   r   r,   r2   r%   r%   r#   r&   rV      s     	rV   c                	       s@   e Zd Zdef fddZ	ddededed	efd
dZ  Z	S )ResNetEncoderr4   c              	      s   t    tg | _| jt||j|jd |j	rdnd|j
d d t|j|jdd  }t||j
dd  D ]\\}}}| jt||||d q9d S )Nr   r   r   )r   rW   )rW   )r   r   r   
ModuleListstagesappendrV   r7   hidden_sizesdownsample_in_first_stagedepthszip)r"   r4   in_out_channelsr   r   rW   r#   r%   r&   r      s   
	 zResNetEncoder.__init__FTr+   output_hidden_statesreturn_dictr(   c                 C   sb   |rdnd }| j D ]}|r||f }||}q	|r||f }|s+tdd ||fD S t||dS )Nr%   c                 s   s    | ]	}|d ur|V  qd S r)   r%   )rY   vr%   r%   r&   	<genexpr>   s    z(ResNetEncoder.forward.<locals>.<genexpr>)last_hidden_statehidden_states)rc   tupler	   )r"   r+   rj   rk   ro   stage_moduler%   r%   r&   r,      s   



zResNetEncoder.forward)FT)
r-   r.   r/   r   r   r   rU   r	   r,   r2   r%   r%   r#   r&   ra      s    ra   c                   @   s:   e Zd ZU eed< dZdZdZddgZe	
 dd Zd	S )
ResNetPreTrainedModelr4   resnetr=   )imager   rB   c                 C   s   t |tjrtj|jddd d S t |tjrKtj|jt	dd |j
d urItjj|j\}}|dkr<dt	| nd}t|j
| | d S d S d|jjv ryt|j t|j
 t|j t|j t|d	d d ur{t|j d S d S d S )
Nfan_outr   )modenonlinearity   )ar   r   	BatchNormnum_batches_tracked)
isinstancer   r   initkaiming_normal_weightLinearkaiming_uniform_mathsqrtr   torch_calculate_fan_in_and_fan_outuniform_r$   r-   ones_zeros_running_meanrunning_vargetattrr{   )r"   modulefan_inrZ   boundr%   r%   r&   _init_weights   s$   
z#ResNetPreTrainedModel._init_weightsN)r-   r.   r/   r   __annotations__base_model_prefixmain_input_nameinput_modalities_no_split_modulesr   no_gradr   r%   r%   r%   r&   rr      s   
 rr   c                
       sH   e Zd Z fddZe		d
dededB dedB defdd	Z  Z	S )ResNetModelc                    s>   t  | || _t|| _t|| _td| _	| 
  d S )N)r   r   )r   r   r4   r3   r9   ra   encoderr   AdaptiveAvgPool2dr;   	post_initr<   r#   r%   r&   r     s   

zResNetModel.__init__Nr=   rj   rk   r(   c           	      K   s|   |d ur|n| j j}|d ur|n| j j}| |}| j|||d}|d }| |}|s6||f|dd   S t|||jdS )Nrj   rk   r   r   )rn   pooler_outputro   )r4   rj   use_return_dictr9   r   r;   r
   ro   )	r"   r=   rj   rk   kwargsembedding_outputencoder_outputsrn   pooled_outputr%   r%   r&   r,     s    	

zResNetModel.forwardNN)
r-   r.   r/   r   r   r   rU   r
   r,   r2   r%   r%   r#   r&   r     s    	r   z
    ResNet Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                       s\   e Zd Z fddZe				ddejdB dejdB dedB dedB de	f
d	d
Z
  ZS )ResNetForImageClassificationc                    s^   t  | |j| _t|| _tt |jdkr#t|j	d |jnt
 | _|   d S )Nr   )r   r   
num_labelsr   rs   r   rH   Flattenr   re   r!   
classifierr   r<   r#   r%   r&   r   D  s   
$z%ResNetForImageClassification.__init__Nr=   labelsrj   rk   r(   c                 K   s   |dur|n| j j}| j|||d}|r|jn|d }| |}d}	|dur.| ||| j }	|sD|f|dd  }
|	durB|	f|
 S |
S t|	||jdS )a0  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   )losslogitsro   )r4   r   rs   r   r   loss_functionr   ro   )r"   r=   r   rj   rk   r   outputsr   r   r   outputr%   r%   r&   r,   P  s   
z$ResNetForImageClassification.forward)NNNN)r-   r.   r/   r   r   r   FloatTensor
LongTensorrU   r   r,   r2   r%   r%   r#   r&   r   =  s$    r   zO
    ResNet backbone, to be used with frameworks like DETR and MaskFormer.
    c                
       sL   e Zd ZdZ fddZe		ddededB dedB defd	d
Z	  Z
S )ResNetBackboneFc                    s<   t  | |jg|j | _t|| _t|| _| 	  d S r)   )
r   r   r7   re   num_featuresr3   r9   ra   r   r   r<   r#   r%   r&   r   z  s
   

zResNetBackbone.__init__Nr=   rj   rk   r(   c                 K   s   |dur|n| j j}|dur|n| j j}| |}| j|ddd}|j}d}t| jD ]\}	}
|
| jv r;|||	 f7 }q+|sK|f}|rI||jf7 }|S t	||rU|jddS dddS )ar  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/resnet-50", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 2048, 7, 7]
        ```NTr   r%   )feature_mapsro   
attentions)
r4   r   rj   r9   r   ro   	enumeratestage_namesout_featuresr   )r"   r=   rj   rk   r   r   r   ro   r   idxstager   r%   r%   r&   r,     s0   "

zResNetBackbone.forwardr   )r-   r.   r/   has_attentionsr   r   r   rU   r   r,   r2   r%   r%   r#   r&   r   r  s    
r   )r   r   rr   r   )(rA   r   r   r   r    r   r}   activationsr   backbone_utilsr   modeling_outputsr   r	   r
   r   modeling_utilsr   utilsr   r   configuration_resnetr   
get_loggerr-   loggerModuler   r3   rB   rC   rP   rV   ra   rr   r   r   r   __all__r%   r%   r%   r&   <module>   s@   
*&)+/L