o
    wiL                  	   @   s  d Z ddlmZmZ ddlZddlZddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ eeZd.dejdede dejfddZ!G dd dej"Z#G dd dej"Z$G dd dej"Z%G dd dej"Z&G dd dej"Z'G d d! d!ej"Z(eG d"d# d#eZ)eG d$d% d%e)Z*ed&d'G d(d) d)e)Z+ed*d'G d+d, d,e)eZ,g d-Z-dS )/zPyTorch ConvNext model.    )OptionalUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin   )ConvNextConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r$   k/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/convnext/modeling_convnext.py	drop_path)   s   
r&   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )ConvNextDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r$   r%   r*   A   s   

zConvNextDropPath.__init__hidden_statesc                 C   s   t || j| jS r(   )r&   r   r   r+   r.   r$   r$   r%   forwardE   s   zConvNextDropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r+   r$   r$   r%   
extra_reprH   s   zConvNextDropPath.extra_reprr(   )__name__
__module____qualname____doc__r   floatr*   r   Tensorr0   strr1   __classcell__r$   r$   r,   r%   r'   >   s
    r'   c                       s8   e Zd ZdZd
 fdd	Zdejdejfdd	Z  ZS )ConvNextLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    ư>channels_lastc                    s`   t    tt|| _tt|| _|| _	|| _
| j
dvr*td| j
 |f| _d S )N)r<   channels_firstzUnsupported data format: )r)   r*   r   	Parameterr   onesweightzerosbiasepsdata_formatNotImplementedErrornormalized_shape)r+   rF   rC   rD   r,   r$   r%   r*   R   s   

zConvNextLayerNorm.__init__xr   c                 C   s   | j dkrtjj|| j| j| j| j}|S | j dkr]|j	}|
 }|jddd}|| djddd}|| t|| j  }|j|d}| jd d d d f | | jd d d d f  }|S )Nr<   r=   r   T)keepdim   )r   )rD   r   r   
functional
layer_normrF   r@   rB   rC   r   r6   meanpowsqrtto)r+   rG   input_dtypeusr$   r$   r%   r0   \   s   
	
,zConvNextLayerNorm.forward)r;   r<   )	r2   r3   r4   r5   r*   r   r7   r0   r9   r$   r$   r,   r%   r:   L   s    
r:   c                       s6   e Zd ZdZ fddZdejdejfddZ  Z	S )ConvNextEmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    sL   t    tj|j|jd |j|jd| _t|jd ddd| _	|j| _d S )Nr   kernel_sizestrider;   r=   rC   rD   )
r)   r*   r   Conv2dnum_channelshidden_sizes
patch_sizepatch_embeddingsr:   	layernormr+   configr,   r$   r%   r*   o   s   
zConvNextEmbeddings.__init__pixel_valuesr   c                 C   s4   |j d }|| jkrtd| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rY   
ValueErrorr\   r]   )r+   r`   rY   
embeddingsr$   r$   r%   r0   w   s   



zConvNextEmbeddings.forward
r2   r3   r4   r5   r*   r   FloatTensorr7   r0   r9   r$   r$   r,   r%   rS   j   s    rS   c                       s8   e Zd ZdZd	 fdd	ZdejdejfddZ  Z	S )
ConvNextLayera3  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextConfig`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	|j
 | _td| || _|jdkrAtj|jt| dd	nd | _|d
krOt|| _d S t | _d S )N   r   )rU   paddinggroupsr;   rC      r   T)requires_gradr   )r)   r*   r   rX   dwconvr:   r]   Linearpwconv1r	   
hidden_actactpwconv2layer_scale_init_valuer>   r   r?   layer_scale_parameterr'   Identityr&   )r+   r_   dimr&   r,   r$   r%   r*      s   

$zConvNextLayer.__init__r.   r   c                 C   s|   |}|  |}|dddd}| |}| |}| |}| |}| jd ur-| j| }|dddd}|| | }|S )Nr   rI   r   r   )rl   permuter]   rn   rp   rq   rs   r&   )r+   r.   r   rG   r$   r$   r%   r0      s   






zConvNextLayer.forward)r   rc   r$   r$   r,   r%   re      s    re   c                       s8   e Zd ZdZd
 fdd	Zdejdejfdd	Z  Z	S )ConvNextStagea  ConvNeXT stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextConfig`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
    rI   Nc              	      s   t    |ks|dkr!tt|dddtj|||d| _nt | _p,dg| tj fddt|D  | _	d S )	Nr   r;   r=   rW   rT   r   c                    s   g | ]}t  | d qS ))ru   r&   )re   ).0jr_   drop_path_ratesout_channelsr$   r%   
<listcomp>   s    z*ConvNextStage.__init__.<locals>.<listcomp>)
r)   r*   r   
Sequentialr:   rX   downsampling_layerrt   rangelayers)r+   r_   in_channelsr|   rU   rV   depthr{   r,   rz   r%   r*      s   


zConvNextStage.__init__r.   r   c                 C   s   |  |}| |}|S r(   )r   r   r/   r$   r$   r%   r0      s   

zConvNextStage.forward)rI   rI   rI   Nrc   r$   r$   r,   r%   rw      s    
rw   c                       sN   e Zd Z fddZ		ddejdee dee dee	e
f fd	d
Z  ZS )ConvNextEncoderc              	      s   t    t | _dd tjd|jt|j	dd
|j	D }|jd }t|jD ]$}|j| }t||||dkr;dnd|j	| || d}| j| |}q*d S )	Nc                 S   s   g | ]}|  qS r$   )tolist)rx   rG   r$   r$   r%   r}      s    z,ConvNextEncoder.__init__.<locals>.<listcomp>r   cpu)r   rI   r   )r   r|   rV   r   r{   )r)   r*   r   
ModuleListstagesr   linspacedrop_path_ratesumdepthssplitrZ   r   
num_stagesrw   append)r+   r_   r{   prev_chsiout_chsstager,   r$   r%   r*      s&   

 

zConvNextEncoder.__init__FTr.   output_hidden_statesreturn_dictr   c                 C   sj   |rdnd }t | jD ]\}}|r||f }||}q|r"||f }|s/tdd ||fD S t||dS )Nr$   c                 s   s    | ]	}|d ur|V  qd S r(   r$   )rx   vr$   r$   r%   	<genexpr>   s    z*ConvNextEncoder.forward.<locals>.<genexpr>)last_hidden_stater.   )	enumerater   tupler   )r+   r.   r   r   all_hidden_statesr   layer_moduler$   r$   r%   r0      s   


zConvNextEncoder.forward)FT)r2   r3   r4   r*   r   rd   r   boolr   r   r   r0   r9   r$   r$   r,   r%   r      s    
r   c                   @   s&   e Zd ZeZdZdZdgZdd ZdS )ConvNextPreTrainedModelconvnextr`   re   c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjtfr:|j	j
  |jjd dS t |trO|jdurQ|jj| jj dS dS dS )zInitialize the weightsr   )rL   stdNg      ?)
isinstancer   rm   rX   r@   datanormal_r_   initializer_rangerB   zero_	LayerNormr:   fill_re   rs   rr   )r+   moduler$   r$   r%   _init_weights  s   


z%ConvNextPreTrainedModel._init_weightsN)	r2   r3   r4   r   config_classbase_model_prefixmain_input_name_no_split_modulesr   r$   r$   r$   r%   r      s    r   c                       sX   e Zd Z fddZe			d
deej dee dee de	e
ef fdd	Z  ZS )ConvNextModelc                    sJ   t  | || _t|| _t|| _tj|j	d |j
d| _|   d S )Nri   )r)   r*   r_   rS   rb   r   encoderr   r   rZ   layer_norm_epsr]   	post_initr^   r,   r$   r%   r*     s   

zConvNextModel.__init__Nr`   r   r   r   c                 C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}|d }| |ddg}|sC||f|dd   S t|||j	dS )Nz You have to specify pixel_valuesr   r   r   r   r   )r   pooler_outputr.   )
r_   r   use_return_dictra   rb   r   r]   rL   r   r.   )r+   r`   r   r   embedding_outputencoder_outputsr   pooled_outputr$   r$   r%   r0   $  s(   
zConvNextModel.forward)NNN)r2   r3   r4   r*   r   r   r   rd   r   r   r   r   r0   r9   r$   r$   r,   r%   r     s    
r   z
    ConvNext Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                       sd   e Zd Z fddZe				ddeej deej dee	 dee	 de
eef f
d	d
Z  ZS )ConvNextForImageClassificationc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   r   )r)   r*   
num_labelsr   r   r   rm   rZ   rt   
classifierr   r^   r,   r$   r%   r*   Q  s   
$z'ConvNextForImageClassification.__init__Nr`   labelsr   r   r   c                 C   sb  |dur|n| j j}| j|||d}|r|jn|d }| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtj	ksG|jtj
krLd| j _nd| j _| j jdkrnt }	| jdkrh|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|dd  }
|dur|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   
regressionsingle_label_classificationmulti_label_classificationr   rI   )losslogitsr.   )r_   r   r   r   r   problem_typer   r   r   longintr   squeezer   viewr   r   r.   )r+   r`   r   r   r   outputsr   r   r   loss_fctr#   r$   r$   r%   r0   _  s>   


"


z&ConvNextForImageClassification.forward)NNNN)r2   r3   r4   r*   r   r   r   rd   
LongTensorr   r   r   r   r0   r9   r$   r$   r,   r%   r   J  s$    
r   zQ
    ConvNeXt backbone, to be used with frameworks like DETR and MaskFormer.
    c                
       sJ   e Zd Z fddZe		d
dejdee dee de	fdd	Z
  ZS )ConvNextBackbonec                    s   t  | t  | t|| _t|| _|jd g|j | _i }t	| j
| jD ]\}}t|dd||< q)t|| _|   d S )Nr   r=   )rD   )r)   r*   _init_backbonerS   rb   r   r   rZ   num_featureszip_out_featureschannelsr:   r   
ModuleDicthidden_states_normsr   )r+   r_   r   r   rY   r,   r$   r%   r*     s   

zConvNextBackbone.__init__Nr`   r   r   r   c                 C   s   |dur|n| j j}|dur|n| j j}| |}| j|d|d}|r&|jn|d }d}t| j|D ]\}}	|| jv rG| j	| |	}	||	f7 }q2|sV|f}
|rT|
|f7 }
|
S t
||r_|ddS dddS )ah  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnext-tiny-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnext-tiny-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   r   r$   )feature_mapsr.   
attentions)r_   r   r   rb   r   r.   r   stage_namesout_featuresr   r
   )r+   r`   r   r   r   r   r.   r   r   hidden_stater#   r$   r$   r%   r0     s:   



zConvNextBackbone.forward)NN)r2   r3   r4   r*   r   r   r7   r   r   r
   r0   r9   r$   r$   r,   r%   r     s    r   )r   r   r   r   )r   F).r5   typingr   r   r   torch.utils.checkpointr   torch.nnr   r   r   activationsr	   modeling_outputsr
   r   r   r   modeling_utilsr   utilsr   r   utils.backbone_utilsr   configuration_convnextr   
get_loggerr2   loggerr7   r6   r   r&   Moduler'   r:   rS   re   rw   r   r   r   r   r   __all__r$   r$   r$   r%   <module>   sB   
 , 04FM