o
    	۷itH                  	   @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ eeZd0dejdededejfddZG dd dejZG dd dejZ G dd dej!Z"G dd dejZ#G dd dejZ$G d d! d!ejZ%G d"d# d#ejZ&eG d$d% d%eZ'eG d&d' d'e'Z(ed(d)G d*d+ d+e'Z)ed,d)G d-d. d.e'eZ*g d/Z+dS )1zPyTorch ConvNextV2 model.    )OptionalN)nn   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin)can_return_tuple   )ConvNextV2Config        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutput r!   h/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_path(   s   
r#   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )ConvNextV2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r!   r"   r'   @   s   

zConvNextV2DropPath.__init__hidden_statesc                 C   s   t || j| jS r%   )r#   r   r   )r(   r+   r!   r!   r"   forwardD   s   zConvNextV2DropPath.forwardc                 C   s   d| j  S )Nzp=)r   )r(   r!   r!   r"   
extra_reprG   s   zConvNextV2DropPath.extra_reprr%   )__name__
__module____qualname____doc__r   floatr'   r   Tensorr,   strr-   __classcell__r!   r!   r)   r"   r$   =   s
    r$   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ConvNextV2GRNz)GRN (Global Response Normalization) layerdimc                    s>   t    ttddd|| _ttddd|| _d S )Nr   )r&   r'   r   	Parameterr   zerosweightbias)r(   r7   r)   r!   r"   r'   N   s   
zConvNextV2GRN.__init__r+   r   c                 C   sF   t jj|dddd}||jdddd  }| j||  | j | }|S )N   )r   r<   T)ordr7   keepdim)r7   r>   ư>)r   linalgvector_normmeanr:   r;   )r(   r+   global_featuresnorm_featuresr!   r!   r"   r,   S   s   zConvNextV2GRN.forward)
r.   r/   r0   r1   intr'   r   FloatTensorr,   r5   r!   r!   r)   r"   r6   K   s    r6   c                       sB   e Zd ZdZddd fdd
Zdejdejf fd	d
Z  ZS )ConvNextV2LayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    r@   channels_lastepsdata_formatc                   s8   t  j|fd|i| |dvrtd| || _d S )NrK   )rI   channels_firstzUnsupported data format: )r&   r'   NotImplementedErrorrL   )r(   normalized_shaperK   rL   kwargsr)   r!   r"   r'   c   s   
zConvNextV2LayerNorm.__init__featuresr   c                    sJ   | j dkr|dddd}t |}|dddd}|S t |}|S )z
        Args:
            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
        rM   r   r<   r   r   )rL   permuter&   r,   )r(   rQ   r)   r!   r"   r,   i   s   
zConvNextV2LayerNorm.forward	r.   r/   r0   r1   r'   r   r3   r,   r5   r!   r!   r)   r"   rH   ]   s    "rH   c                       s6   e Zd ZdZ fddZdejdejfddZ  Z	S )ConvNextV2EmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    sL   t    tj|j|jd |j|jd| _t|jd ddd| _	|j| _d S )Nr   kernel_sizestrider@   rM   rJ   )
r&   r'   r   Conv2dnum_channelshidden_sizes
patch_sizepatch_embeddingsrH   	layernormr(   configr)   r!   r"   r'   }   s   
zConvNextV2Embeddings.__init__pixel_valuesr   c                 C   s4   |j d }|| jkrtd| |}| |}|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rY   
ValueErrorr\   r]   )r(   r`   rY   
embeddingsr!   r!   r"   r,      s   



zConvNextV2Embeddings.forward)
r.   r/   r0   r1   r'   r   rG   r3   r,   r5   r!   r!   r)   r"   rT   x   s    rT   c                       s8   e Zd ZdZd	 fdd	ZdejdejfddZ  ZS )
ConvNextV2Layera5  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    r   c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	|j
 | _td| | _td| || _|dkrAt|| _d S t | _d S )N   r   )rV   paddinggroupsr@   rK      r   )r&   r'   r   rX   dwconvrH   r]   Linearpwconv1r   
hidden_actactr6   grnpwconv2r$   Identityr#   )r(   r_   r7   r#   r)   r!   r"   r'      s   
$zConvNextV2Layer.__init__rQ   r   c                 C   sr   |}|  |}|dddd}| |}| |}| |}| |}| |}|dddd}|| | }|S )Nr   r<   r   r   )ri   rR   r]   rk   rm   rn   ro   r#   )r(   rQ   residualr!   r!   r"   r,      s   





zConvNextV2Layer.forward)r   rS   r!   r!   r)   r"   rc      s    rc   c                       s8   e Zd ZdZd
 fdd	Zdejdejfdd	Z  ZS )ConvNextV2Stagea  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`list[float]`): Stochastic depth rates for each layer.
    r<   Nc              	      s   t    |ks|dkr"tt|dddtj|||dg| _nt | _p-dg| t fddt|D | _d S )	Nr   r@   rM   rJ   rU   r   c                    s   g | ]}t  | d qS ))r7   r#   )rc   ).0jr_   drop_path_ratesout_channelsr!   r"   
<listcomp>   s    z,ConvNextV2Stage.__init__.<locals>.<listcomp>)	r&   r'   r   
ModuleListrH   rX   downsampling_layerrangelayers)r(   r_   in_channelsrw   rV   rW   depthrv   r)   ru   r"   r'      s   


zConvNextV2Stage.__init__rQ   r   c                 C   s,   | j D ]}||}q| jD ]}||}q|S r%   )rz   r|   )r(   rQ   layerr!   r!   r"   r,      s
   



zConvNextV2Stage.forward)r<   r<   r<   NrS   r!   r!   r)   r"   rr      s    
rr   c                       s<   e Zd Z fddZ	d	dejdee defddZ	  Z
S )
ConvNextV2Encoderc              	      s   t    t | _dd tjd|jt|j	dd
|j	D }|jd }t|jD ]$}|j| }t||||dkr;dnd|j	| || d}| j| |}q*d S )	Nc                 S   s   g | ]}|  qS r!   )tolist)rs   xr!   r!   r"   rx      s    z.ConvNextV2Encoder.__init__.<locals>.<listcomp>r   cpu)r   r<   r   )r}   rw   rW   r~   rv   )r&   r'   r   ry   stagesr   linspacedrop_path_ratesumdepthssplitrZ   r{   
num_stagesrr   append)r(   r_   rv   prev_chsiout_chsstager)   r!   r"   r'      s&   

 

zConvNextV2Encoder.__init__Fr+   output_hidden_statesr   c                 C   s@   |r|gnd }| j D ]}||}|d ur|| q
t||dS )N)last_hidden_stater+   )r   r   r   )r(   r+   r   all_hidden_stateslayer_moduler!   r!   r"   r,      s   

zConvNextV2Encoder.forward)F)r.   r/   r0   r'   r   r3   r   boolr   r,   r5   r!   r!   r)   r"   r      s    r   c                   @   s,   e Zd ZU eed< dZdZdgZdd ZdS )ConvNextV2PreTrainedModelr_   
convnextv2r`   rc   c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjtfr:|j	j
  |jjd dS t |trM|jj
  |j	j
  dS dS )zInitialize the weightsr   )rC   stdNg      ?)
isinstancer   rj   rX   r:   datanormal_r_   initializer_ranger;   zero_	LayerNormrH   fill_r6   )r(   moduler!   r!   r"   _init_weights  s   

z'ConvNextV2PreTrainedModel._init_weightsN)	r.   r/   r0   r   __annotations__base_model_prefixmain_input_name_no_split_modulesr   r!   r!   r!   r"   r     s   
 r   c                	       sH   e Zd Z fddZee	d	deej dee	 de
fddZ  ZS )
ConvNextV2Modelc                    sJ   t  | || _t|| _t|| _tj|j	d |j
d| _|   d S )Nr?   rg   )r&   r'   r_   rT   rb   r   encoderr   r   rZ   layer_norm_epsr]   	post_initr^   r)   r!   r"   r'     s   

zConvNextV2Model.__init__Nr`   r   r   c                 C   sb   |d u r| j j}|d u rtd| |}| j||d}|j}| |ddg}t|||j	dS )Nz You have to specify pixel_valuesr   r?   )r   pooler_outputr+   )
r_   r   ra   rb   r   r   r]   rC   r   r+   )r(   r`   r   embedding_outputencoder_outputsr   pooled_outputr!   r!   r"   r,   +  s   
zConvNextV2Model.forwardNN)r.   r/   r0   r'   r   r   r   r   rG   r   r   r,   r5   r!   r!   r)   r"   r     s    r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                	       sN   e Zd ZdZ fddZee	d
deej	 deej
 defdd	Z  ZS ) ConvNextV2ForImageClassificationFc                    sV   t  | |j| _t|| _|jdkr t|jd |j| _nt	 | _| 
  d S )Nr   r?   )r&   r'   
num_labelsr   r   r   rj   rZ   
classifierrp   r   r^   r)   r!   r"   r'   P  s   


z)ConvNextV2ForImageClassification.__init__Nr`   labelsr   c                 K   sP   | j |fi |}|j}| |}d}|dur | j||| jd}t|||jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   pooled_logitsr_   )losslogitsr+   )r   r   r   loss_functionr_   r	   r+   )r(   r`   r   rP   outputsr   r   r   r!   r!   r"   r,   _  s   
z(ConvNextV2ForImageClassification.forwardr   )r.   r/   r0   accepts_loss_kwargsr'   r   r   r   r   rG   
LongTensorr	   r,   r5   r!   r!   r)   r"   r   F  s    r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                	       sH   e Zd ZdZ fddZee	d
dejde	e
 defdd	Z  ZS )ConvNextV2BackboneFc                    s   t  | t  | t|| _t|| _|jd g|j | _i }t	| j
| jD ]\}}t|dd||< q)t|| _|   d S )Nr   rM   )rL   )r&   r'   _init_backbonerT   rb   r   r   rZ   num_featureszip_out_featureschannelsrH   r   
ModuleDicthidden_states_normsr   )r(   r_   r   r   rY   r)   r!   r"   r'     s   

zConvNextV2Backbone.__init__Nr`   r   r   c           	      C   s   |du r| j j}| |}| j|dd}|j}g }t| j|D ]\}}|| jv r4| j| |}|	| qt
t||r?|dS ddS )ar  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   )feature_mapsr+   )r_   r   rb   r   r+   r   stage_namesout_featuresr   r   r   tuple)	r(   r`   r   r   r   r+   r   r   hidden_stater!   r!   r"   r,     s"   


zConvNextV2Backbone.forwardr%   )r.   r/   r0   has_attentionsr'   r   r   r   r3   r   r   r   r,   r5   r!   r!   r)   r"   r   y  s    r   )r   r   r   r   )r   F),r1   typingr   r   r   activationsr   modeling_outputsr   r   r   r	   modeling_utilsr
   utilsr   r   utils.backbone_utilsr   utils.genericr   configuration_convnextv2r   
get_loggerr.   loggerr3   r2   r   r#   Moduler$   r6   r   rH   rT   rc   rr   r   r   r   r   r   __all__r!   r!   r!   r"   <module>   sB   
 ,%#),@