o
    ߥi                     @   s   d dl Z d dlZd dlmZ d dlm  mZ d dlmZm	Z	 d dl
mZ G dd dejZG dd dejZG dd	 d	ejZedddZdS )    N)DropPathtrunc_normal_)register_modelc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	Blocka   ConvNeXt Block. There are two equivalent implementations:
    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
    We use (2) as we find it slightly faster in PyTorch

    Args:
        dim (int): Number of input channels.
        drop_path (float): Stochastic depth rate. Default: 0.0
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
            ư>c                    s   t    tj||dd|d| _t|dd| _t|d| | _t	 | _
td| || _|dkr>tj|t| dd	nd | _|d
krLt|| _d S t | _d S )N      )kernel_sizepaddinggroupsr   )eps   r   T)requires_gradr   )super__init__nnConv2ddwconv	LayerNormnormLinearpwconv1GELUactpwconv2	Parametertorchonesgammar   Identity	drop_path)selfdimr!   layer_scale_init_value	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/vldoc/convnext.pyr      s0   



zBlock.__init__c                 C   s|   |}|  |}|dddd}| |}| |}| |}| |}| jd ur-| j| }|dddd}|| | }|S )Nr      r	      )r   permuter   r   r   r   r   r!   )r"   xinputr'   r'   r(   forward)   s   






zBlock.forward)r   r   __name__
__module____qualname____doc__r   r.   __classcell__r'   r'   r%   r(   r      s    r   c                       sD   e Zd ZdZdg dg dddf fdd	Zd	d
 Zdd Z  ZS )ConvNeXta   ConvNeXt
        A PyTorch impl of : `A ConvNet for the 2020s`  -
          https://arxiv.org/pdf/2201.03545.pdf
    Args:
        in_chans (int): Number of input image channels. Default: 3
        num_classes (int): Number of classes for classification head. Default: 1000
        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
        drop_path_rate (float): Stochastic depth rate. Default: 0.
        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
    r	   r	   r	   	   r	   `      i  i   r   r   c           	   
      s(  t    t | _ttj|d dddtd ddd}| j| t	dD ]"tt dddtj d  d	d	d}| j| q*t | _
d
d td|t|D d t	dD ]"tj fddt	| D  }| j
|  | 7  qf| _| | j d S )Nr   r   )r
   strider   channels_first)r   data_formatr	   r*   r)   c                 S   s   g | ]}|  qS r'   )item).0r,   r'   r'   r(   
<listcomp>`   s    z%ConvNeXt.__init__.<locals>.<listcomp>c                    s&   g | ]}t   |  d qS ))r#   r!   r$   )r   )r?   jcurdimsdp_ratesir$   r'   r(   r@   e   s    
)r   r   r   
ModuleListdownsample_layers
Sequentialr   r   appendrangestagesr   linspacesumrD   apply_init_weights)	r"   in_chansdepthsrD   drop_path_rater$   stemdownsample_layerstager%   rB   r(   r   G   s4   



zConvNeXt.__init__c                 C   s8   t |tjtjfrt|jdd tj|jd d S d S )Ng{Gz?)stdr   )	
isinstancer   r   r   r   weightinit	constant_bias)r"   mr'   r'   r(   rP   t   s   zConvNeXt._init_weightsc                 C   s@   g }t dD ]}| j| |}| j| |}|| qt|S )Nr   )rK   rH   rL   rJ   tuple)r"   r,   xsrF   r'   r'   r(   r.   y   s   zConvNeXt.forward)r0   r1   r2   r3   r   rP   r.   r4   r'   r'   r%   r(   r5   9   s    -r5   c                       s.   e Zd ZdZ		d fdd	Zdd Z  ZS )	r   aF   LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
    with shape (batch_size, channels, height, width).
    r   channels_lastc                    sT   t    tt|| _tt|| _|| _	|| _
| j
dvr$t|f| _d S )N)r`   r<   )r   r   r   r   r   r   rY   zerosr\   r   r=   NotImplementedErrornormalized_shape)r"   rc   r   r=   r%   r'   r(   r      s   

zLayerNorm.__init__c                 C   s   | j dkrt|| j| j| j| jS | j dkrN|jddd}|| djddd}|| t	
|| j  }| jd d d d f | | jd d d d f  }|S d S )Nr`   r<   r*   T)keepdimr)   )r=   F
layer_normrc   rY   r\   r   meanpowr   sqrt)r"   r,   usr'   r'   r(   r.      s   

,zLayerNorm.forward)r   r`   r/   r'   r'   r%   r(   r      s    r   Fc                 K   s    t dg dg dd|}|S )Nr6   r8   )rR   rD   r'   )r5   )
pretrainedin_22kkwargsmodelr'   r'   r(   convnext_tiny   s   rp   )FF)osr   torch.nnr   torch.nn.functional
functionalre   timm.models.layersr   r   timm.models.registryr   Moduler   r5   r   rp   r'   r'   r'   r(   <module>   s   ,L 