o
    پiu                     @   s$  d Z ddlmZmZmZmZ ddlZddlmZ ddl	m  m
Z ddlmZmZ ddlmZmZmZmZmZmZmZ ddlmZ ddlmZmZ d	gZG d
d dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd	 d	ejZ#dd Z$dd Z%dd Z&d9ddZ'd:dd Z(ee(d!d"e(d!d"e(d!d"e(d!d"e(d!d"e(d!d"e(d!d"e(d!d#d$d%d&d'Z)ed;d(e#fd)d*Z*ed;d(e#fd+d,Z+ed;d(e#fd-d.Z,ed;d(e#fd/d0Z-ed;d(e#fd1d2Z.ed;d(e#fd3d4Z/ed;d(e#fd5d6Z0ed;d(e#fd7d8Z1dS )<z
CoaT architecture.

Paper: Co-Scale Conv-Attentional Image Transformers - https://arxiv.org/abs/2104.06399

Official CoaT code at: https://github.com/mlpc-ucsd/CoaT

Modified from timm/models/vision_transformer.py
    )ListOptionalTupleUnionNIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPath	to_2tupletrunc_normal__assert	LayerNorm   )build_model_with_cfg)register_modelgenerate_default_cfgsCoaTc                       s6   e Zd ZdZ fddZdeeef fddZ  ZS )ConvRelPosEncz+ Convolutional relative position encoding. c           	   	      s   t    t|tr||i}|| _nt|tr|| _nt t | _	g | _
| D ]5\}}d}||d |d   d }tj|  |  ||f||f||f|  d}| j	| | j
| q* fdd| j
D | _dS )aj  
        Initialization.
            Ch: Channels per head.
            h: Number of heads.
            window: Window size(s) in convolutional relative positional encoding. It can have two forms:
                1. An integer of window size, which assigns all attention heads with the same window s
                    size in ConvRelPosEnc.
                2. A dict mapping window size to #attention head splits (
                    e.g. {window size 1: #attention head split 1, window size 2: #attention head split 2})
                    It will apply different window size to the attention head splits.
        r      )kernel_sizepaddingdilationgroupsc                    s   g | ]}|  qS  r   .0xhead_chsr   D/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/coat.py
<listcomp>B       z*ConvRelPosEnc.__init__.<locals>.<listcomp>N)super__init__
isinstanceintwindowdict
ValueErrornn
ModuleList	conv_listhead_splitsitemsConv2dappendchannel_splits)	selfr    	num_headsr(   
cur_windowcur_head_splitr   padding_sizecur_conv	__class__r   r!   r%      s.   



zConvRelPosEnc.__init__sizec                 C   s  |j \}}}}|\}}	t|d||	  kd |d d d d dd d d f }
|d d d d dd d d f }|dd||| ||	}tj|| jdd}g }t| jD ]\}}|	|||  qTtj
|dd}||||||	 dd}|
| }t|d}|S )Nr    dim)r   r   r   r   r   r   )shaper   	transposereshapetorchsplitr2   	enumerater-   r1   catFpad)r3   qvr;   Br4   NCHWq_imgv_img
v_img_listconv_v_img_listiconv
conv_v_imgEV_hatr   r   r!   forwardD   s     zConvRelPosEnc.forward	__name__
__module____qualname____doc__r%   r   r'   rY   __classcell__r   r   r9   r!   r      s    *r   c                       sB   e Zd ZdZ					d fdd	Zdeeef fd	d
Z  ZS )FactorAttnConvRelPosEnczK Factorized attention with convolutional relative position encoding class.    F        Nc                    sh   t    || _|| }|d | _tj||d |d| _t|| _t||| _	t|| _
|| _d S )Ng         )bias)r$   r%   r4   scaler+   LinearqkvDropout	attn_dropproj	proj_dropcrpe)r3   r@   r4   qkv_biasri   rk   shared_crpehead_dimr9   r   r!   r%   \   s   
	

z FactorAttnConvRelPosEnc.__init__r;   c                 C   s   |j \}}}| |||d| j|| j ddddd}|d\}}}	|jdd}
|
dd|	 }|| }| j||	|d	}| j	| | }|dd|||}| 
|}| |}|S )
Nrc   r   r   r      r?   r=   r>   r;   )rA   rg   rC   r4   permuteunbindsoftmaxrB   rl   re   rj   rk   )r3   r   r;   rL   rM   rN   rg   rJ   krK   	k_softmax
factor_attrl   r   r   r!   rY   r   s   .

zFactorAttnConvRelPosEnc.forward)ra   Frb   rb   NrZ   r   r   r9   r!   r`   Z   s    r`   c                       s8   e Zd ZdZd fdd	Zdeeef fddZ  ZS )	
ConvPosEnczy Convolutional Position Encoding.
        Note: This module is similar to the conditional position encoding in CPVT.
    rc   c                    s.   t t|   tj|||d|d |d| _d S )Nr   r   )r   )r$   rx   r%   r+   r0   rj   )r3   r@   ru   r9   r   r!   r%      s    zConvPosEnc.__init__r;   c                 C   s   |j \}}}|\}}t|d||  kd |d d d df |d d dd f }}	|	dd||||}
| |
|
 }|ddd}tj||fdd}|S )Nr   r<   r   r?   )rA   r   rB   viewrj   flattenrD   rG   )r3   r   r;   rL   rM   rN   rO   rP   	cls_token
img_tokensfeatr   r   r!   rY      s   *zConvPosEnc.forward)rc   rZ   r   r   r9   r!   rx      s    rx   c                	       sN   e Zd ZdZdddddejejddf	 fdd	Zdee	e	f fd	d
Z
  ZS )SerialBlockz Serial block class.
        Note: In this implementation, each serial block only contains a conv-attention and a FFN (MLP) module.       @Frb   Nc                    sv   t    |
| _|	|| _t||||||d| _|dkr t|nt | _	|	|| _
t|| }t||||d| _d S )Nr4   rm   ri   rk   rn   rb   in_featureshidden_features	act_layerdrop)r$   r%   cpenorm1r`   factoratt_crper   r+   Identity	drop_pathnorm2r'   r
   mlp)r3   r@   r4   	mlp_ratiorm   rk   ri   r   r   
norm_layer
shared_cpern   mlp_hidden_dimr9   r   r!   r%      s(   


zSerialBlock.__init__r;   c                 C   sV   |  ||}| |}| ||}|| | }| |}| |}|| | }|S N)r   r   r   r   r   r   )r3   r   r;   curr   r   r!   rY      s   


zSerialBlock.forward)r[   r\   r]   r^   r+   GELUr   r%   r   r'   rY   r_   r   r   r9   r!   r~      s    (r~   c                       s   e Zd ZdZg ddddejejdf fdd	Zdede	e
e
f fd	d
Zdede	e
e
f fddZdede	e
e
f fddZdee	e
e
f  fddZ  ZS )ParallelBlockz Parallel block class. Frb   Nc                    s\  t    |	|d | _|	|d | _|	|d | _t|d |||||
d d| _t|d |||||
d d| _t|d |||||
d d| _|dkrOt	|nt
 | _|	|d | _|	|d | _|	|d | _|d |d   krz|d ks}J  J |d |d   kr|d ksJ  J t|d |d  }t|d |||d | _ | _| _d S )Nr   r   rc   r   rb   r   )r$   r%   norm12norm13norm14r`   factoratt_crpe2factoratt_crpe3factoratt_crpe4r   r+   r   r   norm22norm23norm24r'   r
   mlp2mlp3mlp4)r3   dimsr4   
mlp_ratiosrm   rk   ri   r   r   r   shared_crpesr   r9   r   r!   r%      sR   
((zParallelBlock.__init__factorr;   c                 C   s   | j |||dS )z Feature map up-sampling. scale_factorr;   interpolater3   r   r   r;   r   r   r!   upsample     zParallelBlock.upsamplec                 C   s   | j |d| |dS )z Feature map down-sampling.       ?r   r   r   r   r   r!   
downsample#  s   zParallelBlock.downsampler   c                 C   s   |j \}}}|\}}t|d||  kd |ddddddf }	|ddddddf }
|
dd||||}
tj|
|dddd}
|
||ddd}
tj|	|
fdd	}|S )
z Feature map interpolation. r   r<   Nr   Fbilinear)r   recompute_scale_factormodealign_cornersr=   r?   )rA   r   rB   rC   rH   r   rD   rG   )r3   r   r   r;   rL   rM   rN   rO   rP   r{   r|   outr   r   r!   r   '  s    zParallelBlock.interpolatesizesc                 C   st  |\}}}}	|  |}
| |}| |}| j|
|d}
| j||d}| j||	d}| j|d|d}| j|d|	d}| j|d|	d}| j|
d|d}| j|d|d}| j|
d|d}|
| | }
|| | }|| | }|| |
 }|| | }|| | }| 	|}
| 
|}| |}| |
}
| |}| |}|| |
 }|| | }|| | }||||fS )Nrq   g       @)r   r;   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r3   x1x2x3x4r   _S2S3S4cur2cur3cur4upsample3_2upsample4_3upsample4_2downsample2_3downsample3_4downsample2_4r   r   r!   rY   >  s:   








zParallelBlock.forward)r[   r\   r]   r^   r+   r   r   r%   floatr   r'   r   r   r   r   rY   r_   r   r   r9   r!   r      s    <"r   c                       s   e Zd ZdZdddddddd	d
dddddeddddf fdd	Zdd Zejj	dd Z
ejj	d)ddZejj	d*ddZejj	dejfddZd+dedee fdd Zd!d" Zd*d#eejeej f d$efd%d&Zdejfd'd(Z  ZS ),r   z CoaT class.       rc     @      @     rc   rp      rc   r   ra   )rp   rp   rp   rp   Trb   FNtokenc                    sF  t    |dv sJ |pdddd}|_|_ _ d  __|_|_t	|}t
||| d tjd_t
dd	 |D d d  d
 tjd_t
dd	 |D d d
  d tjd_t
dd	 |D d d  d tjd_ttd
d
 d _ttd
d
 d
 _ttd
d
 d _ttd
d
 d _t d dd_t d
 dd_t d dd_t d dd_t d | ||d_t d
 | ||d_t d | ||d_ t d | ||d_!|}|dksJ t"||
||||dt# fdd	t$|d D _%t# fdd	t$|d
 D _&t# fdd	t$|d D _'t# fdd	t$|d D _(|_)j)dkr~t# fdd	t$|D _*nd _*jsj*d ur| d
 _+| d _,nd  _+_,| d _-j)dkr d
  d   kr d ksJ  J tjj.dd
d
d_/t0|_1|dkrt2j|nt3 _4nd _/t0|_1|dkrt2j|nt3 _4t5jdd t5jdd t5jdd t5jdd 6j7 d S )Nr   avgr   rc   )rc         r=   r   )img_size
patch_sizein_chans	embed_dimr   c                 S      g | ]}|d  qS )rp   r   r   r   r   r!   r"     r#   z!CoaT.__init__.<locals>.<listcomp>r   c                 S   r   )ra   r   r   r   r   r!   r"     r#   c                 S   r   )r   r   r   r   r   r!   r"     r#   )r@   ru   )r    r4   r(   rb   )r4   rm   rk   ri   r   r   c              	      0   g | ]}t d d  d  jjdqS )r   r@   r   r   rn   r   )r~   cpe1crpe1r   r   
embed_dimsr   r3   skwargsr   r!   r"         c              	      r   )r   r   r   )r~   cpe2crpe2r   r   r   r!   r"     r   c              	      r   )r   r   r   )r~   cpe3crpe3r   r   r   r!   r"     r   c              	      r   )rc   r   r   )r~   cpe4crpe4r   r   r   r!   r"     r   c              
      s2   g | ]}t d jjjjfd qS ))r   r   r   r   )r   r   r   r   r   r   r   r   r!   r"     s    )in_channelsout_channelsr   {Gz?std)8r$   r%   return_interm_layersout_featuresr   num_featureshead_hidden_sizenum_classesglobal_poolr   r	   r+   r   patch_embed1patch_embed2patch_embed3patch_embed4	ParameterrD   zeros
cls_token1
cls_token2
cls_token3
cls_token4rx   r   r   r   r   r   r   r   r   r   r)   r,   rangeserial_blocks1serial_blocks2serial_blocks3serial_blocks4parallel_depthparallel_blocksr   norm3norm4Conv1d	aggregaterh   	head_droprf   r   headr   apply_init_weights)r3   r   r   r   r   r   serial_depthsr  r4   r   rm   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rater   r   r   crpe_windowr   dprr9   r   r!   r%   c  s   














,$"zCoaT.__init__c                 C   s   t |tjr&t|jdd t |tjr"|jd ur$tj|jd d S d S d S t |tjr>tj|jd tj|jd d S d S )Nr   r   r   r   )	r&   r+   rf   r   weightrd   init	constant_r   )r3   mr   r   r!   r    s   zCoaT._init_weightsc                 C   s   h dS )N>   r   r   r   r   r   r3   r   r   r!   no_weight_decay  s   zCoaT.no_weight_decayc                 C   s   |rJ dd S )Nz$gradient checkpointing not supportedr   )r3   enabler   r   r!   set_grad_checkpointing  r   zCoaT.set_grad_checkpointingc                 C   s"   t ddddddddd	d
gd	}|S )Nz#^cls_token1|patch_embed1|crpe1|cpe1z^serial_blocks1\.(\d+)z#^cls_token2|patch_embed2|crpe2|cpe2z^serial_blocks2\.(\d+)z#^cls_token3|patch_embed3|crpe3|cpe3z^serial_blocks3\.(\d+)z#^cls_token4|patch_embed4|crpe4|cpe4z^serial_blocks4\.(\d+))z^parallel_blocks\.(\d+)N)z^norm|aggregate)i )	stem1r   stem2r   stem3r  stem4r  r  )r)   )r3   coarsematcherr   r   r!   group_matcher  s   zCoaT.group_matcherreturnc                 C   s   | j S r   )r
  r  r   r   r!   get_classifier.  s   zCoaT.get_classifierr   r   c                 C   sJ   || _ |d ur|dv sJ || _|dkrt| j|| _d S t | _d S )Nr   r   )r   r   r+   rf   r   r   r
  )r3   r   r   r   r   r!   reset_classifier2  s
   *zCoaT.reset_classifierc                 C   s  |j d }| |}| jj\}}t|| j}| jD ]
}||||fd}qt||||ddddd	 }| 
|}| j
j\}	}
t|| j}| jD ]
}|||	|
fd}qJt|||	|
ddddd	 }| |}| jj\}}t|| j}| jD ]
}||||fd}q{t||||ddddd	 }| |}| jj\}}t|| j}| jD ]
}||||fd}qt||||ddddd	 }| jd u rtj s| jri }d| jv r||d< d| jv r||d< d	| jv r||d	< d
| jv r||d
< |S | |}|S | jD ]6}| ||	|
f| |||f| |||f}}}|||||||f|	|
f||f||fgd\}}}}q	tj s| jri }d| jv rht||||ddddd	 }||d< d| jv rt|||	|
ddddd	 }||d< d	| jv rt||||ddddd	 }||d	< d
| jv rt||||ddddd	 }||d
< |S | |}| |}| |}|||gS )Nr   rq   r=   rc   r   r   x1_noclsx2_noclsx3_noclsx4_nocls)r   )rA   r   	grid_size
insert_clsr   r   
remove_clsrC   rr   
contiguousr   r   r   r   r   r  r   r   r  r  rD   jitis_scriptingr   r   r  r   r   r   r   r  )r3   x0rL   r   H1W1blkr&  r   H2W2r'  r   H3W3r(  r   H4W4r)  feat_outr   r   r!   forward_features9  sz   


$

$

$

$





46$$$$



zCoaT.forward_featuresx_feat
pre_logitsc                 C   s   t |tr4| jd usJ | jdkrtjdd |D dd}ntjdd |D dd}| |jdd}n| jdkrG|d d dd f jddn|d d df }| 	|}|rX|S | 
|S )Nr   c                 S   s*   g | ]}|d d dd f j dddqS )Nr   T)r@   keepdim)meanr   xlr   r   r!   r"     s   * z%CoaT.forward_head.<locals>.<listcomp>r   r?   c                 S   s   g | ]
}|d d df qS )Nr   r   r@  r   r   r!   r"     s    r   )r&   listr  r   rD   rG   stacksqueezer?  r	  r
  )r3   r<  r=  r   r   r   r!   forward_head  s   

6
zCoaT.forward_headc                 C   s2   t j s| jr| |S | |}| |}|S r   )rD   r.  r/  r   r;  rE  )r3   r   r<  r   r   r!   rY     s
   


zCoaT.forward)TFr   )r[   r\   r]   r^   r   r%   r  rD   r.  ignorer  r  r"  r+   Moduler$  r'   r   strr%  r;  r   Tensorr   boolrE  rY   r_   r   r   r9   r!   r   a  sH     )	
$Qc                 C   s*   | | jd dd}tj|| fdd} | S )z Insert CLS token. r   r=   r   r?   )expandrA   rD   rG   )r   r{   
cls_tokensr   r   r!   r+    s   r+  c                 C   s   | ddddddf S )z Remove CLS token. Nr   r   )r   r   r   r!   r,    s   r,  c                 C   s   i }|  d| } |  D ]O\}}|dsV|dr"t|dd d u sV|dr/t|dd d u sV|dr<t|dd d u sV|drIt|dd d u sV|drWt|dd d u rWq|||< q|S )Nmodelr   r   r  r  r  r
  )getr/   
startswithgetattr)
state_dictrN  out_dictru   rK   r   r   r!   checkpoint_filter_fn  s,   

rT  Fc                 K   s0   | dd r
tdtt| |fdti|}|S )Nfeatures_onlyz<features_only not implemented for Vision Transformer models.pretrained_filter_fn)rO  RuntimeErrorr   r   rT  )variant
pretraineddefault_cfgkwargsrN  r   r   r!   _create_coat  s   r\  r<   c                 K   s    | ddd dddt tddd|S )	Nr   )rc   r   r   g?bicubicTzpatch_embed1.projr
  )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizer?  r   
first_conv
classifierr   )r^  r[  r   r   r!   	_cfg_coat  s   rf  ztimm/)	hf_hub_id)rc     rh  r   squash)rg  r_  ra  	crop_mode)zcoat_tiny.in1kzcoat_mini.in1kzcoat_small.in1kzcoat_lite_tiny.in1kzcoat_lite_mini.in1kzcoat_lite_small.in1kzcoat_lite_medium.in1kzcoat_lite_medium_384.in1kr#  c                 K   :   t dg dg ddd}tdd| it |fi |}|S )	Nrp   )   rl  rl  rl  r   r   r   r   r   r   r   r  r  	coat_tinyrY  )ro  r)   r\  rY  r[  	model_cfgrN  r   r   r!   ro    
   ro  c                 K   rk  )	Nrp   )rl     rt  rt  rm  r   rn  	coat_minirY  )ru  rp  rq  r   r   r!   ru    rs  ru  c                 K   sB   t ddg dg ddd|}td	d| it |fi |}|S )
Nrp   )rl  r   r   r   rm  r   rn  
coat_smallrY  r   )rv  rp  rq  r   r   r!   rv    s   rv  c                 K   >   t dg dg dg dd}tdd| it |fi |}|S )	Nrp   )r   r      r   rm  ra   ra   rp   rp   r   r   r  r   coat_lite_tinyrY  )r{  rp  rq  r   r   r!   r{    
   r{  c                 K   rw  )	Nrp   r   rm  ry  rz  coat_lite_minirY  )r}  rp  rq  r   r   r!   r}    r|  r}  c                 K   rw  )	Nrp   r   r   ry  rz  coat_lite_smallrY  )r~  rp  rq  r   r   r!   r~    r|  r~  c                 K   s8   t dg dg dd}tdd| it |fi |}|S )Nrp   r   rx  r   r   rc   r   
   ra   )r   r   r  coat_lite_mediumrY  )r  rp  rq  r   r   r!   r    s
   r  c                 K   s:   t ddg dg dd}tdd| it |fi |}|S )	Nrh  rp   r  r  )r   r   r   r  coat_lite_medium_384rY  )r  rp  rq  r   r   r!   r    rs  r  )FN)r<   rF  )2r^   typingr   r   r   r   rD   torch.nnr+   torch.nn.functional
functionalrH   	timm.datar   r   timm.layersr	   r
   r   r   r   r   r   _builderr   	_registryr   r   __all__rH  r   r`   rx   r~   r   r   r+  r,  rT  r\  rf  default_cfgsro  ru  rv  r{  r}  r~  r  r  r   r   r   r!   <module>   sj    	$B2:   C

