o
    پi2S                     @   sh  d Z ddlZddlmZmZmZmZ ddlZddlm	Z	 ddl
mZmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ dgZG dd de	jZG dd de	jZ G dd de	jZ!G dd de	jZ"d%ddZ#ee#dde#dde#dddZ$dd Z%d&ddZ&ed&de"fdd Z'ed&de"fd!d"Z(ed&de"fd#d$Z)dS )'a   Transformer in Transformer (TNT) in PyTorch

A PyTorch implement of TNT as described in
'Transformer in Transformer' - https://arxiv.org/abs/2103.00112

The official mindspore code is released and available at
https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/TNT

The official pytorch code is released and available at
https://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/tnt_pytorch
    N)ListOptionalTupleUnionIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)MlpDropPathtrunc_normal__assert	to_2tupleresample_abs_pos_embed   )build_model_with_cfg)feature_take_indices)
checkpoint)generate_default_cfgsregister_modelTNTc                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
	Attentionz Multi-Head Attention
       F        c                    s   t    || _|| _|| }|| _|d | _tj||d |d| _tj|||d| _	tj
|dd| _t||| _tj
|dd| _d S )Ng         biasT)inplace)super__init__
hidden_dim	num_headshead_dimscalennLinearqkvDropout	attn_dropproj	proj_drop)selfdimr   r    qkv_biasr(   r*   r!   	__class__ C/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/tnt.pyr       s   

zAttention.__init__c           
      C   s   |j \}}}| |||d| j| jddddd}|d\}}| |||| jddddd}||dd | j	 }	|	j
dd}	| |	}	|	| dd||d}| |}| |}|S )	Nr   r      r      r,   )shaper%   reshaper    r!   permuteunbindr&   	transposer"   softmaxr(   r)   r*   )
r+   xBNCr%   qkr&   attnr0   r0   r1   forward.   s   *$


zAttention.forward)r   Fr   r   )__name__
__module____qualname____doc__r   rD   __classcell__r0   r0   r.   r1   r      s    r   c                
       sB   e Zd ZdZdddddddejejdf
 fdd	Zd	d
 Z  Z	S )Blockz TNT Block
    r3            @Fr   c                    s  t    ||| _t|||||	|d| _||| _t|t|d |||d| _|| _	| j	rB||| _
tj|| |dd| _d | _n||| | _
tj|| |dd| _||| _||| _t|||||	|d| _|
dkrqt|
nt | _||| _t|t|| |||d| _d S )N)r    r-   r(   r*   r3   )in_featureshidden_featuresout_features	act_layerdropTr   Fr   )r   r   norm_inr   attn_innorm_mlp_inr	   intmlp_inlegacy
norm1_projr#   r$   r)   
norm2_projnorm_outattn_outr
   Identity	drop_pathnorm_mlpmlp)r+   r,   dim_out	num_pixelnum_heads_innum_heads_out	mlp_ratior-   r*   r(   r]   rP   
norm_layerrW   r.   r0   r1   r   B   sX   


	





zBlock.__init__c                 C   s0  ||  | | | }||  | | | }| \}}}| jd u rOtj|d d ddf |d d dd f | 	| 
|||d d gdd}n+tj|d d ddf |d d dd f | | 	| 
|||d d gdd}||  | | | }||  | | | }||fS )Nr   r   r4   r6   )r]   rS   rR   rV   rT   sizerY   torchcatr)   rX   r8   r[   rZ   r_   r^   )r+   pixel_embedpatch_embedr>   r?   r@   r0   r0   r1   rD      s&   
06zBlock.forward)
rE   rF   rG   rH   r#   GELU	LayerNormr   rD   rI   r0   r0   r.   r1   rJ   >   s    CrJ   c                       s   e Zd ZdZ						d fdd		Zddeeeef ef fddZdeeef deeef fddZ	de
jde
jde
jfddZ  ZS )
PixelEmbedz Image to Pixel Embedding
          r2   0   r3   Fc           	         s   t    t|}t|}|d |d  |d |d  f| _| jd | jd  }|| _|| _|| _|| _|| _ fdd|D }|| _	t
j|| jdd d| _| jr\t
j||d| _d S t
j||d| _d S )	Nr   r   c                    s   g | ]	}t |  qS r0   )mathceil).0psstrider0   r1   
<listcomp>   s    z'PixelEmbed.__init__.<locals>.<listcomp>   r2   )kernel_sizepaddingrv   )ry   rv   )r   r   r   	grid_sizeimg_size
patch_sizerW   num_patchesin_dimnew_patch_sizer#   Conv2dr)   Unfoldunfold)	r+   r|   r}   in_chansr   rv   rW   r~   r   r.   ru   r1   r      s    
	"zPixelEmbed.__init__Treturnc                 C   s   |rt | jS | jS N)maxr}   )r+   	as_scalarr0   r0   r1   
feat_ratio   s   
zPixelEmbed.feat_ratior|   c                 C   s$   |d | j d  |d | j d  fS )Nr   r   )r}   )r+   r|   r0   r0   r1   dynamic_feat_size   s   $zPixelEmbed.dynamic_feat_sizer=   	pixel_posc                 C   s2  |j \}}}}t|| jd kd| d| d| jd  d| jd  d	 t|| jd kd| d| d| jd  d| jd  d	 | jre| |}| |}|dd|| j | j	| j
d | j
d }n| |}|dd|| j || jd | jd }| |}|| }||| j | j	ddd}|S )	Nr   zInput image size (*z) doesn't match model (r   z).r   r4   )r7   r   r|   rW   r)   r   r;   r8   r~   r   r   r}   )r+   r=   r   r>   r@   HWr0   r0   r1   rD      s*   ((


*
zPixelEmbed.forward)rn   ro   r2   rp   r3   FT)rE   rF   rG   rH   r   r   r   rU   r   r   rg   TensorrD   rI   r0   r0   r.   r1   rm      s     "$rm   c                       sj  e Zd ZdZdddddddd	d
d	dddddddejd
df fdd	Zdd Zej	j
dd Zej	j
d7ddZej	j
d8ddZej	j
dejfddZd9dedee fdd Z					!	d:d"ejd#eeeee f  d$ed%ed&ed'ed(edeeej eejeej f f fd)d*Z	+		d;d#eeee f d,ed-efd.d/Zd0d1 Zd7d2efd3d4Zd5d6 Z  ZS )<r   zC Transformer in Transformer - https://arxiv.org/abs/2103.00112
    rn   ro   r2     tokeni   rp   rK   r3   rL   Fr   c                    s  t    |dv sJ || _|| _  | _ | _| _d| _d| _t	||||||d| _
| j
j}t| j
dr:| j
 n||| _| j
j}|d |d  }||| | _t||  | _| | _ttdd | _ttd|d  | _ttd||d |d | _tj|d| _dd	 td||D }g }t|D ]}|t| ||	|
|||||| ||d
 qt || _! fdd	t|D | _"| | _#t|| _$|dkrt |nt% | _&t'| jdd t'| jdd t'| jdd | (| j) d S )N r   avgr   F)r|   r}   r   r   rv   rW   r   r   )pc                 S   s   g | ]}|  qS r0   )item)rs   r=   r0   r0   r1   rw     s    z TNT.__init__.<locals>.<listcomp>)r,   r`   ra   rb   rc   rd   r-   r*   r(   r]   re   rW   c                    s    g | ]}t d |  dqS )zblocks.)modulenum_chs	reductiondict)rs   i	embed_dimrr0   r1   rw   &  s    {Gz?std)*r   r   num_classesglobal_poolnum_featureshead_hidden_sizer   num_prefix_tokensgrad_checkpointingrm   ri   r~   hasattrr   r   rX   r#   r$   r)   rY   	Parameterrg   zeros	cls_token	patch_posr   r'   pos_droplinspacerangeappendrJ   
ModuleListblocksfeature_infonorm	head_dropr\   headr   apply_init_weights)r+   r|   r}   r   r   r   r   	inner_dimdepthnum_heads_innernum_heads_outerrd   r-   	drop_ratepos_drop_rateproj_drop_rateattn_drop_ratedrop_path_ratere   first_striderW   r~   r   ra   dprr   r   r.   r   r1   r      sn   

 

zTNT.__init__c                 C   s   t |tjr&t|jdd t |tjr"|jd ur$tj|jd d S d S d S t |tjr>tj|jd tj|jd d S d S )Nr   r   r   g      ?)	
isinstancer#   r$   r   weightr   init	constant_rl   )r+   mr0   r0   r1   r   2  s   zTNT._init_weightsc                 C   s   h dS )N>   r   r   r   r0   r+   r0   r0   r1   no_weight_decay;  s   zTNT.no_weight_decayc                 C   s   t dddgd}|S )Nz=^cls_token|patch_pos|pixel_pos|pixel_embed|norm[12]_proj|proj)z^blocks\.(\d+)N)z^norm)i )stemr   r   )r+   coarsematcherr0   r0   r1   group_matcher?  s   zTNT.group_matcherTc                 C   s
   || _ d S r   )r   )r+   enabler0   r0   r1   set_grad_checkpointingJ  s   
zTNT.set_grad_checkpointingr   c                 C   s   | j S r   )r   r   r0   r0   r1   get_classifierN  s   zTNT.get_classifierNr   r   c                 C   sJ   || _ |d ur|dv sJ || _|dkrt| j|| _d S t | _d S )Nr   r   )r   r   r#   r$   r   r\   r   )r+   r   r   r0   r0   r1   reset_classifierR  s
   *zTNT.reset_classifierNCHWr=   indicesreturn_prefix_tokensr   
stop_early
output_fmtintermediates_onlyc                    s  |dv sJ d|dk}g }	t tj|\}
}|j\ }}}|j}|	 j
d}tjj dd|fdd}|j }|}tj sW|s[j}n	jd|d  }t|D ],\}}jr}tj s}t|||\}}n|||\}}||
v r|	|r|n| qhjrfdd	|	D }fd
d	|	D }	|rj||f\ fdd	|	D }	tj s|rtt|	|}	|r|	S |}||	fS )a<   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if an int, if is a sequence, select by matching indices
            return_prefix_tokens: Return both prefix and spatial intermediate tokens
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   NLCz)Output format must be one of NCHW or NLC.r   r4   r   r6   Nc                    s"   g | ]}|d d d j f qS )Nr   r   rs   yr   r0   r1   rw        " z-TNT.forward_intermediates.<locals>.<listcomp>c                    s"   g | ]}|d d  j d f qS r   r   r   r   r0   r1   rw     r   c                    s,   g | ]}|  d dddd qS )r4   r   r2   r   r   )r8   r9   
contiguousr   )r>   r   r   r0   r1   rw     s   , )r   lenr   r7   ri   r   rY   r)   rX   r8   r~   rg   rh   r   expandr   r   jitis_scripting	enumerater   r   r   r   r   r   listzip)r+   r=   r   r   r   r   r   r   r8   intermediatestake_indices	max_index_heightwidthri   rj   r   r   blkprefix_tokensr0   )r>   r   r   r+   r1   forward_intermediatesY  s@   "


zTNT.forward_intermediatesr   
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r   r   r#   r\   r   r   )r+   r   r   r   r   r   r0   r0   r1   prune_intermediate_layers  s   
zTNT.prune_intermediate_layersc                 C   s   |j d }| || j}| | | ||| jd}tj	| j
|dd|fdd}|| j }| |}| jD ]}| jrLtj sLt|||\}}q9|||\}}q9| |}|S )Nr   r4   r   r6   )r7   ri   r   rY   r)   rX   r8   r~   rg   rh   r   r   r   r   r   r   r   r   r   r   )r+   r=   r>   ri   rj   r   r0   r0   r1   forward_features  s   
"



zTNT.forward_features
pre_logitsc                 C   sZ   | j r| j dkr|d d | jd f jddn|d d df }| |}|r(|S | |S )Nr   r   r6   r   )r   r   meanr   r   )r+   r=   r   r0   r0   r1   forward_head  s   8
zTNT.forward_headc                 C   s   |  |}| |}|S r   )r   r   )r+   r=   r0   r0   r1   rD     s   

zTNT.forwardFr   r   )NFFFr   F)r   FT)rE   rF   rG   rH   r#   rl   r   r   rg   r   ignorer   r   r   Moduler   rU   r   strr   r   r   r   boolr   r   r   r   r   rD   rI   r0   r0   r.   r1   r      s    T	


 	
K
r   c                 K   s&   | ddd dddt tdddd	d
d|S )Nr   )r2   rn   rn   g?bicubicTzpixel_embed.projr   zarXiv:2103.00112zTransformer in TransformerzMhttps://github.com/huawei-noah/Efficient-AI-Backbones/tree/master/tnt_pytorch)urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizer   r   
first_conv
classifier	paper_ids
paper_name
origin_urlr   )r   kwargsr0   r0   r1   _cfg  s   	r
  ztimm/)	hf_hub_id)ztnt_s_legacy_patch16_224.in1kztnt_s_patch16_224.in1kztnt_b_patch16_224.in1kc           
      C   s`  |  dd  d| v r| }ni }|  D ]\}}|dd}|dd}|dd}|dd	}|d
d}|dd}|dd}|dd}|dd}|dd}|dd}|dd}|dd}|dkr|jjdkr|j\}}}t|d  }}	||	 |ksJ |ddd ||||	}|||< q	 |d j|j	jkrt
|d |jjd d!|d< |S )"Nouter_tokensr   	outer_pos	inner_posr   rj   ri   
proj_norm1rX   
proj_norm2rY   inner_norm1rR   
inner_attnrS   inner_norm2rT   	inner_mlprV   outer_norm1rZ   
outer_attnr[   outer_norm2r^   	outer_mlpr_   Fg      ?r   r   r   )new_sizer   )popitemsreplaceri   rW   r7   rU   r9   r8   r   r   r{   )

state_dictmodelout_dictrB   r&   r>   r?   r@   r   r   r0   r0   r1   checkpoint_filter_fn  s@   

r   Fc                 K   s2   | dd}tt| |ftt|ddd|}|S )Nout_indicesr2   getter)r!  feature_cls)pretrained_filter_fnfeature_cfg)r  r   r   r   r   )variant
pretrainedr	  r!  r  r0   r0   r1   _create_tnt  s   
r(  r   c              	   K   s8   t dddddddd}tdd
| it |fi |}|S )Nro        rK      FT)r}   r   r   r   r   r-   rW   tnt_s_legacy_patch16_224r'  )r,  r   r(  r'  r	  	model_cfgr  r0   r0   r1   r,    s   
r,  c                 K   6   t ddddddd}td
d	| it |fi |}|S )Nro   r)  r*  rK   r+  Fr}   r   r   r   r   r-   tnt_s_patch16_224r'  )r2  r-  r.  r0   r0   r1   r2  &     
r2  c                 K   r0  )Nro   i  (   rK   
   Fr1  tnt_b_patch16_224r'  )r6  r-  r.  r0   r0   r1   r6  /  r3  r6  )r   r   )*rH   rq   typingr   r   r   r   rg   torch.nnr#   	timm.datar   r   timm.layersr	   r
   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__r   r   rJ   rm   r   r
  default_cfgsr   r(  r,  r2  r6  r0   r0   r0   r1   <module>   sJ     "\@ 
w
%
