o
    پiD                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ dgZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#dd Z$d5ddZ%d6ddZ&ee&dd e&dd e&dd e&dd e&dd!d"e&dd!d"e&dd!d"e&dd!d"d#Z'ed5d$e#fd%d&Z(ed5d$e#fd'd(Z)ed5d$e#fd)d*Z*ed5d$e#fd+d,Z+ed5d$e#fd-d.Z,ed5d$e#fd/d0Z-ed5d$e#fd1d2Z.ed5d$e#fd3d4Z/dS )7a   Pooling-based Vision Transformer (PiT) in PyTorch

A PyTorch implement of Pooling-based Vision Transformers as described in
'Rethinking Spatial Dimensions of Vision Transformers' - https://arxiv.org/abs/2103.16302

This code was adapted from the original version at https://github.com/naver-ai/pit, original copyright below.

Modifications for timm by / Copyright 2020 Ross Wightman
    N)partial)ListOptionalSequenceTupleUnion)nnIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)trunc_normal_	to_2tuple   )build_model_with_cfg)feature_take_indices)register_modelgenerate_default_cfgs)BlockPoolingVisionTransformerc                       sJ   e Zd ZdZ fddZdeejejf deejejf fddZ  Z	S )SequentialTuplezI This module exists to work around torchscript typing issues list -> listc                    s   t t| j|  d S N)superr   __init__)selfargs	__class__ C/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/pit.pyr   #   s   zSequentialTuple.__init__xreturnc                 C   s   | D ]}||}q|S r   r   )r   r   moduler   r   r   forward&   s   
zSequentialTuple.forward)
__name__
__module____qualname____doc__r   r   torchTensorr"   __classcell__r   r   r   r   r   !   s    2r   c                       sR   e Zd Z					d	 fdd	Zdeejejf deejejf fddZ  ZS )
TransformerN        c
           
         s^   t t|   | || _|	r|	nt | _tj fddt|D  | _	d S )Nc                    s2   g | ]}t d  | ttjdddqS )Tư>eps)dim	num_heads	mlp_ratioqkv_bias	proj_drop	attn_drop	drop_path
norm_layer)r   r   r   	LayerNorm).0ir4   drop_path_prob	embed_dimheadsr1   r3   r   r   
<listcomp>>   s    z(Transformer.__init__.<locals>.<listcomp>)
r   r*   r   poolr   Identitynorm
Sequentialrangeblocks)
r   base_dimdepthr=   r1   r?   r3   r4   r;   r6   r   r:   r   r   -   s   zTransformer.__init__r   r    c                 C   s   |\}}|j d }| jd ur| ||\}}|j \}}}}|ddd}tj||fdd}| |}| |}|d d d |f }|d d |d f }|dd||||}||fS )Nr      )r/   )	shaper?   flatten	transposer'   catrA   rD   reshape)r   r   
cls_tokenstoken_lengthBCHWr   r   r   r"   K   s   



zTransformer.forward)Nr+   r+   NN	r#   r$   r%   r   r   r'   r(   r"   r)   r   r   r   r   r*   ,   s    2r*   c                       s8   e Zd Zd fdd	Zdeejejf fddZ  ZS )Poolingzerosc              	      sB   t t|   tj|||d |d |||d| _t||| _d S )Nr   rG   )kernel_sizepaddingstridepadding_modegroups)r   rT   r   r   Conv2dconvLinearfc)r   
in_featureout_featurerX   rY   r   r   r   r   `   s   	zPooling.__init__r    c                 C   s   |  |}| |}||fS r   )r\   r^   )r   r   	cls_tokenr   r   r   r"   n   s   

zPooling.forward)rU   rS   r   r   r   r   rT   _   s    "rT   c                	       s@   e Zd Z				ddedededef fd	d
Zdd Z  ZS )ConvEmbedding         r   img_size
patch_sizerX   rW   c                    s   t t|   |}t|| _t|| _t| jd d|  | jd  | d | _t| jd d|  | jd  | d | _	| j| j	f| _
tj|||||dd| _d S )Nr   rG   r   T)rV   rX   rW   bias)r   rb   r   r   rf   rg   mathfloorheightwidth	grid_sizer   r[   r\   )r   in_channelsout_channelsrf   rg   rX   rW   r   r   r   r   u   s   	

,,zConvEmbedding.__init__c                 C   s   |  |}|S r   )r\   r   r   r   r   r   r"      s   
zConvEmbedding.forward)rc   rd   re   r   )r#   r$   r%   intr   r"   r)   r   r   r   r   rb   t   s    rb   c                       s  e Zd ZdZ										
								d?dededededee dee dee def fddZdd Z	e
jjdd Ze
jjd@ddZe
jjd@d d!Zd"ejfd#d$ZdAd&ed'ee fd(d)Z	%			*	dBd+e
jd,eeeee f  d-ed.ed/ed0ed"eee
j ee
jee
j f f fd1d2Z	3		dCd,eeee f d4ed5efd6d7Zd8d9 ZdDd:ed"e
jfd;d<Zd=d> Z  ZS )Er   z Pooling-based Vision Transformer

    A PyTorch implement of 'Rethinking Spatial Dimensions of Vision Transformers'
        - https://arxiv.org/abs/2103.16302
    rc   rd   re   overlap0   rt   rt   rG         rG   rw   re   rw        tokenFr+   rf   rg   rX   	stem_type	base_dimsrF   r=   r1   c                    s6  t t|   |dv sJ || _|| _|d |d  }|	| _|| _|r%dnd| _g | _t	|
||||| _
ttd|| j
j| j
j| _ttd| j|| _tj|d| _g }dd td|t||D }|}tt|D ]F}d }|| ||  }|dkrt||dd}|t|| || || |||||| d	g7 }|}|  jt||d d|  d
| dg7  _qrt| | _tj|d |d  dd| _ | | _! | _"| _#t|| _$|	dkrt%| j#|	nt& | _'d | _(|r|	dkrt%| j#| jnt& | _(d| _)t*| jdd t*| jdd | +| j, d S )N)r{   r   rG   r   )pc                 S   s   g | ]}|  qS r   )tolist)r8   r   r   r   r   r>      s    z5PoolingVisionTransformer.__init__.<locals>.<listcomp>)rX   )r?   r3   r4   r;   transformers.)num_chs	reductionr!   r,   r-   Fg{Gz?)std)-r   r   r   r}   r=   num_classesglobal_pool
num_tokensfeature_inforb   patch_embedr   	Parameterr'   randnrk   rl   	pos_embedra   Dropoutpos_droplinspacesumsplitrC   lenrT   r*   dictr   transformersr7   rA   num_featureshead_hidden_sizer<   	head_dropr]   r@   head	head_distdistilled_trainingr   apply_init_weights)r   rf   rg   rX   r|   r}   rF   r=   r1   r   in_chansr   	distilled	drop_ratepos_drop_drateproj_drop_rateattn_drop_ratedrop_path_rater<   r   dprprev_dimr9   r?   r   r   r   r      sb    "

.
 "z!PoolingVisionTransformer.__init__c                 C   s4   t |tjrtj|jd tj|jd d S d S )Nr   g      ?)
isinstancer   r7   init	constant_rh   weight)r   mr   r   r   r      s   z&PoolingVisionTransformer._init_weightsc                 C   s   ddhS )Nr   ra   r   r   r   r   r   no_weight_decay   s   z(PoolingVisionTransformer.no_weight_decayTc                 C   s
   || _ d S r   )r   r   enabler   r   r   set_distilled_training   s   
z/PoolingVisionTransformer.set_distilled_trainingc                 C   s   |rJ dd S )Nz$gradient checkpointing not supportedr   r   r   r   r   set_grad_checkpointing   s   z/PoolingVisionTransformer.set_grad_checkpointingr    c                 C   s   | j d ur| j| j fS | jS r   )r   r   r   r   r   r   get_classifier   s   
z'PoolingVisionTransformer.get_classifierNr   r   c                 C   sh   || _ |d ur
|| _|dkrt| j|nt | _| jd ur2|dkr+t| j| j nt | _d S d S )Nr   )r   r   r   r]   r<   r@   r   r   )r   r   r   r   r   r   reset_classifier   s    
&z)PoolingVisionTransformer.reset_classifierNCHWr   indicesrA   
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}| || j }| j|jd dd}
t| jd }t	j
 s:|s>| j}n	| jd|	d  }t|D ]\}}|||
f\}}
||v r`|| qK|re|S ||krn| |
}
|
|fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   zOutput shape must be NCHW.r   r   r   N)r   r   r   r   r   r   ra   expandrH   r'   jitis_scripting	enumerateappendrA   )r   r   r   rA   r   r   r   intermediatestake_indices	max_indexrM   last_idxstagesfeat_idxstager   r   r   forward_intermediates  s(   


z.PoolingVisionTransformer.forward_intermediatesr   
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r    )r   r   r   r   r@   rA   r   )r   r   r   r   r   r   r   r   r   prune_intermediate_layers3  s   
z2PoolingVisionTransformer.prune_intermediate_layersc                 C   sP   |  |}| || j }| j|jd dd}| ||f\}}| |}|S )Nr   r   )r   r   r   ra   r   rH   r   rA   )r   r   rM   r   r   r   forward_featuresC  s   

z)PoolingVisionTransformer.forward_features
pre_logitsc                 C   s   | j d urH| jdksJ |d d df |d d df }}| |}| |}|s3| |}|  |}| jrB| jrBtj sB||fS || d S | jdkrU|d d df }| |}|sa| |}|S )Nr{   r   r   rG   )	r   r   r   r   r   trainingr'   r   r   )r   r   r   x_distr   r   r   forward_headK  s"   
"






z%PoolingVisionTransformer.forward_headc                 C   s   |  |}| |}|S r   )r   r   rp   r   r   r   r"   b  s   

z PoolingVisionTransformer.forward)rc   rd   re   rr   rs   ru   rx   rw   ry   rz   r{   Fr+   r+   r+   r+   r+   )Tr   )NFFr   F)r   FTF) r#   r$   r%   r&   rq   strr   floatr   r   r'   r   ignorer   r   r   r   Moduler   r   r   r(   r   r   boolr   r   r   r   r   r"   r)   r   r   r   r   r      s    	N
 
3
c                 C   s<   i }t d}|  D ]\}}|dd |}|||< q|S )z preprocess checkpoints zpools\.(\d)\.c                 S   s   dt | dd  dS )Nr   r   z.pool.)rq   group)expr   r   r   <lambda>q  s    z&checkpoint_filter_fn.<locals>.<lambda>)recompileitemssub)
state_dictmodelout_dictp_blockskvr   r   r   checkpoint_filter_fnh  s   

r   Fc                 K   s>   t td}|d|}tt| |fttd|dd|}|S )Nrz   out_indiceshook)feature_clsr   )pretrained_filter_fnfeature_cfg)tuplerC   popr   r   r   r   )variant
pretrainedkwargsdefault_out_indicesr   r   r   r   r   _create_pitv  s   
r   r   c                 K   s    | ddd dddt tddd|S )	Nry   )rz   rc   rc   g?bicubicTzpatch_embed.convr   )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanr   
first_conv
classifierr	   )r   r   r   r   r   _cfg  s   r   ztimm/)	hf_hub_id)r   r   )r   r   )zpit_ti_224.in1kzpit_xs_224.in1kzpit_s_224.in1kzpit_b_224.in1kzpit_ti_distilled_224.in1kzpit_xs_distilled_224.in1kzpit_s_distilled_224.in1kzpit_b_distilled_224.in1kr    c                 K   >   t ddg dg dg ddd}td| fi t |fi |S )	N      @   r   r   rz   rv   rw   rw   re   rd   rw   rg   rX   r}   rF   r=   r1   	pit_b_224r   r   r   r   
model_argsr   r   r   r       r  c                 K   r   )	Nrd   re   rs   ru   rz   rv      rw   r  	pit_s_224r  r  r   r   r   r	    r  r	  c                 K   r   )	Nrd   re   rs   ru   rx   rw   r  
pit_xs_224r  r  r   r   r   r
    r  r
  c                 K   r   )	Nrd   re       r  r  ru   rx   rw   r  
pit_ti_224r  r  r   r   r   r    r  r  c              	   K   @   t ddg dg dg dddd}td	| fi t |fi |S )
Nr   r   r   r   r   rw   Trg   rX   r}   rF   r=   r1   r   pit_b_distilled_224r  r  r   r   r   r       	r  c              	   K   r  )
Nrd   re   rs   ru   r  rw   Tr  pit_s_distilled_224r  r  r   r   r   r    r  r  c              	   K   r  )
Nrd   re   rs   ru   rx   rw   Tr  pit_xs_distilled_224r  r  r   r   r   r    r  r  c              	   K   r  )
Nrd   re   r  ru   rx   rw   Tr  pit_ti_distilled_224r  r  r   r   r   r    r  r  r   )r   )0r&   ri   r   	functoolsr   typingr   r   r   r   r   r'   r   	timm.datar
   r   timm.layersr   r   _builderr   	_featuresr   	_registryr   r   vision_transformerr   __all__rB   r   r   r*   rT   rb   r   r   r   r   default_cfgsr  r	  r
  r  r  r  r  r  r   r   r   r   <module>   sz    3 Z

