o
    پi^                     @   s  d Z ddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ dgZG dd de	jZG dd de	jZG dd de	jZG dd de	jZ dd Z!edAdee"e"f de#fddZ$G dd de	jZ%dAddZ&dBd d!Z'ee'd"d#e'd"d$d%e'd"d&d$d'd(e'd"d#e'd"d$d%e'd"d&d$d'd(e'd"d#e'd"d$d%e'd"d#e'd"d#e'd"d#d)Z(edAd*e%fd+d,Z)edAd*e%fd-d.Z*edAd*e%fd/d0Z+edAd*e%fd1d2Z,edAd*e%fd3d4Z-edAd*e%fd5d6Z.edAd*e%fd7d8Z/edAd*e%fd9d:Z0edAd*e%fd;d<Z1edAd*e%fd=d>Z2edAd*e%fd?d@Z3dS )Ca   CrossViT Model

@inproceedings{
    chen2021crossvit,
    title={{CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification}},
    author={Chun-Fu (Richard) Chen and Quanfu Fan and Rameswar Panda},
    booktitle={International Conference on Computer Vision (ICCV)},
    year={2021}
}

Paper link: https://arxiv.org/abs/2103.14899
Original code: https://github.com/IBM/CrossViT/blob/main/models/crossvit.py

NOTE: model names have been renamed from originals to represent actual input res all *_224 -> *_240 and *_384 -> *_408

Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
Modified from Timm. https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
    )partial)ListOptionalTupleNIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPath	to_2tupletrunc_normal__assert   )build_model_with_cfg)register_notrace_function)register_modelgenerate_default_cfgs)BlockCrossVitc                       s*   e Zd ZdZd fdd	Zd	d
 Z  ZS )
PatchEmbedz Image to Patch Embedding
                Fc                    sT  t    t|}t|}|d |d  |d |d   }|| _|| _|| _|r|d dkrbttj||d ddddtj	dd	tj|d |d
 ddddtj	dd	tj|d
 |dddd| _
d S |d dkrttj||d ddddtj	dd	tj|d |d
 dd
ddtj	dd	tj|d
 |dd
dd| _
d S d S tj||||d| _
d S )Nr   r            r   )kernel_sizestridepaddingT)inplace   r   )r   r   )super__init__r
   img_size
patch_sizenum_patchesnn
SequentialConv2dReLUproj)selfr#   r$   in_chans	embed_dim
multi_convr%   	__class__ H/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/crossvit.pyr"   +   s4   
 





	zPatchEmbed.__init__c                 C   s   |j \}}}}t|| jd kd| d| d| jd  d| jd  d	 t|| jd kd| d| d| jd  d| jd  d	 | |ddd}|S )Nr   zInput image size (*z) doesn't match model (r   z).r    )shaper   r#   r*   flatten	transpose)r+   xBCHWr1   r1   r2   forwardG   s   ((zPatchEmbed.forward)r   r   r   r   F)__name__
__module____qualname____doc__r"   r<   __classcell__r1   r1   r/   r2   r   '   s    r   c                       s.   e Zd Z				d fdd	Zdd Z  ZS )	CrossAttention   F        c                    s   t    || _|| }|d | _tj|||d| _tj|||d| _tj|||d| _t	|| _
t||| _t	|| _d S )Ng      )bias)r!   r"   	num_headsscaler&   LinearwqwkwvDropout	attn_dropr*   	proj_drop)r+   dimrF   qkv_biasrM   rN   head_dimr/   r1   r2   r"   S   s   

zCrossAttention.__init__c           	      C   s   |j \}}}| |d d dddf |d| j|| j dddd}| |||| j|| j dddd}| |||| j|| j dddd}||dd | j }|j	dd}| 
|}|| dd|d|}| |}| |}|S )	Nr   r   .r    r   rO   )r4   rI   reshaperF   permuterJ   rK   r6   rG   softmaxrM   r*   rN   )	r+   r7   r8   Nr9   qkvattnr1   r1   r2   r<   h   s   <**


zCrossAttention.forward)rC   FrD   rD   )r=   r>   r?   r"   r<   rA   r1   r1   r/   r2   rB   R   s    rB   c                       s8   e Zd Zdddddejejf fdd	Zdd Z  ZS )CrossAttentionBlock      @FrD   c
           
         sL   t    |	|| _t|||||d| _|dkrt|| _d S t | _d S )N)rF   rP   rM   rN   rD   )	r!   r"   norm1rB   r\   r	   r&   Identity	drop_path)
r+   rO   rF   	mlp_ratiorP   rN   rM   ra   	act_layer
norm_layerr/   r1   r2   r"   }   s   

$zCrossAttentionBlock.__init__c                 C   s0   |d d dddf |  | | | }|S )Nr   r   .)ra   r\   r_   )r+   r7   r1   r1   r2   r<      s   ,zCrossAttentionBlock.forward)	r=   r>   r?   r&   GELU	LayerNormr"   r<   rA   r1   r1   r/   r2   r]   {   s    r]   c                       sL   e Zd Zddddejejf fdd	Zdeej	 deej	 fddZ
  ZS )	MultiScaleBlockFrD   c                    sJ  t    t|}|| _t | _t|D ]3}g }t|| D ]}|t	|| || || ||||	| |d qt|dkrH| jtj
|  qt| jdkrSd | _t | _t|D ].}|| ||d |  krk	 ||| |
 t|| ||d |  g}| jtj
|  q\t | _t|D ]R}|d | }|| }|d dkr| jt|| ||| ||||	d |d qg }t|d D ]}|t|| ||| ||||	d |d q| jtj
|  qt | _t|D ]2}||d |  || kr	 |||d |  |
 t||d |  || g}| jtj
|  qd S )N)rO   rF   rb   rP   rN   rM   ra   rd   r   r   FrS   )r!   r"   lennum_branchesr&   
ModuleListblocksrangeappendr   r'   projsr`   rH   fusionr]   revert_projs)r+   rO   patchesdepthrF   rb   rP   rN   rM   ra   rc   rd   ri   dtmpid_nh_r/   r1   r2   r"      s   




,



zMultiScaleBlock.__init__r7   returnc                 C   s(  g }t | jD ]\}}||||  qtjttj g }t | jD ]\}}|||| d d dddf  q$g }t t	| j
| jD ]K\}\}}	tj|| ||d | j  d d dd df fdd}
||
}
|	|
d d dddf }tj||| d d dd df fdd}
||
 qF|S )Nr   r   .rT   )	enumeraterk   rm   torchjitannotater   Tensorrn   zipro   rp   catri   )r+   r7   outs_bru   blockproj_cls_tokenr*   outsro   revert_projrt   reverted_proj_cls_tokenr1   r1   r2   r<      s   &6(zMultiScaleBlock.forward)r=   r>   r?   r&   re   rf   r"   r   r{   r~   r<   rA   r1   r1   r/   r2   rg      s    	&Wrg   c                 C   s   dd t | |D S )Nc                 S   s(   g | ]\}}|d  | |d  | qS )r   r   r1   ).0ru   pr1   r1   r2   
<listcomp>
     ( z(_compute_num_patches.<locals>.<listcomp>)r   )r#   rq   r1   r1   r2   _compute_num_patches	  s   r   Fss
crop_scalec                 C   s   | j dd \}}||d ks||d kra|rV|d |krV|d |krVtt||d  d tt||d  d }}| dddd|||d  |||d  f } | S tjjj| |ddd} | S )	a~  
    Pulled out of CrossViT.forward_features to bury conditional logic in a leaf node for FX tracing.
    Args:
        x (Tensor): input image
        ss (tuple[int, int]): height and width to scale to
        crop_scale (bool): whether to crop instead of interpolate to achieve the desired scale. Defaults to False
    Returns:
        Tensor: the "scaled" image batch tensor
    rR   Nr   r          @bicubicF)sizemodealign_corners)r4   introundr{   r&   
functionalinterpolate)r7   r   r   r:   r;   cuclr1   r1   r2   scale_image  s   20r   c                       s   e Zd ZdZdddddddd	d
ddddddddeejdddf fdd	Zdd Ze	j
jdd Ze	j
jd+ddZe	j
jd,ddZe	j
jdejfddZd-ded ee fd!d"Zdee	j fd#d$Zd+d%ee	j d&ede	jfd'd(Zd)d* Z  ZS ).r   zI Vision Transformer with support for patch or hybrid CNN input stage
    r   )      ?r   )rC   r   r     )     )r   r   r   r   r   )   r   )r   r   r^   FTrD   gư>)epstokenc           !         sn  t    |dv sJ _|_t|_t|}fdd|D _|_tj|}t	|_
 _t  __t _tj
D ],}td| ttdd||   |  td| ttdd |  qGtj| D ]\}}}jt|||||
d q{tj|d_td	d |D }d
d td||D }d}t _t|D ]0\}}t|d d |d  }||||  }t  ||||	||||d
} ||7 }j|  qt fddtj
D _!t|_"t fddtj
D _#tj
D ]}t$t%d| dd t$t%d| dd q&j' d S )Nr   avgc                    s$   g | ] t  fd djD qS )c                    s   g | ]}t |  qS r1   )r   )r   sjsir1   r2   r   C      z0CrossVit.__init__.<locals>.<listcomp>.<listcomp>)tupler#   )r   r+   r   r2   r   C  s   $ z%CrossVit.__init__.<locals>.<listcomp>
pos_embed_r   
cls_token_)r#   r$   r,   r-   r.   )r   c                 S   s   g | ]
}t |d d qS )rR   N)sumr   r7   r1   r1   r2   r   \      c                 S   s   g | ]}|  qS r1   )itemr   r1   r1   r2   r   ]  s    r   rS   )rF   rb   rP   rN   rM   ra   rd   c                    s   g | ]} | qS r1   r1   r   ru   )r-   rd   r1   r2   r   r  r   c                    s,   g | ]}d krt  | nt  qS r   )r&   rH   r`   r   )r-   num_classesr1   r2   r   t  s    {Gz?std)(r!   r"   r   global_poolr
   r#   img_size_scaledr   r   rh   ri   r-   r   num_featureshead_hidden_sizer&   rj   patch_embedrl   setattr	Parameterr{   zerosr   rm   r   rL   pos_droplinspacerk   rz   maxrg   norm	head_dropheadr   getattrapply_init_weights)!r+   r#   	img_scaler$   r,   r   r-   rr   rF   rb   r.   r   rP   	drop_ratepos_drop_rateproj_drop_rateattn_drop_ratedrop_path_raterd   r   r%   ru   im_sr   rs   total_depthdprdpr_ptridx	block_cfg
curr_depthdpr_blkr/   )r-   rd   r   r+   r2   r"   &  sr   



.(	
"
zCrossVit.__init__c                 C   s   t |tjr&t|jdd t |tjr"|jd ur$tj|jd d S d S d S t |tjr>tj|jd tj|jd d S d S )Nr   r   r   r   )	
isinstancer&   rH   r   weightrE   init	constant_rf   )r+   mr1   r1   r2   r   ~  s   zCrossVit._init_weightsc                 C   sZ   t  }t| jD ]"}|d|  t| d| d }|d ur*|jr*|d|  q|S )Nr   r   )setrl   ri   addr   requires_grad)r+   outru   per1   r1   r2   no_weight_decay  s   zCrossVit.no_weight_decayc                 C   s   t dddgdS )Nz ^cls_token|pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemrk   )dict)r+   coarser1   r1   r2   group_matcher  s   zCrossVit.group_matcherc                 C   s   |rJ dd S )Nz$gradient checkpointing not supportedr1   )r+   enabler1   r1   r2   set_grad_checkpointing  s   zCrossVit.set_grad_checkpointingry   c                 C   s   | j S N)r   r   r1   r1   r2   get_classifier  s   zCrossVit.get_classifierNr   r   c                    sF    _ |d ur|dv sJ |_t fddtjD _d S )Nr   c                    s.   g | ]} d krt j|  nt  qS r   )r&   rH   r-   r`   r   r   r+   r1   r2   r     s     z-CrossVit.reset_classifier.<locals>.<listcomp>)r   r   r&   rj   rl   ri   r   )r+   r   r   r1   r   r2   reset_classifier  s   zCrossVit.reset_classifierc           
         s   |j d }g  t| jD ]H\}}|}| j| }t||| j}||}|dkr)| jn| j}||dd}t	j
||fdd}|dkrC| jn| j}|| }| |} | qt| jD ]\}}	|	  qZ fddt| jD   S )Nr   rS   r   rT   c                       g | ]
\}}| | qS r1   r1   )r   ru   r   xsr1   r2   r     r   z-CrossVit.forward_features.<locals>.<listcomp>)r4   rz   r   r   r   r   cls_token_0cls_token_1expandr{   r   pos_embed_0pos_embed_1r   rm   rk   r   )
r+   r7   r8   ru   r   x_r   
cls_tokens	pos_embedr   r1   r   r2   forward_features  s$   



zCrossVit.forward_featuresr   
pre_logitsc                    s    j dkrdd D ndd D  fddD |s't jd tjr3tjdd D dd	S tjtjfd
dt	 jD dd	dd	S )Nr   c                 S   s(   g | ]}|d d dd f j ddqS )Nr   rT   )meanr   r1   r1   r2   r     r   z)CrossVit.forward_head.<locals>.<listcomp>c                 S   s   g | ]
}|d d df qS )Nr   r1   r   r1   r1   r2   r     r   c                    s   g | ]}  |qS r1   )r   r   r   r1   r2   r     s    r   c                 S   s   g | ]}|qS r1   r1   r   r1   r1   r2   r     s    r   rT   c                    r   r1   r1   )r   ru   r   r   r1   r2   r     r   )
r   r   r   r&   r`   r{   r   r   stackrz   )r+   r   r   r1   )r+   r   r2   forward_head  s
   &,zCrossVit.forward_headc                 C   s   |  |}| |}|S r   )r   r   )r+   r7   r   r1   r1   r2   r<     s   

zCrossVit.forwardF)Tr   )r=   r>   r?   r@   r   r&   rf   r"   r   r{   r|   ignorer   r   r   Moduler   r   r   strr   r   r~   r   boolr   r<   rA   r1   r1   r/   r2   r   "  sF    X	
	
 c                 K   s4   | dd r
tddd }tt| |fd|i|S )Nfeatures_onlyz<features_only not implemented for Vision Transformer models.c                 S   sD   i }|   D ]}d|v sd|v r|dd}n|}| | ||< q|S )Nr   	cls_token.rx   )keysreplace)
state_dictnew_state_dictkeynew_keyr1   r1   r2   pretrained_filter_fn  s   z._create_crossvit.<locals>.pretrained_filter_fnr	  )getRuntimeErrorr   r   )variant
pretrainedkwargsr	  r1   r1   r2   _create_crossvit  s   
r   c                 K   s   | ddd dt tdddd
|S )Nr   )r      r  g      ?T)zpatch_embed.0.projzpatch_embed.1.proj)zhead.0zhead.1)
urlr   
input_size	pool_sizecrop_pctr   r   fixed_input_size
first_conv
classifierr   )r  r  r1   r1   r2   _cfg  s   r  ztimm/)	hf_hub_id)zpatch_embed.0.proj.0zpatch_embed.1.proj.0)r  r  )r     r  r   )r  r  r  r  )zcrossvit_15_240.in1kzcrossvit_15_dagger_240.in1kzcrossvit_15_dagger_408.in1kzcrossvit_18_240.in1kzcrossvit_18_dagger_240.in1kzcrossvit_18_dagger_408.in1kzcrossvit_9_240.in1kzcrossvit_9_dagger_240.in1kzcrossvit_base_240.in1kzcrossvit_small_240.in1kzcrossvit_tiny_240.in1kry   c                 K   Z   t dddgddgg dg dg dgddgg dd	}tdd
| dt |fi |}|S )Nr   g?r   r   `   r   r   r   r   r   r   r   r   r   r$   r-   rr   rF   rb   crossvit_tiny_240r  r  r1   r   r  r  r  
model_argsmodelr1   r1   r2   r"       " r"  c                 K   r  )Nr  r   r   r   r   r  r   r   r!  crossvit_small_240r#  r1   r$  r%  r1   r1   r2   r)    r(  r)  c                 K   sZ   t dddgddgg dg dg dgddgg dd}tdd	| d
t |fi |}|S )Nr  r   r   r   r   r  r   r!  crossvit_base_240r#  r1   r$  r%  r1   r1   r2   r*     r(  r*  c                 K   r  )Nr  r   r         r   r   r   r   r   r   r   r!  crossvit_9_240r#  r1   r$  r%  r1   r1   r2   r/  )  r(  r/  c                 K   r  )Nr  r   r   r   r   r      r   r   r.  r!  crossvit_15_240r#  r1   r$  r%  r1   r1   r2   r2  2  r(  r2  c              	   K   sb   t ddddgddgg dg dg dgddgg dd	|}tdd
| dt |fi |}|S )Nr  r   r   r     r   r   r   r   r.  r!  crossvit_18_240r#  r1   r$  r%  r1   r1   r2   r5  ;  s   " r5  c              	   K   \   t dddgddgg dg dg dgddgg dd	d
}tdd| dt |fi |}|S )Nr  r   r   r+  r,  r-  r   r.  Tr   r$   r-   rr   rF   rb   r.   crossvit_9_dagger_240r#  r1   r$  r%  r1   r1   r2   r8  D     " r8  c              	   K   r6  )Nr  r   r   r   r   r0  r   r.  Tr7  crossvit_15_dagger_240r#  r1   r$  r%  r1   r1   r2   r:  M  r9  r:  c              	   K   r6  )Nr   g?r   r   r   r   r0  r   r.  Tr7  crossvit_15_dagger_408r#  r1   r$  r%  r1   r1   r2   r<  V  r9  r<  c              	   K   r6  )Nr  r   r   r   r3  r4  r   r.  Tr7  crossvit_18_dagger_240r#  r1   r$  r%  r1   r1   r2   r=  _  r9  r=  c              	   K   r6  )Nr;  r   r   r   r3  r4  r   r.  Tr7  crossvit_18_dagger_408r#  r1   r$  r%  r1   r1   r2   r>  h  r9  r>  r   )r  )4r@   	functoolsr   typingr   r   r   r{   torch.nnr&   	timm.datar   r   timm.layersr	   r
   r   r   _builderr   _features_fxr   	_registryr   r   vision_transformerr   __all__r   r   rB   r]   rg   r   r   r   r   r   r  r  default_cfgsr"  r)  r*  r/  r2  r5  r8  r:  r<  r=  r>  r1   r1   r1   r2   <module>   s    +)o 
-
