o
    پidP                     @   s:  d Z ddlZddlmZmZ ddlmZ ddlmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZm Z  ddl!m"Z"m#Z# eG dd dZ$eG dd dZ%dZddZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*G dd dej'Z+G dd dej'Z,G dd dej'Z-d[d!d"Z.dZd#d$Z/e e/d%d&d'e/d(d&d'e/d)d*d'e/d+d*d'e/d,d*d'e/d-d*d.d/d0e/d1d*d2d/d0e/d3d*d4d/d0e/d5d6d'e/d7d6d.d/d0e/d8d6d2d/d0e/d9d6d4d/d0e/d:d;d.d/d0e/d<d;d2d/d0e/d=d;d4d/d0d>Z0ed\d?e"fd@dAZ1ed\d?e"fdBdCZ2ed\d?e"fdDdEZ3ed\d?e"fdFdGZ4ed\d?e"fdHdIZ5ed\d?e"fdJdKZ6ed\d?e"fdLdMZ7ed\d?e"fdNdOZ8ed\d?e"fdPdQZ9ed\d?e"fdRdSZ:ed\d?e"fdTdUZ;ed\d?e"fdVdWZ<ed\d?e"fdXdYZ=dS )]ab   ViTamin

Paper: Designing Scalable Vison Models in the Vision-Language Era
A family of model weights on Huggingface: https://huggingface.co/collections/jienengchen/vitamin-family-661048126b72debdaca060bf

@inproceedings{chen2024vitamin,
  title={ViTamin: Designing Scalable Vision Models in the Vision-language Era},
  author={Chen, Jieneng and Yu, Qihang and Shen, Xiaohui and Yuille, Alan and Chen, Liang-Chieh},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  year={2024}
}

Based on Apache 2.0 licensed code at https://github.com/ViTamin/ViTamin

Modifications and timm support by Jieneng Chen 2024

Reference:
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer_hybrid.py
    N)	dataclassfield)partial)OptionalUnionTupleOPENAI_CLIP_MEANOPENAI_CLIP_STD)create_act_layerget_norm_layerget_norm_act_layercreate_conv2dmake_divisibleDropPathHybridEmbed   )build_model_with_cfg)named_applycheckpoint_seq)register_modelgenerate_default_cfgs)VisionTransformercheckpoint_filter_fnc                   @   s   e Zd ZU dZeed< dZeed< dZe	ed< dZ
e	ed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZeed< dS )
VitConvCfg      @expand_ratioTexpand_output   kernel_sizer   
group_sizeFpre_norm_actdwstride_modeavg2	pool_typedownsample_pool_typegelu	act_layer 
norm_layergh㈵>norm_epsdown_shortcutmlpN)__name__
__module____qualname__r   float__annotations__r   boolr   intr    r!   r#   strr%   r&   r(   r*   r+   r,   r   r-    r6   r6   G/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/vitamin.pyr   '   s   
 r   c                   @   s   e Zd ZU dZeeeeedf f df ed< dZeeeeedf f df ed< dZ	eed< e
edZeed	< d
Zeed< dS )VitCfg)`           .	embed_dim)   r      r>   depths@   
stem_width)default_factoryconv_cfgr)   	head_typeN)r.   r/   r0   r=   r   r   r4   r2   r@   rB   r   r   rD   rE   r5   r6   r6   r6   r7   r8   8   s   
 $$r8   r)   c                 C   sp   t | tjr4| jd | jd  | j }|| j }tj| jdt	
d|  | jd ur6tj| j d S d S d S )Nr   r          @)
isinstancennConv2dr   out_channelsgroupsinitnormal_weightmathsqrtbiaszeros_)modulenameschemefan_outr6   r6   r7   
_init_convA   s   

rW   c                       sH   e Zd Z				ddedededed	ed
ef fddZdd Z  Z	S )Stemr'   layernorm2dư>Tin_chsout_chsr(   r*   r+   rQ   c                    sb   t    tt|||d}|| _t||dd|d| _||| _t||dd|d| _t	t
|  d S )Nepsr   r>   striderQ   r   )super__init__r   r   r\   r   conv1norm1conv2r   rW   )selfr[   r\   r(   r*   r+   rQ   norm_act_layer	__class__r6   r7   rb   K   s   
	
zStem.__init__c                 C   s"   |  |}| |}| |}|S N)rc   rd   re   rf   xr6   r6   r7   forward^   s   


zStem.forward)r'   rY   rZ   T)
r.   r/   r0   r4   r5   r1   r3   rb   rm   __classcell__r6   r6   rh   r7   rX   J   s&    rX   c                	       s<   e Zd Z		ddedededef fddZd	d
 Z  ZS )Downsample2dr$   Tdimdim_outr%   rQ   c                    sL   t    tjddddd| _||krtj||d|d| _d S t | _d S )Nr   r>   r   F)r   r`   paddingcount_include_padrQ   )ra   rb   rH   	AvgPool2dpoolrI   expandIdentity)rf   rp   rq   r%   rQ   rh   r6   r7   rb   f   s
   
zDownsample2d.__init__c                 C      |  |}| |}|S rj   )rv   rw   rk   r6   r6   r7   rm   u      

zDownsample2d.forward)r$   T)	r.   r/   r0   r4   r5   r3   rb   rm   rn   r6   r6   rh   r7   ro   e   s    ro   c                       s4   e Zd ZdZ					d
 fdd	Zdd	 Z  ZS )StridedConvz downsample 2d as well
    r   r>   r   r<   c                    s>   t    ttddd}tj|||||d| _||| _d S )NrY   rZ   r]   )r   r`   rr   )ra   rb   r   r   rH   rI   projnorm)rf   r   r`   rr   in_chansr=   r*   rh   r6   r7   rb   ~   s   
zStridedConv.__init__c                 C   ry   rj   )r}   r|   rk   r6   r6   r7   rm      rz   zStridedConv.forward)r   r>   r   r   r<   )r.   r/   r0   __doc__rb   rm   rn   r6   r6   rh   r7   r{   {   s    r{   c                       sh   e Zd ZdZ							dd	ed
edededededededef fddZdddZdd Z	  Z
S )MbConvLNBlockzL Pre-Norm Conv Block - 1x1 - kxk - 1x1, w/ inverted bottleneck (expand)
    r           r   rY   rZ   r'   r   r[   r\   r`   	drop_pathr   r*   r+   r(   r   c
              	      s  t t|   |||| _| _| _t||	 }
tt|||d}|dkr/t	||ddd| _
n||kr>tj||ddd| _
nt | _
||dd	| _t | _t||
dddd
| _t|dd| _t|
|
||d|
dd| _t|dd| _t|
|ddd| _|dkrt|| _d S t | _d S )Nr]   r>   avgT)r%   rQ   r   rt   F)	apply_actr_   )inplace)r`   dilationrK   rQ   r   )ra   r   rb   r`   r[   r\   r   r   r   ro   shortcutrH   rI   rx   pre_normdownr   	conv1_1x1r   act1	conv2_kxkact2	conv3_1x1r   r   )rf   r[   r\   r`   r   r   r*   r+   r(   r   mid_chsprenorm_act_layerrh   r6   r7   rb      s&   

$zMbConvLNBlock.__init__r)   c                 C   s   t tt|d|  d S )N)rU   )r   r   rW   )rf   rU   r6   r6   r7   init_weights   s   zMbConvLNBlock.init_weightsc                 C   sb   |  |}| |}| |}| |}| |}| |}| |}| |}| || }|S rj   )	r   r   r   r   r   r   r   r   r   )rf   rl   r   r6   r6   r7   rm      s   







zMbConvLNBlock.forward)r   r   r   rY   rZ   r'   r   r)   )r.   r/   r0   r   r4   r1   r5   rb   r   rm   rn   r6   r6   rh   r7   r      s<    	

#r   c                	       sL   e Zd ZdZ		ddedeeeeef f def fddZd	d
 Z	  Z
S )MbConvStagesz3 MobileConv for stage 1 and stage 2 of ViTamin
       r   cfgimg_sizer~   c                    s   t    d| _t||jd| _g }t|j| _t	|jd d D ])\} |dkr0|j|d  n|j fddt
|j| D }|tj| g7 }q!tj| | _td|jd |jd d| _d S )	NF)r[   r\   r>   r   r   c                    s2   g | ]}t |d krn  |d krdnddqS )r   r>   r   )r[   r\   r`   )r   ).0drp   stage_in_chsr6   r7   
<listcomp>   s    z)MbConvStages.__init__.<locals>.<listcomp>)r`   r~   r=   )ra   rb   grad_checkpointingrX   rB   stemlenr=   
num_stages	enumerateranger@   rH   
Sequentialstagesr{   rv   )rf   r   r   r~   r   sblocksrh   r   r7   rb      s(   
zMbConvStages.__init__c                 C   s@   |  |}| jrtj st| j|}n| |}| |}|S rj   )r   r   torchjitis_scriptingr   r   rv   rk   r6   r6   r7   rm      s   


zMbConvStages.forward)r   r   )r.   r/   r0   r   r8   r   r4   r   rb   rm   rn   r6   r6   rh   r7   r      s    #r   c                       s.   e Zd Z				d	 fdd	Zdd Z  ZS )
GeGluMlpr'   NTr   c                    sl   t    tt|p
ddd}||| _tj|||d| _t|| _	tj|||d| _
tj|||d| _d S )N	layernormrZ   r]   rt   )ra   rb   r   r   r}   rH   Linearw0r   actw1w2)rf   in_featureshidden_featuresr(   r*   rQ   droprh   r6   r7   rb     s   
	

zGeGluMlp.__init__c                 C   s2   |  |}| | || | }| |}|S rj   )r}   r   r   r   r   rk   r6   r6   r7   rm     s   

zGeGluMlp.forward)r'   NTr   )r.   r/   r0   rb   rm   rn   r6   r6   rh   r7   r      s    r   Fc                 K   sl   | dd}|d usJ t||ddd}tt|dd|d< |dd	 tt| |ftt	|d
dd|S )Nout_indicesr   r~   )r   r~   F)backboner|   embed_layer
patch_sizer   getter)r   feature_cls)pretrained_filter_fnfeature_cfg)
popr   getr   r   
setdefaultr   r   r   dict)variant
pretrained	embed_cfgkwargsr   r   r6   r6   r7   _create_vitamin  s   
r   c                 K   s    | ddd dddt tddd|S )	Ni  )r   r   r   g?bicubicTzpatch_embed.backbone.stem.conv1head)urlnum_classes
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanstd
first_conv
classifierr   )r   r   r6   r6   r7   _cfg+  s   r   zjienengchen/ViTamin-S-LTTr;   )	hf_hub_idr   zjienengchen/ViTamin-Szjienengchen/ViTamin-B-LTTr<   zjienengchen/ViTamin-Bzjienengchen/ViTamin-L-224pxzjienengchen/ViTamin-L-256px)r      r   g      ?)r   r   r   r   zjienengchen/ViTamin-L-336px)r   P  r   zjienengchen/ViTamin-L-384px)r   r;   r;   zjienengchen/ViTamin-L2-224px   zjienengchen/ViTamin-L2-256pxzjienengchen/ViTamin-L2-336pxzjienengchen/ViTamin-L2-384pxzjienengchen/ViTamin-XL-256px  zjienengchen/ViTamin-XL-336pxzjienengchen/ViTamin-XL-384px)z%vitamin_small_224.datacomp1b_clip_lttz!vitamin_small_224.datacomp1b_clipz$vitamin_base_224.datacomp1b_clip_lttz vitamin_base_224.datacomp1b_clipz!vitamin_large_224.datacomp1b_clipz!vitamin_large_256.datacomp1b_clipz!vitamin_large_336.datacomp1b_clipz!vitamin_large_384.datacomp1b_clipz"vitamin_large2_224.datacomp1b_clipz"vitamin_large2_256.datacomp1b_clipz"vitamin_large2_336.datacomp1b_clipz"vitamin_large2_384.datacomp1b_clipz"vitamin_xlarge_256.datacomp1b_clipz"vitamin_xlarge_336.datacomp1b_clipz"vitamin_xlarge_384.datacomp1b_clipreturnc              
   K   T   t dddtddddd}td	d
dtddd|d}tdd| it|fi |}|S )N)rA      r;   r>      r   rA   rY   rZ   r*   r+   1dr=   r@   rB   rD   rE   r;         rF   Fr   r=   depth	num_heads	mlp_layer	mlp_ratioclass_tokenglobal_poolr   vitamin_small_224r   )r   r8   r   r   r   r   r   r   r   
model_argsmodelr6   r6   r7   r   b      

r   c              
   K   r   )N)r   r   r<   r   r   rY   rZ   r   r   r   r<   r      rF   Fr   r   vitamin_base_224r   )r   r   r   r6   r6   r7   r   v  s    

r   c              
   K   r   )N   i@  r   r   r   rY   rZ   r   r   r   r         rF   Fr   r   vitamin_large_224r   )r   r   r   r6   r6   r7   r     r   r   c                 K   V   t dddtddddd}td	d
ddtddd|d	}tdd| it|fi |}|S )Nr   r   r   rY   rZ   r   r   r   r   r   r   r   rF   Fr   	r   r=   r   r   r   r   r   r   r   vitamin_large_256r   )r   r   r   r6   r6   r7   r         
r   c                 K   r   )Nr   r   r   rY   rZ   r   r   r   r   r   r   r   rF   Fr   r   vitamin_large_336r   )r   r   r   r6   r6   r7   r         
r   c                 K   r   )Nr   r   r   rY   rZ   r   r   r   r;   r   r   r   rF   Fr   r   vitamin_large_384r   )r   r   r   r6   r6   r7   r     r   r   c              
   K   r   )Nr   r   r   rY   rZ   r   r   r   r   r   r   rF   Fr   r   vitamin_large2_224r   )r   r   r   r6   r6   r7   r     r   r   c                 K   r   )Nr   r   r   rY   rZ   r   r   r   r   r   r   r   rF   Fr   r   vitamin_large2_256r   )r   r   r   r6   r6   r7   r     r   r   c                 K   r   )Nr   r   r   rY   rZ   r   r   r   r   r   r   r   rF   Fr   r   vitamin_large2_336r   )r   r   r   r6   r6   r7   r     r   r   c                 K   r   )Nr   r   r   rY   rZ   r   r   r   r;   r   r   r   rF   Fr   r   vitamin_large2_384r   )r   r   r   r6   r6   r7   r     r   r   c                 K   sZ   t dddtddddd}td	d
ddtdddd|d
}t	dd| it|fi |}|S )Nr:   r;   r   r   r:   rY   rZ   r   r   r   r   r       r   rF   Fr   none
r   r=   r   r   r   r   r   r   	pos_embedr   vitamin_xlarge_256r   r  r   r   r6   r6   r7   r  %  s,   
r  c                 K   X   t dddtddddd}td	d
ddtdddd|d
}tdd| it|fi |}|S )Nr   r   r:   rY   rZ   r   r   r   r   r   r  r   rF   Fr   r  r  r  r   r  r   r   r6   r6   r7   vitamin_xlarge_3369      
r  c                 K   r  )Nr   r   r:   rY   rZ   r   r   r   r;   r   r  r   rF   Fr   r  r  vitamin_xlarge_384r   )r
  r   r   r6   r6   r7   r
  L  r	  r
  r   )FN)F)>r   rO   dataclassesr   r   	functoolsr   typingr   r   r   r   torch.nnrH   	timm.datar	   r
   timm.layersr   r   r   r   r   r   r   _builderr   _manipulater   r   	_registryr   r   vision_transformerr   r   r   r8   rW   ModulerX   ro   r{   r   r   r   r   r   default_cfgsr   r   r   r   r   r   r   r   r   r   r  r  r
  r6   r6   r6   r7   <module>   s    $
	>0

+