o
    پi+h                     @   s  d Z ddlmZ ddlmZmZmZmZmZ ddl	Z	ddl
mZ ddlmZmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ dgZ G dd dej!Z"G dd dej!Z#G dd dej!Z$G dd dej!Z%G dd dej!Z&G dd dej!Z'dEddZ(dFddZ)ee)dde)dde)dde)dde)dde)dde)dd d!dd"d#e)dd d!dd"d#e)dd d!dd"d#e)dd d!dd"d#e)dd"d$e)ddd$d%Z*d&e'fd'd(Z+dGd*d+Z,edGd,e'fd-d.Z-edGd,e'fd/d0Z.edGd,e'fd1d2Z/edGd,e'fd3d4Z0edGd,e'fd5d6Z1edGd,e'fd7d8Z2edGd,e'fd9d:Z3edGd,e'fd;d<Z4edGd,e'fd=d>Z5edGd,e'fd?d@Z6edGd,e'fdAdBZ7edGd,e'fdCdDZ8dS )Ha[   FocalNet

As described in `Focal Modulation Networks` - https://arxiv.org/abs/2203.11926

Significant modifications and refactoring from the original impl at https://github.com/microsoft/FocalNet

This impl is/has:
* fully convolutional, NCHW tensor layout throughout, seemed to have minimal performance impact but more flexible
* re-ordered downsample / layer so that striding always at beginning of layer (stage)
* no input size constraints or input resolution/H/W tracking through the model
* torchscript fixed and a number of quirks cleaned up
* feature extraction support via `features_only=True`
    )partial)CallableListOptionalTupleUnionNIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)MlpDropPathLayerNorm2dtrunc_normal_ClassifierHeadNormMlpClassifierHead   )build_model_with_cfg)feature_take_indices)named_apply
checkpoint)generate_default_cfgsregister_modelFocalNetc                       sT   e Zd Zdddddefdedededed	ed
ededef fddZdd Z	  Z
S )FocalModulation   TF        dimfocal_levelfocal_factorbiasuse_post_normnormalize_modulator	proj_drop
norm_layerc
                    s*  t    || _|| _|| _|| _|| _|| _||| jd g| _t	j
|d| | jd  d|d| _t	j
||d|d| _t	 | _t	j
||dd| _t	|| _t	 | _g | _t| jD ]'}
| j|
 | j }| jt	t	j
|||||d ddt	  | j| q\| jr|	|| _d S t	 | _d S )Nr   r   )kernel_sizer   )r$   F)r$   groupspaddingr   )super__init__r   focal_windowr   r   r    r!   input_splitnnConv2dfhGELUactprojDropoutr"   
ModuleListfocal_layerskernel_sizesrangeappend
SequentialIdentitynorm)selfr   r)   r   r   r   r    r!   r"   r#   kr$   	__class__ H/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/focalnet.pyr(   %   s.   
"


"zFocalModulation.__init__c           
      C   s   |  |}t|| jd\}}}d}t| jD ]\}}||}|||d d ||d f   }q| |jddd}|||d d | jd f   }| j	rS|| jd  }|| 
| }	| |	}	| |	}	| |	}	|	S )Nr   r   )r      T)keepdim)r-   torchsplitr*   	enumerater4   r0   meanr   r!   r.   r:   r1   r"   )
r;   xqctxgatesctx_alllfocal_layer
ctx_globalx_outr?   r?   r@   forwardM   s   
"


zFocalModulation.forward)__name__
__module____qualname__r   intboolfloatr   r(   rP   __classcell__r?   r?   r=   r@   r   $   s2    	
(r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )LayerScale2dh㈵>Fc                    s*   t    || _t|t| | _d S N)r'   r(   inplacer+   	ParameterrC   onesgamma)r;   r   init_valuesr[   r=   r?   r@   r(   i   s   
zLayerScale2d.__init__c                 C   s*   | j dddd}| jr||S || S )Nr   )r^   viewr[   mul_)r;   rG   r^   r?   r?   r@   rP   n   s   zLayerScale2d.forward)rY   F)rQ   rR   rS   r(   rP   rW   r?   r?   r=   r@   rX   h   s    rX   c                       st   e Zd ZdZdddddddddejefded	ed
edede	de	de	dededede
de
f fddZdd Z  ZS )FocalNetBlockz% Focal Modulation Network Block.
          @r   rA   F-C6?r   r   	mlp_ratior   r)   r    use_post_norm_in_modulationr!   layerscale_valuer"   	drop_path	act_layerr#   c              	      s,  t    || _|| _|| _|| _|| _|s||nt | _	t
||| j|||	|d| _|r2||nt | _|dur@t||nt | _|
dkrMt|
nt | _|sX||nt | _t|t|| ||	dd| _|rq||nt | _|durt||nt | _|
dkrt|
| _dS t | _dS )ap  
        Args:
            dim: Number of input channels.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            focal_level: Number of focal levels.
            focal_window: Focal window size at first focal level.
            use_post_norm: Whether to use layer norm after modulation.
            use_post_norm_in_modulation: Whether to use layer norm in modulation.
            layerscale_value: Initial layerscale value.
            proj_drop: Dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
        )r)   r   r    r!   r"   r#   Nr   T)in_featureshidden_featuresrj   dropuse_conv)r'   r(   r   rf   r)   r   r    r+   r9   norm1r   
modulation
norm1_postrX   ls1r   
drop_path1norm2r   rT   mlp
norm2_postls2
drop_path2)r;   r   rf   r   r)   r    rg   r!   rh   r"   ri   rj   r#   r=   r?   r@   r(   w   s<   
	
$zFocalNetBlock.__init__c                 C   s`   |}|  |}| |}| |}|| | | }|| | | | | 	| }|S rZ   )
ro   rp   rq   rs   rr   rx   rw   rv   ru   rt   )r;   rG   shortcutr?   r?   r@   rP      s   


&zFocalNetBlock.forward)rQ   rR   rS   __doc__r+   r/   r   rT   rV   rU   r   r(   rP   rW   r?   r?   r=   r@   rc   s   sN    	
?rc   c                       s   e Zd ZdZdddddddddddefded	ed
ededededededededededededef fddZ	e
jjdddZdd Z  ZS )FocalNetStagez4 A basic Focal Transformer layer for one stage.
    rd   Tr   Fre   r   r   out_dimdepthrf   
downsampler   r)   use_overlap_downr    rg   r!   rh   r"   ri   r#   c                    sv   t    || _|| _d| _|rt|d|d| _nt | _t	 	
fddt
|D | _dS )a8  
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels.
            depth: Number of blocks.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            downsample: Downsample layer at start of the layer.
            focal_level: Number of focal levels
            focal_window: Focal window size at first focal level
            use_overlap_down: User overlapped convolution in downsample layer.
            use_post_norm: Whether to use layer norm after modulation.
            use_post_norm_in_modulation: Whether to use layer norm in modulation.
            layerscale_value: Initial layerscale value
            proj_drop: Dropout rate for projections.
            drop_path: Stochastic depth rate.
            norm_layer: Normalization layer.
        Fr   )in_chsout_chsstrideoverlapr#   c                    s<   g | ]}t 	
t tr | n d qS ))r   rf   r   r)   r    rg   r!   rh   r"   ri   r#   )rc   
isinstancelist.0iri   r   r)   rh   rf   r#   r!   r|   r"   r    rg   r?   r@   
<listcomp>   s     z*FocalNetStage.__init__.<locals>.<listcomp>N)r'   r(   r   r}   grad_checkpointing
Downsampler~   r+   r9   r3   r6   blocks)r;   r   r|   r}   rf   r~   r   r)   r   r    rg   r!   rh   r"   ri   r#   r=   r   r@   r(      s    
#

"zFocalNetStage.__init__c                 C   s
   || _ d S rZ   )r   )r;   enabler?   r?   r@   set_grad_checkpointing  s   
z$FocalNetStage.set_grad_checkpointingc                 C   s>   |  |}| jD ]}| jrtj st||}q||}q|S rZ   )r~   r   r   rC   jitis_scriptingr   )r;   rG   blkr?   r?   r@   rP     s   


zFocalNetStage.forwardT)rQ   rR   rS   rz   r   rT   rV   rU   r   r(   rC   r   ignorer   rP   rW   r?   r?   r=   r@   r{      s`    	
Dr{   c                       sF   e Zd Z			ddededededee f
 fd	d
Zdd Z  Z	S )r      FNr   r   r   r   r#   c                    s   t    || _d}|}|r%|dv sJ |dkrd\}}n|dkr%d\}}tj|||||d| _|dur;||| _dS t | _dS )	a  

        Args:
            in_chs: Number of input image channels.
            out_chs: Number of linear projection output channels.
            stride: Downsample stride.
            overlap: Use overlapping convolutions if True.
            norm_layer: Normalization layer.
        r   )r   r   r   )   r   r   )rA   r   )r$   r   r&   N)r'   r(   r   r+   r,   r1   r9   r:   )r;   r   r   r   r   r#   r&   r$   r=   r?   r@   r(     s   

$zDownsample.__init__c                 C      |  |}| |}|S rZ   )r1   r:   r;   rG   r?   r?   r@   rP   ;     

zDownsample.forward)r   FN)
rQ   rR   rS   rT   rU   r   r   r(   rP   rW   r?   r?   r=   r@   r     s     r   c                '       s  e Zd ZdZdddddddd	d
d
d
d
ddddddeeddfdededededeedf de	deedf deedf de
de
de
de
dee de	d ee	 d!e
d"e
d#e
d$ef& fd%d&Zejjd'd( ZejjdHd)d*ZejjdId,d-Zejjd.ejfd/d0ZdJdedee fd1d2Z		
	
	3	
dKd4ejd5eeeee f  d6e
d7e
d8ed9e
d.eeej eejeej f f fd:d;Z	<	
	+dLd5eeee f d=e
d>e
fd?d@ZdAdB ZdHdCe
fdDdEZdFdG Z  Z S )Mr   z," Focal Modulation Networks (FocalNets)
    rA     avg`   r   r      r   rd   )r   r   r   r   rA   rA   rA   rA   FN      ?r   g?rY   )epsin_chansnum_classesglobal_pool	embed_dimdepths.rf   focal_levelsfocal_windowsr   r    rg   r!   head_hidden_sizehead_init_scalerh   	drop_rateproj_drop_ratedrop_path_rater#   c                    s  t    t|| _ fddt| jD  || _ | _ d  | _| _g | _	t
| d |	|d| _ d }dd td|t|D }g }t| jD ]N} | }t|||| ||dk|| || |	|
|||||t|d| t|d|d   |d	}|}||g7 }|  j	t|d
d|  d| dg7  _	qKtj| | _|rt | _|| _t| j|||||d| _n|| j| _t| j|||d| _ttt|d|  dS )aa  
        Args:
            in_chans: Number of input image channels.
            num_classes: Number of classes for classification head.
            embed_dim: Patch embedding dimension.
            depths: Depth of each Focal Transformer layer.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            focal_levels: How many focal levels at all stages. Note that this excludes the finest-grain level.
            focal_windows: The focal window size at all stages.
            use_overlap_down: Whether to use convolutional embedding.
            use_post_norm: Whether to use layernorm after modulation (it helps stabilize training of large models)
            layerscale_value: Value for layer scale.
            drop_rate: Dropout rate.
            drop_path_rate: Stochastic depth rate.
            norm_layer: Normalization layer.
        c                    s   g | ]} d |  qS )r   r?   r   r   r?   r@   r   n  s    z%FocalNet.__init__.<locals>.<listcomp>r`   r   )r   r   r   r#   c                 S   s   g | ]}|  qS r?   )item)r   rG   r?   r?   r@   r   }  s    Nr   )r   r|   r}   rf   r~   r   r)   r   r    rg   r!   rh   r"   ri   r#   r   r   layers.)num_chs	reductionmodule)hidden_size	pool_typer   r#   )r   r   )r   )r'   r(   len
num_layersr6   r   r   num_featuresr   feature_infor   stemrC   linspacesumr{   dictr+   r8   layersr9   r:   r   headr   r   r   _init_weights)r;   r   r   r   r   r   rf   r   r   r   r    rg   r!   r   r   rh   r   r   r   r#   in_dimdprr   i_layerr|   layerr=   r   r@   r(   E  st   
&
&
*

	zFocalNet.__init__c                 C   s   dhS )N r?   r;   r?   r?   r@   no_weight_decay  s   zFocalNet.no_weight_decayc                 C   s    t d|r
ddgdS g ddS )Nz^stem)z^layers\.(\d+)Nz^norm)i ))z^layers\.(\d+).downsample)r   )z^layers\.(\d+)\.\w+\.(\d+)Nr   )r   r   )r   )r;   coarser?   r?   r@   group_matcher  s   zFocalNet.group_matcherTc                 C   s"   || _ | jD ]}|j|d qd S )N)r   )r   r   r   )r;   r   rL   r?   r?   r@   r     s   
zFocalNet.set_grad_checkpointingreturnc                 C   s   | j jS rZ   )r   fcr   r?   r?   r@   get_classifier  s   zFocalNet.get_classifierc                 C   s   || _ | jj||d d S )N)r   )r   r   reset)r;   r   r   r?   r?   r@   reset_classifier  s   zFocalNet.reset_classifierNCHWrG   indicesr:   
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}tj s |s$| j}
n	| jd|	d  }
t| jd }t|
D ]\}}||}||v rW|rP||krP| |}n|}|	| q8|r\|S ||kre| |}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   zOutput shape must be NCHW.Nr   )
r   r   r   r   rC   r   r   rE   r:   r7   )r;   rG   r   r:   r   r   r   intermediatestake_indices	max_indexstageslast_idxfeat_idxstagex_interr?   r?   r@   forward_intermediates  s*   


zFocalNet.forward_intermediatesr   
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r   r   r+   r9   r:   r   )r;   r   r   r   r   r   r?   r?   r@   prune_intermediate_layers   s   
z"FocalNet.prune_intermediate_layersc                 C   s"   |  |}| |}| |}|S rZ   )r   r   r:   r   r?   r?   r@   forward_features  s   


zFocalNet.forward_features
pre_logitsc                 C   s   |r	| j ||dS |  |S )N)r   )r   )r;   rG   r   r?   r?   r@   forward_head  s   zFocalNet.forward_headc                 C   r   rZ   )r   r   r   r?   r?   r@   rP     r   zFocalNet.forwardFr   rZ   )NFFr   F)r   FT)!rQ   rR   rS   rz   r   r   rT   strr   rV   rU   r   r   r(   rC   r   r   r   r   r   r+   Moduler   r   Tensorr   r   r   r   r   r   rP   rW   r?   r?   r=   r@   r   A  s    



	
i
 
4
r   c                 C   s   t | tjrt| jdd | jd urtj| j d S d S t | tjrLt| jdd | jd ur6tj| j |rNd|v rP| jj	
| | jj	
| d S d S d S d S )Ng{Gz?)stdhead.fc)r   r+   r,   r   weightr   initzeros_Lineardatarb   )r   namer   r?   r?   r@   r     s   

r   r   c                 K   s    | dddddt tdddd	|S )
Nr   )rA      r   )r   r   g?bicubicz	stem.projr   mit)urlr   
input_size	pool_sizecrop_pctinterpolationrF   r   
first_conv
classifierlicenser   )r   kwargsr?   r?   r@   _cfg-  s   r   ztimm/)	hf_hub_id)rA     r   )   r   iRU  )r   r   r   r   r   )r   r   )zfocalnet_tiny_srf.ms_in1kzfocalnet_small_srf.ms_in1kzfocalnet_base_srf.ms_in1kzfocalnet_tiny_lrf.ms_in1kzfocalnet_small_lrf.ms_in1kzfocalnet_base_lrf.ms_in1kzfocalnet_large_fl3.ms_in22kzfocalnet_large_fl4.ms_in22kzfocalnet_xlarge_fl3.ms_in22kzfocalnet_xlarge_fl4.ms_in22kzfocalnet_huge_fl3.ms_in22kzfocalnet_huge_fl4.ms_in22kmodelc                 C   s   |  d| } d| v r| S dd l}i }| }|  D ]W\}}|dd|}|dd}|dd	d
 |}d|v rC||vrC|dd|}|dd}|dd}||v rm||  | krm|| j|jkrm||| j}|||< q|S )Nr   zstem.proj.weightr   zgamma_([0-9])z
ls\1.gammapatch_embedr   zlayers.(\d+).downsamplec                 S   s   dt | dd  dS )Nr   r   z.downsample)rT   group)rG   r?   r?   r@   <lambda>e  s    z&checkpoint_filter_fn.<locals>.<lambda>r:   znorm([0-9])znorm\1_postzln.znorm.r   r   )	getre
state_dictitemssubreplacenumelshapereshape)r  r   r  out_dict	dest_dictr<   vr?   r?   r@   checkpoint_filter_fn[  s$   ,
r  Fc                 K   sP   t dd t|ddD }|d|}tt| |fttd|dd|}|S )	Nc                 s   s    | ]\}}|V  qd S rZ   r?   )r   r   _r?   r?   r@   	<genexpr>q  s    z#_create_focalnet.<locals>.<genexpr>r   )r   r   rA   r   out_indicesT)flatten_sequentialr  )pretrained_filter_fnfeature_cfg)tuplerE   r  popr   r   r  r   )variant
pretrainedr   default_out_indicesr  r   r?   r?   r@   _create_focalnetp  s   
r  r   c                 K   *   t dg ddd|}tdd| i|S )Nr   r   r   r   focalnet_tiny_srfr  r?   )r  r   r  r  r   model_kwargsr?   r?   r@   r  |     r  c                 K   r  )Nr   r      r   r   r  focalnet_small_srfr  r?   )r&  r   r!  r?   r?   r@   r&    r#  r&  c                 K   r  )Nr$     r  focalnet_base_srfr  r?   )r(  r   r!  r?   r?   r@   r(    r#  r(  c                 K   0   t dg ddg dd|}tdd| i|S )	Nr   r   r   r   r   r   focalnet_tiny_lrfr  r?   )r+  r   r!  r?   r?   r@   r+       r+  c                 K   r)  )	Nr$  r   r   r*  focalnet_small_lrfr  r?   )r-  r   r!  r?   r?   r@   r-    r,  r-  c                 K   r)  )	Nr$  r'  r   r*  focalnet_base_lrfr  r?   )r.  r   r!  r?   r?   r@   r.    r,  r.  c              
   K   >   t dg ddg ddgd dddd|}tdd
| i|S )Nr$     r      r   Tre   r   r   r   r   r    r   rh   focalnet_large_fl3r  r?   )r3  r   r!  r?   r?   r@   r3       r3  c              	   K   6   t d	g ddg ddddd|}td
d| i|S )Nr$  r0  r   r   r   r   Tre   r   r   r   r    r   rh   focalnet_large_fl4r  r?   )r8  r   r!  r?   r?   r@   r8       r8  c              
   K   r/  )Nr$     r   r1  r   Tre   r2  focalnet_xlarge_fl3r  r?   )r;  r   r!  r?   r?   r@   r;    r4  r;  c              	   K   r5  )Nr$  r:  r6  Tre   r7  focalnet_xlarge_fl4r  r?   )r<  r   r!  r?   r?   r@   r<    r9  r<  c                 K   s@   t dg ddg ddgd ddddd|}tdd
| i|S )Nr$  `  r   rA   r   Tre   )r   r   r   r   r    rg   r   rh   focalnet_huge_fl3r  r?   )r>  r   r!  r?   r?   r@   r>    s   r>  c              
   K   s8   t d	g ddg dddddd|}td
d| i|S )Nr$  r=  r6  Tre   )r   r   r   r    rg   r   rh   focalnet_huge_fl4r  r?   )r?  r   r!  r?   r?   r@   r?    s   r?  )Nr   )r   r   )9rz   	functoolsr   typingr   r   r   r   r   rC   torch.nnr+   	timm.datar	   r
   timm.layersr   r   r   r   r   r   _builderr   	_featuresr   _manipulater   r   	_registryr   r   __all__r   r   rX   rc   r{   r   r   r   r   default_cfgsr  r  r  r&  r(  r+  r-  r.  r3  r8  r;  r<  r>  r?  r?   r?   r?   r@   <module>   s     DRV& 
_
#
