o
    oi4                     @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	Zd dlm
Z
 d dlmZ d dlmZ ddlmZmZmZmZ dd	lmZ d"dede
jde
jfddZG dd de
jZG dd de
jZd#de
jdefddZd$defddZd$defddZd$defddZd$d d!Z dS )%    N)partial)CallableSequenceTupleUnion)nn)trunc_normal_)KORNIA_CHECK   )MemEffAttentionMlp
PatchEmbedSwiGLUFFNFused)NestedTensorBlock TFfnmodulereturnc                 C   sf   |s
|r
| ||d |  D ]\}}|rd||fn|}t| |||dd q|r1|r1| ||d |S )zApply named function to module.r   name.T)r   r   r   depth_firstinclude_root)named_childrenjoinnamed_apply)r   r   r   r   r   
child_namechild_module r   \/home/ubuntu/.local/lib/python3.10/site-packages/kornia/feature/dedode/transformer/dinov2.pyr   +   s   r   c                   @   s   e Zd Zdd ZdS )
BlockChunkc                 C   s   | D ]}||}q|S Nr   )selfxbr   r   r   forward8   s   
zBlockChunk.forwardN)__name__
__module____qualname__r%   r   r   r   r   r    7   s    r    c                       s   e Zd Zdddddddddddd	d
eejeddf fdd	Zedd Z	dd Z
dd Zd)ddZdd Zd)ddZd*ddZd*ddZ						d+dejd eeef d!ed"ed#eeejeej f  f
d$d%Zd	d&d'd(Z  ZS ),DinoVisionTransformer               g      @Tg        FNmlpr
   c                    s  t    ttjdd | _| _d| _|| _	| _	|| _
||||d| _| jj}ttdd| _ttd|| j | _|du rN|g| ndd td||D d	kratnd
ksidkrltndkrwdd }|nt 	
fddt|D }|dkrd| _g }|| }td||D ]}|t g| ||||    qtdd |D | _n	d| _t|| _| _t | _ttd| _|   |   D ]}d|_!qdS )a\  Construct dino vision transformer.

        Args:
        img_size (int, tuple): input image size
        patch_size (int, tuple): patch size
        in_chans (int): number of input channels
        embed_dim (int): embedding dimension
        depth (int): depth of transformer
        num_heads (int): number of attention heads
        mlp_ratio (int): ratio of mlp hidden dim to embedding dim
        qkv_bias (bool): enable bias for qkv if True
        proj_bias (bool): enable bias for proj in attn if True
        ffn_bias (bool): enable bias for ffn if True
        drop_path_rate (float): stochastic depth rate
        drop_path_uniform (bool): apply uniform drop rate across blocks
        weight_init (str): weight init scheme
        init_values (float): layer-scale init values
        embed_layer (nn.Module): patch embedding layer
        act_layer (nn.Module): MLP activation layer
        block_fn (nn.Module): transformer block class
        ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
        block_chunks: (int) split block sequence into block_chunks units for FSDP wrap

        ư>)epsr
   )img_size
patch_sizein_chans	embed_dimTc                 S   s   g | ]}|  qS r   )item).0r#   r   r   r   
<listcomp>~       z2DinoVisionTransformer.__init__.<locals>.<listcomp>r   r/   swiglufusedswigluidentityc                  _   s   t  S r!   )r   Identity)argskwargsr   r   r   f   s   z)DinoVisionTransformer.__init__.<locals>.fc                    s.   g | ]}	
|  d qS ))dim	num_heads	mlp_ratioqkv_bias	proj_biasffn_bias	drop_path
norm_layer	act_layer	ffn_layerinit_valuesr   )r7   irI   block_fndprr5   rF   rJ   rK   rC   rH   rB   rE   rD   r   r   r8      s     c                 S   s   g | ]}t |qS r   )r    )r7   pr   r   r   r8      r9   FN)"super__init__r   r   	LayerNormnum_featuresr5   
num_tokensn_blocksrB   r3   patch_embednum_patches	Parametertorchzeros	cls_token	pos_embedlinspacer   r   NotImplementedErrorrangechunked_blocksappendr=   
ModuleListblocksnormhead
mask_tokeninit_weights
parametersrequires_grad)r"   r2   r3   r4   r5   depthrB   rC   rD   rF   rE   drop_path_ratedrop_path_uniformrK   embed_layerrI   rN   rJ   block_chunksrX   r@   blocks_listra   	chunksizerL   param	__class__rM   r   rR   ?   sR   
- &

zDinoVisionTransformer.__init__c                 C   s   | j jS r!   )r\   devicer"   r   r   r   ru      s   zDinoVisionTransformer.devicec                 C   s.   t | jdd tjj| jdd tt|  d S )N{Gz?stdr0   )r   r]   r   initnormal_r\   r   init_weights_vit_timmrv   r   r   r   rh      s   z"DinoVisionTransformer.init_weightsc              	   C   sR  |j }|jd d }| jjd d }||kr||kr| jS | j }|d d df }|d d dd f }	|jd }
|| j }|| j }|d |d }}tjj|	dt	t
|t	t
||
dddd|t
| |t
| fdd}	tt	||	jd	 k tt	||	jd k |	dddddd|
}	tj|d|	fdd
|S )Nr
   r   g?r,      bicubic)scale_factormoderA   )dtypeshaper]   floatr3   r   
functionalinterpolatereshapeintmathsqrtpermuter	   viewrZ   cat	unsqueezeto)r"   r#   whprevious_dtypenpatchNr]   class_pos_embedpatch_pos_embedrA   w0h0r   r   r   interpolate_pos_encoding   s*   



.z.DinoVisionTransformer.interpolate_pos_encodingc                 C   s~   |j \}}}}| |}|d ur"t|d| j|jd|}tj| j	
|j d dd|fdd}|| ||| }|S )Nr}   r   r
   r   )r   rW   rZ   wherer   rg   r   r   r   r\   expandr   )r"   r#   masks_B_ncr   r   r   r   r   prepare_tokens_with_masks   s   
$$z/DinoVisionTransformer.prepare_tokens_with_masksc           	         s    fddt ||D } jD ]}||}q|}g }t ||D ]!\}} |}||d d df |d d dd f ||d q|S )Nc                    s   g | ]
\}}  ||qS r   )r   )r7   r#   r   rv   r   r   r8          z?DinoVisionTransformer.forward_features_list.<locals>.<listcomp>r   r
   x_norm_clstokenx_norm_patchtokens	x_prenormr   )ziprd   re   rb   )	r"   x_list
masks_listr#   blkall_xoutputr   x_normr   rv   r   forward_features_list   s   


z+DinoVisionTransformer.forward_features_listc                 C   sj   t |tr| ||S | ||}| jD ]}||}q| |}|d d df |d d dd f ||dS )Nr   r
   r   )
isinstancelistr   r   rd   re   )r"   r#   r   r   r   r   r   r   forward_features   s   



z&DinoVisionTransformer.forward_featuresc                 C   s   |  |}g t| j}}t|trt|| |n|}t| jD ]\}}||}||v r1|| q tt|t|kdt| dt| d |S )Nonly  /  blocks found)	r   lenrd   r   r   r`   	enumeraterb   r	   )r"   r#   nr   total_block_lenblocks_to_takerL   r   r   r   r   $_get_intermediate_layers_not_chunked  s   

,z:DinoVisionTransformer._get_intermediate_layers_not_chunkedc           	      C   s   |  |}g dt| jd }}}t|trt|| |n|}| jD ]}||d  D ]}||}||v r:|| |d7 }q+q#tt|t|kdt| dt| d |S )Nr   r}   r
   r   r   r   )r   r   rd   r   r   r`   rb   r	   )	r"   r#   r   r   rL   r   r   block_chunkr   r   r   r    _get_intermediate_layers_chunked  s   



,z6DinoVisionTransformer._get_intermediate_layers_chunkedr#   r   r   return_class_tokenr   c           	         s   j r
||}n||}|rfdd|D }dd |D }dd |D }|r>|j\ } fdd|D }|rGtt||S t|S )Nc                    s   g | ]}  |qS r   )re   r7   outrv   r   r   r8   +  s    zADinoVisionTransformer.get_intermediate_layers.<locals>.<listcomp>c                 S   s   g | ]
}|d d df qS )Nr   r   r   r   r   r   r8   ,  r   c                 S   s    g | ]}|d d dd f qS )Nr
   r   r   r   r   r   r8   -  s     c                    s8   g | ]}|  j j d dddd qS )r}   r   r,   r
   r~   )r   r3   r   
contiguousr   Br   r"   r   r   r   r8   0  s    *)ra   r   r   r   tupler   )	r"   r#   r   r   r   re   outputsclass_tokens_r   r   r   get_intermediate_layers  s   z-DinoVisionTransformer.get_intermediate_layers)is_trainingc                O   s&   | j |i |}|r|S | |d S )Nr   )r   rf   )r"   r   r>   r?   retr   r   r   r%   8  s   zDinoVisionTransformer.forwardr!   )r
   )r
   FFT)r&   r'   r(   r   r   GELUBlockrR   propertyru   rh   r   r   r   r   r   r   rZ   Tensorr   r   r   boolr   r   r%   __classcell__r   r   rs   r   r)   >   sZ    s






r)   r   c                 C   s>   t | tjrt| jdd | jdurtj| j dS dS dS )zDViT weight initialization, original timm impl (for reproducibility).rw   rx   N)r   r   Linearr   weightbiasrz   zeros_r   r   r   r   r|   @  s   
r|   r+   c                 K   (   t d| ddddtttdd|}|S )	zReturn ViT Small.i  r.         
attn_classr3   r5   rk   rB   rC   rN   Nr   r)   r   r   r   r3   r?   modelr   r   r   	vit_smallH     
	r   c                 K   s(   t d| ddddtttdd|}|S )zReturn ViT Base.r-   r.   r   r   r   Nr   r   r   r   r   r   vit_baseV  r   r   c                 K   r   )	zReturn ViT Large.i      r+   r   r   r   Nr   r   r   r   r   r   	vit_larged  r   r   c                 K   r   )	zNClose to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64.i   (   r   r   r   r   Nr   r   r   r   r   r   
vit_giant2r  r   r   )r   TF)r   )r+   )!r   	functoolsr   typingr   r   r   r   rZ   torch.utils.checkpointr   torch.nn.initr   kornia.core.checkr	   layersr   r   r   r   r   r   Moduler   rc   r    r)   strr|   r   r   r   r   r   r   r   r   <module>   s(     