o
    ߥi                     @   sn  d dl Z d dlmZ dgZdZdededefddZd	d
dddd dddedededededede	fddZ
ddde	fddZddde	fddZddde	fddZddde	fddZdd d!dd"d#ed$ed%ede	fd&d'ZG d(d) d)ejZd	d!dd*ded%ede	fd+d,Zd!dd-d%ede	fd.d/Zddde	fd0d1Zddde	fd2d3Zddde	fd4d5ZdS )6    Ntorchz%https://dl.fbaipublicfiles.com/dinov2	arch_name
patch_sizereturnc                 C   s"   |  ddd d }d| | S )N_    dinov2_)replace)r   r   compact_arch_name r   _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/anydoor/dinov2/hubconf.py_make_dinov2_model_name   s   r   	vit_largei     g      ?mlpT)r   img_sizer   init_values	ffn_layerblock_chunks
pretrainedr   r   r   r   r   c                 K   sP   ddl m} t| |}	t|||||d}
|
jdi | |j|  di |
}|S )N   )vision_transformer)r   r   r   r   r   r   )dinov2.modelsr   r   dictupdate__dict__)r   r   r   r   r   r   r   kwargsvitsr   
vit_kwargsmodelr   r   r   _make_dinov2_model   s   
r!   )r   c                 K      t dd| d|S )zP
    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
    	vit_smallr   r   Nr   r!   r   r   r   r   r   dinov2_vits142   
   r'   c                 K   r"   )zC
    DINOv2 ViT-B/14 model pretrained on the LVD-142M dataset.
    vit_baser$   Nr   r%   r&   r   r   r   dinov2_vitb14:   r(   r*   c                 K   r"   )zP
    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
    r   r$   Nr   r%   r&   r   r   r   dinov2_vitl14B   r(   r+   c                 K      t ddd| d|S )zP
    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
    
vit_giant2swiglufusedr   r   r   Nr   r%   r&   r   r   r   dinov2_vitg14J   s   r0   i   r   
model_name	embed_dimlayersr   r2   r3   r4   c           	      K   s   |dv sJ d| t d| | d}|r>|dkrt|nd}td|  d|  d| d	 }tjj|d
d}|j|dd |S )N)r   r   Unsupported number of layers: r   i  r   r   /_linearz	_head.pthcpu)map_locationF)strict)nnLinearstr_DINOV2_BASE_URLr   hubload_state_dict_from_urlload_state_dict)	r2   r3   r4   r   r   linear_head
layers_strurl
state_dictr   r   r   _make_dinov2_linear_headU   s   rF   c                       s<   e Zd Zdddejdejdef fddZdd	 Z  ZS )
_LinearClassifierWrapperr   )r4   backbonerB   r4   c                   s    t    || _|| _|| _d S )N)super__init__rH   rB   r4   )selfrH   rB   r4   	__class__r   r   rJ   l   s   

z!_LinearClassifierWrapper.__init__c              	   C   s   | j dkr$| j|}|d d}|d d}t||dg}nG| j dkrc| jj|ddd}t|d d d|d d d|d d d|d	 d d|d	 d ddg}nJ d| j  | |S )Nr   x_norm_clstokenr   x_norm_patchtokensr   T)nreturn_class_token      Fr5   )	r4   rH   forward_featuressqueezer   catmeanget_intermediate_layersrB   )rK   x	cls_tokenpatch_tokenslinear_inputr   r   r   forwardv   s   

0&
z _LinearClassifierWrapper.forward)	__name__
__module____qualname__r;   ModuleintrJ   r]   __classcell__r   r   rL   r   rG   j   s    
rG   r   r4   r   c           	      K   sH   t d| |d|}|j}|j}t| |}t||||d}t|||dS )Nr$   r1   )rH   rB   r4   r   )r!   r3   r   r   rF   rG   )	r   r4   r   r   rH   r3   r   r2   rB   r   r   r   _make_dinov2_linear_classifier   s"   
re   )r4   r   c                 K   s   t dd| |d|S )z
    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone (optionally)
    pretrained on the LVD-142M dataset and trained on ImageNet-1k.
    r#   rd   Nr   re   )r4   r   r   r   r   r   dinov2_vits14_lc   s
   rg   c                 K   r"   )z
    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone (optionally)
    pretrained on the LVD-142M dataset and trained on ImageNet-1k.
    r)   r$   Nr   rf   r&   r   r   r   dinov2_vitb14_lc   
   rh   c                 K   r"   )z
    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone (optionally)
    pretrained on the LVD-142M dataset and trained on ImageNet-1k.
    r   r$   Nr   rf   r&   r   r   r   dinov2_vitl14_lc   ri   rj   c                 K   r,   )z
    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone (optionally)
    pretrained on the LVD-142M dataset and trained on ImageNet-1k.
    r-   r.   r/   Nr   rf   r&   r   r   r   dinov2_vitg14_lc   s   rk   )r   torch.nnr;   dependenciesr>   r=   rb   r   floatboolr!   r'   r*   r+   r0   rF   ra   rG   re   rg   rh   rj   rk   r   r   r   r   <module>   sv   

 
			