o
     i6                     @   s@  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlm
  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ zd dlm Z  W n e!yr   dZ Y nw 	dddZ"dd Z#			dddZ$G dd de
j%Z&d de
j%de'fddZ(dddZ)dS )!    N)OrderedDict)deepcopy)partial)	rearrange)named_apply)trunc_normal_)StochasticDepth)
PatchEmbed)Block)MHA)FusedMLPMlp)layer_norm_fnFc              	   C   s   t t| |||||d}|S )N)	num_heads
cross_attnqkv_proj_biasdropoutfused_bias_fcuse_flash_attn)r   r   )r   qkv_bias	attn_dropr   r   r   	mixer_cls r   S/home/ubuntu/.local/lib/python3.10/site-packages/xformers/_flash_attn/models/vit.pycreate_mixer_cls   s   	r   c                 C   s4   t | | }|stt|| d}|S tt|d}|S )N)hidden_features
activation)r   )intr   r   r   )	embed_dim	mlp_ratio	act_layer	fused_mlp	inner_dimmlp_clsr   r   r   create_mlp_cls+   s   r$   c                 C   sP   t ||||
||o||d kd}t| ||	|}t| |||d|||||dd}|S )N   )r   T)norm_clsprenormresid_dropout1resid_dropout2
drop_path1
drop_path2fused_dropout_add_lnresidual_in_fp32)r   r$   r
   )r   r   r   r   	drop_rateattn_drop_rater*   r+   
norm_layerr    r   r   r!   r,   	layer_idxn_layerlast_layer_subsetr   r#   blockr   r   r   create_block4   s.   r5   c                       s   e Zd ZdZddddddddd	d
dd
dddddddeddddddf fdd	Zd"ddZdd Zej	j
dd Zdd Zd#ddZd$defddZdd Zd# fd d!	Z  ZS )%VisionTransformerzVision Transformer
    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
        - https://arxiv.org/abs/2010.11929
             i  token      g      @TNF         c                    s  t    	dksJ d|sJ |du sJ d|dksJ |du s%J |r)J |du r1	dkn|}p;ttjdd p@tj || _	| _ | _| _	|rQd	nd
| _
|| _|tu r_dini }|d|||| d|| _| jj}|rttd	d	nd| _|r|n|| j
 }ttd	|d | _dd td
|D t 	
fddtD | _tjd| _td dd| _| _| _| jrtdu rt d|d
krt!| j	|nt" | _#| $| dS )a  
        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            num_classes (int): number of classes for classification head
            global_pool (str): type of global pooling for final sequence (default: 'token')
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            init_values: (float): layer-scale init values
            class_token (bool): use class token
            fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
            drop_rate (float): dropout rate
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            weight_init (str): weight init scheme
            embed_layer (nn.Module): patch embedding layer
            norm_layer: (nn.Module): normalization layer
            act_layer: (nn.Module): MLP activation layer
        r:   z#Only support pooling with CLS tokenNzLayerScale is not supported yetr>   avgư>)epsr%   r   r   )img_size
patch_sizein_chansr   bias{Gz?c                 S   s   g | ]}|  qS r   )item).0xr   r   r   
<listcomp>   s    z.VisionTransformer.__init__.<locals>.<listcomp>c                    sR   g | ]%}t 
|d kr|d  nd|  |	dkdqS )r   r%   r=   r:   )r*   r+   r0   r    r   r   r!   r,   r1   r2   r3   )r5   )rH   ir    r/   depthdprr.   r   r   r,   r!   global_poolr   r0   r   r   r   r   r   rJ      s,    )prow)rP   modezTriton is not installedr   )%super__init__r   nn	LayerNormGELUnum_classesrO   num_featuresr   num_prefix_tokensno_embed_classr	   patch_embednum_patches	Parametertorchzeros	cls_tokenrandn	pos_embedlinspace
ModuleListrangeblocksDropoutr   r   	drop_pathnormr,   r   ImportErrorLinearIdentityheadinit_weights)selfrB   rC   rD   rY   rO   r   rM   r   r   r   init_valuesclass_tokenr\   pre_normfc_normr.   r/   drop_path_rateweight_initembed_layerr0   r    r   r   r!   r,   use_fc_normpatch_embed_extra_kwargsr^   	embed_len	__class__rL   r   rU   g   s`   
4

&
 zVisionTransformer.__init__c                 C   sD   |dksJ t | jdd | jd urtjj| jdd tt|  d S )Nr>   rF   stdr@   )r   rd   rb   rV   initnormal_r   init_weights_vit_timm)rq   rS   r   r   r   rp      s
   
zVisionTransformer.init_weightsc                 C   s   t | d S N)r   )rq   mr   r   r   _init_weights   s   zVisionTransformer._init_weightsc                 C   s   ddhS )Nrd   rb   r   )rq   r   r   r   no_weight_decay   s   z!VisionTransformer.no_weight_decayc                 C   s~   | j r!|| j }| jd urtj| j|jd dd|fdd}|S | jd ur8tj| j|jd dd|fdd}|| j }|S )Nr   rQ   r%   dim)r\   rd   rb   r`   catexpandshaperq   rI   r   r   r   
_pos_embed   s   

$
$
zVisionTransformer._pos_embedc              
   C   s*  |  |}| |}d}| jdks|r!| jD ]	}|||\}}qn | jdd D ]	}|||\}}q(| jd ||tddd\}}| js\| | || }| |j	| jj
jd}|S | jjdkse| jshd}n| tj|jdd |j|jd}t|| jj
| jj|| jj| jr| jjnd	|d
d}|S )z~
        If all_tokens==False and self.global_pool == 'token', we only return the features for the
        cls token.
        Nr:   rQ   r   r%   )mixer_subset)dtype)devicer   r=   F)residualrA   	dropout_prowscaler'   )r]   r   rO   rh   slicer,   rj   r   rk   toweightr   rP   trainingr`   onesr   r   r   rE   rA   )rq   rI   
all_tokenshidden_statesr   r4   r   r   r   r   forward_features  sH   




z"VisionTransformer.forward_features
pre_logitsc                 C   sP   | j r| j dkr|d d | jd f jddn|d d df }|r#|S | |S )Nr?   r%   r   r   )rO   r[   meanro   )rq   rI   r   r   r   r   forward_head=  s   8zVisionTransformer.forward_headc                 C   s   | j |dd}| |}|S )NF)r   )r   r   r   r   r   r   forwardB  s   
zVisionTransformer.forwardc                    s$  |d }|  dkrt|d|d< dd  t fdd| D }t| j}| jd jjrd	|d
  d|v r|d	|d
  d}|d	|d
  d}|d | j	 |d	|d
  d< || j	d  |d	|d
  d< |d | j	 |d	|d
  d< || j	d  |d	|d
  d< t
 j||dS )Nzpatch_embed.proj.weight   zo c h w -> o (c h w)c                 S   s    t dd| } t dd| } | S )Nz^blocks.(\d+).attn.qkv.zblocks.\1.mixer.Wqkv.z^blocks.(\d+).attn.proj.zblocks.\1.mixer.out_proj.)resub)keyr   r   r   key_mapping_attnO  s   z;VisionTransformer.load_state_dict.<locals>.key_mapping_attnc                 3   s     | ]\}} ||fV  qd S r   r   )rH   kvr   r   r   	<genexpr>T  s    z4VisionTransformer.load_state_dict.<locals>.<genexpr>rQ   zblocks.r%   z.mixer.Wqkv.weightz.mixer.Wqkv.biasz.mixer.Wq.weightz.mixer.Wkv.weightz.mixer.Wq.biasz.mixer.Wkv.bias)strict)r   r   r   itemslenrh   mixerr   popr   rT   load_state_dict)rq   
state_dictr   patch_embed_weightr2   Wqkvbqkvr|   r   r   r   G  s$   
z!VisionTransformer.load_state_dictr>   )TF)__name__
__module____qualname____doc__r	   rU   rp   r   r`   jitignorer   r   r   boolr   r   r   __classcell__r   r   r|   r   r6   a   sN     



/r6   r>   modulenamec                 C   sT   t | tjrt| jdd | jdurtj| j dS dS t| dr(| 	  dS dS )zCViT weight initialization, original timm impl (for reproducibility)rF   r~   Nrp   )

isinstancerV   rm   r   r   rE   r   zeros_hasattrrp   )r   r   r   r   r   r   d  s   

r   c                 K   s2   | rJ t dddddd|}tdi |}|S )zViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
    r8   r;   r<   )rC   r   rM   r   Nr   )dictr6   )
pretrainedkwargsmodel_kwargsmodelr   r   r   vit_base_patch16_224n  s   r   r   )NNFr   )*mathr   collectionsr   copyr   	functoolsr   r`   torch.nnrV   torch.nn.functional
functionalFeinopsr   timm.models.helpersr   torch.nn.initr   torchvision.opsr   flash_attn.layers.patch_embedr	   flash_attn.modules.blockr
   flash_attn.modules.mhar   flash_attn.modules.mlpr   r    flash_attn.ops.triton.layer_normr   rl   r   r$   r5   Moduler6   strr   r   r   r   r   r   <module>   sB   

-  
