o
    ߥi?                     @   s   d dl mZ d dlZd dlmZ d dlm  mZ ddlm	Z	m
Z
mZmZmZ ddlmZmZmZmZmZmZmZmZ G dd dejZG dd	 d	ejZG d
d deZdS )    )partialN   )AdapterLoRAPrefixPromptSideTune)	AttentionBlockDropPath
LayerScaleMlp
PatchEmbedVisionTransformercheckpoint_seqc                       s:   e Zd ZdZ								d
 fdd	Zdd	 Z  ZS )AttentionPETLa=  Extend the parameter-efficient transfer learning (PETL) method to the original Attention.

    Prefix tuning optimizes the task-specific vector in the multi-head attention layer.
    'Prefix-tuning: Optimizing continuous prompts for generation' by Li & Liang(2021)
    See https://arxiv.org/abs/2101.00190

    LoRA constructs an additional layer with low-rank decomposition matrices of the weights in the network.
    'LoRA: Low-Rank Adaptation of Large Language Models' by Hu et al.(2021)
    See https://arxiv.org/abs/2106.09685

    Attributes:
        prefix_length: An integer indicating the length of prefix tuning.
        prefix_type: A string indicating the type of prefix tuning.
        lora_length: An integer indicating the length of LoRA tuning.
        lora_type: A string indicating the type of LoRA tuning.
       F        Nc
                    s   t    || dksJ d|| _|| }
|
d | _tj||d |d| _t|| _t||| _	t|| _
|rI|dkrIt||||	d| _nd | _|r]|dkr]t||||d| _d S d | _d S )Nr   z$dim should be divisible by num_headsg         )bias)dim	num_headslora_length	lora_type)r   r   prefix_lengthprefix_type)super__init__r   scalennLinearqkvDropout	attn_dropproj	proj_dropr   lorar   prefix)selfr   r   qkv_biasr#   r%   r   r   r   r   head_dim	__class__ i/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vision_efficient_tuning/backbone.pyr   "   s2   



zAttentionPETL.__init__c           
      C   s   |j \}}}| |||d| j|| j ddddd}|d\}}}| jd ur5| ||||\}}}| jd urE| ||||\}}}||dd | j	 }	|	j
dd}	| |	}	|	| dd|||}| |}| |}|S )	Nr      r   r      r   )shaper!   reshaper   permuteunbindr&   r'   	transposer   softmaxr#   r$   r%   )
r(   xBNCr!   qkvattnr-   r-   r.   forwardK   s$   




zAttentionPETL.forward)r   Fr   r   NNNN)__name__
__module____qualname____doc__r   rB   __classcell__r-   r-   r+   r.   r      s    )r   c                       sR   e Zd ZdZddddddejejedddddddddf fdd	Zd	d
 Z	  Z
S )	BlockPETLa)  Extend the parameter-efficient transfer learning (PETL) method to the original Block.

    Visual prompt tuning (VPT) is proposed to initialize tunable prompt tokens
    and prepend to the original tokens in the first layer or multiple layers.
    'Visual Prompt Tuning' by Jia et al.(2022)
    See https://arxiv.org/abs/2203.12119

    Adapters project input tokens by an MLP layer.
    'Parameter-Efficient Transfer Learning for NLP' by Houlsby et al.(2019)
    See http://arxiv.org/abs/1902.00751

    Attributes:
        adapter_length: An integer indicating the length of adapter tuning.
        adapter_type: A string indicating the type of adapter tuning.
        prompt_length: An integer indicating the length of prompt tuning.
        prompt_type: A string indicating the type of prompt tuning.
          @Fr   Nr2   c                    s,  t    || _|
|| _||||||||||d	| _|r#t||dnt | _|dkr0t	|nt | _
|
|| _t|t|| |	|d| _|rOt||dnt | _|dkr\t	|nt | _|| _|| _|rw|dkrwt||||	d| _nd | _|| _|| _|r|dkrt||||d| _d S d | _d S )N)r   r)   r#   r%   r   r   r   r   )init_valuesr   )in_featureshidden_features	act_layerdropr   )r   adapter_lengthadapter_typerM   )r   	layer_numprompt_lengthprompt_type)r   r   rQ   norm1rA   r   r   Identityls1r   
drop_path1norm2r   intmlpls2
drop_path2rO   rP   r   adapterrR   rS   r   prompt)r(   r   r   	mlp_ratior)   rN   r#   rJ   	drop_pathrM   
norm_layer
attn_layerrQ   rR   rS   r   r   rO   rP   r   r   r+   r-   r.   r   u   s   





zBlockPETL.__init__c                 C   s   | j d ur| jr| jdkr|  |}|| | | | | }| jd ur<|| | | | 	| 
| }|S || | | 	| 
| }|S )Nr   )r^   rR   rW   rV   rA   rT   r]   r\   r[   rZ   rX   )r(   r:   r-   r-   r.   rB      s   
 
 zBlockPETL.forward)rC   rD   rE   rF   r   GELU	LayerNormr	   r   rB   rG   r-   r-   r+   r.   rH   b   s,    LrH   c                !       s|   e Zd ZdZddddddddd	d
dd
dddddddeddeddddddddddf! fdd	Zdd ZddefddZ	  Z
S )VisionTransformerPETLa]   Extend the parameter-efficient transfer learning (PETL) method to the original Vision Transformer.

    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
        - https://arxiv.org/abs/2010.11929

    The implementation of several tuning methods (prompt, prefix, adapter, and LoRA) based on ViT.
          r   i  tokeni      rI   TNFr    c"           %         s  t    |dv sJ |s|dksJ |du r|dkn|}"p%ttjdd p*tj || _|| _ | _| _	|r;dnd| _
|| _d	| _|| _|| _|| _| _| _| _| _| _| _
| _| _||||| d
| _| jj}#|rttddnd| _|r|#n|#| j
 }$ttd|$d | _tj d| _!|rnt" | _#dd t$d||D dusǈdusǈdusǈ
durt%t&tj' 	
fddt(|D  | _)ntj' 	fddt(|D  | _)|"snt" | _*|"rnt" | _+|dkr+t,| j	|nt" | _-|dkr:| .| |!durGt/| |!| _0dS d| _0dS )a   Initialize a Parameter-efficient Transfer Learning Method based on Vision Transformer.

        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            num_classes (int): number of classes for classification head
            global_pool (str): type of global pooling for final sequence (default: 'token')
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            init_values: (float): layer-scale init values
            class_token (bool): use class token
            fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
            drop_rate (float): dropout rate
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            weight_init (str): weight init scheme
            embed_layer (nn.Module): patch embedding layer
            norm_layer: (nn.Module): normalization layer
            act_layer: (nn.Module): MLP activation layer
            prompt_length: An integer indicating the length of prompt tuning.
            prompt_type: A string indicating the type of prompt tuning.
            prefix_length: An integer indicating the length of prefix tuning.
            prefix_type: A string indicating the type of prefix tuning.
            adapter_length: An integer indicating the length of adapter tuning.
            adapter_type: A string indicating the type of adapter tuning.
            lora_length: An integer indicating the length of LoRA tuning.
            lora_type: A string indicating the type of LoRA tuning.
            sidetune_length: An integer indicating the linear dimension.
            sidetune_type: A string indicating the type of side network.
        )rj   avgrh   rh   Nrk   gư>)epsr   r   F)img_size
patch_sizein_chans	embed_dimr   g{Gz?)pc                 S   s   g | ]}|  qS r-   )item).0r:   r-   r-   r.   
<listcomp>L  s    z2VisionTransformerPETL.__init__.<locals>.<listcomp>c              	      s   g | ]i}di d dddd	ddd| dd	 d
d|dt tr7| nddt trF| nddt trU| nddt 
trd
| n
dqS )r   r   r_   r)   rJ   rN   r#   r`   ra   rM   rb   rQ   rR   rS   r   r   rO   rP   r   r   r-   )
isinstancelistrs   irM   rO   rP   attn_drop_raterb   block_fndpr	drop_raterp   rJ   r   r   r_   ra   r   r   r   rR   rS   r)   r-   r.   rt   Q  sx    	
c                    s,   g | ]}	
|  d 
qS ))
r   r   r_   r)   rJ   rN   r#   r`   ra   rM   r-   rw   )rM   rz   r{   r|   r}   rp   rJ   r_   ra   r   r)   r-   r.   rt   m  s    skip)1r   r   r   r   rd   rc   num_classesglobal_poolnum_featuresrp   num_prefix_tokensno_embed_classgrad_checkpointingdepthrm   class_tokenrR   rS   r   r   rO   rP   r   r   patch_embednum_patches	Parametertorchzeros	cls_tokenrandn	pos_embedr"   pos_droprU   norm_prelinspacer   rH   
Sequentialrangeblocksnormfc_normr    headinit_weightsr   sidetune)%r(   rm   rn   ro   r   r   rp   r   r   r_   r)   rJ   r   r   pre_normr   r}   rz   drop_path_rateweight_initembed_layerra   rM   r{   rR   rS   r   r   rO   rP   r   r   sidetune_lengthsidetune_typeuse_fc_normr   	embed_lenr+   ry   r.   r      s   
E

4"





zVisionTransformerPETL.__init__c                 C   sf   t |d}| |}| |}| |}| jr#tj s#t| j	|}n| 	|}| 
|}||d< |S )a   feature forward function of VisionTransformer.

        Args:
            x (Tensor): the input data.
        Returns:
            res (Dict): the output data, contains:
                - inputs: the original input.
                - x: the intermediate feature.
        )inputsr:   )dictr   
_pos_embedr   r   r   jitis_scriptingr   r   r   )r(   r:   resr-   r-   r.   forward_features  s   






z&VisionTransformerPETL.forward_features
pre_logitsc                 C   s   |d }| j r#| j dkr|dd| jdf jddn|dddf }| jr2d|v r2| |d |}| |}|r;|S | |S )a   head forward function of VisionTransformer.

        Args:
            res (Dict): the input data, contains:
                - inputs: the original input.
                - x: the intermediate feature.
        Returns:
            x (Tensor): the output data.
        r:   rk   Nr   r3   r   r   )r   r   meanr   r   r   )r(   r   r   r:   r-   r-   r.   forward_head  s   


z"VisionTransformerPETL.forward_head)F)rC   rD   rE   rF   r   r
   r   r   boolr   rG   r-   r-   r+   r.   re      sN    	 2re   )	functoolsr   r   torch.nnr   torch.nn.functional
functionalFpetlr   r   r   r   r   timm_vision_transformerr	   r
   r   r   r   r   r   r   Moduler   rH   re   r-   r-   r-   r.   <module>   s   (Rm