o
    NiM                     @  s   d dl mZ d dlZd dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
 dZd	ZG d
d dejZd#ddZd#ddZd$ddZ	d%d&d!d"ZdS )'    )annotationsN)Any)nn)PreTrainedModel   )ArrowConfigtask_gks_c                      sl   e Zd ZdZ fddZe dd Zddd	Ze d
d Z	e dd Z
dddZdd Z  ZS )ArrowLoraLinearLayerz_
    This class represent the main logic of the arrow routing algorithm for linear layers.
    c                   sf   t    || _d| _|j| _|j| _|j| _|j	 | _|j
| _
|j| _d| _g | _|| _d| _d S )NFT)super__init__in_features_protos_readytop_krouter_temperaturetemperaturerng_seedtask_adapter_namescopygks_adapter_namesuse_gksgks_donegks_added_adapter_namescast_input_dtype_enabled)selfr   arrow_config	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/peft/tuners/lora/arrow.pyr   #   s   

zArrowLoraLinearLayer.__init__c                   sf    fdd|  D }tjt|krdS tjt|k r)fdd|D _| _d_dS )z
        Called when adapters are added/removed/renamed so Arrow can refresh its internal state before the next forward
        pass.
        c                   s>   g | ]}| v r|d kr| dr|tdd  s|qS )arrow_routerr	   N)
startswithlenisdigit).0klora_Br   r   
<listcomp>=   s
    .z:ArrowLoraLinearLayer.on_adapter_change.<locals>.<listcomp>Nc                   s   g | ]	}| j vr|qS r   )r   )r$   x)r   r   r   r(   H   s    F)keyssortedr   r"   r   r   r   )r   lora_Ar'   all_ts_adapter_namesr   )r'   r   r   on_adapter_change7   s   


z&ArrowLoraLinearLayer.on_adapter_change   :0yE>c                 C  s   | tj}| tj}|j| }d}| jdur(tj|jjd}|t	| j tj
|d|j|j|d}	|	|	 |  }	t|D ]}
|j|||	   }|| |  }	qB|	S )uw  
        Computes the top *right* singular vector of ΔW = B @ A without forming ΔW.

        Theory:
            For any matrix M, the right singular vectors are the eigenvectors of Mᵀ M. If ΔW = B @ A (with A ∈
            ℝ^{r×in}, B ∈ ℝ^{out×r}), then
                ΔWᵀ ΔW = (B @ A)ᵀ (B @ A) = Aᵀ (Bᵀ B) A ∈ ℝ^{in×in}.
            Therefore, the dominant right singular vector of ΔW is the dominant eigenvector of M := Aᵀ (Bᵀ B) A. We
            find it by *power iteration* on the linear operator
                v ↦ Aᵀ (Bᵀ B) (A v),
            which avoids materializing ΔW (out×in) or M (in×in). The result lives in the input/token space (size =
            in_features), which is exactly what Arrow needs. (Right singular vectors ≡ eigenvectors of MᵀM; power
            iteration converges to the dominant eigenvector under mild conditions.)
        =============================== Practical notes:
            - We perform all iteration in float32 for numerical stability, then cast back
            to the LoRA dtype/device before storing/using the prototype.
            - Convergence is checked with a simple fixed-iter cap (`iters`) and/or
            `allclose` tolerance (`tol`).
            - The returned vector is unique up to sign (±), as with any singular vector.
            Downstream code should be sign-invariant.
        N)devicer   )dtyper1   	generator)totorchfloat32Tr   	Generatorr1   typemanual_seedintrandnsizer2   normrange)r   ABitersepsA32B32Cgenv_wr   r   r   top_right_singular_vec_from_BAP   s   

z3ArrowLoraLinearLayer.top_right_singular_vec_from_BAc           
      C  s~   | j rdS g }| jD ] }|| j}|| j}| ||}|j|j|jd}|| q
tj	|dd}	| j
d|	dd d| _ dS )	a  
        Computes a prototype vector for each LoRA module in every layer by applying Singular Value Decomposition (SVD)
        to the `lora_A` matrix and extracting the top right singular vector.

        These prototypes are later used to calculate the cosine similarity between each input token and each expert.
        The resulting similarity scores serve as coefficients to compute a weighted average of the corresponding LoRA
        modules, effectively routing each token through its most relevant experts.

        ** This prototype computation is done is done once for all experts and is re-done on newly added adapters.**

        Args:
            lora_A : Matrices A in LoRA layer.
            lora_B (optional): Matrices B in LoRA layer. Defaults to None.
        N)r2   r1   r   dim
prototypesF)
persistentT)r   r   weightrK   r4   r2   r1   appendr5   stackregister_buffer)
r   r,   r'   protosnamer@   rA   proto32protoproto_stackr   r   r   build_prototypes}   s   



z%ArrowLoraLinearLayer.build_prototypesc                   s   | j sdS | jr| jsdS tj fdd| jD ddd}tjfdd| jD ddd}| jdu rO| jD ]} | jj	
| | jj	
| q9n| jD ]} | jj	
| | jj	
| qRd| _g | _dS )	a\  
        This function performs General Knowledge Subtraction. It takes an average of provided general_adapters, and
        subtract it from each task_adapter. This subtraction tries to purify the task adapters, based on
        "forgetting-via-negation" principle. Forgetting-via-negation is a task-arithmetic operation, explained in:
        https://huggingface.co/papers/2212.04089 The task adapters will be more focused and isolated, enhancing the
        performance on new tasks.

        Args:
            lora_A : Matrices A in LoRA layer.
            lora_B : Matrices A in LoRA layer.
        Nc                      g | ]} | j qS r   rP   r$   nr,   r   r   r(          z5ArrowLoraLinearLayer.gen_know_sub.<locals>.<listcomp>r   rL   c                   rZ   r   r[   r\   r&   r   r   r(      r_   FT)r   r   r   r5   rR   r   meanr   rP   datasub_)r   r,   r'   avg_Aavg_BrU   r   )r,   r'   r   gen_know_sub   s(   



z!ArrowLoraLinearLayer.gen_know_subr2   torch.dtypec                 C  s6   |du rdS t | dd}|r|j|kr|S |j|dS )a|  
        Whether to cast the dtype of the input of the forward method.

        Usually, we want to enable this to align the input dtype with the dtype of the weight, but by setting
        layer.cast_input_dtype=False, this can be disabled if necessary.

        Enabling or disabling can be managed via the peft.helpers.disable_lora_input_dtype_casting context manager.
        Nr   T)r2   )getattrr2   r4   )r   r)   r2   r   r   r   r   _cast_input_dtype   s   	z&ArrowLoraLinearLayer._cast_input_dtypec                   sn  |  | | jd  jj}|j^}}}|d|}	|	d| jd}
}tj	fdd| jD |	j
|	jd}t|	| jj }tj|| jdd\}}|	|
|ftd}|d|| tj|| j dd}tj fd	d| jD dd}tjfd
d| jD dd}td|	|}td||}||ddd }td||}||}|d}|j|g||R  S )u  
        Applies Arrow routing inside a LoRA layer.

        Steps:
        1. Compute cosine similarity between each token representation and all adapter prototypes.
        2. Select the top-k experts per token and normalize their scores with a softmax.
        3. Project tokens into each selected expert’s low-rank space (A weights).
        4. Map back to the output space (B weights).
        5. Aggregate expert outputs via the weighted sum of their contributions.
        6. Apply dropout, scaling, and return the reshaped delta.

        - Conceptually, this is a Mixture-of-Experts (MoE) over LoRA adapters,
        where coefficients are derived from prototype similarity.

        Returns:
            delta: LoRA output adjustment computed by Arrow routing.
        r   c                   s   g | ]} | qS r   r   r\   )scalingr   r   r(      s    z0ArrowLoraLinearLayer.forward.<locals>.<listcomp>)r1   r2   r   rL   z-infc                   rZ   r   r[   r\   r^   r   r   r(     r_   c                   rZ   r   r[   r\   r&   r   r   r(     r_   ztf, erf -> terzter, eor -> teozte, teo -> to)rh   r   rP   r2   shapeviewr=   rN   r5   tensorr1   absr7   topkr   new_fullfloatscatter_softmaxr   rR   einsum)r   r)   r,   r'   dropoutrj   rA   restF_intoktEscales_tenssimtop_vidx
full_scorecoeffA_stackB_stackzy
delta_flatdeltaout_dimr   )r,   r'   rj   r   forward   s.   
zArrowLoraLinearLayer.forward)r/   r0   )r2   rf   )__name__
__module____qualname____doc__r   r5   no_gradr.   rK   rY   re   rh   r   __classcell__r   r   r   r   r
      s    

-
#

(r
   adapter_names	list[str]c              
   C  s@  d}|D ]}t  }d}d}|  D ]1\}}t|drB||jv rB|j| j}	|j| j}
|dd }|| |	jd }|	j|
jf}q|du rN|||d}q||d krct	d| d	| d
|d  ||d krxt	d| d| d
|d  ||d krt	d| dt
| dt
|d  qt
|d }|t|d fS )z
    After loading all adapters into `model`, check they share:
      - the same LoRA rank (r)
      - identical weight shapes
      - identical sets of target_modules
    Returns (sorted list of target module names, agreed rank r).
    Nr,   .ri   r   )rshapesmodulesr   [z] rank mismatch: z != r   z] shape mismatch: r   z-] target_modules mismatch:
  this adapter -> z
  reference   -> )setnamed_moduleshasattrr,   rP   r'   splitaddrk   
ValueErrorr+   r;   )modelr   	referencerU   curr_modulescurr_rcurr_shapes	full_namemoduler@   rA   mod_nameagreed_modulesr   r   r   %check_loaded_lora_compatibility_arrow  s>   


r   c              	   C  sZ  ddl m} d}z
ddl}|jj}W n	 ty   Y nw d}zddlm} W n	 ty/   Y nw |j|jf}|dur?||f }|durH||f }g }| 	 D ]:\}}	t
|	dr|D ].}
|
t|	di v rt|	ddpnt|	dd}|duru|n|	}t||s||
|t|jf qYqN|rdg}|D ]\}
}}|d|
 d	| d
|  qtd|dS )z
    Validate that every module holding LoRA weights for any of `adapter_names` is Linear-like: nn.Linear,
    bitsandbytes.nn.Linear4bit, nn.Conv1d, or transformers.models.gpt2.modeling_gpt2.Conv1D. If not, raise.
    r   N)Conv1Dr,   
base_layeroriginal_modulezzLoRA adapters must only target Linear-like layers (nn.Linear, nn.Conv1d, HF Conv1D, or bitsandbytes.nn.Linear4bit). Found:z  - adapter 'z' on module 'z
' of type 
)torch.nnr   bitsandbytes
Linear4bitImportError&transformers.models.gpt2.modeling_gpt2r   LinearConv1dr   r   rg   
isinstancerQ   r9   r   	TypeErrorjoin)r   r   r   r   bnbHFConv1Dallowed_types	offendersr   r   rU   baselayer_to_checklinestnamer   r   r   )ensure_adapters_target_linear_layers_onlyH  sH   



r   pathstrreturntuple[str, str | None]c                 C  s   t j| rt jt j| dstd|  d| dfS | dd}t|dkrKd|dd }t|dkrGd|dd }||fS |dfS | dfS )aq  
    Resolve a user-provided adapter `path` into (model_id, subfolder).

    Supports:
      - Local path to a folder that contains `adapter_config.json`
      - Hub path with subfolder, e.g. "user/repo/ts_expert_0[/more/...]", which becomes:
            model_id="user/repo", subfolder="ts_expert_0[/more/...]"
      - Plain Hub repo id "user/repo" (no subfolder)
    zadapter_config.jsonzLocal adapter path 'z)' does not contain 'adapter_config.json'.N/   )	osr   isdirisfiler   r   stripr   r"   )r   partsmodel_id	subfolderr   r   r   _resolve_adapter_sourcey  s   
r   
base_modelr   task_specific_adapter_pathsr   r   general_adapter_pathslist[str] | Noneadapter_kwargsr   c                 K  s  |d u s
t |dkrtdddlm}m} t|d \}}t d}	t|}
|d ur3d|
vr3||
d< |j| f||	d|
}t	dt |D ]+}t | }t|| \}}t|}|d urfd|vrf||d< |j
d||d| qFdd	 t	t |D |_|jr|d u st |dkrtd
t	t |D ]+}t | }t|| \}}t|}|d urd|vr||d< |j
d||d| qdd	 t	t |D |_ng |_t||j|j d\}}t||j|j d ||||d}|jd|d |d |S )Nr   zF`task_specific_adapter_paths` should contain at least one adapter path)
LoraConfig	PeftModel0r   )r   adapter_namer   c                 S     g | ]}t  | qS r   )TASK_ADAPTER_PREFIXr$   ir   r   r   r(         z&create_arrow_model.<locals>.<listcomp>zDYou should provide general LoRA paths if you want to use GenKnowSub.c                 S  r   r   )GKS_ADAPTER_PREFIXr   r   r   r   r(     r   )r   )r   target_modulesr   r    )r   peft_configr   )r"   r   peftr   r   r   r   dictfrom_pretrainedr?   load_adapterr   r   r   r   r   r   add_adapterset_adapter)r   r   r   r   r   r   r   	model_id0sub0initial_ts_expert_namefirst_kwargsr   r   ts_expert_namemidsubmore_kwargsgen_expert_name
gks_kwargsr   r   
router_cfgr   r   r   create_arrow_model  sx   


r   )r   r   )r   r   r   r   )N)
r   r   r   r   r   r   r   r   r   r   )
__future__r   r   typingr   r5   r   transformersr   configr   r   r   Moduler
   r   r   r   r   r   r   r   r   <module>   s"     

+
1