o
    8wÖiÉ9 ã                   @  sZ  d dl mZ d dlZd dlZd dlmZmZmZ d dlZd dl	m
Z
 d dlm
  mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ d	d
lmZ G dd„ dƒZG dd„ deƒZG dd„ de
jeƒZ G dd„ de
jeƒZ!G dd„ de
jeƒZ"G dd„ de"ƒZ#G dd„ de"ƒZ$G dd„ de"ƒZ%G dd„ de
jeƒZ&d'd%d&„Z'dS )(é    )ÚannotationsN)ÚAnyÚOptionalÚUnion)Úsvd_lowrank)ÚConv1D)ÚBaseTunerLayerÚcheck_adapters_to_merge)Údequantize_module_weightÚgather_params_ctxÚget_bnb_param_typeÚskip_init_on_device)Ú	transposeé   )Ú
LoraConfigc                   @  sV   e Zd ZdZeddd	„ƒZeddd„ƒZeddd„ƒZeddd„ƒZeddd„ƒZ	dS )ÚLoraVarianta{  
    Base class for LoRA variants, e.g. DoRA.

    This class should be subclassed and the methods below should be implemented accordingly. The methods should be
    implemented as static methods, this makes it easier to combine variants.

    Note for developers: These methods are prone to change and should thus considered to be "private". Use at your own
    discretion.
    ÚmoduleÚ	LoraLayerÚadapter_nameÚstrÚreturnÚNonec                 C  ó   t ‚)zKInitialization code for the LoRA variant, it's called within `update_layer`©ÚNotImplementedError)r   r   © r   úS/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/peft/tuners/lora/layer.pyÚinit1   ó   zLoraVariant.initÚactive_adapterÚorig_weightútorch.Tensorc                 C  r   )zZSafe merging of the weights from `merge(..., safe_merge=True)`, should return a new tensorr   ©r   r   r    r   r   r   Ú
merge_safe6   r   zLoraVariant.merge_safec                 C  ó   dS )zdUnsafe merging of the weights from `merge(..., safe_merge=False)`, should modify the weight in-placeNr   r"   r   r   r   Úmerge_unsafe;   ó    zLoraVariant.merge_unsafec                 C  r$   )zFRemove the adapter weights from the original weights, then return themNr   r"   r   r   r   Úunmerge?   r&   zLoraVariant.unmergeÚxÚresultc                 C  r   )a‰  
        The forward pass of the LoRA variant, should return the overall result (not just the diff)

        Args:
            module (LoraLayer): The module on which the forward pass is called
            active_adapter (str): The name of the active adapter
            x (torch.Tensor): The input to the forward call
            result (torch.Tensor): The result from the base model
        r   )r   r   r(   r)   r   r   r   ÚforwardC   s   zLoraVariant.forwardN)r   r   r   r   r   r   )r   r   r   r   r    r!   r   r!   )r   r   r   r   r    r!   r   r   )
r   r   r   r   r(   r!   r)   r!   r   r!   )
Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ústaticmethodr   r#   r%   r'   r*   r   r   r   r   r   &   s    
r   c                   @  sÎ   e Zd ZU dZded< dZded< dCdDdd„ZdEdd„Z				dFdGdd„Zdd„ Z	dd„ Z
dd„ Zd d!„ Zd"d#„ Ze ¡ d$d%„ ƒZdHd*d+„ZdId,d-„ZdJd1d2„ZdKd3d4„ZdLdMd7d8„Zd9d:„ ZdNdAdB„Zd5S )Or   )Úlora_AÚlora_BÚlora_embedding_AÚlora_embedding_Bztuple[str, ...]Úadapter_layer_names)ÚrÚ
lora_alphaÚscalingÚlora_dropoutÚother_param_namesFÚ
base_layerú	nn.ModuleÚephemeral_gpu_offloadÚboolr   r   c                 K  sæ  || _ i | _i | _i | _t i ¡| _t i ¡| _t i ¡| _t 	i ¡| _
t 	i ¡| _d| _g | _i | _i | _tj ¡ | _i | _|| _d| _i | _|| _|  ¡ }t|tjƒr^|j|j}}nt|tjƒrl|j|j}}nÿt|tjƒrz|j|j}}nñt|tj ƒrˆ|j|j}}nãt|tj!ƒr–|j"|j#}}nÕt|t$ƒr¬t%|j&dƒr¥|j&j'n|j&j(\}}n¿t|tj)ƒrÈ|j*s¾t+d| j,› dƒ‚|j-d|j- }}n£t%|dƒrÚt%|dƒrÚ|j.|j/}}n‘t%|d	ƒrìt%|d
ƒrì|j0|j1}}nt%|dƒrÿ|j,j2dkrÿ|j|j}}nlt%|dƒr|j,j2dkr|j|j}}nW|j,j2dkr#|j|j}}nHt%|dƒr8|j,j2dkr8|j|j}}n3|j,j2dkrG|j|j}}n$t%|dƒr[t%|dƒr[|j|j}}nd\}}t3 4dt5|ƒ› dt6¡ || _|| _d S )NFTÚds_shapez=Only same dim for query/key/value is supported as of now for Ú.é   Ú
infeaturesÚoutfeaturesÚ
input_sizeÚoutput_sizeÚ	codebooksÚQuantizedLinearÚw_bitÚWQLinear_GEMMÚ
EetqLinearÚW_qÚ	HQQLinearÚPatchedLinearÚin_featuresÚout_features)NNzUnsupported layer type 'z(' encountered, proceed at your own risk.)7r:   r5   r6   r7   ÚnnÚ
ModuleDictr8   r0   r1   ÚParameterDictr2   r3   Ú_disable_adaptersÚmerged_adaptersÚuse_doraÚ	lora_biasÚtorchÚlora_magnitude_vectorÚ_cachesr<   Úcast_input_dtype_enabledÚlora_variantÚkwargsÚget_base_layerÚ
isinstanceÚLinearrM   rN   ÚConv1dÚin_channelsÚout_channelsÚConv2dÚConv3dÚ	EmbeddingÚnum_embeddingsÚembedding_dimr   ÚhasattrÚweightr>   ÚshapeÚMultiheadAttentionÚ_qkv_same_embed_dimÚ
ValueErrorÚ	__class__Ú	embed_dimrA   rB   rC   rD   r+   ÚwarningsÚwarnÚtypeÚUserWarning)Úselfr:   r<   r[   rM   rN   r   r   r   Ú__init__W   sv   
ÿÿ
zLoraLayer.__init__rT   úOptional[LoraVariant]c                K  r$   )aÉ  Return a matching LoRA variant for this layer type.

        Given the init arguments of this layer, return the correct LoRA variant, if any. E.g., if `use_dora=True`, this
        method should return the DoRA variant for the given layer.

        If there is no fitting variant, return None.

        Note: If this layer type does not support the LoRA variant at all, please raise an error during __init__ as is
        convention, and not here.

        Nr   )rs   rT   r[   r   r   r   Úresolve_lora_variant¤   s   zLoraLayer.resolve_lora_varianté    Ú
use_qalorarU   Úqalora_group_sizeÚintc                 K  sè  t ƒ  ¡ }|d= |dkrtd|› ƒ‚| j|||
d}|d ur$|| j|< || j|< || j|< |dkr9tj|d}nt 	¡ }| j
 t ||i¡¡ tj| j|dd| j|< tj|| j|	d| j|< |	| j|< |rr|t |¡ | j|< n|| | j|< || j|< t|tƒr¦| d	¡r¦t|  ¡ jƒ |  ||¡ W d   ƒ n1 s w   Y  n¯t|tƒrÎ| d
¡rÎt|  ¡ jƒ |  ||¡ W d   ƒ n1 sÈw   Y  n‡t|tƒrö| ¡ dkröt|  ¡ jƒ |   |¡ W d   ƒ n1 sðw   Y  n_|dkrt|  ¡ jƒ |  !|¡ W d   ƒ n	1 sw   Y  n<|dkr)tj" #| j| j¡ n,|dkrLt|  ¡ jƒ |  $|¡ W d   ƒ n	1 sFw   Y  n	|rU|  %||¡ |  &|¡ || jv rl| j| j"| fi |¤Ž |  '| j(¡ d S )Nrs   r   ú?`r` should be a positive integer value but the value passed is )rT   rx   ry   ç        ©ÚpF©ÚbiasÚpissaÚcordaÚoloraÚloftqÚevaÚ
orthogonal))ÚlocalsÚcopyrl   rv   rZ   r5   r6   rO   ÚDropoutÚIdentityr8   ÚupdaterP   r^   rM   r0   rN   r1   rU   ÚmathÚsqrtr7   rT   r]   r   Ú
startswithr   r\   rh   Ú
pissa_initÚ
corda_initÚlowerÚ
olora_initÚ
loftq_initr   Úzeros_Úorthogonal_initÚreset_lora_parametersÚ%_move_adapter_to_device_of_base_layerÚset_adapterÚactive_adapters)rs   r   r5   r6   r8   Úinit_lora_weightsÚ
use_rslorarT   rx   rU   ry   r[   rZ   Úlora_dropout_layerr   r   r   Úupdate_layer²   sn   
ÿ




ÿ€ÿ€ÿ€
ÿ€

ÿ€
zLoraLayer.update_layerc                 C  s  |du rd S || j  ¡ v rY|du r!tjj| j | jt d¡d n| ¡ dkr9tjj	| j | jd| j
|  d ntd|›ƒ‚tj | j| j¡ | j| rYtj | j| j¡ || j ¡ v rƒtj | j| ¡ tj 	| j| ¡ | j| r…tj | j| j¡ d S d S d S )	NFTé   )ÚaÚgaussianr   )Ústdz)Unknown initialization init_lora_weights=)r0   ÚkeysrO   r   Úkaiming_uniform_rh   rŒ   r   r‘   Únormal_r5   rl   r”   r1   rU   r€   r2   r3   )rs   r   rš   r   r   r   r–   þ   s$    $

ùzLoraLayer.reset_lora_parametersc                 C  sl  |   ¡ }|j}t|ƒ}|j}|rt|ƒ}n|tjtjtjfv r"|}nt	d|› dƒ‚| j
| }| j| }| tj¡}tj |j¡\}	}
|	d d …d |…f |
d |… }}| ¡ | j| j_| ¡ | j| j_| j|| j| j | j| j 8  _|dkr”|j||j|j|j|jd |j¡}||_d S |dkr«|j||j|jd |j¡}||_d S | |¡}||j_d S )Nz.Unsupported data type for the base layer. Got r?   Ú4bit)Ú
quant_typeÚquant_storageÚcompress_statisticsr   Ú8bit)Úrequires_gradÚhas_fp16_weights)r\   rh   r   Údtyper
   rV   Úfloat32Úfloat16Úbfloat16Ú	TypeErrorr7   r5   ÚtoÚlinalgÚqrÚdataÚ
contiguousr0   r1   rm   r¦   r§   r¨   r   Údevicerª   r«   )rs   r   r:   r    Úbnb_param_typer¬   Úweight_tensorÚscale_factorr5   ÚQÚRÚQrÚRrr   r   r   r’     sN   


"&ûú
ýü

zLoraLayer.olora_initc                 C  s  |   ¡ j}|j}|tjtjtjfvrtdƒ‚t| 	tj¡| j
ƒ}|dkrWtjj|jdd\}}}|d d …d | j| …f }|d | j| … }	|	| j|  }	|d | j| … }
n2t| d¡ƒdkrt|j| j| t| d¡d ƒd\}}	}|	| j|  }	| ¡ }
ntd	|› d
ƒ‚t t |	¡¡|
 }|t t |	¡¡ }|| j| j_|| j| j_|j| j| | |  }t| 	|¡| j
ƒ}||   ¡ j_d S )NzPlease initialize PiSSA under float32, float16, or bfloat16. Subsequently, re-quantize the residual model to help minimize quantization errors.r   F)Úfull_matricesÚ_niter_é   éÿÿÿÿ)ÚniterzLinit_lora_weights should be 'pissa' or 'pissa_niter_[number of iters]', got ú	 instead.)r\   rh   r¬   rV   r­   r®   r¯   r°   r   r±   Úfan_in_fan_outr²   Úsvdr´   r5   r7   ÚlenÚsplitr   rz   Útrl   Údiagr   r0   r1   )rs   r   rš   rh   r¬   ÚVÚSÚUhÚVrÚSrÚUhrÚUrr0   r1   r   r   r   r   D  s:   
ÿÿ

ÿzLoraLayer.pissa_initc                 C  s$  |   ¡ }|j}|j}|tjtjtjfvrtdƒ‚| tj¡}|j	 
d¡}|j	 
d¡}t|dƒs3tdƒ‚|j}|j}	|j}
|j}| j| }t |
¡ ¡ sRt |
¡ ¡ rVtdƒ‚t |	¡ ¡ sdt |	¡ ¡ rhtdƒ‚t |¡ ¡ svt |¡ ¡ rztdƒ‚|	 
d¡|ksˆ|	 
d¡|kr˜td	|	 
¡ › d
|› d|› dƒ‚|
 
d¡|kr¬td|
 
¡ › d
|› dƒ‚| 
d¡|ksº| 
d¡|krÊtd| 
¡ › d
|› d|› dƒ‚|
| j|  }
| ¡  |
 ¡  dd¡¡ ¡ }|	 |
 ¡ ¡ ¡ }|| j| j_	|| j| j_	|j	| j| | |  }| |¡}||   ¡ j_	|`d S )NzPlease initialize CorDA under float32, float16, or bfloat16. Subsequently, re-quantize the residual model to help minimize quantization errors.r   r   Úeigensz’`eigens` attribute not found for layer, please run `preprocess_corda` first. More information can be found at examples/corda_finetuning/README.md.zdInvalid value found in matrix S. Please file an issue at https://github.com/huggingface/peft/issues.zdInvalid value found in matrix U. Please file an issue at https://github.com/huggingface/peft/issues.zdInvalid value found in matrix V. Please file an issue at https://github.com/huggingface/peft/issues.zMatrix U size mismatch: z vs. (z, zé). Please make sure the `lora_config` and `model` argument of `preprocess_corda` is consistent with `get_peft_model`. If you're using cache in `preprocess_corda`, please make sure the cache is built with the same model and LoRA rank.zMatrix S size mismatch: zê,). Please make sure the `lora_config` and `model` argument of `preprocess_corda` is consistent with `get_peft_model`. If you're using cache in `preprocess_corda`, please make sure the cache is built with the same model and LoRA rank.zMatrix V size mismatch: rÁ   )r\   rh   r¬   rV   r­   r®   r¯   r°   r±   r´   Úsizerg   rl   rÑ   ÚU_WCÚS_WCÚV_WCr5   ÚisnanÚanyÚisinfr7   rÈ   Úmulr   Úviewrµ   r0   r1   )rs   r   rš   Úlinearrh   r¬   Úout_dimÚin_dimrÑ   ÚUrË   rÊ   r5   r0   r1   r   r   r   r   g  sh   ÿ
ÿ
ÿÿÿÿÿÿ
zLoraLayer.corda_initc                 C  s¶   ddl m} |  ¡ j}| j dd¡| j| | j dd¡dœ}||fi |¤Ž\}}}|| j ¡ v r>|| j| j_	|| j
| j_	|| j ¡ v rS|| j| j_	|| j| j_	||  ¡ j_	d S )Nr   )r“   Ú
loftq_bitsé   Ú
loftq_iterr   )Únum_bitsÚreduced_rankÚnum_iter)Úpeft.utils.loftq_utilsr“   r\   rh   r[   Úgetr5   r0   r¢   r´   r1   r2   r3   )rs   r   r“   rh   r[   Úqweightr0   r1   r   r   r   r“   ±  s   
ýzLoraLayer.loftq_initc                 C  sò   | j | }|d dkrtd|› dƒ‚t ||¡}tj |¡\}}|dd d…d d …f }|dd d…d d …f }|  ¡ jj}t | j	|d ¡ 
|¡jd }	t |d | j¡j 
|¡d }
t |	 ¡  |¡¡| j| _t |
 ¡  |¡¡| j| _d S )NrÀ   r   zAOrthogonal initialization requires the LoRA rank to be even, got rÃ   r   g      $@)r5   rl   rV   Úrandnr²   r³   r\   rh   r¬   rM   ÚmmÚTrN   rO   Ú	Parameterrµ   r±   r0   r1   )rs   r   ÚrankÚXrº   Ú_Úq_oddÚq_evenr¬   r0   r1   r   r   r   r•   Æ  s   
 zLoraLayer.orthogonal_initÚkeyr   Úvaluer   c                 C  s   || j |< d S ©N)rX   ©rs   rñ   rò   r   r   r   Ú_cache_store×  s   zLoraLayer._cache_storec                 C  s   | j  |¡}|S ró   )rX   Úpoprô   r   r   r   Ú
_cache_popÚ  s   zLoraLayer._cache_popÚadapterÚscaleúfloat | intc                 C  s0   || j vrdS || j|  | j|  | j |< dS )zºSet the scale of the given adapter to the initial scale multiplied by the provided factor

        The initial scale is determined by the configured `r` (rank) and `lora_alpha`.
        N)r7   r6   r5   )rs   rø   rù   r   r   r   Ú	set_scaleÞ  s   
"zLoraLayer.set_scalec                 C  s>   |dkrdS | j D ]}|| j ¡ vrq	| j|  |9  < q	dS )zHMultiply the current scale of all active adapters by the provided factorr   N)r™   r0   r¢   r7   ©rs   rù   r   r   r   r   Úscale_layerè  s   
üzLoraLayer.scale_layerNúOptional[float | int]c                 C  sV   | j D ]%}|| j ¡ vrq|du r| j| | j|  | j|< q| j|  |  < qdS )zãDivide the current scale of all active adapters by the provided factor. If `scale=None` is passed, reset to
        initial scale

        The initial scale is determined by the configured `r` (rank) and `lora_alpha`.

        N)r™   r0   r¢   r6   r5   r7   rü   r   r   r   Úunscale_layeró  s   
ùzLoraLayer.unscale_layerc                 O  s’   |  dd¡}|du rdS t|ƒt|ƒkr%dt|ƒ› dt|ƒ› d}t|ƒ‚| jr.d}t|ƒ‚dd„ |D ƒ}|D ]}| j  |d	¡rFd
}t|ƒ‚q7dS )zMCheck if the arguments are compatible with the configs and state of the modelÚadapter_namesNzNLength of `adapter_names` should be the same as the number of inputs, but got z and z respectively.z`Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first.c                 S  s   h | ]}|d kr|’qS )Ú__base__r   )Ú.0Únamer   r   r   Ú	<setcomp>  s    z0LoraLayer._check_forward_args.<locals>.<setcomp>Fz1Cannot pass `adapter_names` when DoRA is enabled.)ræ   rÆ   rl   ÚmergedrT   )rs   r(   Úargsr[   r   ÚmsgÚunique_adaptersr   r   r   r   Ú_check_forward_args  s*   ÿÿÿþÿzLoraLayer._check_forward_argsr(   r!   r  r   ú	list[str]r[   c                  sì   | j |g|¢R i |¤Ž}|j}t|ƒ}g }|D ]‰ | ‡ fdd„t|ƒD ƒ¡ qt|ƒD ]G\}	}
|
dkr5q,|
| j ¡ vr=q,| j|
 }| j|
 }| j|
 }| j	|
 }|||	   
|jj¡}||||ƒƒƒ| }|||	   | 
|¡7  < q,|S )Nc                   ó   g | ]
\}}|ˆ kr|‘qS r   r   ©r  ÚindexÚitem©rø   r   r   Ú
<listcomp>)  ó    z2LoraLayer._mixed_batch_forward.<locals>.<listcomp>r  )r:   r¬   ÚsetÚappendÚ	enumerater0   r¢   r1   r8   r7   r±   rh   )rs   r(   r   r  r[   r)   Útorch_result_dtyper  Úsub_batch_indices_listÚir   r0   r1   Údropoutr7   Ú	sub_batchÚlora_outputr   r  r   Ú_mixed_batch_forward  s&   



zLoraLayer._mixed_batch_forward)F)r:   r;   r<   r=   r   r   ©rT   r=   r   ru   )FFFrw   )rT   r=   rx   r=   rU   r=   ry   rz   )rñ   r   rò   r   r   r   )rñ   r   r   r   )rø   r   rù   rú   r   r   )rù   rú   r   r   ró   )rù   rþ   r   r   ©
r(   r!   r  r   r   r
  r[   r   r   r!   )r+   r,   r-   r4   Ú__annotations__r9   rt   rv   r   r–   r’   r   r   r“   rV   Úno_gradr•   rõ   r÷   rû   rý   rÿ   r	  r  r   r   r   r   r   Q   s0   
 
MõL-#J





r   c                      st   e Zd Z									d/d0‡ fdd„Zd1dd„Zd2d3d d!„Zd4d"d#„Zd5d%d&„Zd6d+d,„Zd7‡ fd-d.„Z	‡  Z
S )8r^   r   r   r|   FTr   r   r5   rz   r6   r8   ÚfloatrÄ   r=   Úis_target_conv_1d_layerrš   úUnion[bool, str]r›   rT   rU   r   r   c              
     sN   t ƒ  ¡  tj| |fi |¤Ž || _|| _| j||||||	|
|d || _d S )N©r6   r8   rš   r›   rT   rU   )Úsuperrt   r   rÄ   Ú_active_adapterr   r!  )rs   r:   r   r5   r6   r8   rÄ   r!  rš   r›   rT   rU   r[   ©rm   r   r   rt   K  s   
ø

zLinear.__init__ru   c                K  ó   |sd S ddl m} |ƒ S )Nr   )ÚDoraLinearVariant)Úvariantsr(  )rs   rT   r[   r(  r   r   r   rv   k  ó   zLinear.resolve_lora_variantNÚ
safe_merger   úOptional[list[str]]c           	      C  sr  t | |ƒ}|s	dS |D ]«}|| j ¡ v r¶|  ¡ }|r||jj ¡ }|j}|| jvr5|  	|¡}|| 
|¡7 }n
| j|  | ||¡}t |¡ ¡ sNtd|› dƒ‚||j_| j| r{|j| j| j| j|   }t |¡ ¡ sttd|› dƒ‚| 
|¡|j_n4|| jvr|  	|¡}|j j|7  _n| j|  | ||j¡ | j| r°|j j| j| j| j|  7  _| j |¡ qdS ©a^  
        Merge the active adapter weights into the base weights

        Args:
            safe_merge (`bool`, *optional*):
                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                before merging the weights. This is useful if you want to check if the merge operation will produce
                NaNs. Defaults to `False`.
            adapter_names (`list[str]`, *optional*):
                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
                to `None`.
        Nú1NaNs detected in the merged weights. The adapter ú seems to be broken)r	   r0   r¢   r\   rh   r´   Úcloner¬   rZ   Úget_delta_weightr±   r#   rV   ÚisfiniteÚallrl   rU   r€   r1   r7   r%   rS   r  )	rs   r+  r   r   r:   r    Ú
orig_dtypeÚdelta_weightÚnew_biasr   r   r   Úmerges  sF   



ÿ

ÿ€


"€ÙzLinear.mergec                 C  óÔ   | j s
t d¡ dS t| jƒdkrh| j ¡ }|| j ¡ v r_|  ¡ j	}|| j
vr:|j}|  |¡}| j| |¡8  _n| j
|  | ||¡}||_| j| r_|  ¡ j j| j| j| j|  8  _t| jƒdksdS dS ©úW
        This method unmerges all merged adapter layers from the base weights.
        ú Already unmerged. Nothing to do.Nr   ©r  ro   rp   rÆ   rS   rö   r0   r¢   r\   rh   rZ   r¬   r1  r´   r±   r'   rU   r€   r1   r7   ©rs   r   rh   r4  r5  Úunmergedr   r   r   r'   ®  ó    





&ózLinear.unmerger!   c                 C  s¼   | j | jj}| j | jj}|jdko|tjkp|tjk}| j| j}| j | j}|r3| 	¡ }| 	¡ }t
|| | jƒ| j|  }|r\|j|d}| |¡| j| j_| |¡| j | j_|S ©úÂ
        Compute the delta weight for the given adapter.

        Args:
            adapter (str):
                The name of the adapter for which the delta weight should be computed.
        Úcpu©r¬   )r1   rh   r¶   r¬   rq   rV   r®   r¯   r0   r   r   rÄ   r7   r±   r´   ©rs   rø   r¶   r¬   Úcast_to_fp32Úweight_AÚweight_BÚoutput_tensorr   r   r   r1  Ä  s   zLinear.get_delta_weightr(   r  r   r[   c                 O  s^  | j |g|¢R i |¤Ž | dd ¡}| jr*| jr|  ¡  | j|g|¢R i |¤Ž}|S |d ur>| j|g|¢R d|i|¤Ž}|S | jrO| j|g|¢R i |¤Ž}|S | j|g|¢R i |¤Ž}|j}| j 	¡ }| j
D ]A}||vrmqf| j| }	| j| }
| j| }| j| }|  ||	jj¡}|| jvr›||
|	||ƒƒƒ|  }qf| j| j| |||d}qf| |¡}|S ©Nr   )r   r(   r)   )r	  rö   Údisable_adaptersr  r'   r:   r  r¬   r0   r¢   r™   r1   r8   r7   Ú_cast_input_dtyperh   rZ   r*   r±   )rs   r(   r  r[   r   r)   r  Úlora_A_keysr   r0   r1   r  r7   r   r   r   r*   æ  sD   âäç







ü
zLinear.forwardc                   ó   t ƒ  ¡ }d| S ©Núlora.©r$  Ú__repr__©rs   Úrepr&  r   r   rQ    ó   
zLinear.__repr__)	r   r   r|   FFTFFF)r   r   r5   rz   r6   rz   r8   r   rÄ   r=   r!  r=   rš   r"  r›   r=   rT   r=   rU   r=   r   r   r  ©FN©r+  r=   r   r,  r   r   ©r   r   ©r   r!   ©r(   r!   r  r   r[   r   r   r!   ©r   r   )r+   r,   r-   rt   rv   r7  r'   r1  r*   rQ  Ú__classcell__r   r   r&  r   r^   I  s"    ô
 
;

"(r^   c                      sŽ   e Zd Z								d9d:‡ fdd„Zd;dd„Zdd„ Zd<d=d#d$„Zd>d%d&„Zd?d(d)„Zd@d/d0„Z	dAd3d4„Z
dBd5d6„ZdC‡ fd7d8„Z‡  ZS )Drd   r   r   r|   FTr:   r;   r   r   r5   rz   r6   r8   r   rÄ   r=   rš   r"  r›   rT   rU   r   r   c              
     s^   |
rt d|
› d| jj› dƒ‚tƒ  ¡  t | |¡ || _|| _| j|||||||	|
d d S )Nz
lora_bias=z is not supported for r?   r#  )	rl   rm   r+   r$  rt   r   rÄ   r%  r   )rs   r:   r   r5   r6   r8   rÄ   rš   r›   rT   rU   r[   r&  r   r   rt     s    

øzEmbedding.__init__ru   c                K  r'  )Nr   )ÚDoraEmbeddingVariant)r)  r\  )rs   rT   r[   r\  r   r   r   rv   7  r*  zEmbedding.resolve_lora_variantc	                 C  s\  t ƒ  ¡ }	|	d= |dkrtd|› ƒ‚| j|d}
|
d ur"|
| j|< || j|< || j|< |dkr7tj|d}nt 	¡ }|| j
|< t || jf¡}t | j|f¡}t |¡| j|< t |¡| j|< || j|< |rr|t |¡ | j|< n|| | j|< || j|< |dkrˆ|  |¡ n|r|  ||¡ |  |¡ || jv r¦| j| j| fi |	¤Ž |  | j¡ d S )Nrs   r   r{   ©rT   r|   r}   r„   )r‡   rˆ   rl   rv   rZ   r5   r6   rO   r‰   rŠ   r8   rV   rè   rM   rN   rë   r2   r3   rU   rŒ   r   r7   rT   r“   r–   r—   r   r˜   r™   )rs   r   r5   r6   r8   rš   r›   rT   rU   r[   rZ   rœ   rF  rG  r   r   r   r   ?  s<   








zEmbedding.update_layerNr+  r   r,  c                 C  sô   t | |ƒ}|s	dS |D ]l}|| j ¡ v rw|  ¡ }|jj}|rR|jj ¡ }|| jvr4||  	|¡ 
|¡7 }n
| j|  | ||¡}t |¡ ¡ sMtd|› dƒ‚||j_n|| jvrf|j j|  	|¡ 
|¡7  _n| j|  | ||j¡ | j |¡ qdS r-  )r	   r2   r¢   r\   rh   r¬   r´   r0  rZ   r1  r±   r#   rV   r2  r3  rl   r%   rS   r  )rs   r+  r   r   r:   r4  r    r   r   r   r7  p  s.   


ÿ

€èzEmbedding.mergec                 C  s¦   | j s
t d¡ dS t| jƒdkrQ| j ¡ }|  ¡ jj}|| j	 
¡ v rH|  ¡ j}|| jvr;| j|  |¡ |¡8  _n| j|  | ||¡}||_t| jƒdksdS dS r9  )r  ro   rp   rÆ   rS   rö   r\   rh   r¬   r2   r¢   rZ   r´   r1  r±   r'   )rs   r   r4  rh   r>  r   r   r   r'   œ  s   



÷zEmbedding.unmerger!   c                 C  sª   | j | j}| j| j}|jdko|tjkp|tjk}| j| }| j | }|r/| ¡ }| ¡ }t	|| dƒ| j
|  }|rS|j|d}| |¡| j|< | |¡| j |< |S )rA  rB  TrC  )r3   r¶   r2   r¬   rq   rV   r®   r¯   r   r   r7   r±   rD  r   r   r   r1  ®  s   

zEmbedding.get_delta_weightr(   r  r   r
  r[   c                  sÐ   | j |g|¢R i |¤Ž}t|ƒ}g }|D ]‰ | ‡ fdd„t|ƒD ƒ¡ qt|ƒD ]<\}}	|	dkr2q)|	| j ¡ vr:q)| j|	 j}
| j|	 j}| j|	 }|||  }|  	||
¡}|||   || | 7  < q)|S )Nc                   r  r   r   r  r  r   r   r  Ú  r  z2Embedding._mixed_batch_forward.<locals>.<listcomp>r  )
r:   r  r  r  r2   r¢   rê   r3   r7   Ú_embed)rs   r(   r   r  r[   r)   r  r  r  r   Úembedding_AÚembedding_Br7   r  Úafter_Ar   r  r   r  Ð  s"   
zEmbedding._mixed_batch_forwardÚinputrh   c              	   C  s*   |   ¡ }tj|||j|j|j|j|jdS )N)Úpadding_idxÚmax_normÚ	norm_typeÚscale_grad_by_freqÚsparse)r\   ÚFÚ	embeddingrc  rd  re  rf  rg  )rs   rb  rh   r:   r   r   r   r^  î  s   ùzEmbedding._embedc                 O  sD  | j |g|¢R i |¤Ž | dd ¡}| jr*| jr|  ¡  | j|g|¢R i |¤Ž}|S |d ur>| j|g|¢R d|i|¤Ž}|S | jrO| j|g|¢R i |¤Ž}|S | j|g|¢R i |¤Ž}|j}| jD ]9}|| j	vriqa|| j
vrŽ| j	| j}| j| j}	| j| }
|  ||¡}|||	 |
  }qa| j
| j| |||d}qa| |¡}|S rI  )r	  rö   rJ  r  r'   r:   r  r¬   r™   r2   rZ   rê   r3   r7   r^  r*   r±   )rs   r(   r  r[   r   r)   r  r   r_  r`  r7   ra  r   r   r   r*   ú  s@   æèë




ü
zEmbedding.forwardc                   rM  rN  rP  rR  r&  r   r   rQ    rT  zEmbedding.__repr__)r   r   r|   FTFFF)r:   r;   r   r   r5   rz   r6   rz   r8   r   rÄ   r=   rš   r"  r›   r=   rT   r=   rU   r=   r   r   r  rU  rV  rW  rX  r  )rb  r!   rh   r!   r   r!   rY  rZ  )r+   r,   r-   rt   rv   r   r7  r'   r1  r  r^  r*   rQ  r[  r   r   r&  r   rd     s&    õ
"1
,

"

%rd   c                      sv   e Zd Z							d-d.‡ fdd„Zdd„ Zdd„ Zd/d0d!d"„Zd1d#d$„Zd2d&d'„Zd3d)d*„Z	d4‡ fd+d,„Z
‡  ZS )5Ú_ConvNdr   r   r|   TFr:   r;   r   r   r5   rz   r6   r8   r   rš   r"  r›   r=   rT   rU   r   r   c
              
     sŠ   t ƒ  ¡  t | |¡ |jdkrt d¡ ||j dkr-td|jj› d|j› d|› dƒ‚|| _	|j
 ¡ | _| j||||||||	d d S )	Nr   zMLoRA adapter added to ConvNd layer with groups > 1. Merging is not supported.r   zTargeting a z with groups=z
 and rank zš. Currently, support is limited to conv layers where the rank is divisible by groups. Either choose a different rank or do not target this specific layer.r#  )r$  rt   r   Úgroupsro   rp   rl   rm   r+   r%  rh   ÚdimÚ_kernel_dimr   )rs   r:   r   r5   r6   r8   rš   r›   rT   rU   r[   r&  r   r   rt   &  s(   


ÿ
øz_ConvNd.__init__c	                 C  sŠ  t ƒ  ¡ }	|	d= |dkrtd|› ƒ‚| j|d}
|
d ur"|
| j|< || j|< || j|< |dkr7tj|d}nt 	¡ }|| j
|< |  ¡ }|j}|j}|j}t|ƒ}d| jd   }}|| j||||d	d
| j|< ||| j|||j|d| j|< || j|< |r‰|t |¡ | j|< n|| | j|< || j|< |dkrŸ|  |¡ n|r§|  ||¡ |  |¡ || jv r½| j| j| fi |	¤Ž |  | j ¡ d S )Nrs   r   r{   r]  r|   r}   ©r   rÀ   Fr   )rk  r€   r„   )!r‡   rˆ   rl   rv   rZ   r5   r6   rO   r‰   rŠ   r8   r\   Úkernel_sizeÚstrideÚpaddingrq   rm  rM   r0   rN   rk  r1   rU   rŒ   r   r7   rT   r“   r–   r—   r   r˜   r™   )rs   r   r5   r6   r8   rš   r›   rT   rU   r[   rZ   rœ   r:   ro  rp  rq  Ú
conv_layerÚ
out_kernelÚ
out_strider   r   r   r   N  sH   




ÿ



z_ConvNd.update_layerc                 C  s   dd| j d   S )N)rÁ   rn  r   )rm  ©rs   r   r   r   Ú_get_dora_factor_view…  s   z_ConvNd._get_dora_factor_viewNr+  r   r,  c           	      C  sŒ  t | |ƒ}|s	dS |D ]¸}|| j ¡ v rÃ|  ¡ }|jj}|jdkr%tdƒ‚|r†|jj 	¡ }|| j
vr?|  |¡}|| |¡7 }n
| j
|  | ||¡}t |¡ ¡ sXtd|› dƒ‚||j_| j| r…|j| j| j| j|   }t |¡ ¡ s~td|› dƒ‚| |¡|j_n7|| j
vrœ|  |¡}|j j| |¡7  _n| j
|  | ||j¡ | j| r½|j j| j| j| j|  7  _| j |¡ qdS )a`  
        Merge the active adapter weights inside the base weights

        Args:
            safe_merge (`bool`, *optional*):
                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                before merging the weights. This is useful if you want to check if the merge operation will produce
                NaNs. Defaults to `False`.
            adapter_names (`list[str]`, *optional*):
                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
                to `None`.
        Nr   z<Merging is not supported for _ConvNd layers with groups > 1!r.  r/  )r	   r0   r¢   r\   rh   r¬   rk  r   r´   r0  rZ   r1  r±   r#   rV   r2  r3  rl   rU   r€   r1   r7   r%   rS   r  )	rs   r+  r   r   r:   r4  r    r5  r6  r   r   r   r7  ˆ  sJ   




ÿ

ÿ€


"€Ôz_ConvNd.mergec                 C  r8  r9  r<  r=  r   r   r   r'   È  r?  z_ConvNd.unmerger!   c                 C  s<  | j | jj}| j| jj}|jdko|tjkp|tjk}| j| j}| j | j}|r3| 	¡ }| 	¡ }|  
¡ j ¡ dd… dkr\| d¡ d¡| d¡ d¡  d¡ d¡| j|  }n$|  | dd¡|¡}|  
¡ jdkru|| j|  }n| dd¡| j|  }|rœ|j|d}| |¡| j| j_| |¡| j | j_|S )	rA  rB  rÀ   rà   )r   r   r@   r   r   rC  )r1   rh   r¶   r0   r¬   rq   rV   r®   r¯   r   r\   rÒ   ÚsqueezeÚ	unsqueezer7   Úconv_fnr   rk  r±   r´   rD  r   r   r   r1  Þ  s.   &ÿþz_ConvNd.get_delta_weightr(   c                 O  sZ  | j |g|¢R i |¤Ž | dd ¡}| jr*| jr|  ¡  | j|g|¢R i |¤Ž}|S |d ur>| j|g|¢R d|i|¤Ž}|S | jrO| j|g|¢R i |¤Ž}|S | j|g|¢R i |¤Ž}|j}| jD ]D}|| j	 
¡ vrkqa| j	| }| j| }	| j| }
| j| }|  ||jj¡}|| jvr™||	||
|ƒƒƒ|  }qa| j| j| |||d}qa| |¡}|S rI  )r	  rö   rJ  r  r'   r:   r  r¬   r™   r0   r¢   r1   r8   r7   rK  rh   rZ   r*   r±   )rs   r(   r  r[   r   r)   r  r   r0   r1   r  r7   r   r   r   r*     sB   ãåé






ü
z_ConvNd.forwardc                   rM  rN  rP  rR  r&  r   r   rQ  3  rT  z_ConvNd.__repr__)r   r   r|   TFFF)r:   r;   r   r   r5   rz   r6   rz   r8   r   rš   r"  r›   r=   rT   r=   rU   r=   r   r   rU  rV  rW  rX  )r(   r!   r   r!   rZ  )r+   r,   r-   rt   r   rv  r7  r'   r1  r*   rQ  r[  r   r   r&  r   rj  $  s     ö(7
@

.'rj  c                      ó&   e Zd Z‡ fdd„Zd	dd„Z‡  ZS )
rb   c                   ó8   t ƒ j|i |¤Ž | jdkstd| j› ƒ‚tj| _d S )Nrà   z0Conv2d layer kernel must have 4 dimensions, not )r$  rt   rm  rl   rh  Úconv2dry  ©rs   r  r[   r&  r   r   rt   :  ó   
zConv2d.__init__rT   r=   r   ru   c                K  r'  )Nr   )ÚDoraConv2dVariant)r)  r  )rs   rT   r[   r  r   r   r   rv   @  r*  zConv2d.resolve_lora_variantr  ©r+   r,   r-   rt   rv   r[  r   r   r&  r   rb   8  ó    rb   c                      rz  )
r_   c                   r{  )Nr@   z0Conv1d layer kernel must have 3 dimensions, not )r$  rt   rm  rl   rh  Úconv1dry  r}  r&  r   r   rt   K  r~  zConv1d.__init__rT   r=   r   ru   c                K  r'  )Nr   )ÚDoraConv1dVariant)r)  rƒ  )rs   rT   r[   rƒ  r   r   r   rv   Q  r*  zConv1d.resolve_lora_variantr  r€  r   r   r&  r   r_   I  r  r_   c                      rz  )
rc   c                   r{  )Nrž   z0Conv3d layer kernel must have 5 dimensions, not )r$  rt   rm  rl   rh  Úconv3dry  r}  r&  r   r   rt   \  r~  zConv3d.__init__rT   r=   r   ru   c                K  r'  )Nr   )ÚDoraConv3dVariant)r)  r…  )rs   rT   r[   r…  r   r   r   rv   b  r*  zConv3d.resolve_lora_variantr  r€  r   r   r&  r   rc   Z  r  rc   c                      s~  e Zd ZdZ						d[d\‡ fdd„Zed]dd„ƒZed^dd„ƒZed^dd„ƒZed_dd„ƒZ	ed]d d!„ƒZ
ed`d"d#„ƒZed_d$d%„ƒZed]d&d'„ƒZedad)d*„ƒZedad+d,„ƒZedbd.d/„ƒZedcd1d2„ƒZedcd3d4„ƒZddd6d7„Zed_d8d9„ƒZde‡ fd:d;„Zdfdgd@dA„ZdedBdC„ZdhdFdG„ZdidIdJ„Z‡ fdKdL„ZdjdQdR„ZedSdT„ ƒZ‡ fdUdV„Z‡ fdWdX„Zdk‡ fdYdZ„Z ‡  Z!S )lrj   a†  LoRA implemented in a multihead attention layer

    This is currently only implemented for the case of `_qkv_same_embed_dim = True`, i.e. query, key, and value having
    the same dimension.

    Note: LoRA is applied to both the in_proj (query/key/value) and out_proj. There is currently no way to specify only
    one of them. Don't try to apply LoRA to the out_proj of MultiheadAttention by targeting that layer specifically,
    since the forward method of that layer is not being used, hence the LoRA adapter would be ignored.

    This is a little bit hacky because of the way that MultiheadAttention is implemented in PyTorch: There are no
    `nn.Linear` layers which we can hook onto or, in case of output projection, `.forward` is not used. This
    implementation works around these problems by merging the weights before the forward call and unmerging them after
    the forward call.
    r   r   r|   TFr   r   r5   rz   r6   r8   r   rš   r"  r›   r=   rT   r   r   c	           
   	     s¼   t |ddƒstd| jj› dƒ‚|rt| jj› dƒ‚tƒ  ¡  tj| |fi |	¤Ž t|jt	j
ƒrEt
|j|f||||||dœ|	¤Ž| j_n
td| jj› dƒ‚|| _|  ||||||¡ d S )Nrk   Tz?Only same embed for query/key/value is supported as of now for r?   z: does not support DoRA (yet), please set use_dora to False)r5   r6   r8   rš   r›   rT   z.out_proj must be an instance of nn.Linear for )Úgetattrrl   rm   r+   r$  rt   r   r]   Úout_projrO   r^   r:   r%  r   )
rs   r:   r   r5   r6   r8   rš   r›   rT   r[   r&  r   r   rt   {  s2   ÿ
þø	÷zMultiheadAttention.__init__c                 C  ó
   |   ¡ jS ró   )r\   rn   ru  r   r   r   rn   ¨  ó   
zMultiheadAttention.embed_dimúOptional[int]c                 C  rˆ  ró   )r\   Úkdimru  r   r   r   r‹  ¬  r‰  zMultiheadAttention.kdimc                 C  rˆ  ró   )r\   Úvdimru  r   r   r   rŒ  °  r‰  zMultiheadAttention.vdimc                 C  rˆ  ró   )r\   rk   ru  r   r   r   rk   ´  r‰  z&MultiheadAttention._qkv_same_embed_dimc                 C  rˆ  ró   )r\   Ú	num_headsru  r   r   r   r  ¸  r‰  zMultiheadAttention.num_headsc                 C  rˆ  ró   )r\   r  ru  r   r   r   r  ¼  r‰  zMultiheadAttention.dropoutc                 C  rˆ  ró   )r\   Úbatch_firstru  r   r   r   rŽ  À  r‰  zMultiheadAttention.batch_firstc                 C  rˆ  ró   )r\   Úhead_dimru  r   r   r   r  Ä  r‰  zMultiheadAttention.head_dimúnn.Parameterc                 C  rˆ  ró   )r\   Úin_proj_weightru  r   r   r   r‘  È  r‰  z!MultiheadAttention.in_proj_weightc                 C  rˆ  ró   )r\   Úin_proj_biasru  r   r   r   r’  Ì  r‰  zMultiheadAttention.in_proj_biasr;   c                 C  s   |   ¡ j  ¡ S ró   )r\   r‡  ru  r   r   r   r‡  Ð  s   zMultiheadAttention.out_projúOptional[nn.Parameter]c                 C  rˆ  ró   )r\   Úbias_kru  r   r   r   r”  Ô  r‰  zMultiheadAttention.bias_kc                 C  rˆ  ró   )r\   Úbias_vru  r   r   r   r•  Ø  r‰  zMultiheadAttention.bias_vú,tuple[Optional[torch.Tensor], Optional[int]]c                 O  s   |   ¡ j|i |¤ŽS ró   )r\   Úmerge_masksr}  r   r   r   r—  Ü  s   zMultiheadAttention.merge_masksc                 C  rˆ  ró   )r\   Úadd_zero_attnru  r   r   r   r˜  ß  r‰  z MultiheadAttention.add_zero_attnc                   s*   t ƒ j|i |¤Ž | jjj|i |¤Ž d S ró   )r$  r   r:   r‡  r}  r&  r   r   r   ã  s   zMultiheadAttention.update_layerNr+  r   r,  c           
      C  s~  t | |ƒ}|s	dS |D ]±}|| j ¡ v r¼|  ¡ }|jjj}|r||jj 	¡  
¡ }||  |¡ |¡7 }t |¡ ¡ s@td|› dƒ‚|jjj 	¡  
¡ }||j |¡ |¡7 }t |¡ ¡ sctd|› dƒ‚|`||_|j ¡ `||j ¡ _|jj|gd n:|  |¡ |¡}|jj 	¡ | }	|`|	|_|j |¡ |¡}|jjj 	¡ | }	|j ¡ `|	|j ¡ _|jj|gd | j |¡ qdS )a^  
        Merge the active adapter weights into the base weights

        Args:
            safe_merge (`bool`, *optional*):
                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
                before merging the weights. This is useful if you want to check if the merge operation will produce
                NaNs. Defaults to `False`.
            adapter_names (`List[str]`, *optional*):
                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
                to `None`.
        Nr.  r/  ©r   )r	   r0   r¢   r\   r‡  rh   r¬   r‘  r´   Údetachr0  r1  r±   rV   r2  r3  rl   r7  rS   r  )
rs   r+  r   r   r:   r4  Úorig_weight_inÚorig_weight_outr5  Úweight_mergedr   r   r   r7  è  sJ   


ÿ
ÿ

€ÑzMultiheadAttention.mergec                 C  sè   | j s
t d¡ dS |  ¡ }|jjjj}t| j	ƒdkrk| j	 
¡ }|| j ¡ v rd|  |¡ |¡}|jj| }|`| dtj|dd¡ |j |¡ |¡}|jjjj| }|jj`|jj dtj|dd¡ t| j	ƒdks|  ¡ j ¡  dS )r:  r;  Nr   r‘  F©rª   rh   )r  ro   rp   r\   r‡  r:   rh   r¬   rÆ   rS   rö   r0   r¢   r1  r±   r‘  r´   Úregister_parameterrO   rë   r'   )rs   r:   r4  r   r5  Ú
old_weightr   r   r   r'   /  s(   

ÿðzMultiheadAttention.unmerger7  únn.MultiheadAttentionc                 C  sv   |r	| j ||d |  ¡ }|j}|`| dtj|j|jd¡ |j ¡ }|j	}|`	| dtj|j|jd¡ ||_|S )zô
        Merging and unloading of the MultiheadAttention module

        This requires an extra step for MultiheadAttention, which is why there is this special method instead of
        relying on the normal merge_and_unload code path.
        )r+  r   r‘  rž  rh   )
r7  r\   r‘  rŸ  rO   rë   r´   rª   r‡  rh   )rs   r7  r+  r   r:   rh   Úout_proj_layerr   r   r   Ú"unload_and_optionally_merge_moduleP  s   	
z5MultiheadAttention.unload_and_optionally_merge_moduler!   c                 C  sª   | j | jj}| j | jj}|jdko|tjk}| j| j}| j | j}|r.| ¡ }| ¡ }|| | j	|  }|rS|j
|d}| 
|¡| j| j_| 
|¡| j | j_|S r@  )r1   rh   r¶   r¬   rq   rV   r®   r0   r   r7   r±   r´   rD  r   r   r   r1  l  s   z#MultiheadAttention.get_delta_weightc                   s:   d|v rt d| jj› dƒ‚tƒ j|g|¢R i |¤Ž d S )Nr   rO  z( does not support mixed adapter batches.)r°   rm   r+   r$  r	  )rs   r(   r  r[   r&  r   r   r	  Ž  s   z&MultiheadAttention._check_forward_argsÚqueryr  r   r[   c           	        s.  |j }ˆ j|g|¢R i |¤Ž ˆ jr&ˆ jrˆ  ¡  ˆ j|g|¢R i |¤Ž}nTˆ jr6ˆ j|g|¢R i |¤Ž}nDˆ  ¡ j}|jˆ jkrRˆ  ¡ j	j
}td|› d|› dƒ‚‡ fdd„ˆ jD ƒ}zˆ j|d ˆ j|g|¢R i |¤Ž}W ˆ  ¡  nˆ  ¡  w |d  |¡|d d ur|d  |¡f}|S |d f}|S )	NzThe out_proj layer of z has merged layers but zJ itself doesn't; please ensure that either both or none have merged layersc                   s   g | ]	}|ˆ j v r|‘qS r   )r0   )r  rŸ   ru  r   r   r  °  s    z.MultiheadAttention.forward.<locals>.<listcomp>r™  r   r   )r¬   r	  rJ  r  r'   r:   r\   r‡  r™   rm   r+   rl   r7  r±   )	rs   r¤  r  r[   Úprevious_dtyper)   r‡  Úcls_namer™   r   ru  r   r*   “  s.   
ÿ(
ÿzMultiheadAttention.forwardc                 C  s^   |   ¡ }|j}|`| dtj|j|jd¡ |j  ¡ }|j}|`| dtj|j|jd¡ d S )Nr‘  rž  rh   )	r\   r‘  rŸ  rO   rë   r´   rª   r‡  rh   )rs   r:   rh   r   r   r   Ú_restore_weights¿  s   

z#MultiheadAttention._restore_weightsc                   ó   |   ¡  tƒ j|i |¤ŽS ró   )r§  r$  Ú
state_dictr}  r&  r   r   r©  Ô  s   zMultiheadAttention.state_dictc                   r¨  ró   )r§  r$  Únamed_modulesr}  r&  r   r   rª  Ø  s   z MultiheadAttention.named_modulesc                   rM  rN  rP  rR  r&  r   r   rQ  Ý  rT  zMultiheadAttention.__repr__)r   r   r|   TFF)r   r   r5   rz   r6   rz   r8   r   rš   r"  r›   r=   rT   r=   r   r   )r   rz   )r   rŠ  )r   r=   )r   r   )r   r  )r   r;   )r   r“  )r   r–  rW  rU  rV  )r7  r=   r+  r=   r   r,  r   r¡  rX  )r¤  r!   r  r   r[   r   r   r!   rZ  )"r+   r,   r-   r.   rt   Úpropertyrn   r‹  rŒ  rk   r  r  rŽ  r  r‘  r’  r‡  r”  r•  r—  r˜  r   r7  r'   r£  r1  r	  r*   r   r§  r©  rª  rQ  r[  r   r   r&  r   rj   k  sd    ÷-

G
!
"
,
rj   Útargetútorch.nn.Moduler   r   Úlora_configr   r   úOptional[torch.nn.Module]c                 K  s¾  d }t | tƒr|  ¡ }n| }t |tjjƒr0| ¡ }| dd ¡ | |j	¡ t| |fi |¤Ž}|S t |tjj
ƒrH| |j	¡ t
| |fi |¤Ž}|S t |tjjƒr`| |j	¡ t| |fi |¤Ž}|S t |tjƒrw| |j	¡ t| |fi |¤Ž}|S t |tjjƒr| |j	¡ t| |fi |¤Ž}|S t |tjjƒr·|d r¦t d¡ d |d< |_| |j	¡ t| |fi |¤Ž}|S t |tƒrÝ|d sÌt d¡ d |d< |_| |j	¡ t| |fddi|¤Ž}|S )NrÄ   zjfan_in_fan_out is set to True but the target module is `torch.nn.Linear`. Setting fan_in_fan_out to False.Fzafan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True.Tr!  )r]   r   r\   rV   rO   rd   rˆ   rö   r‹   Úloftq_configrb   rc   r_   rj   r^   ro   rp   rÄ   r   )r¬  r   r®  r[   Ú
new_moduleÚtarget_base_layerÚembedding_kwargsr   r   r   Údispatch_defaultâ  sX   

âåèëîÿ

÷ÿr´  )r¬  r­  r   r   r®  r   r   r¯  )(Ú
__future__r   rŒ   ro   Útypingr   r   r   rV   Útorch.nnrO   Útorch.nn.functionalÚ
functionalrh  r   Útransformers.pytorch_utilsr   Úpeft.tuners.tuners_utilsr   r	   Úpeft.utils.integrationsr
   r   r   r   Úpeft.utils.otherr   Úconfigr   r   r   ÚModuler^   rd   rj  rb   r_   rc   rj   r´  r   r   r   r   Ú<module>   sB   +   { K      y