o
    wiX                     @   sD  d dl Z d dlmZmZ d dlmZmZ d dlZd dlm	Z	 ej
 r*e	d\ZZndZdZd dlm  mZ d dlmZ d dlmZ ed	d
\ZZd dlmZ d dlmZmZ d dlmZmZ d dlmZ G dd deZeryG dd dej Z!G dd dej Z"						d ddZ#eG dd deeZ$G dd deZ%dS )!    N)	dataclassfield)ListLiteral)safe_importbitsandbytesF)nn)safe_import_fromtransformer_enginepytorch)ModuleMatcher)"get_adapter_attributes_from_linearis_expert_linear)PEFTAdapterWrapper)loggingc                   @   s   e Zd ZdZdd ZdS )
LoRALinearag  An adapter wrapper that adds the output of the adapter to the output of the wrapped module.

    This class is designed to be used with LoRA (Low-Rank Adaptation) and similar techniques
    where the adapter's output is added to the main module's output. It extends the AdapterWrapper
    class to provide a specific implementation of the forward method.
    c                 O   sD   | j |g|R i |\}}}| | }||j}|| |fS )N)base_linear_forwardadapter
contiguousreshapeshape)selfxargskwargslinear_outputbiaslayernorm_outputadapter_output r    [/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/peft/lora.pyforward2   s   zLoRALinear.forwardN)__name__
__module____qualname____doc__r"   r    r    r    r!   r   *   s    r   c                       sZ   e Zd ZdZ						d fdd		Zeje						dd
dZ fddZ	  Z
S )TELinearAdaptera  
        TELinear + LoRA, maintains ckpts structrue (i.e. Linear's weight/bias remain at the same FQN)

        The _init_wrapper and _forward methods provide the LoRA functionality. We want to be able to
        use those inside LinearAdapter but also for monkey-patching modules, without repeating the
        same code -> therefore those are decorated with @staticmethod.

        Args:
            orig_linear (nn.Module): the linear module to augment.
            dim (int): lora's dim in_features -> dim -> out_features.
            alpha (int): lora's scaling alpha.
            dropout (float): dropout prob (default: 0.0).
            dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
            lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
            lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
            are quantized weights (e.g. 4bit) needs to be specified explicitly.
                       postxavierNc           	   	      s   |j tjksJ |jd uo|jjd dk}tt| j|j|j	||j
j|j
jd | j
j|j
j |r<| jj|jj tj| ||||||d d S )Nr   )in_featuresout_featuresr   deviceparams_dtypedimalphadropoutdropout_positionlora_A_init_method
lora_dtype)	__class__teLinearr   r   superr'   __init__r-   r.   weightr/   dtypedatacopy__init_adapter)	r   orig_linearr2   r3   r4   r5   r6   r7   has_biasr8   r    r!   r<   O   s*   


zTELinearAdapter.__init__c                 C      || _ || | _| jj}d| j_| jdurd| j_| j}| j}	|p$| jj}
t	j
||d|
|d| _t	j
||	d|
|d| _|dkrJtj	j| jjj nt	jj| jjjtdd | jjjd t	j|d| _|d	v soJ ||| _dS )
a%  Adds LoRA weights to obj. The obj is either a LinearAdapter or an nn.Module (when
            monkey-patching).

            Args:
                obj (LinearAdapter | nn.Module): input module to adapt.
                dim (int): lora's dim in_features -> dim -> out_features.
                alpha (int): lora's scaling alpha.
                dropout (float): dropout prob (default: 0.0).
                dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
                lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
                lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
                are quantized weights (e.g. 4bit) needs to be specified explicitly.
            FNr   r>   r/   r,      ar   pprer+   r2   scaler=   r/   requires_gradr   r-   r.   r>   r   r:   lora_alora_btorchinituniform_r?   kaiming_uniform_mathsqrtfill_Dropoutr4   r5   objr2   r3   r4   r5   r6   r7   r/   r-   r.   r>   r    r    r!   rA   r   $   


zTELinearAdapter._init_adapterc                    sZ   t t| |}| jdkr| |}| | |}|| j }| jdkr)| |}|| S )NrM   r+   )r;   r'   r"   r5   r4   rR   rQ   rO   )r   r   reslora_resrD   r    r!   r"      s   




zTELinearAdapter.forwardr(   r)   r*   r+   r,   Nr#   r$   r%   r&   r<   rS   no_gradstaticmethodrA   r"   __classcell__r    r    rD   r!   r'   <   s&    #.r'   c                       sV   e Zd ZdZ						d fdd		Zeje						dd
dZdd Z	  Z
S )LinearAdaptera  
    Linear + LoRA, maintains ckpts structrue (i.e. Linear's weight/bias remain at the same FQN)

    The _init_wrapper and _forward methods provide the LoRA functionality. We want to be able to
    use those inside LinearAdapter but also for monkey-patching modules, without repeating the
    same code -> therefore those are decorated with @staticmethod.

    Args:
        orig_linear (nn.Module): the linear module to augment.
        dim (int): lora's dim in_features -> dim -> out_features.
        alpha (int): lora's scaling alpha.
        dropout (float): dropout prob (default: 0.0).
        dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
        lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
        lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
        are quantized weights (e.g. 4bit) needs to be specified explicitly.
    r(   r)   r*   r+   r,   Nc              	      s   t |tjsJ tt| j|j|j|jd u|j	j
|j	jd | j	j|j	j |jd ur5| jj|jj tj| ||||||d d S )N)r-   r.   r   r/   r>   r1   )
isinstancer   r:   r;   re   r<   r-   r.   r   r=   r/   r>   r?   r@   rA   )r   rB   r2   r3   r4   r5   r6   r7   rD   r    r!   r<      s(   



zLinearAdapter.__init__c                 C   rE   )
a  Adds LoRA weights to obj. The obj is either a LinearAdapter or an nn.Module (when
        monkey-patching).

        Args:
            obj (LinearAdapter | nn.Module): input module to adapt.
            dim (int): lora's dim in_features -> dim -> out_features.
            alpha (int): lora's scaling alpha.
            dropout (float): dropout prob (default: 0.0).
            dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
            lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
            lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
            are quantized weights (e.g. 4bit) needs to be specified explicitly.
        FNrF   r,   rG   rH   r   rJ   rL   rN   r[   r    r    r!   rA      r]   zLinearAdapter._init_adapterc                 C   s   t | dd  }d ur|| jksJ ||}n	t|| j| j}| jdkr)| |}| | 	|}|| j
 }| jdkr@| |}|| S )N	super_fwdrM   r+   )getattrr"   Flinearr=   r   r5   r4   rR   rQ   rO   )r   r   fwdr^   r_   r    r    r!   r"     s   





zLinearAdapter.forwardr`   ra   r    r    rD   r!   re      s&    !.re   r(   r)   r*   r+   r,   c           	   	   C   s   t | tjs| jtjksJ t| drJ | jt | tjr5t| |||||| | j}t	dt|fi }n!| jtjkrRt
| |||||| | j}t	dt
|fi }ntdt| dddurj| jjtjjkrj| j| _|| _| S )a  Monkey-patches a nn.Linear (orig_linear param) to be a LinearAdapter, for all purposes
    think of this function as replacing a nn.Linear with a LinearAdapter defined above.

    The orig_linear might not contain valid weights, for example, the given orig_linear was
    initialized within a context-manager that uses a "meta" device. Therefore, we cannot copy
    the weight/bias from the orig_linear to the LinearAdapter, since those have not been allocated,

    To circumvent this scenario, LinearAdapter's additional functionality (_init_adapter, _forward)
    is based on static functions, so that we can use them for patching or when allocating a
    new LinearAdapter object.

    Args:
        orig_linear (nn.Linear): the module we add adapter to.
        dim (int, optional): Lora dim. Defaults to 8.
        alpha (int, optional): Lora alpha scale. Defaults to 32.
        dropout (float, optional): dropout prob. Defaults to 0.0.
        dropout_position (str, optional): location to apply dropout wrt lora.
            Defaults to 'post' (choices: 'pre', 'post').
        lora_A_init_method (str, optional): lora_a init method. Defaults to 'xavier'.
        lora_dtype (_type_, optional): Lora weights' dtype. By default will use orig_linear's dtype
        but orig_linear might use non-trainable dtype (e.g., 4bit), in which case the user must
        specify the dtype manually. Defaults to None.

    Returns:
        (nn.Module): the monkey-patched (nn.Linear + LoRA) nn.Module
    rg   PatchedLinearAdapterPatchedTELinearAdapterz8Expected isinstance(orig_linear, (nn.Linear, te.Linear))quant_stateN)rf   r   r:   r8   r9   hasattrrg   re   rA   typer'   NotImplementedErrorrh   rn   r   
functional
QuantStater"   )	rB   r2   r3   r4   r5   r6   r7   clsnew_clsr    r    r!   patch_linear_module'  s$   $rv   c                   @   s   e Zd ZU dZedd dZee ed< dZ	e
ed< dZe
ed< d	Zeed
< dZed ed< dZeed< dZeed< dZeed< dZejed< dZeed< ddejfddZdS )LoRAa  
    Implements the LoRA (Low-Rank Adaptation) module for parameter-efficient fine-tuning.

    LoRA uses a low-rank projection to adapt the weights of a pre-trained model to a new downstream task.
    This class facilitates the application of LoRA to specific modules within the model architecture.

    Args:
        target_modules (List[str], optional): A list of module names to apply LoRA to.
            Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
                - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections
                                in self-attention.
                - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention.
                - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP.
                - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP.
            Target modules can also contain wildcards. For example, you can specify
                target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv
                on the first two layers.
        exclude_modules (List[str], optional): A list of module names not to apply LoRa to. It will
            match all nn.Linear & nn.Linear-adjacent modules whose name does not match any string in
            exclude_modules. If used, will require target_modules to be empty list or None.
        dim (int): Dimension of the low-rank projection space. Defaults to 32.
        alpha (int): Weighting factor for the low-rank projection. Defaults to 32.
        dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0.
        dropout_position (Literal['pre', 'post'], optional): Position for applying dropout.
            Can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'.
        a2a_experimental (bool): Enables the experimental All-to-All (A2A) communication strategy. Defaults to False.
        dropout_recompute (bool): Enables dropout recompute using Thunder JIT compilation. When True,
            applies thunder.jit() to the dropout layer for memory-efficient training by recomputing
            dropout activations during backward pass instead of storing them.
        lora_dtype (torch.dtype): Parameter data type for LoRA weights. Default None (will use model's dtype).

    Example:
    --------
        >>> from nemo.collections import llm
        >>> lora = llm.peft.LoRA(target_modules=['linear_qkv', 'linear_proj'], dim=32)
        >>> model = llm.Mistral7BModel(model_transform=lora)
        >>> # (set up trainer and data)
        >>> trainer.fit(model, data)

    References:
    -----------
        Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2021).
        LoRA: Low-Rank Adaptation of Large Language Models. arXiv preprint arXiv:2106.09685.
        https://arxiv.org/abs/2106.09685

    )
    c                   C   s   g dS )N)
linear_qkvlinear_proj
linear_fc1
linear_fc2r    r    r    r    r!   <lambda>  s    zLoRA.<lambda>)default_factorytarget_modulesr)   r2   r3   r*   r4   rM   rL   r5   r,   r6   zerolora_B_init_methodFa2a_experimentalNr7   dropout_recomputemc                 C   sp  ddl m} | ||| }dur|\}}t|tjs!|jtjkrZ| js;t	|j
jds;t|dddur>|jjtjjkr>t}ntrI|jtjkrIt}nt}||| j| j| j| j| jdS t|\}	}
}}}td|  ||
|| jfi d|d	d
ddd| jd| jddd|	d| jd| jdt|ddd| jdt|d| j d|d| j!d|}t"||S |S )a  
        Applies LoRA to a specific module within the model architecture.

        Args:
            m (nn.Module): The module to apply LoRA to.
            name (str, optional): Name of the module (if applicable). Defaults to None.
            prefix (str, optional): Prefix for the module name (if applicable). Defaults to None.

        Returns:
            nn.Module: The modified module with LoRA applied, or the original module if not a target.
        r   )ParallelLinearAdapterN_local_tensorrn   )r2   r3   r4   r6   r7   zAdding lora to: base_linear_name
activationidentity	norm_typecolumn_init_methodrow_init_methodgather_outputFinput_is_parallelr4   r5   model_parallel_configconfigr3   	is_expertr   disable_sequence_parallel_commr   base_linear_is_parallel)#nemo.collections.llm.peft.utilsr   matchrf   r   r:   r8   r9   _add_via_setattrro   r=   r?   rh   rn   r   rr   rs   rv   HAVE_TEr'   re   r2   r3   r4   r6   r7   r   r   infor   r5   r   r   r   r   )r   r   nameprefixr   ansr   	full_namelora_clsr   r-   r.   disable_sp_commr   r   r    r    r!   	transform  s   
	

zLoRA.transformNN)r#   r$   r%   r&   r   r~   r   str__annotations__r2   intr3   r4   floatr5   r   r6   r   r   boolr7   rS   r>   r   r   Moduler   r    r    r    r!   rw   f  s   
 0rw   c                   @   s*   e Zd ZdZe ddejfddZdS )	LoRAMergea  
    Implements the LoRA weight merge for parameter-efficient fine-tuning.

    Example:
    --------
        >>> from nemo.collections.llm.peft.lora import LoRAMerge
        >>> lora_merge = LoRAMerge()
        >>> merged_model = lora_merge(trainer.strategy.megatron_parallel)
    Nr   c                 C   s   t |ts|S td|r|ndd |r|nd   |jj}|jj|jj |jj	j
|j |jjj
|j }|| }||jj_|S )a  
        Merges the LoRA adapter with the base model weights.

        Args:
            m (nn.Module): The module to apply LoRA merge to.
            name (str, optional): Name of the module to merge. Defaults to None.
            prefix (str, optional): Prefix for the module name. Defaults to None.

        Returns:
            nn.Module: The modified module with the LoRA adapter merged into the base model weights.
        zmerging  .)rf   r   r   r   to_wrapr=   r   r3   r2   
linear_outtor/   	linear_inr?   )r   r   r   r   base_weightlora_weightmerged_weightr    r    r!   r     s   
(
zLoRAMerge.transformr   )	r#   r$   r%   r&   rS   rb   r   r   r   r    r    r    r!   r     s    
r   r`   )&rW   dataclassesr   r   typingr   r   rS   nemo.utils.import_utilsr   cudais_availabler   HAVE_BNBtorch.nn.functionalr   rr   ri   r	   r9   r   (nemo.collections.llm.peft.module_matcherr   r   r   r   %nemo.lightning.pytorch.callbacks.peftr   r   
nemo.utilsr   r   r:   r'   re   rv   rw   r   r    r    r    r!   <module>   s@   
sz
? 