o
    wiu"                     @   s   d dl mZmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ G dd deZG dd deZeG dd deeZdS )    )	dataclassfield)ListLiteralOptionalN)ShardedStateDict)(gather_from_tensor_model_parallel_region)"make_sharded_tensor_for_checkpoint%make_tp_sharded_tensor_for_checkpoint)nn)ModuleMatcher)ParallelLinearAdapter"get_adapter_attributes_from_linear)PEFTAdapterWrapper)loggingc                	       sJ   e Zd ZdZdd Zdd Z	dd	ed
edee	 de
f fddZ  ZS )ParallelLinearDoRAAdapterzT
    Adapter class for DoRA to handle the additional weight_magnitude parameter
    c                 C   s   t j|dd| _dS )zl
        Initialize weight_magnitude with shape (d,), where d is the output dim of the linear layer
        T)requires_gradN)r   	Parameterweight_magnitude)selfvalue r   [/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/peft/dora.pyinit_weight_magnitude#   s   z/ParallelLinearDoRAAdapter.init_weight_magnitudec                 C   s   | j S )zG
        Public function to get the weight magnitude parameter
        )r   r   r   r   r   get_weight_magnitude)   s   z.ParallelLinearDoRAAdapter.get_weight_magnitude r   Nprefixsharded_offsetsmetadatareturnc                    sP   t  |||}| d}| jrt| j||d}n	t| j|d|d}|||< |S )z
        Sharded state dict implementation for DoRA adapter.
        Weight magnitude is TP sharded for linear_qkv and linear_fc1 only.
        r   )prepend_offsetsr   )supersharded_state_dictinput_is_parallelr	   r   r
   )r   r   r   r    r$   magnitude_keymagnitude_sharded_tensor	__class__r   r   r$   /   s   

z,ParallelLinearDoRAAdapter.sharded_state_dict)r   r   N)__name__
__module____qualname____doc__r   r   strtupler   dictr   r$   __classcell__r   r   r(   r   r      s    r   c                       s<   e Zd ZdZdejdef fddZdd Zdd	 Z	  Z
S )

DoRALinearz
    An adapter wrapper that is designed to be used with DoRA
    It extends the AdapterWrapper class to provide a specific implementation of the forward method.
    to_wrapadapterc                    s4   t  || |  |j|j | _| j|   d S )N)r#   __init__alphadimscalingr4   r   _get_weight_norm)r   r3   r4   r(   r   r   r5   N   s   zDoRALinear.__init__c                 C   st   | j jrt| j jjjj}| j jj}n| j jj}t| j jjjj}| jj| j| |  }t	j
j|dd|j S )N   )r7   )r4   r%   r   
linear_outweightT	linear_inr3   r8   torchlinalgnormtodtypedetach)r   linear_out_weightlinear_in_weightr<   r   r   r   r9   T   s   
zDoRALinear._get_weight_normc                 C   s   |  |\}}}| | }| j |   ddd}| jjdu s&| js)d}n|d |  | j|| d  }||||j	  | |fS )a*  
        Forward method for DoRA

          mag_norm_scale * (linear_output + adapter_output)
        = ||W_0 + B_0 A_0|| / ||W_0 + B A|| * (W_0 x + B A x)
        = ||W_0 + B_0 A_0|| ((W_0 + B A) / ||W_0 + B A||) x
        = m ((W_0 + B A) / ||W_0 + B A||) x
        = equation 5 in DoRA paper

        When dropout is used, equation becomes
          W_0 x + (m /||W_0 + B A|| - 1) W_0 dropout(x) + m /||W_0 + B A|| B A dropout(x)
        = ...
        = m /||W_0 + B A|| (W_0 x + B A dropout(x)) + (m /||W_0 + B A|| - 1) W_0 (dropout(x) - x)

        r:   Nr   )
base_linear_forwardr4   
contiguousr   r9   viewdropouttrainingreshapeshape)r   xlinear_outputbiaslayernorm_outputadapter_outputmag_norm_scaledropout_correctionr   r   r   forward_   s   
zDoRALinear.forward)r*   r+   r,   r-   r   Moduler   r5   r9   rV   r1   r   r   r(   r   r2   H   s
    r2   c                   @   s   e Zd ZU dZedd dZee ed< dZ	e
ed< dZe
ed	< d
Zeed< dZed ed< dZeed< dZeed< dd ZddejfddZdS )DoRAaK  
    Implements the DoRA (Weight-Decomposed LowRank Adaptation) module for parameter-efficient fine-tuning.

    DoRA decomposes pre-trained weight into magnitude and direction, and uses a low-rank projection in the
    directional component to adapt the weights of a pre-trained model to a new downstream task.
    This class facilitates the application of DoRA to specific modules within the model architecture.

    Args:
        See LoRA class for a detailed explanation of the arguments.

    Example:
    --------
        >>> from nemo.collections import llm
        >>> lora = llm.peft.DoRA(target_modules=['linear_qkv', 'linear_proj'], dim=32, alpha=64)
        >>> model = llm.Mistral7BModel(model_transform=lora)
        >>> # (set up trainer and data)
        >>> trainer.fit(model, data)

    References:
    -----------
        Shih-Yang Liu, Chien-Yi Wang, Hongxu Yin, Pavlo Molchanov, Yu-Chiang Frank Wang, Kwang-Ting Cheng,
        Min-Hung Chen (2024). DoRA: Weight-Decomposed Low-Rank Adaptation. arXiv preprint arXiv:2402.09353.
        https://arxiv.org/abs/2402.09353
    )
    c                   C   s   g dS )N)
linear_qkvlinear_proj
linear_fc1
linear_fc2r   r   r   r   r   <lambda>   s    zDoRA.<lambda>)default_factorytarget_modules    r7   @   r6   g        rK   pre)rb   postdropout_positionxavierlora_A_init_methodzerolora_B_init_methodc                 C   s   | j dks	J dd S )Nrb   z`DoRA only supports pre-adapter dropout at this time.Please set DoRA(..., dropout_position='pre'))rd   r   r   r   r   __post_init__   s   zDoRA.__post_init__Nmc                 C   s   |  ||| }durC|\}}t|\}}}	}
}td|  t||	| j|dd| j| jd|| j| j	t
|dd| j|
|d}t||S |S )a  
        Applies DoRA to a specific module within the model architecture.

        Args:
            m (nn.Module): The module to apply DoRA to.
            name (str, optional): Name of the module (if applicable). Defaults to None.
            prefix (str, optional): Prefix for the module name (if applicable). Defaults to None.

        Returns:
            nn.Module: The modified module with DoRA applied, or the original module if not a target.
        NzAdding DoRA to: identityFconfig)base_linear_name
activation	norm_typecolumn_init_methodrow_init_methodgather_outputr%   rK   rd   model_parallel_configr6   disable_sequence_parallel_commbase_linear_is_parallel)matchr   r   infor   r7   rf   rh   rK   rd   getattrr6   r2   )r   rj   namer   ansrv   	full_namer%   in_featuresout_featuresdisable_sp_commru   r4   r   r   r   	transform   s2   

zDoRA.transform)NN)r*   r+   r,   r-   r   r_   r   r.   __annotations__r7   intr6   rK   floatrd   r   rf   rh   ri   r   rW   r   r   r   r   r   rX      s   
 rX   )dataclassesr   r   typingr   r   r   r?   (megatron.core.dist_checkpointing.mappingr   megatron.core.tensor_parallelr   megatron.core.utilsr	   r
   r   (nemo.collections.llm.peft.module_matcherr   nemo.collections.llm.peft.utilsr   r   %nemo.lightning.pytorch.callbacks.peftr   r   
nemo.utilsr   r   r2   rX   r   r   r   r   <module>   s   *9