o
    }oit                     @   sp  d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZmZmZ d dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZm Z m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 e
rd dl4m5Z5 G dd de#ee*Z6G dd dej7Z8G dd dee3Z9dS )    N)ABCabstractmethod)partial)Path)TYPE_CHECKINGAnyCallableDictOptionalTuple)_PATH)_WrappingCheckpointIO)	TrainerFn)override)ADAPTER_META_FILENAMEHF_ADAPTER_CONFIG_FILENAMEHF_ADAPTER_PATH)IOMixin)ckpt_to_dirckpt_to_weights_subdir)MegatronParallel)ModelTransform)MegatronOptimizerModule)get_automodel_from_traineris_trainer_attached)logging)AsyncCompatibleCheckpointIO)ShardedStateDictc                       s   e Zd ZdZedddZdejdejfddZdejddfd	d
Z	dd Z
dejdejdeddf fddZdejddfddZ fddZdd ZdedefddZ  ZS )PEFTa  Abstract base class for Parameter-Efficient Fine-Tuning (PEFT) methods.

    This class defines the interface for PEFT methods, which are used to fine-tune
    large language models efficiently by modifying only a small subset of the model's
    parameters.

    Example:
        class MyPEFT(PEFT):
            def transform(self, module, name=None, prefix=None):
                # Implement the transform logic
                pass


        peft = MyPEFT()
        peft_model = LargeLanguageModel(model_transform=peft)
    Nc                 C   s   t d)aY  Transform a single module according to the PEFT method.

        This method is called for each module in the model during the PEFT application process.
        It should be implemented by subclasses to define how individual modules are transformed
        for the specific PEFT technique.

        Args:
            module (nn.Module): The individual module to be transformed.
            name (Optional[str]): The name of the module within the model structure. Defaults to None.
            prefix (Optional[str]): A prefix to be added to the module name, typically used for
                                    nested modules. Defaults to None.

        Returns:
            nn.Module: The transformed module. This can be the original module with modifications,
                       a new module replacing the original, or the original module if no
                       transformation is needed for this specific module.

        Note:
            This method is automatically called for each module in the model when the PEFT
            instance is applied to the model using the __call__ method.
        z9The transform method should be implemented by subclasses.)NotImplementedError)selfmodulenameprefix r$   Y/home/ubuntu/.local/lib/python3.10/site-packages/nemo/lightning/pytorch/callbacks/peft.py	transform>   s   zPEFT.transformmodelreturnc                 C   s   |  | t|trt|dkr|D ]}|| j qnt|tjjj	j
r-|j| j n|| j t|rD|jjjtjkrD|  | |S )a^  Apply the PEFT method to the entire model.

        This method freezes the model parameters and walks through the model
        structure, applying the transform method to each module.

        Args:
            model (nn.Module): The model to be fine-tuned.

        Returns:
            nn.Module: The transformed model with PEFT applied.
           )freeze_model
isinstancer   lenwalkr&   torchnnparalleldistributedDistributedDataParallelr!   r   trainerstatefnr   FITTINGr    r'   model_chunkr$   r$   r%   __call__W   s   

zPEFT.__call__c                 C   s~   t |trt|dkr|D ]}|  qt |tjjjjr#|j	  n|  t
|r;|jjjtjkr=|jdd dS dS dS )a  Apply a default freeze method to the model.

        This method freezes all the model parameters. This method can be overridden by subclasses to
        implement custom freeze strategies (e.g. freeze only parts of the model)

        Args:
            model (nn.Module): The model to be fine-tuned.

        Returns:
            nn.Module: The transformed model with PEFT applied.
        r)   T)modeN)r+   r   r,   freezer.   r/   r0   r1   r2   r!   r   r3   r4   r5   r   r6   trainr7   r$   r$   r%   r*   r   s   
zPEFT.freeze_modelc                 C   s   t t| dS )z
        This is a helper function to return a partial function that wraps the checkpoint I/O with the PEFT adapter.
        Can be overridden in each PEFT method class.
        )peft)r   WrappedAdapterIOr    r$   r$   r%   get_wrappped_io   s   zPEFT.get_wrappped_ior3   	pl_modulestagec           	         s$  ddl m} ddlm} t j ||d dt|jv | _  j	_
|  }d| _d| _| durQdd	d
}dd  j_dd  j_ j	j| _d	| _dd  j	_ng d} fddt fdd|D }|dd|i| j	_t j	ddr| j	jjn j	j| _d j	_| jrd	 j	_d j	_dS )zPTL callback setup function.r   )create_checkpoint_io)r   )rB   HFAutoModelNFhuggingfaceT)model_librarylorac                   S      dS NTr$   r$   r$   r$   r%   <lambda>       zPEFT.setup.<locals>.<lambda>c                   S   rH   rI   r$   r$   r$   r$   r%   rJ      rK   c                 S   rH   rI   r$   xr$   r$   r%   rJ      rK   )save_ckpt_format
async_savetorch_dist_multiprocassume_constant_structureparallel_saveparallel_save_within_dpparallel_loadload_directly_on_devicec                    s   i | ]	}|t  j|qS r$   )getattrstrategy).0argr3   r$   r%   
<dictcomp>   s    zPEFT.setup.<locals>.<dictcomp>c                    s   t  j| S N)hasattrrW   rL   rZ   r$   r%   rJ          wrapping_ckpt_iorO   r$   )'nemo.lightning.pytorch.strategies.utilsrC   nemo.lightning.pytorch.utilsr   supersetuptype__name___add_via_setattrrW   r3   r@   automodel_setup_optimizerstransform_already_applied_checkpoint_connectorrestore_training_staterestore_modelsetup_optimizersfilter_checkpoint_iorV   
wrapped_io_init_model_parallel_setup_optimizers)	r    r3   rA   rB   rC   r   ro   ckpt_io_kwargsckpt_io_kwarg_names	__class__rZ   r%   rc      s8   



z
PEFT.setupc                 C   sj   t dd |j D | _|j D ] \}}t|dr2| D ]\}}|dur1| j|d |  qqdS )z
        Set params to be saved for PEFT. This function is called in apply_transform.
        Can be overridden in each PEFT method class.
        c                 s   s    | ]
\}}|j r|V  qd S r\   )requires_grad)rX   r"   paramr$   r$   r%   	<genexpr>   s    

z*PEFT.set_params_to_save.<locals>.<genexpr>track_running_statsN.)setlightning_modulenamed_parametersparams_to_savenamed_modulesr]   named_buffersadd)r    r3   module_namer!   buffer_namebufferr$   r$   r%   set_params_to_save   s   

zPEFT.set_params_to_savec                    s  t  ddst |  |  jjdur+t jjjd tkr+ 	| jjj
S t  dddks8 jdurL jdurJtd  | d _dS i } jjdurntd jj   fdd	|j  D |d
< t|jdr~td |j  |jjtjkrtd |j|  jjdur|j r|jjddg|d< |r jj jj|d}|jj|dd |jjtjkr|jj|dd |dd }durt|j |D ]
\}}|j!"| q|j#ddd D ]}t$|t%r|&||j'  dS qt(|du rt)d dS dS )a  
        This function does the following:
        1. Apply PEFT model transform.
        2. Set up model parallel and optimizer, which were skipped in setup
        3. Load weights and optimizer state dict
        4. Set up `finalize_model_grads` from mcore.
        rh   FNTzSetting up optimizerszLoading adapters from c                    s    i | ]\}}  |r||qS r$   )adapter_key_filterrX   kvr?   r$   r%   r[      s
    z(PEFT.apply_transform.<locals>.<dictcomp>
state_dictinit_model_parallelzInitializing model parallel)
is_loading	optimizersharded_state_dictstrict)selective_restorelr_schedulerszmMegatronOptimizerModule not found in trainer callbacks. finalize_model_grads is not properly set up for PEFT.)*rV   rb   apply_transformr   ro   adapter_ckpt_pathr   partsr   restore_automodelparentrg   r   infor'   r   itemsr]   rW   r   r4   r5   r   r6   rl   should_restore_optimizer_statesoptimizer_sharded_state_dictload_checkpointload_model_state_dictload_optimizer_state_dictgetziplr_scheduler_configs	schedulerload_state_dict	callbacksr+   r   on_fit_startr|   r   warning)r    r3   adapter_sharded_state_dictadapter_stater   config	lrs_statecbrt   r?   r%   r      s^   	









zPEFT.apply_transformc           
         s   ddd | j |}|j }|d  D ]}||v s$J || fqddlm |jjd fdd|d 	 D id	d
 |j
 D ]\}}|||d v  qF|jjtjkr| jdusdJ d| | |j| |dd }durt|j|D ]\}}	|j|	 qdS dS dS )z5restores automodel's adapter and optimizer state dictr'   c                 S   s,   |  d}|d |ksJ d|dd S )z0helper function to remove first "model" from fqnrz   r   r)   N)splitjoin)fqnr#   r   r$   r$   r%   pop_fqn_prefix  s   
z.PEFT.restore_automodel.<locals>.pop_fqn_prefixr   r   )to_cpuc                    s   i | ]\}} ||qS r$   r$   r   r   r   r$   r%   r[   -  s    z*PEFT.restore_automodel.<locals>.<dictcomp>Fr   Nz/Expected automodel_setup_optimizers to be validr   )r'   )ro   r   r|   r   keysr`   r   rW   r   r   r}   requires_grad_r4   r5   r   r6   rg   r   r   r   r   r   r   )
r    r3   pathr   r   keyrw   r   r   r   r$   r   r%   r     s,   


zPEFT.restore_automodelr   c                 C   s0   t |tr
|d jS || jv pd|v p|dS )z
        Given a key in the state dict, return whether the key is an adapter (or base model).
        This function can be subclassed in each PEFT method class.
        r)   z	.adapter.z	.adapters)r+   tuplerv   r~   endswith)r    r   r$   r$   r%   r   @  s   

zPEFT.adapter_key_filterNN)re   
__module____qualname____doc__r   r&   r/   Moduler9   r*   r@   plTrainerLightningModulestrrc   r   r   r   boolr   __classcell__r$   r$   rt   r%   r   ,   s    "4I&r   c                	       st   e Zd ZdZdejdejf fddZdd ZdddZ				dde	de
e
eeef  dee ddfddZ  ZS )AdapterWrappera  Abstract base class for wrapping modules with adapters in Parameter-Efficient Fine-Tuning (PEFT).

    This class wraps a module and its associated adapter, providing methods for
    managing the state dictionaries of both the main module and the adapter. It does not
    implement the forward method, which must be implemented by concrete subclasses.

    Attributes:
        to_wrap (nn.Module): The main module to be wrapped.
        adapter (nn.Module): The adapter module to be applied.

    Note:
        This class is abstract and cannot be instantiated directly. Subclasses must
        implement the forward method.

    Example:
        class LoRALinear(AdapterWrapper):
            def __init__(self, to_wrap, adapter):
                super().__init__(to_wrap, adapter)

            def forward(self, x):
                return self.to_wrap(x) + self.adapter(x)

        main_module = nn.Linear(100, 100)
        adapter = nn.Linear(100, 100)
        parallel_adapter = LoRALinear(main_module, adapter)
    to_wrapadapterc                    s   t t|   || _|| _d S r\   )rb   r   __init__r   r   )r    r   r   rt   r$   r%   r   f  s   
zAdapterWrapper.__init__c                 O   s   | j |g|R i |}t|tsJ | j  d| 	 d}|}t|dkr:|\}}t|tr9t|dkr9|\}}nt|dkrE|\}}}|||fS )a_  
        Run the forward method of the linear module `to_wrap`.
        Return a tuple of three elements: linear_output, bias, layernorm_output

        x -> [layernorm/identity] -> layernorm_output -> [linear] -> linear_output, bias

        layernorm_output is different from input x only when linear layer is LayerNormColumnParallelLinear.
        z+ should return a tuple but instead returns N      )r   r+   r   r,   )r    rM   argskwargslinear_outputbiaslayernorm_outputr$   r$   r%   base_linear_forwardk  s"   	

z"AdapterWrapper.base_linear_forwardN Fc                 C   s:   |du ri }| j j|||d | jj|| d|d |S )a  Retrieve the state dictionary of the wrapped module and adapter.

        This method overrides the default state_dict behavior to include both
        the main module's state and the adapter's state under a special 'adapters' key.

        Args:
            destination (Optional[dict]): A dictionary to store the state. If None, a new
                                          dictionary is created. Defaults to None.
            prefix (str): A prefix added to parameter and buffer names. Defaults to ''.
            keep_vars (bool): If True, returns variables instead of tensor values.
                              Defaults to False.

        Returns:
            dict: The state dictionary containing both the main module and adapter states.
        N)destinationr#   	keep_varsadapter.)r   r   r   )r    r   r#   r   r$   r$   r%   r     s
   zAdapterWrapper.state_dictr$   r#   sharded_offsetsmetadatar(   r   c                 C   s:   i }| | j||| | | j| d|| |S )a  Retrieve the sharded state dictionary of the wrapped module and adapter.

        This method is used for distributed checkpointing, combining the sharded states
        of both the main module and the adapter.

        Args:
            prefix (str): A prefix added to parameter and buffer names. Defaults to ''.
            sharded_offsets (Tuple[Tuple[int, int, int]]): Offsets for sharded parameters.
                                                           Defaults to an empty tuple.
            metadata (Optional[dict]): Additional metadata for the sharded state.
                                       Defaults to None.

        Returns:
            ShardedStateDict: The combined sharded state dictionary.
        r   )updater   r   r   )r    r#   r   r   r   r$   r$   r%   r     s   z!AdapterWrapper.sharded_state_dict)Nr   F)r   r$   N)re   r   r   r   r/   r   r   r   r   r   r   intr
   dictr   r   r$   r$   rt   r%   r   J  s"    
r   c                       s   e Zd ZU dZdZee ed< dZee	 ed< dZ
ee	 ed< 	dded dee ddf fd	d
Zeddeeef dedee ddfddZdd Ze			ddedee ded eB deeef fddZ  ZS )r>   a  
    A wrapper class for checkpoint I/O operations, specifically designed for PEFT (Parameter-Efficient Fine-Tuning).

    This class handles the complexities of saving and loading checkpoints for both initial PEFT training and resuming
    PEFT training. It ensures that only the necessary adapter weights are saved and loaded, while also preserving the
    base model weights.

    **Usage:**

    1. **Initial PEFT Training:**
       - The class handles the saving of only adapter weights.
       - Metadata about the base model checkpoint is stored for future reference.

    2. **PEFT Resume:**
       - The class loads both base model and adapter weights.
       - The previously stored metadata is used to locate the correct base model checkpoint.

    **Attributes:**

    - `peft`: The PEFT instance associated with the wrapped checkpoint I/O.
    - `model_ckpt_path`: The path to the base model checkpoint.
    - `adapter_ckpt_path`: The path to the adapter checkpoint.
    Note that the paths are set by save/load functions and users do not need to set them.

    **Methods:**

    - `save_checkpoint`: Saves the adapter weights and metadata to the specified path.
    - `load_checkpoint`: Loads the base model and adapter weights based on the specified path and metadata.
    Nr=   model_ckpt_pathr   checkpoint_ioCheckpointIOr(   c                    s   || _ t | d S r\   )r=   rb   r   )r    r   r=   rt   r$   r%   r     s   zWrappedAdapterIO.__init__
checkpointr   storage_optionsc                    sX   j d usJ d }dD ]
}||v r|} nq|d usJ d||v s&J d||}tt fdd| ||< t||  } j j|||d}ddlm	}	 |	 rt
|d	d
}
ddlm} t j |r| |}|
jt }|jd	d	d |t }n|
jd	d	d dt ji}|
t }t|d}t|| W d    |S 1 sw   Y  |S )N)r   r   zCExpected checkpoint to contain `sharded_state_dict` or `state_dict`z&Expected state_key to be in checkpointc                    s    j | d S )Nr   )r=   r   )itemr?   r$   r%   rJ     s    z2WrappedAdapterIO.save_checkpoint.<locals>.<lambda>)r   r   )is_global_rank_zeroT)	is_saving)HFCheckpointIO)parentsexist_okr   w)r   popr   rm   r   listr   save_checkpointnemo.utils.get_rankr   r   nemo.lightning.io.hfr   r+   _create_lora_hf_configr   r   mkdirr   r   r   r   openjsondump)r    r   r   r   	state_keyr   r   	ckpt_keysrequestr   base_dirr   r   hf_adapter_baseadapter_meta_pathfr$   r?   r%   r     s>   




z WrappedAdapterIO.save_checkpointc                 C   sx   dd }ddl m} ddlm} ||| j j}|| j j|| j j| j jt| j |d}|	 }d|d< d	|d
< ||d< |S )z0Creates a HF lora config from a NeMo Lora configc                 S   st   t tdd |}t|dkr|S t tdd |}dtdd |}t|}| D ]}|t|| q*t |S )a  
            Extracts module names from a list of checkpoint keys that match the target modules.

            This function processes a list of target module patterns, where each pattern may or may
            not contain a wildcard (`'*'`). The function matches these patterns against the
            checkpoint keys, with the following behavior:
            - Patterns containing '*' will be expanded to match any sequence of characters
              except a dot (`.`).
            - Patterns without '*' are matched literally.

            Args:
                ckpt_keys (list of str): A list of strings representing checkpoint keys to be
                    searched.
                target_modules (list of str): A list of target module patterns. Some patterns may
                    contain wildcards (`'*'`), which match any characters except a dot.

            Returns:
                list of str: A list of module names from `target_modules` that match any of the
                `ckpt_keys`. The result is returned as a list of unique module names.

            Example:
                ckpt_keys = [
                    "model.model.layers.27.self_attn.k_proj",
                    "model.model.layers.27.self_attn.v_proj",
                    "model.model.layers.27.self_attn.mlp"
                ]
                target_modules = ["*proj"]

                extract_matched_module_names(ckpt_keys, target_modules)
                # Output: ['k_proj', 'v_proj']

            Notes:
                - This function uses regular expressions to match the target patterns in the
                  checkpoint keys.
                - Wildcards are expanded as `[^.]+` to ensure that the match doesn't cross dot
                  (`.`) boundaries.
            c                 S   s   d| v S N*r$   rL   r$   r$   r%   rJ   6      z_WrappedAdapterIO._create_lora_hf_config.<locals>.extract_matched_module_names.<locals>.<lambda>r   c                 S   s   d| vS r   r$   rL   r$   r$   r%   rJ   9  r   |c                 S   s   |  ddS )Nr   z[^.]+)replacerL   r$   r$   r%   rJ   ;  r^   )	r   rm   r,   r   mapr{   r   refindall)r   target_modulesre_target_modulesnon_re_target_modulescombined_patternansr   r$   r$   r%   extract_matched_module_names  s   &zMWrappedAdapterIO._create_lora_hf_config.<locals>.extract_matched_module_namesr   )
LoraConfig)DoRA)rr   
lora_alphalora_dropoutuse_doraLORA	peft_typeNmegatron_corer   )
r=   r  nemo.collections.llm.peftr  r   dimalphadropoutr+   to_dict)r    r   r  r  r  materialized_module_nameslora_configr$   r$   r%   r     s    2
z'WrappedAdapterIO._create_lora_hf_configmap_locationr   StrictHandlingc                 C   s  | j dusJ d}t|}t|ddr)tt|| _| j j|i d}|j }| _n>|t	  }
 rVt|d}t|}	W d   n1 sFw   Y  t|	d | _|| _n|t t 
 rd|t | _n|| _| j ||||}
|dur|d |
d  |S |
S )a  
        =====================
        Initial PEFT Training
        =====================
        Initial PEFT training requires loading the base model weights. In this case, this function is called by
        trainer.strategy.setup() -> megatron_strategy.restore_model() -> megatron_strategy.load_checkpoint().
        `path = PosixPath(<base_path>)`, and sharded_state_dict contains only base model weights

        ===========
        PEFT Resume
        ===========
        PEFT resume requires loading two set of model weights, 1) base model weights and 2) adapter weights
        Base model weights could be imported from e.g. HF, and is frozen during PEFT training.
        Adapter weights contains the training metadata that will need to be loaded.
        As such, this function will be entered twice during PEFT training resume.

        For the FIRST TIME this function is called by trainer._checkpoint_connector._restore_modules_and_callbacks.
        `path = AdapterPath(<adapter_path>, base_model_path=<base_path>)`, and sharded_state_dict contains only base
        model weights

        For the SECOND TIME this function is called by PEFT.apply_transform (above, in the same file).
        `path = PosixPath(<adapter_path>)`, and sharded_state_dict contains only adapter weights.
        Nbase_model_pathr   r  r   r   )r   r   rV   r   r   r   r   r  r   r   existsr   r   loadr   r   r   )r    r   r   r  r   adapter_ckptbaser   r   r   
model_ckptr$   r$   r%   r   U  s*    z WrappedAdapterIO.load_checkpointr   r\   )NNN)re   r   r   r   r=   r
   r   __annotations__r   r   r   r   r   r	   r   r   r   r   r   r   r   r   r   r$   r$   rt   r%   r>     s:   
 *$H

r>   ):r   r   abcr   r   	functoolsr   pathlibr   typingr   r   r   r	   r
   r   lightning.pytorchpytorchr   r.   torch.nnr/    lightning.fabric.utilities.typesr   $lightning.pytorch.plugins.io.wrapperr    lightning.pytorch.trainer.statesr   typing_extensionsr   nemo.lightning.ckpt_utilsr   r   r   nemo.lightning.io.mixinr   nemo.lightning.io.plr   r    nemo.lightning.megatron_parallelr   0nemo.lightning.pytorch.callbacks.model_transformr   %nemo.lightning.pytorch.optim.megatronr   ra   r   r   
nemo.utilsr   !nemo.utils.callbacks.dist_ckpt_ior   (megatron.core.dist_checkpointing.mappingr   r   r   r   r>   r$   r$   r$   r%   <module>   s:       u