o
    wi                  	   @   s   d dl mZmZmZ d dlmZ zd dlmZ d dl	m
Z
 d dlmZ dZW n eefy5   eZ
dZY nw d dlmZ d d	lmZ d d
lmZ d dlmZmZ G dd deZdS )    )CallableListOptionalNfinalize_model_grads)OptimizerConfig)get_model_configTF)	Optimizer)setup_megatron_optimizer)MegatronParallel)LRSchedulerModuleOptimizerModulec                       st   e Zd ZdZ				ddedee dee dee def
 fd	d
Z	dddZ
dedee fddZdd Z  ZS )MegatronOptimizerModuleaz  A OptimizerModule for the megatron optimizers.

    Attributes:
        config (OptimizerConfig): Configuration for the optimizer.
        no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
        scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
        lr_mult (float): Learning rate multiplier.

    Example::

        config = OptimizerConfig(...)
        lr_scheduler = MyLRSchedulerModule(...)
        optimizer_module = MegatronOptimizerModule(config, lr_scheduler)

    Methods:
        setup(model): Sets up the optimizer.
        optimizers(model): Defines the optimizers.
    N      ?configlr_schedulerno_weight_decay_condscale_lr_condlr_multc                    s*   t  j|d || _|| _|| _|| _dS )a  Initializes the MegatronOptimizerModule.

        Args:
            config (OptimizerConfig): Configuration for the optimizer.
            lr_scheduler (Optional[LRSchedulerModule]): The learning rate scheduler module.
            no_weight_decay_cond (Optional[Callable]): Condition for no weight decay.
            scale_lr_cond (Optional[Callable]): Condition for scaling learning rate.
            lr_mult (float): Learning rate multiplier.
        )r   N)super__init__r   r   r   r   )selfr   r   r   r   r   	__class__ b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/lightning/pytorch/optim/megatron.pyr   :   s
   
z MegatronOptimizerModule.__init__trainer
pl.Trainer	pl_modulepl.LightningModulec                    s    fdd}|t |_dS )zWe will add the finalize_model_grads function to the model config.

        Args:
            model: The model for which the optimizer is being set up.
        c                     s    j | i |S )Nr   )argskwargsr   r   r   finalize_model_grads_funcY   s   zGMegatronOptimizerModule.on_fit_start.<locals>.finalize_model_grads_funcN)r   r#   )r   r   r   r#   r   r"   r   on_fit_startR   s   z$MegatronOptimizerModule.on_fit_startmodelreturnc                 C   s2   t |ts	tdt|| j| j| j| jd}|gS )a0  Defines the optimizers.

        Args:
            model (MegatronParallel): The model for which the optimizers are being defined.

        Returns:
            List[Optimizer]: The list of optimizers.

        Raises:
            ValueError: If the model is not an instance of MegatronParallel.
        z-Model must be an instance of MegatronParallel)r   r   r   )
isinstancer   
ValueErrorr
   r   r   r   r   )r   r%   	optimizerr   r   r   
optimizers^   s   
z"MegatronOptimizerModule.optimizersc                 O   s   t |i |S )z0Return function to finalize the model gradients.r   )r   r    r!   r   r   r   r   x   s   z,MegatronOptimizerModule.finalize_model_grads)NNNr   )r   r   r   r   )__name__
__module____qualname____doc__r   r   r   r   floatr   r$   r   r   r	   r*   r   __classcell__r   r   r   r   r   &   s(    
r   )typingr   r   r   lightning.pytorchpytorchplmegatron.core.distributedr   megatron.core.optimizerr   megatron.core.utilsr   HAVE_MEGATRON_COREImportErrorModuleNotFoundErrorobjecttorch.optimr	   nemo.lightning._strategy_libr
    nemo.lightning.megatron_parallelr   !nemo.lightning.pytorch.optim.baser   r   r   r   r   r   r   <module>   s    