o
    wi(.                  "   @   sr  d Z ddlmZ ddlmZ ddlmZmZmZm	Z	 ddl
mZ ddlm  mZ ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' e%d\Z(Z)e)r}ddl*m+Z+ e%d\Z(Z,e%d\Z(Z-e)re,re-rddl.m/Z/ erddl
mZ0 ddl1m2Z2 ddl3m4Z4 ddgZ5de	ej6ej7f de	ej6ej7f fddZ8dej9de:ddfddZ;	 	 			 	 	 	!		"			d;d#e:d$e<d%e<d&e<dB d'e<dB d(e<d)e<d*e<d+e=d,e:dB d-e=d.e>dB d/e>dB d0e>dB de?e	ej@ejAf ejBf fd1dZC	d<dejDdee: d2ed3 ddeB fd4d5ZEdd6de:d7d8fd9d:ZFdS )=z>Utility functions for loading models with modelopt layer spec.    )nullcontext)partial)TYPE_CHECKINGCallableOptionalUnionN)_WrappingCheckpointIO)StrictHandling)	lightning)llm) _setup_trainer_and_restore_model)ckpt_to_context_subdir)ckpt_to_weights_subdir)logging)safe_import)unwrap_modeltransformer_engine)get_gpt_modelopt_spec	mamba_ssmcausal_conv1d)get_mamba_stack_modelopt_spec)CheckpointIO)MegatronParallel#set_modelopt_spec_if_exists_in_ckpt2setup_trainer_and_restore_model_with_modelopt_spec	model_cfgreturnc                 C   sX   t d t| tjrttdd| _| S t| tjr#tt	dd| _
| S tdt|  )a  
    Set the model layer spec to a modelopt spec variant. This function updates the model
    config with the appropriate modelopt layer specification based on the model type.

    Args:
        model_cfg (Union[llm.GPTConfig, llm.SSMConfig]): The model config.

    Returns:
        Union[llm.GPTConfig, llm.SSMConfig]: The model config updated for the modelopt layer specification.
    z<Setting model layer specification to the modelopt layer specT)remap_te_layernormz1No modelopt layer spec supported for config type )r   info
isinstancer   	GPTConfigr   r   transformer_layer_spec	SSMConfigr   mamba_stack_spec
ValueErrortype)r    r&   f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/modelopt/model_utils.py_set_gpt_mamba_modelopt_spec9   s   
r(   modelpathc                 C   st   t |d}t|ddd }| rt| drdS t| tjtjfr.t	| j
 d| j
_dS tt|  d dS )zdSet model.config.transformer_layer_spec to modelopt spec if modelopt_state exists in the checkpoint.znemo://F	is_savingmodelopt_statemoduleNzI is neither a GPTModel nor MambaModel. Modelopt state will not be loaded.)strremoveprefixr   existshasattrr   r   GPTModel
MambaModelr(   configgradient_accumulation_fusionr   warningr%   )r)   r*   modelopt_state_pathr&   r&   r'   r   Q   s   
   TF
model_pathtensor_model_parallel_sizepipeline_model_parallel_size"num_layers_in_first_pipeline_stage!num_layers_in_last_pipeline_stageexpert_model_parallel_sizedevices	num_nodesinference_onlytokenizer_pathlegacy_ckptstrategy_kwargstrainer_kwargsmodel_config_overridesc                 C   s  |du ri }|du ri }|du ri }t d|  d |r6tjd|||tjdddd|
r.tjndd	|}ntjd|||tj|
rCtjndd|}tjd||d|tj	d	tjdd
dd|}tj
jt| dd}t|j | D ]\}}t d| d|  t|j|| qs|r|`|r||j_|r||j_d}|	rddlm} ||	}t| ||| d|j_t d| d ||fS )a  Loads a GPT model from a NeMo 2.0 checkpoint using modelopt layer spec.

    Args:
        model_path (str): Path to the NeMo checkpoint.
        tensor_model_parallel_size (int): Size of the tensor model parallelism.
        pipeline_model_parallel_size (int): Size of the pipeline model parallelism.
        num_layers_in_first_pipeline_stage (int): Number of layers in the first pipeline stage.
        num_layers_in_last_pipeline_stage (int): Number of layers in the last pipeline stage.
        devices (int): Number of devices on each node.
        num_nodes (int): Number of nodes being used.
        inference_only (bool): If True, loads the model for inference only w/o initializing the optimizer.
        tokenizer_path (Optional[str]): Path to the tokenizer if not using model's tokenizer.
        legacy_ckpt (bool): If True, allow loading ckpt saved with older version of TE.
        strategy_kwargs (Optional[dict]): Additional keyword arguments for nl.MegatronStrategy.
        trainer_kwargs (Optional[dict]): Additional keyword arguments for nl.Trainer.
        model_config_overrides (Optional[dict]): keyword arguments to override model config.

    Returns:
        Union[llm.GPTModel, llm.MambaModel]: The loaded model with the specified configuration.
    NzLoading model from z with modelopt layer spec...Fpytorch)	r;   r<   r?   pipeline_dtypeckpt_load_optimizerckpt_parallel_save_optimsetup_optimizersddpckpt_load_strictness)r;   r<   r?   rI   rN   gpuz
bf16-mixedT)	precisionparams_dtypeautocast_enabledgrad_reduce_in_fp32)r@   rA   acceleratorstrategypluginsr)   )r*   subpathzOverriding model.config.z to r   )get_tokenizerzLoaded model: 
r&   )r   r   nlMegatronStrategytorchbfloat16r	   LOG_ALLTrainerMegatronMixedPrecisionioload_contextr   r(   r5   itemssetattroptimr=   r>   3nemo.collections.nlp.modules.common.tokenizer_utilsrX   r   rU   restore_config)r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rU   trainerr)   kv	tokenizerrX   r&   r&   r'   r   a   sx   $
	

rh   z
pl.Trainerc                 C   s   |s|du rdS t |jjd|j}|sdS t| }tj|r&t	d dS t
|dr/| nt  tjj|gt|dddd W d   n1 sKw   Y  tj|r[t	d	 t
|d
rc|jS dS )z
    Restore ModelOpt state from checkpoint.

    Args:
        model (nn.Module): The model to restore the state to.
        path (str): The path to the checkpoint.
        trainer (pl.Trainer): The trainer object, in case path not provided.
    Nr*   zAModel Optimizer state already restored from checkpoint. Skipping.hide_teacher_modelFr+   zmodule.)prefixz/Restored Model Optimizer state from checkpoint.hide_loss_modules)getattrrU   rg   	ckpt_pathr   mtoModeloptStateManageris_convertedr   r   r2   rl   r   rV   restore_sharded_modelopt_stater   rn   )r)   r*   rh   
core_modelr&   r&   r'   restore_modelopt_state   s,   



rv   r   checkpoint_ior   c                 C   s   t | }tj|sdS t|tr|jn|}t|ddr't	d |j
dd t|dr0| nt  tjj|gt|dd|jd	 W d   n1 sMw   Y  td
 dS )a  
    Save ModelOpt state to checkpoint.

    Args:
        model (nn.Module): The MegatronParallel model to save the state from.
        path (str): The path to the checkpoint.
        checkpoint_io (CheckpointIO): The checkpoint IO object from MegatronStrategy.
    N
async_saveFzCModel-Optimizer library in use. Async checkpoint saving is blocked.T)blockingrl   r+   )sharded_strategyz,Saved Model-Optimizer state into checkpoint.)r   rq   rr   rs   r   r   rw   ro   r   r7   maybe_finalize_save_checkpointr2   rl   r   rV   save_sharded_modelopt_stater   save_sharded_strategyr   )r)   r*   rw   ru   ckpt_ior&   r&   r'   save_modelopt_state   s   


r   )r9   r9   NNr9   r9   r9   TNFNNN)NN)G__doc__
contextlibr   	functoolsr   typingr   r   r   r   lightning.pytorchrH   Lmodelopt.torch.optr\   optrq   torch.nnnn$lightning.pytorch.plugins.io.wrapperr   +megatron.core.dist_checkpointing.validationr	   nemor
   rZ   nemo.collectionsr   #nemo.collections.llm.inference.baser   nemo.lightning.ckpt_utilsr   nemo.lightning.io.plr   
nemo.utilsr   nemo.utils.import_utilsr   nemo.utils.model_utilsr   _HAVE_TE4megatron.core.post_training.modelopt.gpt.model_specsr   HAVE_MAMBA_SSMHAVE_CAUSAL_CONV1D6megatron.core.post_training.modelopt.mamba.model_specsr   pllightning.fabric.pluginsr    nemo.lightning.megatron_parallelr   __all__r    r"   r(   LightningModuler/   r   intbooldicttupler3   r4   r_   r   Modulerv   r   r&   r&   r&   r'   <module>   s   
	

m
(