o
    }oim                 	   @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dlm
Z
 d dlmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZ zd d	lmZmZ d d
lmZ dZ W n e!e"fyu   dZ Y nw d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZBmCZC dgZDe+jEddd dd G dd de$e2ZFdS )     )annotationsN)abstractmethodpath)Path)AnyCallableDictIteratorListOptionalTupleUnion)NeuralModule)import_multistorageclientis_multistorageclient_url)OptimizerConfigget_megatron_optimizer)get_model_configTF)LightningModuleTrainer)model_summaryrank_zero_only)
DictConfig	OmegaConf	open_dict)package_info)optim)Model)SaveRestoreConnector)McoreDistributedOptimizerprepare_lr_scheduler)loggingmodel_utils)AppState)register_debug_hooks)NeMoBaseException)get_rankis_global_rank_zeroModelPTmultiplyc                 C  s   | | S N )xyr,   r,   M/home/ubuntu/.local/lib/python3.10/site-packages/nemo/core/classes/modelPT.py<lambda>=   s    r0   )replacec                      s  e Zd ZdZddʇ fddZdddZdˇ fddZ	ddddZdddZdddZ	dddZ
dd!d"Z	#ddd'd(Zdd*d+Ze				,			ddd8d9Zedddd:dՇ fd?d@ZeddCdDZeddFdGZddIdJZddKdLZddMdNZddQdRZ		dddVdWZdXdY ZdZd[ Zd\d] Zddd_d`Zdadb Zdcdd Zdedf ZdddidjZ ddkdlZ!	mdddrdsZ"	mdddtduZ#dddvdwZ$dddxdyZ%ddzd{Z&e'ddd~dZ(d fddZ)e	,	ddddZ*dddZ+dddZ,dddZ-ddddZ.dddZ/e0dd Z1e0dd Z2e3j4j5dd Z4e2j6dd Z2e0 fddZ7e0dd Z8e8j6dd Z8e0dd Z9e9j6dd Z9e:dddZ;e:ddddZ<dd Z=edd Z>dd Z?dd Z@dd ZAddddZBddddZCdd ZDdd ZEddÄ ZFddń ZGdddȄZH  ZIS )r)   z;
    Interface for Pytorch-lightning based NeMo models
    Ncfgr   trainerr   c                   s  |durt |tstdt| dt   	 t }t|}t	|}d|v r.tdd|vrIt
|d d| jj| jj|_t
|d	 d
|vret| tj|_W d   n1 s`w   Y  || _t | _| d d| _d| _d| _d| _d| _d| _| | t  | _!| "  t#j$% rt#j$& durt#j$& |_'| jdur| ( sd| jv r| jj)dur| jj)*dds| +| jj) d| jv r| jj,dur| jj,*dds| j-|j,d d| jv r| jj.dur| jj.*dds| j/|j.d nNd| jv r| jj)durt01dt
2| jj)  d| jv r.| jj,dur.t01dt
2| jj,  d| jv rH| jj.durHt01dt
2| jj.  d| _3d| _4t5| j6| _6| 7  d| _8d| _9d| _:d| _;| <  d| _=dS )a.  
        Base class from which all NeMo models should inherit

        Args:
            cfg (DictConfig):  configuration object.
                The cfg object should have (optionally) the following sub-configs:

                * train_ds - to instantiate training dataset
                * validation_ds - to instantiate validation dataset
                * test_ds - to instantiate testing dataset
                * optim - to instantiate optimizer with learning rate scheduler

            trainer (Optional): Pytorch Lightning Trainer instance
        NzWtrainer constructor argument must be either None or lightning.pytorch.Trainer. But got z	 instead.modelz^Creating model config node is forbidden due to collision problem when loading from checkpoint.targetFz{0}.{1}Tnemo_versionr2   train_dsdefer_setupvalidation_dsval_data_configtest_dstest_data_configzIf you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
Train config : 
zIf you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
Validation config : 
zPlease call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).
Test config : 
)>
isinstancer   
ValueErrortypesuper__init__r$   r#   #convert_model_config_to_dict_configmaybe_update_config_versionr   
set_structformat	__class__
__module____name__r5   r   r   __version__r6   _cfgdict$_nemo_submodule_name_to_config_fieldsave_hyperparameters	_train_dl_validation_dl_test_dl_optimizer_param_groups
_optimizer
_schedulerset_trainerr   _save_restore_connector_set_model_guidtorchcudais_availablecurrent_device	device_id_is_model_being_restoredr7   getsetup_training_datar9   setup_multiple_validation_datar<   setup_multiple_test_datar"   warningto_yaml_validation_step_outputs_test_step_outputswrap_training_steptraining_step_setup_profiling_nsys_profile_started_nsys_profile_complete_memory_profile_started_memory_profile_complete_setup_chakra_profiling_chakra_profile_in_progress)selfr2   r3   	app_staterH   r,   r/   rC   E   s   










zModelPT.__init__returnNonec                 C  s   t  | _d S r+   )r   rW   )clsr,   r,   r/   __init_subclass__   s   zModelPT.__init_subclass__c                   s6   | j ddrt| j| j| j| j dd t  S )z'
        Register debug hooks.
        dump_debug_infoFdump_debug_info_to_file)r2   r_   r%   r4   r3   logrB   on_fit_startrp   rr   r,   r/   rz      s   
zModelPT.on_fit_startTconfig_pathstrsrcverify_src_existsboolc                 C  s|   |du s|dkr
|S t |jdkrtdt| dsi | _| jdu r%i | _|| j v r5td| d | j	| |||S )a+  Register model artifacts with this function. These artifacts (files) will be included inside .nemo file
        when model.save_to("mymodel.nemo") is called.

        How it works:

        1. It always returns existing absolute path which can be used during Model constructor call
            EXCEPTION: src is None or "" in which case nothing will be done and src will be returned
        2. It will add (config_path, model_utils.ArtifactItem()) pair to self.artifacts

            .. code-block::

                If "src" is local existing path:
                    then it will be returned in absolute path form.
                elif "src" starts with "nemo_file:unique_artifact_name":
                    .nemo will be untarred to a temporary folder location and an actual existing path will be returned
                else:
                    an error will be raised.

        WARNING: use .register_artifact calls in your models' constructors.
        The returned path is not guaranteed to exist after you have exited your model's constructor.

        Args:
            config_path (str): Artifact key. Usually corresponds to the model config.
            src (str): Path to artifact.
            verify_src_exists (bool): If set to False, then the artifact is optional and register_artifact will return
                                      None even if src is not found. Defaults to True.

        Returns:
            str: If src is not None or empty it always returns absolute path which is guaranteed to exist during model
                 instance life
        N z.nemoz|Registering .nemo files as artifacts not supported. If you are trying to make a nested model, use `register_nemo_submodule`.	artifactsz3You tried to register an artifact under config key=z4 but an artifact for it has already been registered.)
r   suffixr&   hasattrr   keysr"   rc   rW   register_artifact)rp   r|   r~   r   r,   r,   r/   r      s   &


zModelPT.register_artifactc                 C  s"   t | do| jduot| jdkS )z.Returns True if model has artifacts registeredr   Nr   )r   r   lenr{   r,   r,   r/   has_artifacts*  s   "zModelPT.has_artifactsc                 C  sD   |   D ]}t|trt|dr|jdurt|jdkr dS qdS )zHReturns True if it has artifacts or any of the submodules have artifactsr   Nr   TF)modulesr?   r)   r   r   r   )rp   moduler,   r,   r/   "has_native_or_submodules_artifacts.  s   
z*ModelPT.has_native_or_submodules_artifactsc                 C  s   t | jdkS )z5Returns True if it has any registered NeMo submodulesr   )r   rN   r{   r,   r,   r/   has_nemo_submodules:  s   zModelPT.has_nemo_submodulesnameconfig_fieldr4   	'ModelPT'c                 C  sJ   t |tstdt|j t| dstdt| || || j|< dS )aX  
        Adds a NeMo model as a submodule. Submodule can be accessed via the `name` attribute on the parent NeMo model
        this submodule was registered on (`self`).
        In the saving process, the whole parent model (self) is held as a solid model with artifacts
        from the child submodule, the submodule config will be saved to the `config_field` of the parent model.
        This method is necessary to create a nested model, e.g.

        .. code-block:: python

            class ParentModel(ModelPT):
                def __init__(self, cfg, trainer=None):
                    super().__init__(cfg=cfg, trainer=trainer)

                    # annotate type for autocompletion and type checking (optional)
                    self.child_model: Optional[ChildModel] = None
                    if cfg.get("child_model") is not None:
                        self.register_nemo_submodule(
                            name="child_model",
                            config_field="child_model",
                            model=ChildModel(self.cfg.child_model, trainer=trainer),
                        )
                    # ... other code

        Args:
            name: name of the attribute for the submodule
            config_field: field in config, where submodule config should be saved
            model: NeMo model, instance of ModelPT
        zBModel is not and instance of ModelPT, so can't be registered. Got rN   zYou are trying to register a submodule before the model is initialized. This is not allowed. Did you forget to call `super().__init__`?N)r?   r)   r&   rA   rJ   r   setattrrN   )rp   r   r   r4   r,   r,   r/   register_nemo_submodule>  s   

zModelPT.register_nemo_submoduler   prefix_nameprefix_config$Iterator[Tuple[str, str, 'ModelPT']]c                 c  s    t | ds
td||| fV  | j D ]2\}}|r"| d| n|}|r-| d| n|}t| |}|j||dD ]\}}	}
||	|
fV  q;qdS )a  
        Returns an iterator over all NeMo submodules recursively, yielding
        tuples of (attribute path, path in config, submodule), starting from the core module

        Args:
            prefix_name: prefix for the name path
            prefix_config: prefix for the path in config

        Returns:
            Iterator over (attribute path, path in config, submodule), starting from (prefix, self)
        rN   zModel is not fully initialized. Calling `named_nemo_modules` before __init__ not allowed. Did you forget to call `super().__init__`?.)r   r   N)r   r&   rN   itemsgetattrnamed_nemo_modules)rp   r   r   r   r   attribute_pathr|   r   submodule_namesubconfig_path	submoduler,   r,   r/   r   k  s    

zModelPT.named_nemo_modules	save_pathc                 C  s   ddd}t |st|  }t }|jdurE|jdkr)t| jtkr)t	dt
 r0|| tj r:tj  | j| t| dS t
 rW|| | j| t| dS dS )	a  
        Saves model instance (weights and configuration) into .nemo file
         You can use "restore_from" method to fully restore instance from .nemo file.

        .nemo file is an archive (tar.gz) with the following:
            model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for
                                model's constructor
            model_wights.ckpt - model checkpoint

        Args:
            save_path: Path to .nemo file where model instance should be saved
        r   Union[str, 'pathlib.Path']c                 S  s,   t | s| j s| jjdd d S d S d S )NT)parents)r   parentexistsmkdirr   r,   r,   r/   maybe_make_save_dir  s
   
z,ModelPT.save_to.<locals>.maybe_make_save_dirN   zDefault NeMo SaveRestoreConnector will not work in model parallel mode. You should use a connector which supports model parallel mode, such as NLPSaveRestoreConnector in NLP. You can also use a custom one.)r   r   )r   r   
expanduserresolver$   model_parallel_sizerA   rW   r   r@   r(   rY   distributedis_initializedbarriersave_tor}   )rp   r   r   rq   r,   r,   r/   r     s&   




zModelPT.save_toFrestore_pathoverride_config_pathOptional[Union[OmegaConf, str]]map_locationOptional[torch.device]strictreturn_configsave_restore_connectorr   Optional[Trainer]validate_access_integrityc	              
   C  s   |du rt  }t|rt }	|	jj|std| n'|jdu r-tjtj	|}ntjtj	|j}t|sDtd| t
 }
||
_| | | j| |||||||}t|trd||_|S )a  
        Restores model instance (weights and configuration) from .nemo file.

        Args:
            restore_path: path to .nemo file from which model should be instantiated
            override_config_path: path to a yaml config that will override the internal
                config file or an OmegaConf / DictConfig object representing the model config.
            map_location: Optional torch.device() to map the instantiated model to a device.
                By default (None), it will select a GPU if available, falling back to CPU otherwise.
            strict: Passed to load_state_dict. By default True.
            return_config: If set to true, will return just the underlying config of the restored
                model as an OmegaConf DictConfig object without instantiating the model.
            trainer: Optional, a pytorch lightning Trainer object that will be forwarded to the
                instantiated model's constructor.
            save_restore_connector (SaveRestoreConnector): Can be overridden to add custom save and restore logic.

            Example:
                ```
                model = nemo.collections.asr.models.EncDecCTCModel.restore_from('asr.nemo')
                assert isinstance(model, nemo.collections.asr.models.EncDecCTCModel)
                ```

        Returns:
            An instance of type cls or its underlying config (if return_config is set).
        NCan't find )r   r   r   osr   r   FileNotFoundErrormodel_extracted_dirabspathr   r$   model_restore_pathupdate_save_restore_connectorrW   restore_fromr?   r)   )ru   r   r   r   r   r   r   r3   r   mscrq   instancer,   r,   r/   r     s8   &




zModelPT.restore_from)r   hparams_filer   checkpoint_pathAOptional[Union[Dict[str, str], str, torch.device, int, Callable]]r   Optional[str]c             	     sN   d}z| j dd t j|||||d|}W | j dd |S | j dd w )z
        Loads ModelPT from checkpoint, with some maintenance of restoration.
        For documentation, please refer to LightningModule.load_from_checkpoint() documentation.
        NT)is_being_restored)r   r   r   r   F)_set_model_restore_staterB   load_from_checkpoint)ru   r   r   r   r   argskwargs
checkpointrr   r,   r/   r     s   
zModelPT.load_from_checkpointtrain_data_configUnion[DictConfig, Dict]c                 C     dS )z
        Setups data loader to be used in training

        Args:
            train_data_layer_config: training data layer parameters.
        Returns:

        Nr,   )rp   r   r,   r,   r/   r`        
zModelPT.setup_training_datar;   c                 C  r   )z
        Setups data loader to be used in validation
        Args:

            val_data_layer_config: validation data layer parameters.
        Returns:

        Nr,   rp   r;   r,   r,   r/   setup_validation_data&  r   zModelPT.setup_validation_datar>   c                 C  s   t  )z
        (Optionally) Setups data loader to be used in test

        Args:
            test_data_layer_config: test data layer parameters.
        Returns:

        )NotImplementedErrorrp   r>   r,   r,   r/   setup_test_data2     	zModelPT.setup_test_datac                 C     d| _ d| _d| _| jd|d zd| _tj| d W d| _nd| _w | jdu rE| jdurGt| jtt	fv rIdd	 t
t| jD | _dS dS dS dS )
z
        (Optionally) Setups data loader to be used in validation, with support for multiple data loaders.

        Args:
            val_data_layer_config: validation data layer parameters.
        r   N
validationdataset_nameconfigTr4   Fc                 S     g | ]}d  |qS )zval_{}_rG   .0idxr,   r,   r/   
<listcomp>T      z:ModelPT.setup_multiple_validation_data.<locals>.<listcomp>)_val_dl_idx_validation_namesrQ   _update_dataset_config_multi_dataset_moder#   resolve_validation_dataloadersrA   listtupleranger   r   r,   r,   r/   ra   =     
z&ModelPT.setup_multiple_validation_datac                 C  r   )
z
        (Optionally) Setups data loader to be used in test, with support for multiple data loaders.

        Args:
            test_data_layer_config: test data layer parameters.
        r   Ntestr   Tr   Fc                 S  r   )ztest_{}_r   r   r,   r,   r/   r   m  r   z4ModelPT.setup_multiple_test_data.<locals>.<listcomp>)_test_dl_idx_test_namesrR   r   r   r#   resolve_test_dataloadersrA   r   r   r   r   r   r,   r,   r/   rb   V  r   z ModelPT.setup_multiple_test_dataoptim_config!Union[Dict[str, Any], DictConfig]c                 C  sj   t | jd }t|j|j|j|d |d |d d |d d |dtj| jj	| j
| jjddd	}|S )
z
        Setup mcore optimizer config.

        Args:
            optim_config: Nemo optim args used to set up Mcore optimizer options.
        r   lrweight_decaybetasr   eps(overlap_param_gather_with_optimizer_stepF)fp16bf16params_dtyper   r   
adam_beta1
adam_beta2adam_eps	clip_graduse_distributed_optimizerr   )r   r4   r   r   r   r   r_   r   r3   gradient_clip_valuse_mcore_dist_optimr2   r   )rp   r   r   megatron_optim_configr,   r,   r/   setup_megatron_optimizationo  s"   

z#ModelPT.setup_megatron_optimization!Optional[Union[DictConfig, Dict]]optim_kwargsOptional[Dict[str, Any]]c              
   C  s  |    | |}|du rtd dS | jdur#t| jdr#|| j_|dur.tj|dd}| j	du r8t
d d|v r| j	durt| j	jtsLtd| jjd	k r| j	j|d d
< | j	j|d d< | j	j|d d< t }|jdurz|j|d d< n*|jdu r| j	j| j	j |d d< n| j	j| j	j |j |d d< n| j	j|d d< t|}tj|dd}d|v r|d}nd}|dd}|du r|dd}nt|r|j }n	|dd  }|dd}d|v r|d}t ||}nt!"|}|dd |dd |dd |dur|#| |dur!||d< |durt|rA|| j$fi |}	tdt%|	 |	| _&nzz1td|i}|durSd|i}
ni }
|
#| t'j(j)|| j$fi |
}tdt%| || _&W nH t*y } zt+d,|t%|
 |d}~ww |dkr| -|}t.|| j/}t0|}	nt1|}	|	| j$fi |}	tdt%|	 |	| _&t2| j&|| j3d| _4| j&| j4fS )aa  Prepares an optimizer from a string name and its optional config parameters.

        Args:
            optim_config: A dictionary containing the following keys:

                * "lr": mandatory key for learning rate. Will raise ValueError if not provided.
                * "optimizer": string name pointing to one of the available optimizers in the registry.                 If not provided, defaults to "adam".
                * "opt_args": Optional list of strings, in the format "arg_name=arg_value".                 The list of "arg_value" will be parsed and a dictionary of optimizer kwargs                 will be built and supplied to instantiate the optimizer.

            optim_kwargs: A dictionary with additional kwargs for the
                optimizer. Used for non-primitive types that are not
                compatible with OmegaConf.

        Nz@No optimizer config provided, therefore no optimizer was createdr   T)r   zSTrainer wasn't specified in model constructor. Make sure that you really wanted it.schedzIWe do not currently support gradient acculumation that is not an integer.r   t_max_epochst_accumulate_grad_batchest_limit_train_batchest_num_workers	max_steps_target_r   adamr   r   r   ru   zOptimizer config = %sz4Could not instantiate class path - {} with kwargs {}mcore_distributed_optim)	optimizerscheduler_configtrain_dataloader)5setup_optimizer_param_groups_optim_config_copyr"   inforL   r   r   r   to_container_trainerrc   r?   accumulate_grad_batchesintr@   r3   r	  
max_epochslimit_train_batchesr$   data_parallel_sizer   num_devices	num_nodescreatepopr_   inspectisclassrJ   lowersplitparse_optimizer_argscopydeepcopyupdaterS   r}   rT   hydrautilsinstantiate	ExceptionerrorrG   r   r   r4   r    get_optimizerr!   rP   rU   )rp   r   r  rq   r  optimizer_clsoptimizer_namer   optimizer_argsr  optimizer_configoptimizer_instanceer   _megatron_optimizerr,   r,   r/   setup_optimization  s   


















	



zModelPT.setup_optimizationc                 C  s,  t | ds
d| _dS g }g }d| jv r| jj}| D ]@\}}t| |d}|du r0t| dt |drU|| dt|	 i}| D ]\}}	|	||< qF|| qt| dg }
| 
 D ]\}}d}|D ]	}||rsd}qj|r{|
| qbt|
rd|
ig| }n	dt| 	 ig}|| _dS )	a  
        Used to create param groups for the optimizer.
        As an example, this can be used to specify per-layer learning rates:

        optim.SGD([
                    {'params': model.base.parameters()},
                    {'params': model.classifier.parameters(), 'lr': 1e-3}
                    ], lr=1e-2, momentum=0.9)

        See https://pytorch.org/docs/stable/optim.html for more information.
        By default, ModelPT will use self.parameters().
        Override this method to add custom param groups.
        In the config file, add 'optim_param_groups' to support different LRs
        for different components (unspecified params will use the default LR):

        model:
            optim_param_groups:
                encoder:
                    lr: 1e-4
                    momentum: 0.8
                decoder:
                    lr: 1e-3
            optim:
                lr: 3e-3
                momentum: 0.9
        
parametersNoptim_param_groupsz not found in model.paramsz does not have parameters.TF)r   rS   r2   r6  r   r   r@   appendr   r5  named_parameters
startswithr   )rp   known_groupsparam_groupsparam_groups_cfggroup	group_cfgr   	new_groupkvother_paramsnp
is_unknownr,   r,   r/   r  9  sB   







z$ModelPT.setup_optimizer_param_groupsc                 C  s(   |    | jdu r| jS | jg| jgfS )z8
        Configure the optimizer and scheduler.
        N)r4  rU   rT   r{   r,   r,   r/   configure_optimizersy  s   
zModelPT.configure_optimizersc                   s8   d fdd   D ]\}}}j|_ | qdS )zK
        Propagates the model GUID to all submodules, recursively.
        r   'NeuralModule'c                   s"   j | _ |  D ]} | qd S r+   )
model_guidchildren)r   childrecursively_propagate_guidrp   r,   r/   rM    s   
z@ModelPT.propagate_model_guid.<locals>.recursively_propagate_guidN)r   rH  )r   rI  )rp   _r   r,   rL  r/   propagate_model_guid  s
   
zModelPT.propagate_model_guidstagec                 C  sf  |    |dkr;d| jv o| jjduo| jjdd}|  du p/t|  to/t|  dk}|r;|r;| | jj |dv rsd| jv oQ| jj	duoQ| jj	dd}| 
 du pft| 
 toft| 
 dk}|rs|rs| j| jj	d	 |d
krd| jv o| jjduo| jjdd}|  du pt|  tot|  dk}|r|r| j| jjd dS dS dS dS )zCalled at the beginning of fit, validate, test, or predict.
        This is called on every process when using DDP.

        Args:
            stage: fit, validate, test or predict
        fitr7   Nr8   Fr   )rQ  validater9   r:   r   r<   r=   )rO  rL   r7   r_   r  r?   r   r   r`   r9   val_dataloaderra   r<   test_dataloaderrb   )rp   rP  train_deferred_setupno_train_dataloaderval_deferred_setupno_val_dataloadertest_deferred_setupno_test_dataloaderr,   r,   r/   setup  sH   





	zModelPT.setupc                 C  s   | j dur| j S dS )z.
        Get the training dataloader.
        N)rP   r{   r,   r,   r/   r    s   
zModelPT.train_dataloaderc                 C     | j du rg | _ | j S )z0
        Get the validation dataloader.
        N)rQ   r{   r,   r,   r/   rS       
zModelPT.val_dataloaderc                 C  r\  )z*
        Get the test dataloader.
        N)rR   r{   r,   r,   r/   rT    r]  zModelPT.test_dataloadersync_metrics,Optional[Dict[str, Dict[str, torch.Tensor]]]c                 C  s  | j durt| j dkri S t| j d tr8| j| j dd}|dur1d|v r1| j|dd|d | j   |S di i}t| j D ]x\}}| 	|}| j||d}|pTi }d|v rhd|vrh|| j
krh|d |d< | D ]E\}}|dkri }	| D ] \}
}|
|d vr|| j
kr|
}||	||
 < n||
 }||	|< qz|d }||	 ||d< ql|| }|||< ql| j |   qAd|v r| j|dd|d |S )a  
        Default DataLoader for Validation set which automatically supports multiple data loaders
        via `multi_validation_epoch_end`.

        If multi dataset support is not required, override this method entirely in base class.
        In such a case, there is no need to implement `multi_validation_epoch_end` either.

        .. note::
            If more than one data loader exists, and they all provide `val_loss`,
            only the `val_loss` of the first data loader will be used by default.
            This default can be changed by passing the special key `val_dl_idx: int`
            inside the `validation_ds` config.

        Args:
            outputs: Single or nested list of tensor outputs from one or more data loaders.

        Returns:
            A dictionary containing the union of all items from individual data_loaders,
            along with merged logs from all data loaders.
        Nr   dataloader_idxry   T)on_epoch	sync_distval_loss)validation_step_outputsr   r?   rM   multi_validation_epoch_endlog_dictr  clear	enumerate get_validation_dataloader_prefixr   r   r&  )rp   r^  output_dictra  val_outputsdataloader_prefixdataloader_logsrA  rB  rg  k_logv_log	new_k_logoutput_logsnew_kr,   r,   r/   on_validation_epoch_end  sD   





zModelPT.on_validation_epoch_endc                 C  s  | j durt| j dkri S t| j d tr7| j| j dd}|dur0d|v r0| j|ddd | j   |S di i}t| j D ]z\}}| 	|}| j||d}|pSi }d|v rgd|vrg|| j
krg|d |d< | D ]G\}}|dkri }| D ] \}	}
|	|d vr|| j
kr|	}|
|||	 < n||	 }|
||< qy|di }|| ||d< qk|| }|||< qk| j |   q@d|v r| j|ddd |S )a  
        Default DataLoader for Test set which automatically supports multiple data loaders
        via `multi_test_epoch_end`.

        If multi dataset support is not required, override this method entirely in base class.
        In such a case, there is no need to implement `multi_test_epoch_end` either.

        .. note::
            If more than one data loader exists, and they all provide `test_loss`,
            only the `test_loss` of the first data loader will be used by default.
            This default can be changed by passing the special key `test_dl_idx: int`
            inside the `test_ds` config.

        Args:
            outputs: Single or nested list of tensor outputs from one or more data loaders.

        Returns:
            A dictionary containing the union of all items from individual data_loaders,
            along with merged logs from all data loaders.
        Nr   r`  ry   T)rb  	test_loss)test_step_outputsr   r?   rM   multi_test_epoch_endrg  r  rh  ri  get_test_dataloader_prefixr   r   r_   r&  )rp   rk  ra  test_outputsrm  rn  rA  rB  rg  ro  rp  rq  rr  rs  r,   r,   r/   on_test_epoch_end9  sD   





zModelPT.on_test_epoch_endr   outputsList[Dict[str, torch.Tensor]]ra  r  c                 C     t d dS )a8  
        Adds support for multiple validation datasets. Should be overriden by subclass,
        so as to obtain appropriate logs for each of the dataloaders.

        Args:
            outputs: Same as that provided by LightningModule.on_validation_epoch_end()
                for a single dataloader.
            dataloader_idx: int representing the index of the dataloader.

        Returns:
            A dictionary of values, optionally containing a sub-dict `log`,
            such that the values in the log will be pre-pended by the dataloader prefix.
        aE  Multi data loader support has been enabled, but `multi_validation_epoch_end(outputs, dataloader_idx) has not been implemented.
If you require multi data loader support for validation sets, please override this method.
If you do not require multi data loader support, please instead override `on_validation_epoch_end(outputs).Nr"   rc   rp   r{  ra  r,   r,   r/   rf       z"ModelPT.multi_validation_epoch_endc                 C  r}  )a2  
        Adds support for multiple test datasets. Should be overriden by subclass,
        so as to obtain appropriate logs for each of the dataloaders.

        Args:
            outputs: Same as that provided by LightningModule.on_validation_epoch_end()
                for a single dataloader.
            dataloader_idx: int representing the index of the dataloader.

        Returns:
            A dictionary of values, optionally containing a sub-dict `log`,
            such that the values in the log will be pre-pended by the dataloader prefix.
        a9  Multi data loader support has been enabled, but `multi_test_epoch_end(outputs, dataloader_idx) has not been implemented.
If you require multi data loader support for validation sets, please override this method.
If you do not require multi data loader support, please instead override `on_test_epoch_end(outputs).Nr~  r  r,   r,   r/   rw    r  zModelPT.multi_test_epoch_endc                 C  
   | j | S z
        Get the name of one or more data loaders, which will be prepended to all logs.

        Args:
            dataloader_idx: Index of the data loader.

        Returns:
            str name of the data loader at index provided.
        )r   rp   ra  r,   r,   r/   rj       

z(ModelPT.get_validation_dataloader_prefixc                 C  r  r  )r   r  r,   r,   r/   rx    r  z"ModelPT.get_test_dataloader_prefixc                 C  s   g }i }|  D ]+\}}d}	|D ]
}
|
|v rd}	 nq|D ]}||v r,|| d}	 nq|	r3|||< q| j|dd |duratd|  t|dkr_td| d|  td	 dS dS t|dkrqtd
|  dS dS )z?
        Load a part of the state dict into the model.
        FTr   Nz)Model checkpoint partially restored from r   z9The following parameters were excluded when loading from z : z'Make sure that this is what you wanted!zAThe following parameters were excluded when loading checkpoint : )r   r8  load_state_dictr"   r  r   )rp   
state_dictincludeexcludeload_from_stringexcluded_param_namesdict_to_loadrA  rB  
should_addrE  r2  r,   r,   r/   load_part_of_state_dict  sH   
zModelPT.load_part_of_state_dictcpur   c              
     s  g d fdd D }t |dkrdS t |dkr+td fddt|D  d	v rjdurts tjtrbj}| j||d
dd}| j	|
 dd td| d ~nBtjttfrj}| D ]-}|j}| j||d
dd}|ddg}|dg }	| |
 ||	d| d ~qqntdW d   n1 sw   Y  dv rijdurit tjtrd}
t| dr| jdur| j}t|dr|jjdurtd 	 W d   dS | j|
|d
dd}| j	|
 dd td|
 d ~nDtjttfrUj}| D ].}|j}
| j|
|d
dd}|ddg}|dg }	| |
 ||	d|
 d ~q%ntdW d   n	1 sdw   Y  dv rjdurtw tjtrd}tj||d}| j	|d dd td | d ~n?tjttfrۈj}| D ])}|j}tj||d}|ddg}|dg }	| |d ||	d| d ~qntd!W d   dS W d   dS 1 sw   Y  dS dS dS )"a  
        Initializes a given model with the parameters obtained via specific config arguments.
        The state dict of the provided model will be updated with `strict=False` setting so as to prevent
        requirement of exact model parameters matching.

        Initializations:
            init_from_nemo_model: Str path to a .nemo model in order to load state_dict from single nemo file;
            if loading from multiple files, pass in a dict where the values have the following fields:

                path: Str path to .nemo model

                include: Optional list of strings, at least one of which needs to be contained in parameter name
                to be loaded from this .nemo file. Default: everything is included.

                exclude: Optional list of strings, which can be used to exclude any parameter containing one of
                these strings from being loaded from this .nemo file. Default: nothing is excluded.

                hydra usage example:

                init_from_nemo_model:
                    model0:
                        path:<path/to/model1>
                        include:["encoder"]
                    model1:
                        path:<path/to/model2>
                        include:["decoder"]
                        exclude:["embed"]

            init_from_pretrained_model: Str name of a pretrained model checkpoint (obtained via cloud).
                The model will be downloaded (or a cached copy will be used), instantiated and then
                its state dict will be extracted. If loading from multiple models, you can pass in a dict
                with the same format as for init_from_nemo_model, except with "name" instead of "path"

            init_from_ptl_ckpt: Str name of a Pytorch Lightning checkpoint file. It will be loaded and
                the state dict will extracted. If loading from multiple files, you can pass in a dict
                with the same format as for init_from_nemo_model.

        Args:
            cfg: The config used to instantiate the model. It need only contain one of the above keys.
            map_location: str or torch.device() which represents where the intermediate state dict
                (from the pretrained model or checkpoint) will be loaded.

        )init_from_nemo_modelinit_from_pretrained_modelinit_from_ptl_ckptc                   s$   g | ]}| v r|d urdndqS )Nr   r   r,   )r   arg)r2   r,   r/   r   7  s   $ zAModelPT.maybe_init_from_pretrained_checkpoint.<locals>.<listcomp>r   Nr   zLCannot pass more than one model initialization arguments to config!
Found : c                   s   g | ]
\}}|r | qS r,   r,   )r   r   arg_present)r   r,   r/   r   @  s    r  init_strictT)r   r   Fr  z6Model checkpoint restored from nemo file with path : ``r  r   r  znemo file with path `z=Invalid type: init_from_nemo_model is not a string or a dict!r  r3   resume_from_checkpointzxModel training is being resumed via Pytorch Lightning.
Initialization from pretrained model (via cloud) will be skipped.zBModel checkpoint restored from pretrained checkpoint with name : `z!pretrained checkpoint with name `zCInvalid type: init_from_pretrained_model is not a string or a dict!r  )r   r  zIModel checkpoint restored from pytorch lightning checkpoint with path : `z;Invalid type: init_from_ptl_ckpt is not a string or a dict!)sumr@   ri  r  r   r?   r}   r   r_   r  r  r"   r  r   rM   valuesr   r  r  	TypeErrorr  r   r3   _checkpoint_connectorresume_checkpoint_pathfrom_pretrainedr   r  rY   load)rp   r2   r   arg_matches
model_pathrestored_modelmodel_load_dictmodel_load_cfgr  r  
model_namer3   	ckpt_pathckptr,   )r   r2   r/   %maybe_init_from_pretrained_checkpoint  s   -



3


$z-ModelPT.maybe_init_from_pretrained_checkpointc                   s.   |dkrdt jv rt jd t | dS )zm
        Called at the end of fit and test.

        Args:
            stage: either 'fit' or 'test'
        rQ  PL_TRAINER_GPUSN)r   environr  rB   teardown)rp   rP  rr   r,   r/   r    s   
zModelPT.teardownsave_dirsplit_by_modulec                 C  sD   |du rt  }t|std| | | | j|||}|S )a  
        Extract the state dict(s) from a provided .nemo tarfile and save it to a directory.

        Args:
            restore_path: path to .nemo file from which state dict(s) should be extracted
            save_dir: directory in which the saved state dict(s) should be stored
            split_by_module: bool flag, which determins whether the output checkpoint should
                be for the entire Model, or the individual module's that comprise the Model
            save_restore_connector (SaveRestoreConnector): Can be overrided to add custom save and restore logic.

        Example:
            To convert the .nemo tarfile into a single Model level PyTorch checkpoint
            ::
            state_dict = nemo.collections.asr.models.EncDecCTCModel.extract_state_dict_from('asr.nemo', './asr_ckpts')


            To restore a model from a Model level checkpoint
            ::
            model = nemo.collections.asr.models.EncDecCTCModel(cfg)  # or any other method of restoration
            model.load_state_dict(torch.load("./asr_ckpts/model_weights.ckpt"))


            To convert the .nemo tarfile into multiple Module level PyTorch checkpoints
            ::
            state_dict = nemo.collections.asr.models.EncDecCTCModel.extract_state_dict_from(
                            'asr.nemo',
                            './asr_ckpts',
                            split_by_module=True
                        )


            To restore a module from a Module level checkpoint
            ::
            model = nemo.collections.asr.models.EncDecCTCModel(cfg)  # or any other method of restoration

            # load the individual components
            model.preprocessor.load_state_dict(torch.load("./asr_ckpts/preprocessor.ckpt"))
            model.encoder.load_state_dict(torch.load("./asr_ckpts/encoder.ckpt"))
            model.decoder.load_state_dict(torch.load("./asr_ckpts/decoder.ckpt"))


        Returns:
            The state dict that was loaded from the original .nemo checkpoint
        Nr   )r   r   r   FileExistsErrorr   rW   extract_state_dict_from)ru   r   r  r  r   r  r,   r,   r/   r    s   4

zModelPT.extract_state_dict_from	'Trainer'c                 C  sL   t | jdstd dS d}|dur|jdkrt| dS | | dS )a  
        Helper method to check whether the model can safely be tested
        on a dataset after training (or loading a checkpoint).

        ::

            trainer = Trainer()
            if model.prepare_test(trainer):
                trainer.test(model)

        Returns:
            bool which declares the model safe to test. Provides warnings if it has to
            return False to guide the user.
        r<   z.No `test_ds` config found within the manifest.Fav  

During testing, it is currently advisable to construct a new Trainer "
                    "with single GPU and no DDP to obtain accurate results.
                    "Following pattern should be used: "
                    "trainer = Trainer(devices=1, accelerator='gpu')"
                    "if model.prepare_test(trainer):"
                    "  trainer.test(model)

Nr   T)r   rL   r"   r  r  rc   rV   )rp   r3   DDP_WARNr,   r,   r/   prepare_test  s   



zModelPT.prepare_testc                 C  s   || _ || _| | dS )zz
        Set an instance of Trainer object.

        Args:
            trainer: PyTorch Lightning Trainer object.
        N)r3   r  set_world_size)rp   r3   r,   r,   r/   rV   +  s   zModelPT.set_trainerc                 C  sP   d| _ |durt|tr|jr|jr|j|j | _ ntd t }| j |_ dS )z
        Determines the world size from the PyTorch Lightning Trainer.
        And then updates AppState.

        Args:
            trainer (Trainer): PyTorch Lightning Trainer object
        r   Nz8World size can only be set by PyTorch Lightning Trainer.)
world_sizer?   r   r  r  r"   rc   r$   )rp   r3   rq   r,   r,   r/   r  6  s   	

zModelPT.set_world_sizer   	max_depthmodel_summary.ModelSummaryc                 C  s   t j| |dS )a  Summarize this LightningModule.

        Args:
            max_depth: The maximum depth of layer nesting that the summary will include. A value of 0 turns the
                layer summary off. Default: 1.

        Return:
            The model summary object
        )r  )r   	summarize)rp   r  r,   r,   r/   r  J  s   
zModelPT.summarizer   r   c                 C  s   t | dr| jdu rdS |dur?t|tst|}|dv r;t| jd |d }|| j|< t| jd | j| _dS t	ddS )a  
        Update the config (if not None) of the dataset by given name.
        Preserves said config after updating.

        Args:
            dataset_name: str name of the dataset whose config is being updated.
                Can be one of `train`, `validation` and `test`.
            config: Optional DictConfig or dict. If None is passed, this method simply returns.
                If dict is passed, it is cast into a DictConfig.
                The internal config is updated with the passed config.
        r   TN)trainr   r   F_dszL`dataset_name` when updating config must be one of [train, validation, test])
r   r   r?   r   r   r  rF   r2   rL   r@   )rp   r   r   key_namer,   r,   r/   r   V  s   


zModelPT._update_dataset_configc                 C  s(   d}|   D ]}|jr|| 7 }q|S )z\
        Utility property that returns the total number of parameters of the Model.
        r   )r5  requires_gradnumel)rp   numrE  r,   r,   r/   num_weightsv  s   zModelPT.num_weightsc                 C     | j S )  
        Property that holds the finalized internal config of the model.

        Note:
            Changes to this config are not reflected in the state of the model.
            Please create a new model using an updated config to properly update the model.
        )rL   r{   r,   r,   r/   r2     r   zModelPT.cfgc                 C  r  )z)
        Get the trainer object.
        )r  r{   r,   r,   r/   r3     s   zModelPT.trainerc                 C  sN   || _ | td| j i t| dr#d| jv r%t| j | jd< dS dS dS )r  r2   _hparams_initialN)rL   _set_hparamsr   r  r   r  	to_object)rp   r2   r,   r,   r/   r2     s
   	c                   sX   |  td| ji t| dr(d| jv r(t| jd tr(t| jd | jd< t	 j
S )a)  
        Overwrite default hparams property to return the lastest model config.
        Without this change, the hparams property would return the old config if there was a direct change to
        self._cfg (e.g., in self.setup_optimization()) that was not done via `self.cfg = new_cfg`.
        r2   r  )r  r   r  rL   r   r  r?   r   r  rB   hparamsr{   rr   r,   r/   r    s   
zModelPT.hparamsc                 C  d   | j dur| j S g | _ | jdur/t| jttfr/t| jdkr/tt| jD ]}| j g  q&| j S )z
        Cached outputs of validation_step. It can be a list of items (for single data loader) or a list of lists
        (for multiple data loaders).

        Returns:
            List of outputs of validation_step.
        Nr   )re   rQ   r?   r   r   r   r   r8  rp   rN  r,   r,   r/   re    s   
	
zModelPT.validation_step_outputsc                 C  
   || _ d S r+   )re   rp   valuer,   r,   r/   re       
c                 C  r  )z
        Cached outputs of test_step. It can be a list of items (for single data loader) or a list of
        lists (for multiple data loaders).

        Returns:
            List of outputs of test_step.
        Nr   )rf   rR   r?   r   r   r   r   r8  r  r,   r,   r/   rv    s   
	(zModelPT.test_step_outputsc                 C  r  r+   )rf   r  r,   r,   r/   rv    r  c                  C  s   t  } | jS r+   )r$   is_model_being_restored)rq   r,   r,   r/   r^     s   z ModelPT._is_model_being_restoredr   folderc                 C  s   t  }| |_||_d S r+   )r$   r  nemo_file_folder)r   r  rq   r,   r,   r/   r     s   
z ModelPT._set_model_restore_statec                 C  sJ   t | ds#t }tt | _|  r|j}nd }|j| j|d d S d S )NrI  )restoration_path)	r   r$   r}   uuiduuid4rI  r^   r   register_model_guid)rp   appstater   r,   r,   r/   rX     s   
zModelPT._set_model_guidc                 C  s$   t | dr
|| _dS t| d| dS )zB
        Update the save_restore_connector for the model.
        rW   N)r   rW   r   )ru   r   r,   r,   r/   r     s   

z%ModelPT.update_save_restore_connectorc                 C  s  | j dddur| j jddrddlm} ddlm} d| _| j jd	d| _| j jd
d| _	| j jdd}|du sDt
j|sLtd| dt|}| j jdd}| j jdd}|dd}|| d | _|| d | _| jjddd | jjddd t| jtrtd| j  n
tdt| j t| j	trtd| j	  n
tdt| j	 | j	| jkrntd| j jddrtd| | _tjjtjjjtjjjgtjj d||d| jd| _!dS dS dS )a  Enables chakra profiling
        To use, add the following options to the model config:
        ## Chakra profiling options
        chakra_profile:
            enabled: False
            start_step: 2  # Global batch to start profiling
            end_step: 2 # Global batch to end profiling
            warmup_steps: 0  # Global batch to start profiling
            active_steps: 1  # Global batch to start profiling
            trace_dir: None # Path to store the profile output file
        chakra_profileNenabledFr   )ExecutionTraceObserver)
get_envintT
start_stepend_step	trace_dirzchakra profile output path () is not set or does not exist.warmup_stepsactive_stepsr   SLURM_JOB_ID_chakra_kineto)r   exist_okz(chakra profiling setup with start_step: z.chakra start_step must be of type int. Found: z&chakra profiling setup with end_step: z,chakra end_step must be of type int. Found: zBchakra end_step must be greater than or equal to chakra start_stepzZProfiler conflict: Chakra profiling and Nsys profiling cannot be enabled at the same time.)waitwarmupactive)
activitiesscheduleexecution_trace_observer)"r2   r_   r  torch.profilerr  nemo.utils.env_var_parsingr  _chakra_profile_enabled_chakra_profile_start_step_chakra_profile_end_stepr   r   isdirr@   r   _chakra_trace_dir_kineto_trace_dirr   r?   r  r"   r  rA   nsys_profiler*  _etrY   profilerprofileProfilerActivityCPUCUDAr  _prof)rp   r  r  r  r  r  job_idr,   r,   r/   rn     sZ   
zModelPT._setup_chakra_profilingc                 C  s$  | j dddurz| j jddrzd| _| j jdd| _| j jdd| _| j jd	dg| _| j jd
d| _t| jt	krJt
d| j  n
tdt| j t| jt	kret
d| j  n
tdt| j | j| jkrvntd| j dddur| j jddrd| _| j jdd| _| j jdd| _| j jdd| _| j jdd| _t| jt	krt
d| j  n
tdt| j t| jt	krt
d| j  n
tdt| j | j| jkrntd| jdu stj| jstd| j ddS dS dS )a  Enables nsys profiling
        To use, add the following optoins to the model config:
        ## Nsys profiling options
        nsys_profile: False
            start_step: 10  # Global batch to start profiling
            end_step: 10 # Global batch to end profiling
            ranks: [0] # Global rank IDs to profile
            gen_shape: False # Generate model and kernel details including input shapes
        And then wrap the model training script with:
        nsys profile -s none -o <profile filepath>  -t cuda,nvtx --force-overwrite true
            --capture-range=cudaProfilerApi --capture-range-end=stop python ./examples/...
        See more options at: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-profiling

        Enables CUDA memory profiling
        To use, add the following options to the model config:
        ## CUDA memory profiling options
        memory_profile:
            enabled: True
            start_step: 10  # Global batch to start profiling
            end_step: 10 # Global batch to end profiling
            rank: 0 # Global rank ID to profile
            output_path: None # Path to store the profile output file
        r  Nr  FTr  r   r  ranks	gen_shapez&Nsys profiling setup with start_step: z,Nsys start_step must be of type int. Found: z$Nsys profiling setup with end_step: z*Nsys end_step must be of type int. Found: z>Nsys end_step must be greater than or equal to nsys start_stepmemory_profilerankoutput_pathz3CUDA memory start_step must be of type int. Found: z+CUDA memory profiling setup with end_step: z1CUDA memory end_step must be of type int. Found: zGCUDA memory end_step must be greater than or equal to memory start_stepzMemory profile output path (r  )r2   r_   r  _nsys_profile_enabled_nsys_profile_start_step_nsys_profile_end_step_nsys_profile_ranks_nsys_profile_gen_shaperA   r  r"   r  r@   r  _memory_profile_enabled_memory_profile_start_step_memory_profile_end_step_memory_profile_rank_memory_profile_output_pathr   r   r  r{   r,   r,   r/   ri   Y  sZ   zModelPT._setup_profilingc                 C  s~   t | ds=t | jdr5| jjdur5| jjddr5t| dt| jj dd | jd  D | jd	< dS t| dd dS dS )
zPyTorch Lightning hook:
        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-start
        We use it here to copy the relevant config for dynamic freezing.
        _freeze_cfgfreeze_updatesNr  Fc                 S  s   i | ]}|d qS Fr,   )r   rA  r,   r,   r/   
<dictcomp>  s    z*ModelPT.on_train_start.<locals>.<dictcomp>r   	is_frozen)	r   r2   r  r_   r   r   r  r  r   r{   r,   r,   r/   on_train_start  s   

"zModelPT.on_train_startbatchr   	batch_idxunusedOptional[int]c                 C  sp  | j jdkrt| drB| jrB| jsB| jj| jkrB| jj| jkrBt	
d| jj d | jt| jdt  d  | j  d| _t| drt| jrt| jst|| jkrtt | jv rtt	
d	 tj   | jrqtjjjdd
  d| _t| dr| jr| j s|| j!krt | j"krt	
d tjj#j$dd d| _ t| dr,| j%dur.| j&r0t| dr2| jdur4| jjd }| j%d ' D ]r\}}t(|t)rt*|dksJ d||d ko||d kp|d dk}n||kp|dk}|r| j%d | st+| |,  t+| |-  d| j%d |< q|s+| j%d | r+t+| |.  d| j%d |< qdS dS dS dS dS dS )zPyTorch Lightning hook:
        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-start
        We use it here to enable profiling and dynamic freezing.
        rZ   r  z/====== Start chakra profiling from global_step  ======rank-.jsonTr  z"====== Start nsys profiling ======)record_shapesr  z)====== Start CUDA memory profiling ======i )max_entriesr  Nr3   r   r      z>freeze_updates modules list cannot have more than two elementsr   r  r
  F)/devicerA   r   r  ro   r3   global_stepr  r  r"   r  r  register_callbackr}   r  r'   r  startr  rj   r  r  rY   rZ   cudartcudaProfilerStartr   autogradr  	emit_nvtx	__enter__r  rl   r  r  memory_record_memory_historyr  trainingr   r?   r   r   r   freezer  unfreeze)rp   r  r  r  num_updatesmlm_stepsshould_freezer,   r,   r/   on_train_batch_start  sX   
 




 
&zModelPT.on_train_batch_startc                 C  sr  | j jdkrt| drQ| jrQ| jrQ| jjd | jkrCt	d| jj d | j
  | j
t| jdt  d  | j  d| _n| jjd | jkrQ| j
  t| d	rv| jrv| jsv|| jkrvt | jv rvt	d
 tj   d| _t| dr| jr| js|| jkrt | jkrt	d tjj !| j" d| j d tjj j#dd d| _dS dS dS dS dS dS dS )zPyTorch Lightning hook:
        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-batch-end
        We use it here to enable nsys profiling.
        rZ   r  r   z+====== End chakra profiling at global_step r  r  r  Fr  z ====== End nsys profiling ======Tr  z'====== End CUDA memory profiling ======z/memory_profile_rankz.pickleN)r  )$r  rA   r   r  ro   r3   r  r  r"   r  r  stopexport_chrome_tracer}   r  r'   r  unregister_callbackr  stepr  rk   r  r  rY   rZ   r  cudaProfilerStopr  rm   r  r  r  _dump_snapshotr  r   )rp   r{  r  r  r  r,   r,   r/   on_train_batch_end  s<   

 






zModelPT.on_train_batch_endc                 C  s$   t | dr
t| d d| _d| _dS )zX
        Utility function to clean up the module state at the end of execution.
        r  N)r   delattrre   rf   r{   r,   r,   r/   _cleanup_on_execution_end  s   


z!ModelPT._cleanup_on_execution_endc                 C     |    dS )zPyTorch Lightning hook:
        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end
        We use it here to cleanup the dynamic freezing config.
        Nr1  r{   r,   r,   r/   on_train_end*  s   zModelPT.on_train_endc                 C  r2  zPyTorch Lightning hook:
        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-test-end
        Nr3  r{   r,   r,   r/   on_test_end2     zModelPT.on_test_endc                 C  r2  r5  r3  r{   r,   r,   r/   on_predict_end9  r7  zModelPT.on_predict_endOptional[DictConfig]c                 C  sP   |du r| j durt| j dr| j j}|du rdS t|tr#t|S t|S )zy
        Return a copy of `optim_config` if provided (and otherwise of the internal optim config, if available).
        Nr   )	rL   r   r   r?   r   r$  r%  r   r  )rp   r   r,   r,   r/   r  @  s   


zModelPT._optim_config_copyr+   )r2   r   r3   r   )rs   rt   )T)r|   r}   r~   r}   r   r   )rs   r   )r   r}   r   r}   r4   r   rs   rt   )r   r   )r   r}   r   r}   rs   r   )r   r}   )NNTFNNT)r   r}   r   r   r   r   r   r   r   r   r   r   r3   r   r   r   )r   r}   r   r   r   r   r   r   )r   r   )r;   r   )r>   r   )r   r   )NN)r   r  r  r  )rP  r   r  )r^  r   rs   r_  )rs   r_  )r   )r{  r|  ra  r  rs   r_  )ra  r  rs   r}   )r  )r2   r   r   r}   )rP  r}   )FN)r   r}   r  r}   r  r   r   r   )r3   r  rs   r   )r3   r   )r   )r  r  rs   r  )r   r}   r   r  )r   r   r  r}   )r  r   r  r  r  r  rs   r  )r  r   r  r  r  r  rs   rt   )r   r  rs   r9  )JrJ   rI   __qualname____doc__rC   rv   rz   r   r   r   r   r   r   r   classmethodr   r   r   r`   r   r   ra   rb   r   r4  r  rG  rO  r[  r  rS  rT  rt  rz  rf  rw  rj  rx  r  r   r  r  r  r  rV   r  r  r   propertyr  r2   r   r3   gettersetterr  re  rv  staticmethodr^   r   rX   r   rn   ri   r  r(  r/  r1  r4  r6  r8  r  __classcell__r,   r,   rr   r/   r)   @   s     

=


.
 (G



 0@,


`^
' 2
=
$

 










	IT9$)G
__future__r   r$  r  r   pathlibr  abcr   r   r   typingr   r   r	   r
   r   r   r   r   r'  rY   nemo.core.classes.moduler   nemo.utils.msc_utilsr   r   megatron.core.optimizerr   r   megatron.core.utilsr   HAVE_MEGATRON_COREImportErrorModuleNotFoundErrorlightning.pytorchr   r   lightning.pytorch.utilitiesr   r   	omegaconfr   r   r   nemor   	nemo.corer   nemo.core.classes.commonr   +nemo.core.connectors.save_restore_connectorr   nemo.core.optimr    r!   
nemo.utilsr"   r#   nemo.utils.app_stater$   nemo.utils.debug_hookr%   nemo.utils.exceptionsr&   nemo.utils.get_rankr'   r(   __all__register_new_resolverr)   r,   r,   r,   r/   <module>   sJ   (