o
    wi
                 $   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d d	l m!Z! d d
l"m#Z#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z>m?Z? d dl@mAZAmBZB d dlCmDZD d dlEmFZFmGZG d dlHmIZI d dlJmKZK d dlLmMZM d dlNmOZO d dlPmQZQ d dlRmSZSmTZTmUZUmVZVmWZW d dlXmYZY d dlZm[Z[ d d l\m]Z]m^Z^ eOd!d"\Z_Z`z
d d#lambZb d$ZcW n edeefy+   d%ZcY nw z
d d&lamfZf d$ZgW n edeefyD   d%ZgY nw G d'd( d(eKZhG d)d* d*eKZiG d+d, d,eKZje
G d-d. d.Zke
G d/d0 d0Zle
G d1d2 d2Zme
G d3d4 d4Zne
G d5d6 d6Zoe
G d7d8 d8Zpe
G d9d: d:Zqe
G d;d< d<ZrG d=d> d>e#ZsG d?d@ d@e#ZtddAdBdCeee6ef  dDee fdEdFZuddAdBdCeee6ef  fdGdHZvdIeeeewf  dDeeeewf  fdJdKZx	%	%	%		ddAdBdLewdMeydNeydOeydPewdQewfdRdSZzdAdBdTeeewf dUewdVewdWewdDeeewewewf fdXdYZ{					$	%ddAdBdUewdVewdWewdTewdZeydMeydDeeewewewf fd[d\Z|d]d^ Z}d_d` Z~dAdBdUeewgdLeewgdVewdWewdaedbeydceddeydeedfeydgedheydiedjeydkedleydmef$dndoZG dpdq dqe4ZdAdBdLedVewdreydsdtdueyfdvdwZdxdy ZG dzd{ d{e)ZdAejjdDdfd|d}ZG d~d de0Zddeewef deydeyfddZdS )    N)defaultdict)	dataclassfield)	timedelta)Path)copymove)Any
CollectionDictListOptionalTupleUnion)HydraConfig)get_original_cwd)CallbackModelCheckpoint)EarlyStopping)IntervalTimer)MLFlowLoggerNeptuneLoggerTensorBoardLoggerWandbLogger)_TrainingEpochLoop)DDPStrategy)_CheckpointConnector)
DictConfig	OmegaConf	open_dict)EMA)IPLEpochStopper)NEMO_ENV_VARNAME_TESTINGNEMO_ENV_VARNAME_VERSION)loggingtimers)AppState)NeMoModelCheckpointPreemptionCallback)get_envbool)NeMoBaseException)is_global_rank_zero)safe_import_from)add_filehandlers_to_pl_logger)ClearMLLoggerClearMLParamsDLLoggerDLLoggerParamsMLFlowParams)add_handlers_to_mcore_logger)uninject_model_parallel_rank)import_multistorageclientis_multistorageclient_urlz)megatron.core.num_microbatches_calculatorget_current_global_batch_size)StragglerDetectionCallbackTF)FaultToleranceCallbackc                   @      e Zd ZdZdS )NotFoundErrorz)Raised when a file or folder is not foundN__name__
__module____qualname____doc__ rB   rB   S/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/utils/exp_manager.pyr<   O       r<   c                       s    e Zd ZdZ fddZ  ZS )LoggerMisconfigurationErrorzDRaised when a mismatch between trainer.logger and exp_manager occursc                    s   |d }t  | d S )Nzf You can disable lighning's trainer from creating a logger by passing logger=False to its constructor.super__init__)selfmessage	__class__rB   rC   rH   V   s
   z$LoggerMisconfigurationError.__init__)r>   r?   r@   rA   rH   __classcell__rB   rB   rK   rC   rE   S   s    rE   c                   @   r;   )CheckpointMisconfigurationErrorzGRaised when a mismatch between trainer.callbacks and exp_manager occursNr=   rB   rB   rB   rC   rN   ^   rD   rN   c                   @   s   e Zd ZU dZdZeed< dZeed< dZe	ed< dZ
eed	< d
Zeed< d
Zeed< d
Zeed< dZee	 ed< dZee	 ed< dZee ed< dZeed< dS )EarlyStoppingParamszEarlyStoppingParams PODval_lossmonitorminmodegMbP?	min_delta
   patienceTverbosestrictcheck_finiteNstopping_thresholddivergence_thresholdcheck_on_train_epoch_endFlog_rank_zero_only)r>   r?   r@   rA   rQ   str__annotations__rS   rT   floatrV   intrW   boolrX   rY   rZ   r   r[   r\   r]   rB   rB   rB   rC   rO   b   s   
 rO   c                   @   s*   e Zd ZU dZdZeed< dZeed< dS )IPLEpochStopperParamsu  
    Parameters for the IPLEpochStopper callback used in iterative pseudo-label training.

    This is part of the TopIPL pipeline, a semi-supervised training method for ASR
    that uses iterative pseudo-labeling (IPL) — periodically stopping training to generate
    pseudo-labels for unlabeled data and fine-tuning the model on them.

    For more details, see:
    🔗 Top-IPL: Top-N Pseudo-Label Averaging for Iterative ASR Training
    https://arxiv.org/abs/2506.07659

    Attributes:
        enable_stop (bool): If True, enables the stopping behavior in the callback.
        stop_every_n_epochs (int): Specifies how many epochs to train before stopping.
    Tenable_stop   stop_every_n_epochsN)	r>   r?   r@   rA   rd   rb   r_   rf   ra   rB   rB   rB   rC   rc   v   s   
 rc   c                   @   sb  e Zd ZU dZdZee ed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed	< dZee ed
< dZee ed< dZee ed< dZee ed< dZeed< dZee ed< dZee ed< dZee ed< dZee ed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed < dS )!CallbackParamszCallbackParams PODNfilepathdirpathfilenamerP   rQ   TrW   	save_last   
save_top_kFsave_weights_onlyrR   rS   auto_insert_metric_namere   every_n_epochsevery_n_train_stepstrain_time_intervalprefixz.nemopostfixsave_best_modelalways_save_nemosave_nemo_on_train_endmodel_parallel_sizesave_on_train_epoch_end
async_savesave_last_n_optim_states) r>   r?   r@   rA   rh   r   r^   r_   ri   rj   rQ   rW   rb   rk   rm   ra   rn   rS   ro   rp   rq   rr   r	   rs   rt   ru   rv   rw   rx   ry   rz   r|   rB   rB   rB   rC   rg      s0   
 rg   c                   @   sB   e Zd ZU dZdZee ed< dZee	 ed< dZ
ee ed< dS )	StepTimingParamszStepTimingParams PODmean	reductionF	sync_cudare   buffer_sizeN)r>   r?   r@   rA   r   r   r^   r_   r   rb   r   ra   rB   rB   rB   rC   r}      s
   
 r}   c                   @   s^   e Zd ZU dZdZee ed< dZee	 ed< dZ
ee ed< dZee ed< dZeed	< d
S )	EMAParamszEMAParams PODFenableg+?decaycpu_offloadvalidate_original_weightsre   every_n_stepsN)r>   r?   r@   rA   r   r   rb   r_   r   r`   r   r   r   ra   rB   rB   rB   rC   r      s   
 r   c                   @   sf   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< d	Zeed< dZeed< dS )StragglerDetectionParamszStragglerDetectionParams PODi,  report_time_intervalTcalc_relative_gpu_perfcalc_individual_gpu_perf   num_gpu_perf_scores_to_loggffffff?gpu_relative_perf_thresholdgpu_individual_perf_thresholdFstop_if_detectedN)r>   r?   r@   rA   r   r`   r_   r   rb   r   r   ra   r   r   r   rB   rB   rB   rC   r      s   
 r   c                   @   s   e Zd ZU dZdZeed< dZee ed< dZ	ee ed< dZ
eed	< dZeed
< ejdkr1ejnejZejed< dZeed< dZeed< dZeed< dZeed< dZee ed< dS )FaultToleranceParamszFaultToleranceParams PODg      @workload_check_intervalg      @initial_rank_heartbeat_timeoutg     @rank_heartbeat_timeoutTcalculate_timeoutssafety_factorntrank_termination_signalINFO	log_levelr   max_rank_restartsmax_subsequent_job_failures additional_ft_launcher_argsNsimulated_fault)r>   r?   r@   rA   r   r`   r_   r   r   r   r   rb   r   osnamesignalSIGKILLSIGTERMr   Signalsr   r^   r   ra   r   r   r   r	   rB   rB   rB   rC   r      s   
  r   c                   @   s6  e Zd ZU dZdZee ed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed< d	Zee ed
< d	Zee ed< d	Zee ed< dZee ed< dZee ed< dZeeeef  ed< d	Zee ed< dZeeeef  ed< d	Zee ed< edd dZee ed< d	Zee ed< edd dZee ed< d	Zee ed< edd dZee  ed< d	Z!ee ed< dZ"eeeef  ed< dZ#ee ed< ed d dZ$ee% ed!< d	Z&ee ed"< d	Z'ee ed#< ed$d dZ(ee) ed%< ed&d dZ*ee+ ed'< dZ,ee ed(< dZ-ee.e  ed)< dZ/ee ed*< d	Z0ee ed+< ed,d dZ1ee2 ed-< d	Z3ee ed.< d	Z4ee ed/< dZ5ee ed0< ed1d dZ6ee7 ed2< dZ8ee ed3< d4Z9e:ed5< d	Z;ee ed6< ee<dZ=ee< ed7< d	Z>ee ed8< ee?dZ@ee? ed9< dZAee ed:< dS );ExpManagerConfigz=Experiment Manager config for validation of passed arguments.Nexplicit_log_direxp_dirr   versionTuse_datetime_versionFresume_if_existsresume_past_endresume_ignore_no_checkpointresume_from_checkpointcreate_tensorboard_loggersummary_writer_kwargscreate_wandb_loggerwandb_logger_kwargscreate_mlflow_loggerc                   C      t  S N)r3   rB   rB   rB   rC   <lambda>       zExpManagerConfig.<lambda>)default_factorymlflow_logger_kwargscreate_dllogger_loggerc                   C   r   r   )r2   rB   rB   rB   rC   r      r   dllogger_logger_kwargscreate_clearml_loggerc                   C   r   r   )r0   rB   rB   rB   rC   r      r   clearml_logger_kwargscreate_neptune_loggerneptune_logger_kwargscreate_checkpoint_callbackc                   C   r   r   )rg   rB   rB   rB   rC   r     r   checkpoint_callback_paramscreate_early_stopping_callback!create_ipl_epoch_stopper_callbackc                   C   r   r   )rO   rB   rB   rB   rC   r     r   early_stopping_callback_paramsc                   C   r   r   )rc   rB   rB   rB   rC   r     r   !ipl_epoch_stopper_callback_paramscreate_preemption_callbackfiles_to_copylog_step_timinglog_delta_step_timingc                   C   r   r   )r}   rB   rB   rB   rC   r     r   step_timing_kwargslog_local_rank_0_onlylog_global_rank_0_onlydisable_validation_on_resumec                   C   r   r   )r   rB   rB   rB   rC   r     r   emamax_time_per_runr   seconds_to_sleep#create_straggler_detection_callbackstraggler_detection_paramscreate_fault_tolerance_callbackfault_tolerancelog_tflops_per_sec_per_gpu)Br>   r?   r@   rA   r   r   r^   r_   r   r   r   r   rb   r   r   r   r   r   r   r   r	   r   r   r   r   r   r3   r   r   r2   r   r   r0   r   r   r   r   rg   r   r   r   rO   r   rc   r   r   r   r   r   r   r}   r   r   r   r   r   r   r   r`   r   r   r   r   r   r   r   rB   rB   rB   rC   r      sb   
 r   c                   @   s|   e Zd ZdZdi fdefddZdd Zdd	 Zd
d Zdd Z	dddZ
dddZdddZdddZdd Zdd ZdS )TimingCallbackz5
    Logs execution time of train/val/test steps
    Flog_tokens_per_secc                 C   s   || _ tjdi || _dS )zinit for TimitCallback

        Args:
            log_tokens_per_sec (bool, optional): _description_. Defaults to False.
            timer_kwargs (dict, optional): _description_. Defaults to {}.
        NrB   )r   r&   
NamedTimertimer)rI   r   timer_kwargsrB   rB   rC   rH   .  s   zTimingCallback.__init__c                 C   sR   | j jdkr| j | | j |r!td| d | j | | j | dS )zPSetup the timer

        Args:
            name (_type_): name of timer
        r   zTimer `zZ` was not correctly stopped, suggesting a possible issue. The timer will be reset for now.N)r   r   reset	is_activer%   warningstart)rI   r   rB   rB   rC   _on_batch_start8  s   
zTimingCallback._on_batch_startc                 C   s:   | j | |j|d t| j | ddd|dkd dS )zend of the callback log

        Args:
            name (_type_): _description_
            pl_module (_type_): _description_
        z in sTFre   train_step_timingon_stepon_epoch
batch_sizeprog_barN)r   stoplogtorch	as_tensor)rI   r   	pl_modulerB   rB   rC   _on_batch_endK  s   
zTimingCallback._on_batch_endc                 C      |  d dS )zwrapper

        Args:
            trainer (_type_): _description_
            pl_module (_type_): _description_
            batch (_type_): _description_
            batch_idx (_type_): _description_
        r   Nr   )rI   trainerr   batch	batch_idxrB   rB   rC   on_train_batch_start]  s   	z#TimingCallback.on_train_batch_startc                 C   sx   |  d| | jr:d|v r|d |d< t |j |d jd  tj  }|jd|t	| j
d  ddddd d	S d	S )
zwrapper

        Args:
            trainer (_type_): _description_
            pl_module (_type_): _description_
            outputs (_type_): _description_
            batch (_type_): _description_
            batch_idx (_type_): _description_
        r   texttokensre   tokens_per_sec_per_gpuTFr   N)r   r   r8   accumulate_grad_batchesshaper   distributedget_world_sizer   r   r   )rI   r   r   outputsr   r   tokens_per_gpurB   rB   rC   on_train_batch_endh  s&   


z!TimingCallback.on_train_batch_endr   c                 C   r   )on_validation_batch_startvalidation_step_timingNr   rI   r   r   r   r   dataloader_idxrB   rB   rC   r        z(TimingCallback.on_validation_batch_startc                 C      |  d| dS )on_validation_batch_endr   Nr   rI   r   r   r   r   r   r   rB   rB   rC   r       z&TimingCallback.on_validation_batch_endc                 C   r   )on_test_batch_starttest_step_timingNr   r   rB   rB   rC   r    r   z"TimingCallback.on_test_batch_startc                 C   r  )on_test_batch_endr  Nr  r  rB   rB   rC   r    r  z TimingCallback.on_test_batch_endc                 C   r   )on_before_backwardtrain_backward_timingNr   )rI   r   r   lossrB   rB   rC   r	    r   z!TimingCallback.on_before_backwardc                 C   r  )on_after_backwardr
  Nr  rI   r   r   rB   rB   rC   r    r  z TimingCallback.on_after_backwardNr   )r>   r?   r@   rA   rb   rH   r   r   r   r   r   r  r  r  r	  r  rB   rB   rB   rC   r   )  s    




r   c                   @   sL   e Zd ZdZi fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )DeltaTimingCallbacka  
    Logs execution time of train/val/test steps using nemo logger. Calculates
    time from previous batch end to current batch end. This ensures accuracy.

    Note: step time will only be printed in stdout. If you have initialized
    loggers like TensorBoard, WandB, etc, step time will not be recorded there.
    Use this callback instead of 'TimingCallback' to avoid logging overhead with
    lightning logger used in the latter.
    c                 C   s   | dd| _tt| _dS )zfinit

        Args:
            timer_kwargs (dict, optional): _description_. Defaults to {}.
        r   FN)get
_sync_cudar   r&   )rI   r   rB   rB   rC   rH     s   zDeltaTimingCallback.__init__c                 C   s>   | j rtj rtj  d| j| d< t | j| d< dS )_on_epoch_startr   stepr   N)r  r   cudais_initializedsynchronizer&   time)rI   r   r   r   rB   rB   rC   r    s   
z#DeltaTimingCallback._on_epoch_startc                 C   s   | j rtj rtj  t }|| j| d  }td| j| d  d| d|  | j| d  d7  < || j| d< dS )r  r   zStep r  z: z in s=re   N)	r  r   r  r  r  r  r&   r%   info)rI   r   r   r   enddtrB   rB   rC   r     s   
&z!DeltaTimingCallback._on_batch_endc                 C      |  d|| dS )on_train_epoch_starttrain_step_timing in sNr  r  rB   rB   rC   r       z(DeltaTimingCallback.on_train_epoch_startc                 C   r  )on_validation_epoch_startvalidation_step_timing in sNr  r  rB   rB   rC   r     r  z-DeltaTimingCallback.on_validation_epoch_startc                 C   r  )r   r  Nr  rI   r   r   r   r   r   rB   rB   rC   r     r  z&DeltaTimingCallback.on_train_batch_endc                 C   r  )r  r!  Nr  r"  rB   rB   rC   r    r  z+DeltaTimingCallback.on_validation_batch_endN)r>   r?   r@   rA   rH   r  r   r  r   r   r  rB   rB   rB   rC   r    s    
		r  r   zlightning.pytorch.Trainercfgreturnc                 C   s  t tjdd}| j| j | }|t_|du rtd dS | j	r)t
d dS tt}t r;t
d t
| t|trFt|}nt|tsUtdt| dttj|d	d
}t||}t| | t| |j|j|j|j|j|jd\}}}}t| ||j|j |j!|j"j#|j$ |}	|	du s|	dkr|jpd}	|j%r|j&dds|j|j&_'t(d|j&j' ||_||_t) }
||
_*||
_||
_||
_|	|
_+|j,|
_,|j"|
_"tj-|d	d t
d|  || _.|j/d	u r|j0d	u rtdt1t2d}|d| d| d }|j/d	u r|s|dkrt3| n|j0d	u r-|s-|dkr,t3| nt3| |j4sJ|j5sJ|j%sJ|j6sJ|j7sJ|j8rnt9| |||j|j|j"|j4|j:|j5|j;|j%|j&|j6|j<|j7|j=|j8|j> |j?rt@|jApxi d}| jBCd| n|jDrtE|jApi d}| jBCd| |jFjGrtH|jFjI|jFjJ|jFjK|jFjLd}| jBM| |jNrtOd+i |jP}| jBM| |jQrtRd+i |jS}| jBM| |j,rtT| ||	|j|j"|jU |jVrtW|  |jXdur(d}tY| jBD ]\}}t|tZrt(d t[|jX| jB|< d	} nq|s(|jX| _\| jBMt[|jX |j]rKt^rGt
d t|j_}t`d+i |}| jBM| ntd|jarwtbrst
d |jc}|jddk}tetf|jg||jh|jid}| jBM| ntd|jjrt
d t r|jkr|jkD ]
}tltf|| qtm|d  d!d"d#}|nd$otpjq W d   n	1 sw   Y  tr \}}|rtm|d% d&d"d#}|nd'|  |nts  W d   n	1 sw   Y  tt|d(  tu|d) |d(  n| jv| j d*kr	twx|jy tz  |S ),a  
    exp_manager is a helper function used to manage folders for experiments. It follows the pytorch
    lightning paradigm of exp_dir/model_or_experiment_name/version. If the lightning trainer
    has a logger, exp_manager will get exp_dir, name, and version from the logger.
    Otherwise it will use the exp_dir and name arguments to create the logging
    directory. exp_manager also allows for explicit folder creation via explicit_log_dir.

    The version can be a datetime string or an integer. Datestime version can be disabled if
    use_datetime_version is set to False. It optionally creates TensorBoardLogger, WandBLogger,
    DLLogger, MLFlowLogger, ClearMLLogger, ModelCheckpoint objects from pytorch lightning.
    It copies sys.argv, and git information if available to the logging directory. It creates a
    log file for each process to log their output into.

    exp_manager additionally has a resume feature (resume_if_exists) which can be used to
    continuing training from the constructed log_dir. When you need to continue the training
    repeatedly (like on a cluster which you need multiple consecutive jobs), you need to avoid
    creating the version folders. Therefore from v1.0.0, when resume_if_exists is set to True,
    creating the version folders is ignored.

    Args:
        trainer (lightning.pytorch.Trainer): The lightning trainer.
        cfg (DictConfig, dict): Can have the following keys:

            - explicit_log_dir (str, Path): Can be used to override exp_dir/name/version folder
                creation.
                Defaults to None, which will use exp_dir, name, and version to construct the
                logging directory.
            - exp_dir (str, Path): The base directory to create the logging directory.
                Defaults to None, which logs to ./nemo_experiments.
            - name (str): The name of the experiment. Defaults to None which turns into "default"
                via name = name or "default".
            - version (str): The version of the experiment. Defaults to None which uses either a
                datetime string or lightning's TensorboardLogger system of using version_{int}.
            - use_datetime_version (bool): Whether to use a datetime string for version.
                Defaults to True.
            - resume_if_exists (bool): Whether this experiment is resuming from a previous run.
                If True, it sets trainer._checkpoint_connector._ckpt_path so that the trainer
                should auto-resume. exp_manager will move files under log_dir to log_dir/run_{int}.
                Defaults to False.
                From v1.0.0, when resume_if_exists is True, we would not create version folders to
                make it easier to find the log folder for next runs.
            - resume_past_end (bool): exp_manager errors out if resume_if_exists is True
                and a checkpoint matching ``*end.ckpt`` indicating a previous training run
                fully completed. This behaviour can be disabled, in which case the ``*end.ckpt``
                will be loaded by setting resume_past_end to True. Defaults to False.
            - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True
                and no checkpoint could be found. This behaviour can be disabled, in which case exp_manager
                will print a message and continue without restoring, by setting resume_ignore_no_checkpoint
                to True. Defaults to False.
            - resume_from_checkpoint (str): Can be used to specify a path to a specific checkpoint
                file to load from. This will override any checkpoint found when resume_if_exists
                is True. Defaults to None.
            - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it
                to the pytorch lightning trainer. Defaults to True.
            - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's
                TensorboardLogger class. Note that log_dir is passed by exp_manager and cannot exist
                in this dict. Defaults to None.
            - create_wandb_logger (bool): Whether to create a Weights and Baises logger and attach it
                to the pytorch lightning trainer. Defaults to False.
            - wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's
                WandBLogger class. Note that name and project are required parameters if
                create_wandb_logger is True. Defaults to None.
            - create_mlflow_logger (bool): Whether to create an MLFlow logger and attach it to the
                pytorch lightning training. Defaults to False
            - mlflow_logger_kwargs (dict): optional parameters for the MLFlow logger
            - create_dllogger_logger (bool): Whether to create an DLLogger logger and attach it to the
                pytorch lightning training. Defaults to False
            - dllogger_logger_kwargs (dict): optional parameters for the DLLogger logger
            - create_clearml_logger (bool): Whether to create an ClearML logger and attach it to the
                pytorch lightning training. Defaults to False
            - clearml_logger_kwargs (dict): optional parameters for the ClearML logger
            - create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and
                attach it to the pytorch lightning trainer. The ModelCheckpoint saves the top 3 models
                with the best "val_loss", the most recent checkpoint under ``*last.ckpt``, and the
                final checkpoint after training completes under ``*end.ckpt``.
                Defaults to True.
            - create_early_stopping_callback (bool): Flag to decide if early stopping should be used
                to stop training. Default is False. See EarlyStoppingParams dataclass above.
            - create_preemption_callback (bool): Flag to decide whether to enable preemption callback
                to save checkpoints and exit training immediately upon preemption. Default is True.
            - create_straggler_detection_callback (bool): Use straggler detection callback.
                Default is False.
            - create_fault_tolerance_callback (bool): Use fault tolerance callback. Default is False.
            - files_to_copy (list): A list of files to copy to the experiment logging directory.
                Defaults to None which copies no files.
            - log_local_rank_0_only (bool): Whether to only create log files for local rank 0.
                Defaults to False.
                Set this to True if you are using DDP with many GPUs and do not want many log files
                in your exp dir.
            - log_global_rank_0_only (bool): Whether to only create log files for global rank 0.
                Defaults to False.
                Set this to True if you are using DDP with many GPUs and do not want many log files
                in your exp dir.
            - max_time (str): The maximum wall clock time *per run*. This is intended to be used on
                clusters where you want a checkpoint to be saved after this specified time and be
                able to resume from that checkpoint. Defaults to None.
            - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give
                enough time for rank 0 to initialize
            - train_time_interval (timedelta): pass an object of timedelta to save the model every
                timedelta. Defaults to None. (use _target_ with hydra to achieve this)

    returns:
        log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of
            exp_dir, name, and version.
    
LOCAL_RANKr   Nz@exp_manager did not receive a cfg argument. It will be disabled.zXTrainer was called with fast_dev_run. exp_manager will return without any functionality.zExpManager schemazcfg was type: z(. Expected either a dict or a DictConfigT)resolve)r   r   r   r   r   r   r   r   defaultexperiment_namezUmlflow logger specified but no experiment name set. Using the same as Tensorboard: %sexist_okzExperiments will be logged at zjCannot set both log_local_rank_0_only and log_global_rank_0_only to True.Please set either one or neither.Fznemo_log_globalrank-z_localrank-z.txt)r   )r   r   r   r   zFound a PTL Timer callback, replacing with a StatelessTimer callback. This will happen if you set trainer.max_time as well as exp_manager.max_time_per_run.zEnabling straggler detection...z``create_straggler_detection_callback` is True, but there is no Straggler Det. package installed.zEnabling fault tolerance...)r   
autoresumer   simulated_fault_paramszvFaultToleranceCallback was enabled with create_fault_tolerance_callback, but fault_tolerance package is not installed.zhTFLOPs per sec per GPU will be calculated, conditioned on supported models. Defaults to -1 upon failure.zcmd-args.logwutf-8)encoding zgit-info.logazcommit hash: znemo_error_log.txtzlightning_logs.txtre   rB   ){ra   r   environr  	node_ranknum_devicesr%   rankerrorfast_dev_runr  r   
structuredr   r,   
isinstancedictcreater   
ValueErrortypeto_containermergeerror_checksget_log_dirr   r   r   r   r   r   check_resumer   r   r   ri   r   r   r   r(  r   r'   log_dircheckpoint_namer   makedirs_default_root_dirr   r   r*   r#   add_file_handlerr   r   r   r   r   configure_loggersr   r   r   r   r   r   r  r   	callbacksinsertr   r   r   r   r!   r   r   r   r   appendr   r   r   r   r"   r   configure_checkpointingr   r   -configure_no_restart_validation_training_loopr   	enumerater   StatelessTimermax_timer   HAVE_STRAGGLER_DETr   r9   r   HAVE_FTr   r   r:   r   parentr   r   r   r   r   openwritejoinsysargvget_git_hashget_git_diffadd_err_file_handlerr.   	num_nodesr  sleepr   r4   )r   r#  
local_rankglobal_rankschemarC  r   r   r   rD  	app_statenemo_testinglog_filetiming_callbackema_callbackearly_stop_callbackipl_epoch_stopper_callbackfound_ptl_timeridxcallbackstraggler_det_args_dictstraggler_det_callback	ft_paramsft_use_autoresumefault_tol_callback_filegit_repogit_hashrB   rB   rC   exp_manager  s  l


















	





rs  c              
   C   s   t  rt t krtd| jdur2|js|js|j	r2t
d|j d|j d|j	 d|j	 d	| jdkr@t| s@td	 | jdkrRt| jtsTtd
 dS dS dS )a  
    Checks that the passed trainer is compliant with NeMo and exp_manager's passed configuration.
    Checks that:
        - Throws error when hydra has changed the working directory.
          This causes issues with lightning's DDP
        - Throws error when trainer has loggers defined but create_tensorboard_logger
            or create_wandB_logger or create_mlflow_logger or create_dllogger_logger is True
        - Prints error messages when 1) run on multi-node and not Slurm, and
            2) run on multi-gpu without DDP
    zHydra changed the working directory. This interferes with ExpManger's functionality. Please pass hydra.run.dir=. to your python script.NzwThe pytorch lightning trainer that was passed to exp_manager contained a logger, and either create_tensorboard_logger: z or create_wandb_logger: z or create_mlflow_logger: zor create_dllogger_logger: zS was set to True. These can only be used if trainer does not already have a logger.re   zYou are running multi-node training without SLURM handling the processes. Please note that this is not tested in NeMo and could result in errors.zmYou are running multi-gpu without ddp.Please note that this is not tested in NeMo and could result in errors.)r   initializedr   r   getcwdr<  loggerr   r   r   rE   r\  check_slurmr%   r6  r4  r9  strategyr   )r   r#  rB   rB   rC   r@  J  s>   
r@  checkpoint_pathsc                 C   s:   g }| D ]}t |rtd| d q|| q|S )"_filter_out_unfinished_checkpointszCheckpoint zH has the unfinished marker set - skipped while looking for the last one.)r(   is_checkpoint_unfinishedr%   r   rK  )ry  res
chkpt_pathrB   rB   rC   rz  q  s   

rz  rC  r   r   r   ri   r   c              
      s  |s
t d| dddlm} z|dur||rddlm} W n ty= }	 zdd|	jd	fW  Y d}	~	S d}	~	ww d}
|rD|}
|r	 t	 sT||rTt
|sd}||r|}|j|d
d}|r||j|ddd}dd |D }dd |D  ng }g  nt
|rt }|}|| d}|rd
nd}|rtdd |D d
d}tdd |D d
d n|g }g  nw|rt|ntt|d }| }dd t|dD }dd |D }dd |D }|r|nt|d}t|}t|}t|}|dkr|dkrt d|r|nt|d t }t  t }|dkr-|dkr-t d|r>t|dkslt dksl|rdd| d}|
du rQ|d7 }n|
|kr^|d | d!7 }t| nbtd| d"t|dkr|rt|d#krd$t|d v r|d }
n;t d%| d&n2t d'|d  d(t d#krt fd)dd*D r d }
t|
}
nt d%  d+ d }
|
durt|
| _td,| j  t	 r>g }t| rt| D ]}| r|j !d-s|"| qt|dkr@t|d.}d}|D ]}|# r|d#7 }qtt|d/|  }|$  |D ]}t%t|t| q1dS dS dS )0aQ  Checks that resume=True was used correctly with the arguments pass to exp_manager. Sets
    trainer._checkpoint_connector._ckpt_path as necessary.

    Returns:
        log_dir (Path): The log_dir
        exp_dir (str): The base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment

    Raises:
        NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints
        could not be found.
        ValueError: If resume is True, and there were more than 1 checkpoint could found.
    zResuming requires the log_dir z to be passed to exp_managerr   )	is_s3_urlN)S3UtilsFz<Detected S3 dirpath while missing required dependencies.
{}
r.  T)match_directory)suffixreturn_key_onlyc                 S      g | ]	}| d r|qS zend.ckptendswith.0krB   rB   rC   
<listcomp>      z check_resume.<locals>.<listcomp>c                 S   r  z	last.ckptr  r  rB   rB   rC   r    r  z	**/*.ckptc                 S   r  r  r  r  rB   rB   rC   r    r  )reversec                 S   r  r  r  r  rB   rB   rC   r    r  checkpointsc                 S   s   g | ]}|  r|qS rB   )is_dirr  drB   rB   rC   r    s    *c                 S   r  )z*endmatchr  rB   rB   rC   r    r  c                 S   r  )z*lastr  r  rB   rB   rC   r    r  z	*end.ckptzEnd checkpoint is unfinished and cannot be used to resume the training. Please remove the checkpoint manually to avoid unexpected cosequences, such as restarting from scratch.z
*last.ckptaL  Last checkpoint is unfinished and cannot be used to resume the training. Please remove the checkpoint manually to avoid unexpected cosequences,  such as restarting from scratch. Hint: Iteration number can be added  to the checkpoint name pattern to maximize chance that there is at least one finished last checkpoint to resume from.z]There were no checkpoints found in checkpoint_dir or no checkpoint folder at checkpoint_dir :z. zTraining from scratch.zTraining from .z. Cannot resume.re   mp_rankzMultiple checkpoints z that matches *end.ckpt.zFound z= indicating that the last training run has already completed.c                    s    g | ]}|t  d  v r|qS r  )r^   )r  slast_checkpointsrB   rC   r    s     )r  tp_rank
fsdp_shardz that matches *last.ckpt.z#Resuming training from checkpoint: zevents.out.tfeventszrun_*run_)&r<  nemo.utils.s3_dirpath_utilsr~  nemo.utils.s3_utilsr  ImportErrorformatoutputdecoder,   r7   s3_path_existsfind_files_with_suffixr6   globsortedr   existslistrgloblenrz  r%   r   r<   r^   anyr5   	ckpt_pathr  iterdiris_filer   
startswithrK  r  mkdirr   )r   rC  r   r   r   ri   r   r~  r  err
checkpointcheckpoint_dir_existscheckpoint_dirall_keysend_checkpointsmscdist_checkpointsend_dist_checkpointslast_dist_checkpointsend_chkpt_cntfinished_end_chkpt_cntlast_chkpt_cntfinished_last_chkpt_cntwarnfiles_to_movechildother_run_dirs	run_countfoldnew_run_dirrp  rB   r  rC   rB  ~  s   
	"	








	rB  r   r   r   r   c              	   C   sx   | j durtd| d|s|r td| d| d| d t r2t| r2td| d	 t|t|d
d
fS )aR  Checks that the passed arguments are compatible with explicit_log_dir.

    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment

    Raise:
        LoggerMisconfigurationError
    NzfThe pytorch lightning trainer that was passed to exp_manager contained a logger and explicit_log_dir: zN was pass to exp_manager. Please remove the logger from the lightning trainer.z'exp_manager received explicit_log_dir: z and at least one of exp_dir: z, or version: z>. Please note that exp_dir, name, and version will be ignored.zExp_manager is logging to z, but it already exists.r   )	rv  rE   r%   r6  r,   r   r  r   r^   )r   r   r   r   r   rB   rB   rC   check_explicit_log_dir2  s"   
r  r   c           
      C   sB  |r
t | ||||S |}|du rtt d }| jdurE| jjr/|r+td| d| jj}|r9td| d| jj}d| jj }n>|pHd}|pQt	j
td}|s|r^td	 d}n%t r|ritd
}ntt|||d}d|j }|du r~dn|t	j
t< t|tt| t|du rdnt| }	|	t|||fS )a  
    Obtains the log_dir used for exp_manager.

    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment
        explicit_log_dir (str): The explicit path to the log folder. Defaults to False.
        use_datetime_version (bool): Uses date and time as the version of the log folder.
            Defaults to True.
        resume_if_exists (bool): if resume_if_exists of the exp_manager's config is enabled or not.
            When enabled, the version folders would not get created.

    Raise:
        LoggerMisconfigurationError: If trainer is incompatible with arguments
        NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints
            could not be found.
        ValueError: If resume is True, and there were more than 1 checkpoint could found.
    Nnemo_experimentszThe pytorch lightning trainer that was passed to exp_manager contained a logger, the logger's save_dir was not None, and exp_dir (z) was not None. If trainer.logger.save_dir exists, exp_manager will use trainer.logger.save_dir as the logging directory and exp_dir must be None.z[The pytorch lightning trainer that was passed to exp_manager contained a logger, and name: z was also passed to exp_manager. If the trainer contains a logger, exp_manager will use trainer.logger.name, and name passed to exp_manager must be None.version_r'  zZNo version folders would be created under the log folder as 'resume_if_exists' is enabled.z%Y-%m-%d_%H-%M-%Ssave_dirr   r   r   )r  r^   r   cwdrv  r  rE   r   r   r   r2  r  r$   r%   r   r,   r  strftimer   )
r   r   r   r   r   r   r   _exp_dirtensorboard_loggerrC  rB   rB   rC   rA  T  sL   
	,rA  c               
   C   sX   zdt jg dt jd fW S  t jtfy+ }  zdd| fW  Y d} ~ S d} ~ ww )z
    Helper function that tries to get the commit hash if running inside a git folder

    returns:
        Bool: Whether the git subprocess ran without error
        str: git subprocess output or error message
    T)gitz	rev-parseHEADstderrF{}
N)
subprocesscheck_outputSTDOUTr  CalledProcessErrorFileNotFoundErrorr  r  rB   rB   rC   rY    s   rY  c               
   C   sT   zt jddgt jd W S  t jy) }  zd| jdW  Y d} ~ S d} ~ ww )z
    Helper function that tries to get the git diff if running inside a git folder

    returns:
        Bool: Whether the git subprocess ran without error
        str: git subprocess output or error message
    r  diffr  r  r.  N)r  r  r  r  r  r  r  r  rB   rB   rC   rZ    s   rZ  r   r   r   r   wandb_kwargsr   mlflow_kwargsr   dllogger_kwargsr   clearml_kwargsr   neptune_kwargsc                 C   s  g }|r(|du ri }nd|v rt dtd|||d|}|| td |rd|	du r0i }	d|	vr<d|	vr<t d|	d	ddu rH||	d	< tj|	d	 d
d tdd|i|	}|| td |
ryt	dd|i|}|| td |rt
di |}|| td |rt||||jd}|| td |r|du ri }d|vrd|vrt dd|vrtddst dtdi |}|| td | j| dS )z
    Creates TensorboardLogger and/or WandBLogger / MLFlowLogger / DLlogger / ClearMLLogger
    and attach them to trainer.
    Raises ValueError if summary_writer_kwargs or wandb_kwargs are misconfigured.
    NrC  z{You cannot pass `log_dir` as part of `summary_writer_kwargs`. `log_dir` is handled by lightning's TensorBoardLogger logger.r  z!TensorboardLogger has been set upr   projectz.name and project are required for wandb_loggerr  Tr)  r   zWandBLogger has been set uprun_namezMLFlowLogger has been set upzDLLogger has been set up)clearml_cfgrC  rs   ru   zClearMLLogger has been set upz0name and project are required for neptune_loggerapi_keyNEPTUNE_API_TOKENz|either api_key should be set in neptune_kwargs or NEPTUNE_API_TOKEN should be set in environment variable for neptune_loggerzNeptuneLogger has been set uprB   )r<  r   rK  r%   r  r  r   rE  r   r   r1   r/   ru   getenvr   _logger_connectorconfigure_logger)r   r   rC  r   r   r   r   r   r   r  r   r  r   r  r   r  r   r  logger_listr  wandb_loggermlflow_loggerdllogger_loggerclearml_loggerneptune_loggerrB   rB   rC   rH    sh   











rH  c                       s$   e Zd ZdZdd fddZ  ZS )NeMoCheckpointConnectorz
    Wrapper around Lightning's _CheckpointConnector to use broadcasted checkpoint path in
    distributed training settings to pre-load checkpoint.
    Nr$  c                    sv   | j j}|durtd| dtj   t }t	 
| |dur9tdt | ddtj   dS dS )resume_startNzResuming from checkpoint z, rank z2Time elapsed loading checkpoint/optimizer states: z.2fz seconds, rank )r   r  r%   r  r   r   get_rankr  perf_counterrG   r  )rI   checkpoint_path
start_timerK   rB   rC   r  .  s   z$NeMoCheckpointConnector.resume_startr   )r$  N)r>   r?   r@   rA   r  rM   rB   rB   rK   rC   r  (  s    r  resumeparamsr   r   c           
   	   C   s  | j D ]}t|trtdqd|v rL|jdur5td |jdu r)t|jj	|_|j
du r5t|jj|_
t| |d= W d   n1 sGw   Y  |jdu rXt|d |_|j
du rg| d|j d|_
|jdu ro||_|jrt }|jdur|jdks|jdur|jdks|jdur|jdkrtd	|j d
|j d|j d|j
d t_t|j t|j
 t|j d|jv r| jdur| jdkr| j| jk rtd| j d| j d|j d n| jdur| jdkrtd| j d| j d tdd|i|}| jp
d|_d|jv sd|jv rt|j|_| j | |r@t j!" r9t#|}	| j |	 dS t$d dS dS )zAdds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer
    already has a ModelCheckpoint callback
    zThe pytorch lightning trainer that was passed to exp_manager contained a ModelCheckpoint and create_checkpoint_callback was set to True. Please either set create_checkpoint_callback to False, or remove ModelCheckpoint from the lightning trainerrh   NzEfilepath is deprecated. Please switch to dirpath and filename insteadr  z--{z:.4f}-{epoch}re   zkalways_save_nemo is set to True, please ensure that model parallel is not used.tensor_model_parallel_size: z,pipeline_model_parallel_size: z,context_parallel_size: ,z-lastvalr{   zVThe checkpoint callback was told to monitor a validation value but trainer.max_epochs(z0) was less than trainer.check_val_every_n_epoch(zF). It is very likely this run will fail with ModelCheckpoint(monitor='ze') not found in the returned metrics. Please ensure that validation is run within trainer.max_epochs.zbThe checkpoint callback was told to monitor a validation value and trainer's max_steps was set to z5. Please ensure that max_steps will run for at least z8 epochs to ensure that checkpointing will not error out.n_resumer   r  r  z:Preemption is supported only on GPUs, disabling preemptionrB   )%rI  r9  r   rN   rh   r%   r   ri   r   rS  rj   r   r    rQ   rs   rv   r'   tensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizerE   r(   CHECKPOINT_NAME_LASTdebug
max_epochscheck_val_every_n_epochr6  	max_stepsr  last_model_pathr5   rK  r   r  is_availabler)   r  )
r   rC  r   r  r  r   rj  ra  checkpoint_callbackpreemption_callbackrB   rB   rC   rL  =  s   

	










	rL  c                 C   s    z| j jW S  ty   Y dS w )rw  F)accelerator_connectoris_slurm_managing_tasksAttributeError)r   rB   rB   rC   rw    s
   
rw  c                	       s   e Zd ZdZdejdfdedededdf fdd	Z	de
eef fd
dZde
eef ddfddZdejjddf fddZ  ZS )rO  z&Extension of PTL timers to be per run.NTdurationintervalrW   r$  c                    s   t  ||| dS )a  stateless timer

        Args:
            duration (timedelta, optional): _description_. Defaults to None.
            interval (str, optional): _description_. Defaults to Interval.step.
            verbose (bool, optional): _description_. Defaults to True.
        NrF   )rI   r  r	  rW   rK   rB   rC   rH     s   zStatelessTimer.__init__c                 C   s   i S )
state_dictrB   )rI   rB   rB   rC   r
       zStatelessTimer.state_dictr
  c                 C   s   dS )load_state_dictNrB   )rI   r
  rB   rB   rC   r    r  zStatelessTimer.load_state_dictr   c                    sH   t  | |jr"|j}|r||}||| ddlm} | dS )_check_time_remainingr   )_TunerExitExceptionN)rG   r  should_stopr  _monitor_candidates_save_last_checkpoint&lightning.pytorch.utilities.exceptionsr  )rI   r   r  monitor_candidatesr  rK   rB   rC   r    s   
z$StatelessTimer._check_time_remaining)r>   r?   r@   rA   r   r  r   r^   rb   rH   r   r	   r
  r  	lightningpytorchTrainerr  rM   rB   rB   rK   rC   rO    s"    "rO  c                 C   s<   t | jjtkrtdt dS t| | j| j	}|| j_dS )rM  zFDetected custom epoch loop. Skipping no validation on restart support.N)
r=  fit_loop
epoch_loopr   warningsr  UserWarning SkipResumeTrainingValidationLoop	min_stepsr   )r   looprB   rB   rC   rM    s
   rM  c                       s&   e Zd ZdZdef fddZ  ZS )r  z
    Extend the PTL Epoch loop to skip validating when resuming.
    This happens when resuming a checkpoint that has already run validation, but loading restores
    the training state before validation has run.
    r$  c                    s   | j rdS t |S )_should_check_val_fxF)
restartingrG   r  )rI   data_fetcherrK   rB   rC   r    s   z5SkipResumeTrainingValidationLoop._should_check_val_fx)r>   r?   r@   rA   rb   r  rM   rB   rB   rK   rC   r    s    r  exp_log_dirremove_ckptremove_nemoc                 C   s   t | } |r(td ttj| dd}|D ]}t| td|  q|rLtd ttj| dd}|D ]}t| td|  q<dS dS )a  
    Helper method that removes Pytorch Lightning .ckpt files or NeMo .nemo files from the
    checkpoint directory

    Args:
        exp_log_dir: str path to the root directory of the current experiment.
        remove_ckpt: bool, whether to remove all *.ckpt files in the checkpoints directory.
        remove_nemo: bool, whether to remove all *.nemo files in the checkpoints directory.
    zDeleting *.ckpt files ...r  z*.ckptzDeleted file : zDeleting *.nemo files ...z*.nemoN)r^   r%   r  r  r   pathrV  remove)r!  r"  r#  
ckpt_filesrh   
nemo_filesrB   rB   rC   clean_exp_ckpt  s   




r(  r   )FFFNN)NNNNTF)TF)r  r   r   r  rW  r  r  collectionsr   dataclassesr   r   datetimer   pathlibr   shutilr   r   typingr	   r
   r   r   r   r   r   lightning.pytorchr  r   hydra.core.hydra_configr   hydra.utilsr   lightning.pytorch.callbacksr   r   *lightning.pytorch.callbacks.early_stoppingr   !lightning.pytorch.callbacks.timerr   r   lightning.pytorch.loggersr   r   r   r   lightning.pytorch.loopsr    lightning.pytorch.strategies.ddpr   9lightning.pytorch.trainer.connectors.checkpoint_connectorr   	omegaconfr   r   r    !nemo.collections.common.callbacksr!   3nemo.collections.common.callbacks.ipl_epoch_stopperr"   nemo.constantsr#   r$   
nemo.utilsr%   r&   nemo.utils.app_stater'   nemo.utils.callbacksr(   r)   nemo.utils.env_var_parsingr*   nemo.utils.exceptionsr+   nemo.utils.get_rankr,   nemo.utils.import_utilsr-   !nemo.utils.lightning_logger_patchr.   nemo.utils.loggersr/   r0   r1   r2   r3   nemo.utils.mcore_loggerr4   nemo.utils.model_utilsr5   nemo.utils.msc_utilsr6   r7   r8   HAVE_MCORE_MBATCH_CALCULATORptl_resiliencyr9   rQ  r  ModuleNotFoundErrorr:   rR  r<   rE   rN   rO   rc   rg   r}   r   r   r   r   r   r  rs  r@  r^   rz  rb   rB  r  rA  rY  rZ  r:  rH  r  rL  rw  rO  r  r  rM  r  r(  rB   rB   rB   rC   <module>   s  $"

At(:   u*'
 5

$
U	

`
a*
$