o
    }oiD7                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z! eG dd deZ"dS )    N)	dataclassfield)Path)ListOptionalUnion)ModelCheckpoint)LoggerTensorBoardLoggerWandbLogger)LocalCheckpointCallback)IOMixin)logging)AppStatec                   @   s*  e Zd ZU dZdZeed< dZee ed< dZ	ee ed< dZ
ee ed< dZeed	< d
Zeed< d
Zeed< dZeee  ed< dZeed< dZee ed< dZee ed< dZee ed< eedZee ed< dd Zd$deeje j!f defddZ"dd Z#d%ddZ$dd Z%d d! Z&d"d# Z'dS )&
NeMoLoggera  Logger for NeMo runs.

    Args:
        name (str): Name of the experiment.
        log_dir (Optional[str]): Directory to save logs.
        explicit_log_dir (Optional[str]): Explicit log directory.
        version (Optional[str]): Version of the experiment.
        use_datetime_version (bool): Whether to use datetime as version.
        log_local_rank_0_only (bool): Log only on local rank 0.
        log_global_rank_0_only (bool): Log only on global rank 0.
        files_to_copy (Optional[List[str]]): List of files to copy to log directory.
        update_logger_directory (bool): Whether to update logger directory to write to `exp_dir`.
            If True, the `save_dir` passed to the logger will be reconfigured to write to `exp_dir / save_dir`.
            This ensures that all output from an experiment is written to a common directory.
            If False, the logger's save_dir will not be overwritten.
            This argument applies only to TensorBoardLogger and WandbLogger instances.
        ckpt (Optional[ModelCheckpoint]): Model checkpoint callback.
        tensorboard: (Optional[TensorBoardLogger]): A PyTorch Lightning TensorBoardLogger instance
            to add to the trainer.
        wandb (Optional[WandbLogger]): A PyTorch Lightning WandBLogger instance
            to add to the trainer.
        extra_loggers(Optional[List[Logger]]): Any additional loggers to add to the trainer.
    defaultnameNlog_direxplicit_log_dirversionTuse_datetime_versionFlog_local_rank_0_onlylog_global_rank_0_onlyfiles_to_copyupdate_logger_directoryckpttensorboardwandb)default_factoryextra_loggersc                 C   s$   | j du r| jdu rtdd S d S )NTzkCannot set both log_local_rank_0_only and log_global_rank_0_only to True. Please set either one or neither.)r   r   
ValueError)self r"   N/home/ubuntu/.local/lib/python3.10/site-packages/nemo/lightning/nemo_logger.py__post_init__I   s
   zNeMoLogger.__post_init__trainerresume_if_existsc           
      C   s(  ddl m} ddlm} |j| _|j| _| jt_| jrjt	|t
jrj|jdur2| js2td| j | js8| jrEtd| j| j| j | rXt| j rXtd| j t| jt| jddf\}}| _}nX| j}| jdu rztt d	 }| jsd
| _| jptj|d}|s|rtd d}n| r| jrtd}|r| r|tj|< t|tt| j t|du rdnt| }t }	||	_||	_| j|	_||	_t j!|	_"tj#|dd t$d| |r| r| %|| t	|t
jr| &||| | j'||| j(d | )||	 | *| |	S )aI  Setup the logger for the experiment.

        Args:
            trainer (Union[pl.Trainer, fl.Fabric]): Trainer or Fabric instance.
            resume_if_exists (bool): Whether to resume if log directory exists.

        Returns:
            AppState: The application state with updated log directory and other settings.
        r   )NEMO_ENV_VARNAME_VERSION)is_global_rank_zeroNznemo logger received explicit_log_dir: {} and the pytorch lightning trainer that was passed to nemo_logger container a logger, but update_logger_directory is False. This means that the trainer's logger directory may not match with the explicit_log_dir.znemo logger received explicit_log_dir: {} and at least one of dir: {}or version: {}. Please note that dir, name, and version will be ignored.z3NeMoLogger is logging to {}, but it already exists. nemo_experimentsr   zZNo version folders would be created under the log folder as 'resume_if_exists' is enabled.z%Y-%m-%d_%H-%M-%ST)exist_okz Experiments will be logged at {})r   r   )+nemo.constantsr'   nemo.utils.get_rankr(   
local_rankglobal_rankr   rankr   
isinstanceplTrainerloggerr   warningformatr   r   errorr   existsstrr   cwdosenvirongetr   timestrftimer   exp_dirsysargvcmd_argsmakedirsinfo_handle_task_config_setup_trainer_loggers_setup_trainer_model_checkpointr   _setup_files_to_move_setup_file_logging)
r!   r%   r&   task_configr'   r(   r   _dirr   	app_stater"   r"   r#   setupP   sx   


$


.

zNeMoLogger.setupc                 C   s   | j | jg| j}dd |D }|r&|jd ur | j s |jg| }|j| | jrt|jD ]I}t|t	rN|p6d|_
t|tj|j |_td|j q,t|trs|pVd|_t||j |_t||j |jd< td|j q,d S d S )Nc                 S   s   g | ]}|d ur|qS Nr"   ).0r4   r"   r"   r#   
<listcomp>   s    z5NeMoLogger._setup_trainer_loggers.<locals>.<listcomp>r)   zR"update_logger_directory" is True. Overwriting tensorboard logger "save_dir" to {}dirzL"update_logger_directory" is True. Overwriting wandb logger "save_dir" to {})r   r   r   r4   _logger_connectorconfigure_loggerr   loggersr1   r
   _versionr   r;   pathrelpathsave_dir	_root_dirr   r5   r6   r   _id	_save_dir_wandb_init)r!   r%   rR   r   rU   r4   r"   r"   r#   rG      s6   




z!NeMoLogger._setup_trainer_loggersc                 C   sp  |rld }t |jD ]\}}t|tr t|ts td |} nq	|d ur+||j|< n|j| |jrld|jv rl|j	d urW|j	dkrW|j	|j
k rWtd|j	|j
|j n|jd url|jdkrltd|j|j
 ddlm} |jD ]@}t|trt|ts|jd u rt|d |_|jd u rt|j|r| j d	|j d
|_n| j d	|j d|_|jd t_qud S )NzRThe Trainer already contains a ModelCheckpoint callback. This will be overwritten.vala6  The checkpoint callback was told to monitor a validation value but trainer.max_epochs({}) was less than trainer.check_val_every_n_epoch({}).It is very likely this run will fail with ModelCheckpoint(monitor='{}') not found in the returned metrics. Please ensure that validation is run within trainer.max_epochs.zThe checkpoint callback was told to monitor a validation value and trainer's max_steps was set to {}. Please ensure that max_steps will run for at least {} epochs to ensure that checkpointing will not error out.r   )MegatronStrategycheckpointsz--{z :.4f}-{epoch}-{consumed_samples}z:.4f}-{epoch}-{step}z-last)	enumerate	callbacksr1   PTLModelCheckpointr   r   r5   appendmonitor
max_epochscheck_val_every_n_epochr7   r6   	max_stepsnemo.lightningr`   dirpathr   filenamestrategyr   r   CHECKPOINT_NAME_LAST)r!   r%   r   r   _overwrite_iicallbackr`   r"   r"   r#   rH      sZ   




z*NeMoLogger._setup_trainer_model_checkpointc              
   C   s   z3ddl m} ||d  ||}t|d d}|| W d    W d S 1 s,w   Y  W d S  tyN } ztd	| W Y d }~d S d }~ww )Nr   )serializationztask.pngz	task.jsonwz.Saving task config failed: {}. Skipping saving)
fiddle._src.experimentalrr   save_config_img	dump_jsonopenwrite	Exceptionr   r5   r6   )r!   rK   r   rr   	task_jsonfer"   r"   r#   rF     s   
&zNeMoLogger._handle_task_configc                 C   s   ddl m} ddlm} ddlm} ||d}|d| j d| j d }| jr4|s4| jdkr4t	
| n| jrD|sD| jdkrDt	
| n| jsO| jsOt	
| |  d	S )
z+Set up file logging based on rank settings.r   )NEMO_ENV_VARNAME_TESTING)get_envbool)add_handlers_to_mcore_loggerFznemo_log_globalrank-z_localrank-z.txtN)r,   r}   nemo.utils.env_var_parsingr~   nemo.utils.mcore_loggerr   r/   r.   r   r   add_file_handlerr   )r!   r   r}   r~   r   nemo_testinglog_filer"   r"   r#   rJ     s   


zNeMoLogger._setup_file_loggingc                 C   sF   g }t | rt | D ]}| r|| q||_| j|_d S rO   )r   r8   iterdiris_filere   files_to_mover   )r!   r   rM   r   childr"   r"   r#   rI   $  s   
zNeMoLogger._setup_files_to_move)FNrO   )(__name__
__module____qualname____doc__r   r9   __annotations__r   r   r   r   r   boolr   r   r   r   r   r   r   r   r
   r   r   r   listr   r	   r$   r   r2   r3   flFabricrN   rG   rH   rF   rJ   rI   r"   r"   r"   r#   r   !   s,   
  ^
<r   )#r;   rA   r>   dataclassesr   r   pathlibr   typingr   r   r   lightning.fabricfabricr   lightning.pytorchpytorchr2   ,lightning.pytorch.callbacks.model_checkpointr   rd   lightning.pytorch.loggersr	   r
   r   >nvidia_resiliency_ext.ptl_resiliency.local_checkpoint_callbackr   nemo.lightning.io.mixinr    nemo.lightning.pytorch.callbacks
nemo.utilsr   nemo.utils.app_stater   r   r"   r"   r"   r#   <module>   s"   