o
    wi0                  
   @   s  d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ dgZdeeef fddZdedededeeef fddZdedeeef fddZdededeeef fddZdefddZG dd deeZdS )z
OneLogger callback for NeMo training.

This module provides a callback that integrates OneLogger telemetry with NeMo training.
    N)AnyDict)Trainer)ModelCheckpoint)OneLoggerConfig)on_app_start)TrainingTelemetryConfig)TrainingTelemetryProvider)TimeEventCallback)BaseCallbackOneLoggerNeMoCallbackreturnc                  C   sL   dt jv rt jd} nt jdd} tt jdd}d| t |d}|S )a  Generate minimal configuration for OneLogger initialization.

    This function provides the absolute minimal configuration needed for OneLogger initialization.
    It only includes the required fields and uses defaults for everything else to avoid
    dependencies on exp_manager during early import.

    Returns:
        Dictionary containing minimal initialization configuration
    EXP_NAMESLURM_JOB_NAMEnemo-run
WORLD_SIZE   nemo)application_namesession_tag_or_fnenable_for_current_rankworld_size_or_fn)osenvirongetint_should_enable_for_current_rank)session_tag
world_sizeinit_config r    _/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/lightning/one_logger_callback.pyget_one_logger_init_config%   s   

	r"   trainerglobal_batch_size
seq_lengthc                 C   sV  dt jv rt jd}nt jdd}tt jdd}t| dd}t| dd}|| }t jd	d
}| d| d| d| d| 	}	|| }
d}d}d}dd | jD }t|dk}t| dd}|dk}t| dr| jdurt	| jt
r| jddr~d}nt| jdr| jjrd}|D ]}t|dr|jrd} nq|	|||||
||||d
}|S )aH  Generate base configuration for OneLogger training telemetry.

    This function provides the common configuration needed for both NeMo v1 and v2.
    It extracts basic training information from trainer object and uses provided
    batch size and sequence length values.

    Args:
        trainer: PyTorch Lightning trainer instance
        global_batch_size: Global batch size (calculated by version-specific function)
        seq_length: Sequence length (calculated by version-specific function)

    Returns:
        Dictionary containing base training callback configuration
    r   r   r   r   r   	max_stepslog_every_n_steps
   PERF_VERSION_TAGz0.0.0__bf_se_wsFsyncc                 S   s   g | ]	}t |tr|qS r    )
isinstancer   ).0cbr    r    r!   
<listcomp>o   s    z-_get_base_callback_config.<locals>.<listcomp>r   val_check_intervalstrategyN
async_saveasync)
perf_tag_or_fnglobal_batch_size_or_fnmicro_batch_size_or_fnseq_length_or_fntrain_iterations_target_or_fntrain_samples_target_or_fnlog_every_n_train_iterations&is_validation_iterations_enabled_or_fn is_save_checkpoint_enabled_or_fnsave_checkpoint_strategy)r   r   r   r   getattr	callbackslenhasattrr5   r/   dictr6   )r#   r$   r%   job_namer   r&   r'   micro_batch_sizeperf_version_tagperf_tagtrain_samples_targetis_save_checkpoint_enabled is_validation_iterations_enabledrA   checkpoint_callbacksr3   callbackbase_configr    r    r!   _get_base_callback_configC   sR   
 rQ   c           
      C   s  d}d}t | drz| jdurzt | jdrz| jj}t |drk|j}t|dd}|dur8t|ttjdd }n3t |drk|j	}t |d	rkt
|d
krkt |drTt|n|}t|t
| }t|ttjdd }t |drzt |jdrz|jj}t| ||d}	|	S )ak  Generate NeMo v1 specific configuration for OneLogger training callback.

    This function provides NeMo v1 specific configuration by extracting values from
    the exp_manager_config object and trainer object.

    Args:
        trainer: PyTorch Lightning trainer instance

    Returns:
        Dictionary containing NeMo v1 training callback configuration
    r   lightning_moduleNcfgtrain_ds
batch_sizer   bucket_batch_size__len__r   __iter__encoderd_modelr#   r$   r%   )rE   rR   rS   rT   rB   r   r   r   r   rV   rD   listsumrY   rZ   rQ   )
r#   r$   r%   	model_cfgrT   rH   bucket_batch_sizesbucket_listavg_batch_sizeconfigr    r    r!   get_nemo_v1_callback_config   s8   



rc   datac                 C   s   d}d}|dur8|j }t|drt|ddurtt|d}nt|dd}|dur8ttjdd}t|| }t| ||d}|S )a  Generate NeMo v2 specific configuration for the OneLogger training callback.

    This function extracts the global batch size and sequence length from the provided NeMo v2 data module,
    and uses them to construct the configuration dictionary for the OneLogger training callback.

    Args:
        trainer: PyTorch Lightning trainer instance.
        data: NeMo v2 data module (required).

    Returns:
        Dictionary containing the NeMo v2 training callback configuration.
    r   Nr$   rH   r   r[   )r%   rE   rB   r   r   r   r   rQ   )r#   rd   r$   r%   rH   r   rb   r    r    r!   get_nemo_v2_callback_config   s    re   c                  C   s   t tjdd} | dkS )a  Determine if OneLogger should be enabled for the current rank.

    Uses environment variables instead of torch.distributed to avoid circular imports.
    In distributed training, typically only rank 0 (or the last rank) should
    enable OneLogger to avoid duplicate telemetry data.

    Returns:
        True if OneLogger should be enabled for the current rank, False otherwise
    RANKr4   r   )r   r   r   r   )rankr    r    r!   r      s   
r   c                       sH   e Zd ZdZdZ fddZd fddZded	eddfd
dZ	  Z
S )r   zAdapter extending OneLogger's PTL callback with init + config update.

    __init__ configures the provider from meta info, then calls super().__init__.
    update_config computes TrainingTelemetryConfig and applies it.
    Nc                    s   | j d u rt | | _ | j S )N)	_instancesuper__new__)clsargskwargs	__class__r    r!   rj     s   
zOneLoggerNeMoCallback.__new__r   c                    sX   t | ddrd S t }tdi |}t |   t j	t dd t
  d S )N_initializedF)call_on_app_startr    )rB   r"   r   r	   instancewith_base_configwith_export_configconfigure_providerri   __init__r   )selfr   one_logger_configrn   r    r!   rv     s   

zOneLoggerNeMoCallback.__init__nemo_versionr#   c                 K   st   t  jjd ur
d S |dkrt|d}n|dkr%|dd }t||d}nt|d}tdi |}t  | d S )Nv1)r#   v2rd   )r#   rd   r    )	r	   rr   rb   telemetry_configrc   r   re   r   set_training_telemetry_config)rw   ry   r#   rm   rb   rd   training_telemetry_configr    r    r!   update_config!  s   
z#OneLoggerNeMoCallback.update_config)r   N)__name__
__module____qualname____doc__rh   rj   rv   strr   r   __classcell__r    r    rn   r!   r     s    ) r   r   typingr   r   lightning.pytorchr   ,lightning.pytorch.callbacks.model_checkpointr   nv_one_logger.api.configr   .nv_one_logger.training_telemetry.api.callbacksr   +nv_one_logger.training_telemetry.api.configr   @nv_one_logger.training_telemetry.api.training_telemetry_providerr	   >nv_one_logger.training_telemetry.integration.pytorch_lightningr
   OneLoggerPTLCallbacknemo.lightning.base_callbackr   __all__r   r"   r   rQ   rc   re   boolr   r   r    r    r    r!   <module>   s@   

U5

*