o
    }oi<                     @   s   d Z ddlmZ ddlmZmZ ddlmZ ed\ZZ	ej
dd						dd
ee dee dee dee deej f
ddZdS )zCommon callback utilities for LLM training recipes.

This module provides factory functions for creating various callbacks used in LLM training,
including straggler detection and performance monitoring.
    )Optional)Configcli)safe_importz$nvidia_resiliency_ext.ptl_resiliencyT)is_target_default,  ffffff?straggler_report_time_intervalstop_if_detected_stragglergpu_relative_perf_thresholdgpu_individual_perf_thresholdreturnc                 C   s   t tj| ddd|||dd	S )a  Creates a callback for detecting slower ranks in PyTorch distributed workloads.

    This callback from nvidia-resiliency-ext monitors rank performance using two metrics:
    1. Relative performance: Compared to the best-performing rank
    2. Individual performance: Compared to the rank's best historical performance

    Performance scores range from 0.0 (worst) to 1.0 (best). A rank is considered
    a straggler if its score falls below the configured threshold (default 0.7).
    The callback reports scores every 5 minutes by default.

    Args:
        straggler_report_time_interval: Performance score reporting frequency in seconds.
            Defaults to 300 seconds.
        stop_if_detected_straggler: Whether to stop training if a straggler is detected.
            Defaults to True.
        gpu_relative_perf_threshold: Relative performance threshold for straggler detection.
            Defaults to 0.7.
        gpu_individual_perf_threshold: Individual performance threshold for detection.
            Defaults to 0.7.
    T   )report_time_intervalcalc_relative_gpu_perfcalc_individual_gpu_perfnum_gpu_perf_scores_to_printr   r   stop_if_detectedenable_ptl_logging)r   
res_moduleStragglerDetectionCallback)r	   r
   r   r    r   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/callbacks/common.pystraggler_det_callback   s   r   N)r   Tr   r   )__doc__typingr   nemo_runr   r   nemo.utils.import_utilsr   r   HAVE_RESfactoryintboolfloatr   r   r   r   r   r   <module>   s*   
