o
    }oi                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZmZ G d	d
 d
eZdd Zdd ZedkrMe  dS dS )z>
Test fault tolerance with LLaMA3 recipe and a smaller model.
    N)Callback)llm)straggler_det_callback)FaultTolerancePlugin)TimingCallback)small_llama_cfg
train_datac                   @   s   e Zd ZdddZdd ZdS )CrashCallback   c                 C   s    || _ d| _td| j   d S )Nr   z%Setup to simulate a crash if step == )
crash_stepcurrent_stepprint)selfr    r   Y/home/ubuntu/.local/lib/python3.10/site-packages/tests/collections/llm/test_fault_nvrx.py__init__"   s   zCrashCallback.__init__c                 C   s8   | j d | _ | jr| j | jkrtd| j dd S d S )N   zSimulating a crash at step !)r   r   	Exception)r   trainer	pl_moduleoutputsbatch	batch_idxr   r   r   on_train_batch_end'   s   z CrashCallback.on_train_batch_endN)r
   )__name__
__module____qualname__r   r   r   r   r   r   r	   !   s    
r	   c                  C   s   t jddd} | jdtddd | jdtdd	 | jd
tddd | jdtddd | jdtd dd | jdtd dd | jdtdd	 |  S )N )progdescriptionz	--devicesTz%Number of devices to use for training)typerequiredhelpz--crash-stepz%Step when a crash should be simulated)r!   r#   z--check-reportFz6Check if StragglerDetection reports performance scores)r!   defaultr#   z--experiment-dirz-directory to write results and checkpoints toz--data-pathz4Path to data file. If not specified, uses mock data.z--tokenizer-pathzOPath to a sentencepiece tokenizer model file. If not specified, uses mock data.z--index-mapping-dirz$directory to write index mappings to)argparseArgumentParseradd_argumentintboolstr
parse_args)parserr   r   r   get_args-   s0   r-   c                  C   s  t  } d}tjj| j|| jd}ttjt	d|_
| jr,| jr,t| j| j| jdd|_d|j_d|j_d|jj_d |jj_d|jj_d	|j_d
|j_tj| jdd}tdddg}tttddg|j_| j rv|jj!tjt"| j d tj|||d d }t#t$j%&| jd}|' }W d    n1 sw   Y  | j(rd|v sJ d|v sJ d|v sJ | j rd| j  d|v sJ d|v sJ d|v sJ d S d S )N-L2_llama3_small_pretrain_fault_tolerance_test)dirnamenum_gpus_per_nodei   )	data_pathtokenizer_pathindex_mapping_dir
seq_length   r   
   F      ft)ntasks_per_nodelauncherr   )num_in_job_restartsnum_job_retries_on_failureg      ?)straggler_report_time_interval)r   )pluginsexecutorzrun.logzGPU relative performancezGPU individual performancez Straggler report processing timez&Exception: Simulating a crash at step r   z'Restored all states from the checkpointz-`Trainer.fit` stopped: `max_steps=20` reached))r-   r   	llama3_8bpretrain_recipeexperiment_dirdevicesrunConfig
LlamaModelr   modelr2   r3   r   r4   datar   	max_stepslog_every_n_stepslogckptevery_n_train_stepstrain_time_intervalstrategyckpt_async_saveval_check_intervallimit_val_batchesLocalExecutorr   r   r   	callbacksr   appendr	   openospathjoinreadcheck_report)argsexp_namerC   rA   run_pluginslog_contentfr   r   r   mainI   sT   




rc   __main__)__doc__r%   rY   nemo_runrF   lightning.pytorch.callbacksr   nemo.collectionsr   -nemo.collections.llm.recipes.callbacks.commonr   nemo.lightning.run.pluginsr   nemo.utils.exp_managerr   tests.collections.llm.commonr   r   r	   r-   rc   r   r   r   r   r   <module>   s    <
