o
    wiU                  M   @   s  d dl mZ d dlmZ d dlmZmZ d dlmZ	 d dl
Zd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 dZ8dd Z9de:de;de<dej=e	j> fddZ?	 	!			!	"	!	 	#	#	$	%	&		"	'	'	"	'	(dnd)e<d*e<d+eej@ d,ee< d-e<d.e;d/e<d0e<d1e<d2e<d3e<d4e<d5e<d6eeAej=e   d7e;d8e;d9e;d:e;d;e;d<ed= dej=ejB f*d>d?ZCdod@ee:B dB fdAdBZDdd d!d!d d!d d!d!dCdDdEd"d&d#dd'd'dd d"d#d!dd gd"d"d'd'd'd(d&d"ddddFef&dGe:dHe<dIe<d/e<d0e<dJe<dKe<dLe<dMe<de<dNe<de:dOe;dPe<d2e<dQe:dRe;d9e;dSe;dTe<dUe;d1e<dVe<dWe<dXeAe< d:e;d8e;d7e;d;e;dYe;dZed[ d4e<d\e;d]e:d^e:d_e:d`e:dejEfLdadbZFejGjHe8dcdej=e fdddeZIejGjHe8dcde;de<dej=e	j> fdfdgZJejGjHee8dhedidejEfdjdkZKejGjHee8dhedid]e:dB dejEfdldmZLdS )p    )asdict)Path)LiteralOptionalN)Callback)DistributedDataParallelConfig)	lightning)llm)TokenizerSpec)finetunepretrain)PreTrainingDataModule)Evo2Datasetparse_dataset_config)MockDataModule)default_logtensorboard_loggerwandb_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixedbf16_with_fp8_mixed)/userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192)get_nmt_tokenizer)	callbacks)FLOPsMeasurementCallback)MegatronCommOverlapCallback)TimingCallback
hyena_testc                   C   s   t jtddS )a  
    Creates and returns a configuration for initializing a tokenizer.

    The configuration is set up to use the `get_nmt_tokenizer` function with
    the specified library type as 'byte-level'.

    Returns:
        run.Config: A configuration object for the tokenizer setup.
    z
byte-level)library)runConfigr    r!   r!   d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/recipes/hyena_base.pytokenizer_recipe1   s   
r#   
model_sizetp_comm_overlap
seq_lengthreturnc                 C   s   | dkrt j}n?| dkrt j}n7| dkrt j}n/| dkr t j}n'| dkr(t j}n| dkr0t j}n| dkr8t j}n| dkr@t j}nt	d	|  t
jt jt
j|||d
t dS )aX  
    Factory function to create a striped hyena model

    Returns:
        run.Config[pl.LightningModule]: Configuration for the striped hyena model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=hyena_test ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    test1b7bznv-7b40bznv-40b7b_arc_longcontext40b_arc_longcontextzUnsupported model size: )r&   r%   )config	tokenizer)r	   HyenaTestConfigHyena1bConfigHyena7bConfigHyenaNV7bConfigHyena40bConfigHyenaNV40bConfigHyena7bARCLongContextConfigHyena40bARCLongContextConfigNotImplementedErrorr   r    
HyenaModelr/   )r$   r%   r&   cfg_clsr!   r!   r"   model_recipeA   s4   r;         Fd   2       
   T
torch_disttensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_stepsval_check_intervallimit_test_batcheslimit_val_batcheslog_every_n_stepsr   fp8grad_reduce_in_fp32align_param_gatherno_aligned_megatron_ddpckpt_async_savesave_ckpt_format)rB   zarrc                 C   s   |rt jtd||d}nt jtd|ddd|dd}t jtj| |||||d||d|d}|r4t }d|_nt }t jtjdd||||||d|	|
||d	}|S )
a:
  
    Configure the NeMo Lightning Trainer for Striped Hyena model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Number of tensor replicas for vector parallelism.
        pipeline_parallelism (int): Number of pipeline segments for model pipeline parallelism.
        pipeline_parallelism_type ([type]): Type of pipeline parallelism to apply. Support 'interleaved','split'.
        virtual_pipelien_parallelism (int): Number of virtual pipeline stages for interleaving smaller sub-microbatches
            to reduce the computational graph bubbles caused by pipeline parallel (if pipeline parallel>1 is in use)
        context_parallelism (int): Number of context parallel blocks for processing sub-attention matrices in a block
            parallel fassion.
        sequence_parallelism (bool): Whether to use the sequence_parallelism improvement
            on the base tensor parallelism.
            This will allow for more layers to be parallelized when using tensor parallelism.
        num_nodes (int): Number of nodes for the distributed setting.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps before training terminates.
        val_check_interval (int): Interval between val check runs.
        limit_test_batches (int): Maximum number of batches over which to run test check runs on the dev set data.
        limit_val_batches (int): Maximum number of batches over which to run val check runs on the dev set data.
        log_every_n_steps (int): Log to the endpoint every n steps.
        callbacks (list[run.Config[Callback]]): A list of nemo/lightning callbacks to execute during training.
        fp8 (bool): Whether to use fp8 precision for computations.
        grad_reduce_in_fp32 (bool): Boolean indicating whether to reduce the gradient weight in the FP32 format rather
            than the default bf16.
        align_param_gather (bool): Optimization for faster train step timing potentially through aligning parameter
            gather operations.
        no_aligned_megatron_ddp (bool): Skip the aligned megatron DDP optimizations.
        ckpt_async_save (bool): Bool indicating whether to use asynchronous checkpoint saving.
        save_ckpt_format (Literal['torch_dist', 'zarr']): The checkpoint save method. The default torch_dist may result
            in larger checkpoints currently, but is the preferred option in the long run.
    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    T)check_for_nan_in_gradrQ   rR   )rW   rQ   overlap_grad_reduceoverlap_param_gatheraverage_in_collectiverR   use_distributed_optimizer)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewrT   rU   ckpt_parallel_loadddp   gpur=   F)acceleratoraccumulate_grad_batchesr   devicesrK   rI   pluginsstrategyuse_distributed_samplerrL   rM   rN   rO   )	r   r    r   nlMegatronStrategyr   fp8_amax_history_lenr   Trainer)rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   r   rP   rQ   rR   rS   rT   rU   rd   rk   mixed_precision_cfgtrainerr!   r!   r"   trainer_recipen   sh   ;rs   config_pathc                 C   s   t jt| dS )a  
    Creates a configuration for a blended dataset by utilizing the `run.Config` function.

    Args:
        config_path (Path | str | None, optional): The path to the dataset configuration file.
            Can be a `Path` object, a string, or `None`. Defaults to `None`.

    Returns:
        run.Config: A configuration object initialized with the dataset parsing function
        and the provided dataset configuration path.
    )dataset_config_path)r   r    r   )rt   r!   r!   r"   blended_dataset_config_recipe   s   rv   i    i  r*   defaultdataset_configglobal_batch_sizemicro_batch_sizegrad_acc_batchestensor_parallel_sizer]   r`   seed#use_megatron_comm_overlap_llama3_8kworkersdirenable_preemptiontflops_callbackgc_intervalnsys_profilingnsys_start_stepnsys_end_step
nsys_ranksra   ckpt_format)rV   rB   restore_optimizer_from_ckptresume_pathwandb_project
wandb_namenamec&           .      K   sB  t ||	|d}'| stjt|	||t d}(ntjtt| t|	|||
|t d	}(i })|"durY|#du rQd| d| d| d| d	| d
| d| d|  d| d| }#t|"|#d|)d< |!rftjt	j
|!d| d}*nd}*tjt	jddd||*d}+ttg},|r|,tjt|tdd|d |dkr|,tjtj||d |r|du r|}|,tjtj|||dd |r|,ttj |rtttt|'|(d}-|,|- tj|%|'td3i d|d|d|d|d|d |d!|d"|d#|d$|d%|,d&|d'|d(|d)|d*|d+||(td3||$t|$d,d-|)td.d/d0d1|+d2S )4a	  
    Create a pre-training recipe for a striped hyena model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dataset_config (str): a string specifying the path to the dataset config file (json schema)
        global_batch_size (int | None): global training batch size. If left None this will be inferred from cluster and
            model parallelism configurations.
        micro_batch_size (int): micro batch size per active device.
        num_nodes (int): number of nodes to use
        num_gpus_per_node (int): number of num_gpus_per_node per node to use.
        grad_acc_batches (int): number of training batches the gradients will be accumulated before performing each
            optimizer update.
        tensor_parallel_size (int): Number of tensor parallel splits. Typically between 1 and 8 to keep parallelism
            within a single node.
        pipeline_model_parallel_size (int): Pipeline model parallel size, splits the model by layer.
        context_parallel_size (int): Context model parallel size, how many splits to make across the sequence dimension
            to be processed in parallel similar to the strategy described in the ring attention paper.
        seq_length (int): The desired sequence length to train this model on.
        seed (int): Random seed to use for initialization
        model_size (str): model size to load
        use_megatron_comm_overlap_llama3_8k (bool): If using TP, this controls advanced overlap communications
            which can improve performance during pretraining.
        workers (int): Number of workers to use for per-device batch creation.
        val_check_interval (int): How often the model evaluates during training.
        dir (str): Directory to save logs and checkpoints
        enable_preemption (bool): Enable preemption when training on slurm, captures timeout signals and attempts to
            save a final checkpoint.
        align_param_gather (bool): Optimization for faster train step timing potentially through aligning parameter
            gather operations.
        tflops_callback (bool): Enable tflops callbacks for reporting training speed and device utilization.
        gc_interval (int): How often to run GC operations throughout training (default is auto)
        nsys_profiling (bool): Enable nsys profiling from  NeMo repo.
        max_steps (int): Maximum number of steps the training model should take.
        nsys_start_step (int): Step for when NSYS will start collecting logs
        nsys_end_step (int): Step for when NSYS will stop collecting logs.
        nsys_ranks (list[int]): Ranks for processing nsys logs. Defaults to [0] if not specified.
        no_aligned_megatron_ddp (bool): Disables aligned megatron ddp optimizations.
        fp8 (bool): Whether to use fp8 precision for computations.
        grad_reduce_in_fp32 (bool): Boolean indicating whether to reduce the gradient weight in the FP32 format rather
            than the default bf16.
        ckpt_async_save (bool): Bool indicating whether to use asynchronous checkpoint saving.
        save_ckpt_format (Literal['torch_dist', 'zarr']): The checkpoint save method. The default torch_dist may result
            in larger checkpoints currently, but is the preferred option in the long run.
        resume_path (str): If specified starting weights will be loaded from this checkpoint rather than being
            randomly initialized.
        restore_optimizer_from_ckpt (bool): when loading checkpoint, try to load the optimizer.
        wandb_project (str): if set, logging to wandb will happen
        wandb_name (str): override default name for the wandb log.
    Returns:
        run.Partial: Partial configuration for pre-training.

    )r$   r&   r%   )r&   ry   rz   r/   )pathsdataset_clsr&   rz   ry   r}   num_workersr/   Nzheyna-size-z-TPz-PPz-CPz-GBSz-MBSz-GRFP32z-ALIGNz-NODESz-FP8)projectr   r   T)pathload_model_stateload_optim_stateF)resume_if_existsresume_ignore_no_checkpointresume_past_endresume_from_directoryrestore_config   )r%   tp_comm_overlap_cfgwgrad_deferral_limit(overlap_param_gather_with_optimizer_steprR   r   )gc_interval_traingc_interval_val)
start_stepend_stepranks	gen_shapehyenarK   rI   rC   rD   rG   rH   rJ   rL   rM   rN   r   rP   rQ   rR   rS   rT   rU   r   )r   r   r   ga2U0*3?giUMu>i	  )max_lrmin_lrwarmup_steps)modelrr   datalogoptimresumer!   )r;   r   r    r   r#   r   rv   r   r   rm   RestoreConfig
AutoResumer   appendr   r   nl_callbacksGarbageCollectionCallbackNsysCallbackPreemptionCallbackr   r   Partialrs   r   r   r   ).rx   ry   rz   rI   rJ   r{   r|   r]   r`   r&   r}   r$   r~   r   rL   r   r   rR   r   r   r   rK   r   r   r   rS   rQ   rP   rT   ra   r   rN   r   r   r   r   r   fnkwargsmodel_run_cfgdata_run_cfgextra_loggersrestore_cfgnemo_resumer   flop_meas_callbackr!   r!   r"   pretrain_recipe_creater   s0  `
	

	
r   r   c                   C   s   t  S )z
    Creates and returns a tokenizer configuration.

    Returns:
        run.Config[TokenizerSpec]: A configuration object for the tokenizer.
    )r#   r!   r!   r!   r"   r/     s   r/   c                 C   s   t d| |S )z
    Factory function to create a Hyena model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for a Hyena model.

    r(   )r;   )r%   r&   r!   r!   r"   r     s   	r   )targetr   )r   c                 O   s   t |i |S )aP  
    Create a pre-training recipe for a Hyena model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    r   )r   argsr   r!   r!   r"   pretrain_recipe  s   r   c                O   s$   | dusJ dt || |d|S ) Nz&resume_path None, invalid for finetune)r   r   r   )r   r   r   r   r!   r!   r"   finetune_recipe  s   r   )r<   r=   NNr=   Fr=   r<   r>   r>   r?   r@   rA   NFTTFTrB   )N)Mdataclassesr   pathlibr   typingr   r   lightning.pytorchpytorchplnemo_runr   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rm   nemo.collectionsr	   1nemo.collections.common.tokenizers.tokenizer_specr
   nemo.collections.llm.apir   r   nemo.collections.llm.gpt.datar   ,nemo.collections.llm.gpt.data.megatron.hyenar   r   "nemo.collections.llm.gpt.data.mockr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   r   ;nemo.collections.llm.recipes.tp_overlap_configs.userbuffersr   3nemo.collections.nlp.modules.common.tokenizer_utilsr   nemo.lightning.pytorchr   r   /nemo.lightning.pytorch.callbacks.flops_callbackr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   nemo.utils.exp_managerr   NAMEr#   strboolintr    LightningModuler;   dtypelistrp   rs   rv   r   r   clifactoryr/   r   r   r   r!   r!   r!   r"   <module>   s  ".	


v	
 !"#$%(
 i
 