o
    }oi                     @   s  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ dZ ej!j"ej#dej$fddZ%ej!j"e ddej&ej' fddZ(										d+de)de)deej* dee) de)de+de)de)d e)d!ee,ej&e
   dej&ej- fd"d#Z.ej!j"ee d$dd%ddefd&ee/ d'e/de)de)d(edej0fd)d*Z1dS ),    )CallableOptionalN)Callback)DistributedDataParallelConfig)OptimizerConfig)	lightningMockDataModule)FluxModelParamsMegatronFluxModel)pretrain)default_logtensorboard_logger)
bf16_mixed)TimingCallbackflux_12breturnc                  C   s   t ddddddd} | S )zMock Datamodule Initializationi         T)image_himage_wmicro_batch_sizeglobal_batch_sizeimage_precachedtext_precachedr   )data_module r   _/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/recipes/flux_12b.pyflux_mock_datamodule#   s   r   namec                   C   s   t jtt tdS )z
    Factory function to create a FLUX 12B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the FLUX 12B model.

    )flux_params)runConfigr   r
   r   r   r   r   model2   s   	r$   r   F   { tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   sV   t jtj| ||||dt jtdddd}
t jtjdd|	|ddd||t |
d	d
d}|S )ab  
    Configure the NeMo Lightning Trainer for FLUX 12B model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=flux_12b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        For more information on distributed training strategies, refer to the
        NeMo documentation on multi-GPU and multi-node training.
    T)check_for_nan_in_gradgrad_reduce_in_fp32)tensor_model_parallel_sizepipeline_model_parallel_sizecontext_parallel_sizesequence_parallelpipeline_dtypegradient_accumulation_fusionddpgpur   2       
   Fi  )acceleratoraccumulate_grad_batchesr0   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr/   r-   pluginsstrategyuse_distributed_samplerval_check_interval)r"   r#   nlMegatronStrategyr   Trainerr   )r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   rE   trainerr   r   r   rK   >   s>   ,rK   )targetr    defaultdirr    fnc                 C   sn   t j|t t||t tgdt t| |t|ddt jt	j
t jtddddddt jt	jdddd	d
}|S )a  
    Create a pre-training recipe for FLUX 12B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory flux_12b

        Python API usage:
            >>> recipe = pretrain_recipe(name="flux_12b_pretrain", num_nodes=1)
            >>> print(recipe)

    Note:
        For more details on pre-training LLMs with NeMo, see the pre-training
        guide in the `examples/llm/pretrain/` directory.
    )r-   r.   r0   r   )rN   r    r   g-C6?Tr   )lrbf16use_distributed_optimizerweight_decay)config)resume_if_existsresume_ignore_no_checkpointresume_past_end)r$   rK   datalogoptimresume)r"   Partialr$   rK   r#   r   r   r   r   rH   MegatronOptimizerModuler   
AutoResume)rN   r    r-   r.   rO   reciper   r   r   pretrain_recipe   s8   $

r`   )
r   r   NNr   Fr   r%   r&   N)2typingr   r   lightning.pytorchpytorchplnemo_runr"   torch$lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   megatron.core.optimizerr   nemor   rH   9nemo.collections.diffusion.data.diffusion_mock_datamoduler	   ,nemo.collections.diffusion.models.flux.modelr
   r   nemo.collections.llm.apir   (nemo.collections.llm.recipes.log.defaultr   r   6nemo.collections.llm.recipes.precision.mixed_precisionr   nemo.utils.exp_managerr   NAMEclifactoryautoconvertLightningDataModuler   r#   LightningModuler$   intdtypeboollistrJ   rK   strr\   r`   r   r   r   r   <module>   s   	


S