o
    }oi[                     @   s   d dl mZ d dlmZ d dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ dZejjedd	ejej fd
dZ		ddeded	ejfddZejje
ed				ddee dededed	ejf
ddZdS )    )OptionalN)finetunepretrain)MockDataModule)mixtral_8x7b)TimingCallbackmixtral_8x7b_64k)namereturnc                  C   s   t  } d| j_| S )a  
    Factory function to create a Mixtral 8x7B model configuration with 64k sequence length.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model with 64k sequence length.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=mixtral_8x7b_64k ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
       )r   modelconfig
seq_length)model_config r   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/mixtral_8x7b_64k.pyr      s   r         	num_nodesnum_gpus_per_nodec                 C   s(   t jddtjdddd| |ttgd
S )ao  
    Configure the NeMo Lightning Trainer for Mixtral 8x7B model with 64k sequence length.

    This function sets up the distributed training strategy optimized for very long sequences.

    Args:
        num_nodes (int, optional): Number of compute nodes to use. Defaults to 8.
        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.

    Returns:
        run.Config: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=mixtral_8x7b_64k ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=16, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        This configuration uses significantly increased parallelism to handle the long sequence length efficiently.
        It requires a substantial amount of computational resources.
    r      NT   )
tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelismexpert_parallelismr   r   	callbacks)r   trainertorchbfloat16runConfigr   r   r   r   r   r   r    3   s   
r    )targetr	   defaultdirr	   c                 C   s@   t j|| ||d}t |_t||d|_tjtdddd|_|S )a  
    Create a pre-training recipe for Mixtral 8x7B model with 64k sequence length.

    This function sets up a complete configuration for pre-training, including
    model, trainer, and data settings optimized for 64k sequence length.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int, optional): Number of compute nodes to use. Defaults to 16.
        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory mixtral_8x7b_64k
            $ nemo llm pretrain --factory "mixtral_8x7b_64k(num_nodes=16, name='my_64k_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="mixtral_8x7b_64k_pretrain", num_nodes=16)
            >>> print(recipe)

    Note:
        This recipe is optimized for handling long sequences (64k) compared to the standard version.
        It requires extensive computational resources due to the model size and extended sequence length.
    )r	   r(   r   r   r%   r   i   r   )r   global_batch_sizemicro_batch_size)r   pretrain_reciper   r    r#   r$   r   data)r(   r	   r   r   reciper   r   r   r+   ]   s
   #r+   )r   r   )Nr'   r   r   )typingr   lightning.pytorchpytorchplnemo_runr#   r!   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr   nemo.collections.llm.recipesr   nemo.utils.exp_managerr   NAMEclifactoryr$   LightningModuler   intr    strPartialr+   r   r   r   r   <module>   sH   
*