o
    }oi                     @   s   d dl mZ d dlmZ d dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ dZejjed	d
ejej fddZ		ddeded
ejfddZejje
ed				ddee dededed
ejf
ddZdS )    )OptionalN)finetunepretrain)MockDataModule)SquadDataModule)mixtral_8x7b)TimingCallbackmixtral_8x7b_16k)namereturnc                  C   s   t  } d| j_d| j_| S )a  
    Factory function to create a Mixtral 8x7B model configuration with 16k sequence length.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Mixtral 8x7B model with 16k sequence length.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=mixtral_8x7b_16k ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
     @  )r   modelconfig
seq_lengthmax_position_embeddings)model_config r   a/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/mixtral_8x7b_16k.pyr      s   r         	num_nodesnum_gpus_per_nodec                 C   s(   t jddtjdddd| |ttgd
S )a  
    Configure the NeMo Lightning Trainer for Mixtral 8x7B model with 16k sequence length.

    This function sets up the distributed training strategy optimized for longer sequences.

    Args:
        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.

    Returns:
        run.Config: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=mixtral_8x7b_16k ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        This configuration uses increased parallelism to handle the longer sequence length efficiently.
    r      NT   )
tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelismexpert_parallelismr   r   	callbacks)r   trainertorchbfloat16runConfigr   r   r   r   r   r   r"   5   s   
r"   )targetr
   defaultdirr
   c                 C   s@   t j|| ||d}t |_t||d|_tjtdddd|_|S )a  
    Create a pre-training recipe for Mixtral 8x7B model with 16k sequence length.

    This function sets up a complete configuration for pre-training, including
    model, trainer, and data settings optimized for 16k sequence length.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory mixtral_8x7b_16k
            $ nemo llm pretrain --factory "mixtral_8x7b_16k(num_nodes=4, name='my_16k_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="mixtral_8x7b_16k_pretrain", num_nodes=4)
            >>> print(recipe)
    )r
   r*   r   r   r'   r   i   r   )r   global_batch_sizemicro_batch_size)r   pretrain_reciper   r"   r%   r&   r   data)r*   r
   r   r   reciper   r   r   r-   ^   s
   r-   )r   r   )Nr)   r   r   )typingr   lightning.pytorchpytorchplnemo_runr%   r#   nemo.collections.llm.apir   r   "nemo.collections.llm.gpt.data.mockr   #nemo.collections.llm.gpt.data.squadr   nemo.collections.llm.recipesr   nemo.utils.exp_managerr   NAMEclifactoryr&   LightningModuler   intr"   strPartialr-   r   r   r   r   <module>   sJ   
)