o
    }oiM                     @   s   d dl mZ d dlmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZ dZejjeddejej fd	d
Z		ddededejfddZejje	ed				ddee dedededejf
ddZdS )    )OptionalN)pretrain)MockDataModule)	llama3_8bllama3_8b_64k)namereturnc                  C   s   t  } d| j_| S )a  
    Factory function to create a Llama3 8B model configuration with 64k sequence length.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Llama3 8B model with 64k sequence length.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=llama3_8b_64k ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
       )r   modelconfig
seq_length)model_config r   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/recipes/llama3_8b_64k.pyr
      s   r
         	num_nodesnum_gpus_per_nodec              
   C   s   t jddtjddd| |dS )a  
    Configure the NeMo Lightning Trainer for Llama3 8B model with 64k sequence length.

    This function sets up the distributed training strategy optimized for long sequences.

    Args:
        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.

    Returns:
        run.Config: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=llama3_8b_64k ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=4, num_gpus_per_node=8)
            >>> print(trainer_config)

    Note:
        This configuration uses significantly increased parallelism to handle the long sequence length efficiently.
    r      NT)tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelismr   r   )r   trainertorchbfloat16r   r   r   r   r   r   2   s   r   )targetr   defaultdirr   c                 C   s@   t j|| ||d}t |_t||d|_tjtdddd|_|S )ay  
    Create a pre-training recipe for Llama3 8B model with 64k sequence length.

    This function sets up a complete configuration for pre-training, including
    model, trainer, and data settings optimized for 64k sequence length.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int, optional): Number of compute nodes to use. Defaults to 4.
        num_gpus_per_node (int, optional): Number of GPUs per node. Defaults to 8.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory llama3_8b_64k
            $ nemo llm pretrain --factory "llama3_8b_64k(num_nodes=4, name='my_64k_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="llama3_8b_64k_pretrain", num_nodes=4)
            >>> print(recipe)

    Note:
        This recipe is optimized for handling long sequences (64k) compared to the standard 8k version.
        It requires significant computational resources due to the extended sequence length.
    )r   r!   r   r   r   r	   i      )r   global_batch_sizemicro_batch_size)r   pretrain_reciper
   r   runConfigr   data)r!   r   r   r   reciper   r   r   r%   Y   s
   #r%   )r   r   )Nr    r   r   )typingr   lightning.pytorchpytorchplnemo_runr&   r   nemo.collections.llm.apir   "nemo.collections.llm.gpt.data.mockr   nemo.collections.llm.recipesr   NAMEclifactoryr'   LightningModuler
   intr   strPartialr%   r   r   r   r   <module>   sF   
'