o
    wi33                     @   s8  d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* dZ+ej,j-e+ddej.ej/ fddZ0										d:de1de1deej2 dee1 d e1d!e3d"e1d#e1d$e1d%ee4ej.e
   dej.ej5 fd&d'Z6ej,j-ee+d(dd)ddd*efd+ee7 d,e7d"e1d#e1d-e3d.edej8fd/d0Z9d1ej8dej8fd2d3Z:ej,j-ej;e+d(		)	4			5d;d+ee7 d,e7d6ee7 d"e1d#e1d7ee7 dej8fd8d9Z<dS )<    )CallableOptionalN)Callback)DistributedDataParallelConfig)	lightning)llmvlm)AutoTokenizer)pretrain)nemo_resume)default_logdefault_resumetensorboard_logger),distributed_fused_adam_with_cosine_annealing)
bf16_mixed)Gemma3VLMockDataModule)GarbageCollectionCallback)MegatronCommOverlapCallback)MegatronTokenDropCallback)TimingCallbackgemma3vl_4bnamereturnc                   C   s>   t jtjt jtjt tjt tjt jtjdddddS )af  
    Factory function to create a Gemma3 VL 4B model configuration.

    Returns:
        run.Config[pl.LightningModule]: Configuration for the Gemma3 VL 4B model.

    Examples:
        CLI usage:
            $ nemo llm pretrain model=gemma3vl_4b ...

        Python API usage:
            >>> model_config = model()
            >>> print(model_config)
    i  i 
  )
input_sizehidden_size)language_transformer_configvision_transformer_configvision_projection_config)config)	runConfigr   Gemma3VLModelGemma3VLConfig4Br   Gemma3Config4BGemma3VLVisionConfig!Gemma3VLMultimodalProjectorConfig r'   r'   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/recipes/gemma3vl_4b.pymodel)   s   

r)         T   { tensor_parallelismpipeline_parallelismpipeline_parallelism_typevirtual_pipeline_parallelismcontext_parallelismsequence_parallelism	num_nodesnum_gpus_per_node	max_steps	callbacksc
                 C   sf   t jtj| |||||dddt jtddddddddd}
t jtjdd|	|dd	d
||t |
ddd}|S )a  
    Configure the NeMo Lightning Trainer for Gemma3 VL 4B model.

    This function sets up the distributed training strategy and other training parameters.

    Args:
        tensor_parallelism (int): Degree of tensor model parallelism.
        pipeline_parallelism (int): Degree of pipeline model parallelism.
        pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism.
        virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism.
        context_parallelism (int): Degree of context parallelism.
        sequence_parallelism (bool): Whether to use sequence parallelism.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        max_steps (int): Maximum number of training steps.
        callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations.

    Returns:
        run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer.

    Examples:
        CLI usage:
            $ nemo llm pretrain trainer=gemma3vl_4b ...

        Python API usage:
            >>> trainer_config = trainer(num_nodes=1, num_gpus_per_node=8)
            >>> print(trainer_config)
    Toptim_grads_params)check_for_nan_in_gradgrad_reduce_in_fp32overlap_grad_reduceoverlap_param_gatheraverage_in_collectivedata_parallel_sharding_strategyN)tensor_model_parallel_sizepipeline_model_parallel_sizepipeline_dtype$virtual_pipeline_model_parallel_sizecontext_parallel_sizesequence_parallelgradient_as_bucket_viewckpt_async_saveckpt_parallel_loadddpfsdpgpur+   2       
   Fi  )acceleratoraccumulate_grad_batchesr7   deviceslimit_test_batcheslimit_val_batcheslog_every_n_stepsr6   r4   pluginsstrategyuse_distributed_samplerval_check_interval)r    r!   nlMegatronStrategyr   Trainerr   )r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   rU   trainerr'   r'   r(   r[   H   sN   (	r[   )targetr   defaultFdirr   performance_modefnc                 C   sb   t j|t t||t tgdt jtddddt| |t|ddt	ddt
 d	}|r/t|}|S )
a  
    Create a pre-training recipe for Gemma3 VL 4B model.

    This function sets up a complete configuration for pre-training, including
    model, trainer, data, logging, optimization, and resumption settings.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        name (str): Name of the pre-training run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        performance_mode (bool): If true, enables optimizations for maximum performance.
        fn (Callable): The pre-training function to use.

    Returns:
        run.Partial: Partial configuration for pre-training.

    Examples:
        CLI usage:
            $ nemo llm pretrain --factory gemma3vl_4b ...
            $ nemo llm pretrain --factory "gemma3vl_4b(num_nodes=2, name='my_pretrain')"

        Python API usage:
            >>> recipe = pretrain_recipe(name="gemma3vl_4b_pretrain", num_nodes=2)
            >>> print(recipe)
    )r4   r5   r7      r+   )
seq_lengthglobal_batch_sizemicro_batch_sizer   r^   r   r   ga2U0*3?)max_lrr)   r[   datalogoptimresume)r    Partialr)   r[   r!   r   r   r   r   r   r   "pretrain_performance_optimizations)r^   r   r4   r5   r_   r`   reciper'   r'   r(   pretrain_recipe   s    #
ro   rn   c                 C   sZ   | j jsg | j _tjtddd}tjtdd}tt}| j j|||g d| j j_	| S )a  
    Create a performance-optimized pre-training recipe for Gemma3 VL 4B model.

    This method enables performance optimizations that may not be suitable for all use cases.
    It builds upon the standard pre-training recipe and adds additional performance enhancements.

    Args:
        recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

    Returns:
        run.Partial: Partial configuration for performance-optimized pre-training.

    Note:
        Use this method with caution and only when you need maximum performance.
        It may not be suitable for all hardware configurations or use cases.
    d   )gc_interval_traingc_interval_valT)tp_comm_overlapF)
r[   r7   r    r!   r   r   r   extendrT   r:   )rn   garbage_collection_callbackmcomm_overlap_callbacktoken_drop_callbackr'   r'   r(   rm      s,   
rm   google/gemma-3-4b-itnoneresume_pathpeft_schemec           	      C   s6  t jtjfddtjdddddddddddt jtddddddd}t jtjd	d|d
dd|t |dt t	gd}t j
tjt |t jtdddt jtddddtj| |t|ddtddddt|d}|du so| dkr{d|jj_d|jj_|S | dkrt jtjdg dd|_d |jj_|S td!| )"a  
    Create a fine-tuning recipe for Gemma3 VL 4B model.

    This function sets up a complete configuration for fine-tuning, including
    model, trainer, data, logging, optimization, and resumption settings.
    The recipe uses LoRA (Low-Rank Adaptation) for efficient fine-tuning, unless peft_scheme is set to None.

    Args:
        dir (Optional[str]): Directory for saving logs and checkpoints.
        resume_path (str): Path to the NeMo checkpoint
        name (str): Name of the fine-tuning run.
        num_nodes (int): Number of compute nodes to use.
        num_gpus_per_node (int): Number of GPUs per node.
        peft_scheme (Optional[str]): Parameter efficient fine-tuning scheme to use.

    Returns:
        run.Partial: Partial configuration for fine-tuning.

    Examples:
        CLI usage:
            $ nemo llm finetune --factory gemma3vl_4b

        Python API usage:
            >>> recipe = finetune_recipe(name="gemma3vl_4b_finetune", num_nodes=1)
            >>> print(recipe)
    r*   r+   Nr   Tlog_all)r9   r:   r;   r<   r=   )r?   r@   rA   rB   "encoder_tensor_model_parallel_size$encoder_pipeline_model_parallel_sizerC   rD   rF   ckpt_parallel_saverG   ckpt_parallel_save_optimckpt_load_strictnessrE   rH   rJ   rM   iF  i  )rN   rO   rP   rR   rS   r6   r4   rT   rU   rW   r7   ra   rL   rx   )pretrained_model_name   )rb   rc   rd   	tokenizernum_workersr   re   gh㈵>gH׊>   )rf   min_lrwarmup_stepsrg   ry   loraF)z*.language_model.*.linear_qkvz*.language_model.*.linear_qz*.language_model.*.linear_kvz*.language_model.*.linear_projz*.language_model.*.linear_fc1z*.language_model.*.linear_fc2)freeze_vision_modeltarget_modulesg-C6?zUnrecognized peft scheme: )r    r!   rX   rY   torchbfloat16r   rZ   r   r   rl   r   finetuner)   r   r	   r   r   r   r   lowerr[   rU   r?   rj   r   lrr   LoRApeft
ValueError)	r^   r   rz   r4   r5   r{   rU   r[   rn   r'   r'   r(   finetune_recipe   s   $


r   )
r*   r+   NNr+   Tr,   r,   r-   N)Nr]   rx   r,   r,   ry   )=typingr   r   lightning.pytorchpytorchplnemo_runr    r   $lightning.pytorch.callbacks.callbackr   megatron.core.distributedr   nemor   rX   nemo.collectionsr   r   "nemo.collections.common.tokenizersr	   nemo.collections.llm.apir
   -nemo.collections.llm.recipes.finetune_defaultr   (nemo.collections.llm.recipes.log.defaultr   r   r   'nemo.collections.llm.recipes.optim.adamr   6nemo.collections.llm.recipes.precision.mixed_precisionr   'nemo.collections.vlm.gemma3vl.data.mockr   3nemo.lightning.pytorch.callbacks.garbage_collectionr   6nemo.lightning.pytorch.callbacks.megatron_comm_overlapr   /nemo.lightning.pytorch.callbacks.moe_token_dropr   nemo.utils.exp_managerr   NAMEclifactoryr!   LightningModuler)   intdtypeboollistrZ   r[   strrl   ro   rm   r   r   r'   r'   r'   r(   <module>   s   	


S6-