o
    }oii2                     @   s   d dl mZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ dd	gZejejejejejejejejeejejejejd
ZG dd de
Z G dd	 d	e Z!dS )    )ListTupleN)Callback)	GPTConfig)PEFT)flops_formulaslogging)hyenaFLOPsMeasurementCallbackMM_FLOPsMeasurementCallback)gpt3llama2llama3llama4	nemotron3	nemotron4mixtralbertr	   
deepseekv3transformerqwen3	nemotronhc                   @   sr   e Zd ZdZdZdedejdefddZ	dd	 Z
d
efddZdeeB eB defddZdeeef fddZdS )r
   a  
    Calculate and log FLOPs per second after every ``trainer.log_every_n_steps`` steps.

    Args:
        model_config (GPTConfig): Model parameters.
        data_config (pl.LightningDataModule): Data module being used in the experiment.
        model_name (str): Name of the model being run. The following models are supported:
            gpt3, llama2, llama3, nemotron, mixtral, bert, hyena.


    Tmodel_configdata_config
model_namec                 C   s  || _ || _|| _| jj}| j j}| j j}| j j}| j j}| j j}	| j j	}
t
| j dd }t| jdr7| jjjnd }| j j}|d u rC|	}||||||	|
|||d
}ddlm} t| j |rz| j j|d< | j j|d< | j j|d< | j j|d	< | j j|d
< | j j|d< | j j|d< | j j|d< | j j|d< | j jrd|d< | j j|d< | j j|d< | j j|d< | j j|d< | j j |d< t!j"di || _#| jd ur| j$ n| j| _d| _%d S )Nhybrid_override_pattern	tokenizer)
gbsenc_seq_lenhslayersffn_hsattention_headsmoe_router_topkquery_groups
vocab_sizemodel_patternr   )MLATransformerConfigqk_head_dimqk_pos_emb_head_dim
v_head_dimq_lora_rankkv_lora_rankmoe_layer_freq#moe_shared_expert_intermediate_sizemoe_ffn_hidden_sizemtp_num_layersTis_hybrid_modelmamba_state_dimmamba_head_dimmamba_num_groupsmamba_num_heads )&	model_cfgdata_cfgmodelglobal_batch_size
seq_lengthhidden_size
num_layersffn_hidden_sizenum_attention_headsr#   getattrhasattrr   r%   num_query_groups,megatron.core.transformer.transformer_configr'   
isinstancer(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r   r2   r3   r4   r5   r   FLOPSConfigflops_configloweravg_train_step_time)selfr   r   r   r   r   r   r    r!   r"   r#   r&   r%   r$   config_kwargsr'   r6   r6   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/lightning/pytorch/callbacks/flops_callback.py__init__>   s^   
z!FLOPsMeasurementCallback.__init__c                 C   s"   |j D ]}t|trtdqdS )z
        PyTorch Lightning callback hook. Ensures that user is not using PEFT
        as FLOPS callback does not support it.
        z3FLOPs measurement not supported for finetuning jobsN)	callbacksrD   r   NotImplementedError)rI   trainer	pl_modulecallbackr6   r6   rK   on_train_start   s
   

z'FLOPsMeasurementCallback.on_train_start	batch_idxc                 C   s   z|  j |jd 7  _ W n ty   td Y nw |jj}||j dkrU| j dkr,dS | j |j }| |\}}	d| _ |jd|ddddd	 |	d
|  }
|d|
 dS dS )zd
        PyTorch Lightning callback hook to calculate TFLOPs per sec per GPU after training
        ztrain_step_timing in szb'train_step_timing in s' not found. Make sure to use TimingCallback with FLOPsMeasurementCallback.r   NTFLOPS_per_GPUTF   )on_stepon_epoch
batch_sizeprog_bar   mBTFLOPS)	rH   progress_bar_metricsKeyErrorprintstrategycurrent_epoch_steplog_every_n_stepseval_tflops_per_sec_per_gpulog)rI   rO   rP   outputsbatchrS   ntrain_step_timetflops_per_gpuflopstflopsr6   r6   rK   on_train_batch_end   s4   
	z+FLOPsMeasurementCallback.on_train_batch_endrg   returnc                 C   sT   |   \}}t|ts|g}t|}t|t|d d }|d|  }||fS )ae  
        Args:
            train_step_time (Any[List, float, int]): Train step time (in seconds).
            Step time will be less stable for initial steps (~10 steps)- less
            accurate measurement
            Use average step time over several steps for higher accuracy
        Returns:
            (float): Model TFLOPs per sec per gpu
           NrZ   )eval_model_flopsrD   listnparraymeanlen)rI   rg   total_flopsflops_per_gpustep_time_arrflops_per_sec_per_gpur6   r6   rK   rb      s   


z4FLOPsMeasurementCallback.eval_tflops_per_sec_per_gpuc                    s    j dur fddtD }t|dkr|d n j  _  j tvr5tdtt   td j  t j   j}t	j
 rGt	j
 nd}|| }||fS )z9
        Calculate model FLOPs for a given model
        Nc                    s   g | ]	}| j v r|qS r6   )r9   ).0r9   rI   r6   rK   
<listcomp>   s    z=FLOPsMeasurementCallback.eval_model_flops.<locals>.<listcomp>r    FLOPs measurement supported for JFailed to extract valid model name from or missing FLOPs calculations for rU   )r9   _model_flops_maprs   r   inforo   keysr]   rF   torchdistributedis_initializedget_world_size)rI   model_matchesrt   num_devicesru   r6   ry   rK   rn      s   

z)FLOPsMeasurementCallback.eval_model_flopsN)__name__
__module____qualname____doc__higher_is_betterr   plLightningDataModulestrrL   rR   intrk   r   floatrb   r   rn   r6   r6   r6   rK   r
   /   s    
C	c                   @   s0   e Zd ZdZdZdedejfddZdd Z	d	S )
r   a  
    Calculate and log FLOPs per second after every ``trainer.log_every_n_steps`` steps for multi-modal models.
    The following models are supported:
            hf_clip_vit_l, neva_projection, gpt3, llama2, llama3, nemotron, mixtral, bert, hyena

    Args:
        model_name_config_dict (dict):
            Dictionary containing all the individual model configs that make up the multi-modal model.
        data_config (pl.LightningDataModule): Data module being used in the experiment.
    Tmodel_name_config_dictr   c                 C   s  || _ t | _| D ]\}}t }| j j|d< |j|d< |dv rD|j|d< |j|d< |j|d< |j|d< |j	|d< |j
|d	< d
|d< nZ|dv r_|j|d< |j|d< |j|d< |d j|d< n?|dv r|j|jg|d< |j|d< |j|d< |j|d< |j|d	< |j|d< n|j|d< |j|d< |j|d< |j|d< |j|d< z|j}|d u r|j}||d< W n   Y tjdi || j|< qd| _d S )Nr   r   )hf_clip_vit_lr    img_seq_lenimg_himg_w	patch_dimin_channelsrU   class_token_len)neva_projectionprojector_typer!   inp_sr   )fluxmodel_channels
vec_in_dimr   r"   r#   r$   r   r6   )r8   dictflops_config_dictitemsr:   r<   num_hidden_layersnum_image_embeddings_per_tile
image_size
patch_sizenum_channelsr   r>   
input_sizenum_joint_layersnum_single_layersr   context_dimr   r   r;   r=   r?   r#   rB   r   rE   rH   )rI   r   r   r   r7   kwargsr$   r6   r6   rK   rL      sP   




















z$MM_FLOPsMeasurementCallback.__init__c                 C   s   i t tjtjtjd}d }}| j D ]#\}}||vr1tdt	|
   td| ||| |7 }qtj rDtj nd}|| }||fS )zh
        Calculate model FLOPs for a given model recursively when model has multiple sub-models
        )r   r   r   r   r{   r|   rU   )r}   r   
clip_vit_lr   r   r   r   r   r~   ro   r   r]   r   r   r   r   )rI   mm_model_flops_maprt   ru   r   	flops_cfgr   r6   r6   rK   rn     s$   z,MM_FLOPsMeasurementCallback.eval_model_flopsN)
r   r   r   r   r   r   r   r   rL   rn   r6   r6   r6   rK   r      s    
5)"typingr   r   lightning.pytorchpytorchr   numpyrp   r   lightning.pytorch.callbacksr   #nemo.collections.llm.gpt.model.baser    nemo.lightning.pytorch.callbacksr   
nemo.utilsr   r   nemo.utils.hyena_flops_formulasr	   __all__r   r   r   nemotronr   r   r   r   r   r   r}   r
   r   r6   r6   r6   rK   <module>   s6    #