o
    }oiD                     @   s0  d dl mZ d dlmZmZmZ d dlmZ eG dd dZdefddZ	defd	d
Z
defddZdefddZdefddZdefddZdefddZdefddZdefddZdefddZdefddZdefddZdefdd Zdefd!d"Zdefd#d$Zdefd%d&Zdefd'd(Zd)S )*    )	dataclass)ListOptionalUnion)LLM_VOCAB_SIZE_MAPc                   @   sN  e Zd ZU dZeed< dZee ed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed< dZee ed	< dZee ed
< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZee ed< dZ e!ee"e f ed< dZ#ee ed< dZ$ee ed< dZ%ee ed< dZ&ee' ed < d!Z(e'ed"< dZ)ee ed#< dZ*ee ed$< dZ+ee ed%< dZ,ee ed&< dZ-ee ed'< dS )(FLOPSConfigz8Contains the model hparams needed for FLOPS computationsgbsNenc_seq_lenhslayersffn_hsattention_headsmoe_router_topkquery_groupsimg_seq_lenimg_himg_win_channels	patch_dimclass_token_lenprojector_typeinp_smodel_pattern
vocab_sizemodel_channels
vec_in_dimq_lora_rankkv_lora_rankqk_head_dimqk_pos_emb_head_dim
v_head_dimmoe_layer_freq#moe_shared_expert_intermediate_sizemoe_ffn_hidden_sizemtp_num_layerscausal_self_attnFis_hybrid_modelhybrid_override_patternmamba_state_dimmamba_head_dimmamba_num_groupsmamba_num_heads).__name__
__module____qualname____doc__int__annotations__r	   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   strr   r   r   r   r   r   r   r   r   r    r!   r   r   r"   r#   r$   r%   boolr&   r'   r(   r)   r*   r+    r4   r4   M/home/ubuntu/.local/lib/python3.10/site-packages/nemo/utils/flops_formulas.pyr      sL   
 r   configc                 C   sd   t d }d| j | j | j | j d| j | j | j | j  d| j  d| j | j | j |  S )zModel FLOPs for GPT3 familygpt3            )r   r   r	   r
   r   r6   r   r4   r4   r5   r7   ?   s   r7   c                 C   n   t d }| j| j | j | j | j dd| j | j  d| j | j  d| j | j  d| | j| j    S )zModel FLOPs for llama2 familyllama2      r;   r   r   r	   r   r
   r   r   r   r<   r4   r4   r5   r>   J   *   r>   c                 C   r=   )zModel FLOPs for llama3 familyllama3r?   r@   r;   rA   r<   r4   r4   r5   rC   ^   rB   rC   c                 C   sn   t d }| j| j | j | j | j dd| j | j  d| j | j  d| j | j  d| | j| j    S )zModel FLOPs for nemotron familynemotronr?   r;   rA   r<   r4   r4   r5   rD   r   rB   rD   c                 C   st   t d }| j| j | j | j | j dd| j | j  d| j | j | j  d| j | j  d| | j| j    S )zModel FLOPs for mixtral familymixtralr?   r@   r;   )	r   r   r	   r   r
   r   r   r   r   r<   r4   r4   r5   rE      s*   rE   c                 C   s   d}| j }| j}d}d| j | j | | | | j| j d d || d |r)dnd  d  }d| j | j | | d|  | j| j  }d| j | | | j }|| | S )zModel FLOPs for Qwen3 familyT   r;            ?)	r	   r
   r   r   r   r   r#   r   r   )r6   r%   seq_lenhidden_sizegated_linear_multiplierattention_flops	mlp_flopsvocab_flopsr4   r4   r5   qwen3   sL   
rO   c                 C   sR   t d }d| j | j | j | j | j d| jd| j   |d| j | j    S )zModel FLOPs for BERT familybertH   rG   r;   r?   )r   r   r   r	   r
   r<   r4   r4   r5   rP      s   &rP   c                 C   sJ  | j }| j}| j}| j}| j}| j}| j}|du rtd| jdur%| jn|}| j	dur/| j	nd}	| j
dur9| j
nd}
|| }|| }|| }|
dkrR|}d}d}n
|d }|| }|
}d}d}|	sod||  ||  | }nd||  || d  | }|| | | | | ||| || |  | ||   |d| |    }|S )zlCalculate FLOPs for a standard Transformer model.
    Note: This does not cover encoder-decoder models.
    Nz8vocab_size is required for transformer FLOPs calculationFr   rF   rG   r?   )r   r
   r	   r   r   r   r   
ValueErrorr   r%   r   )r6   
batch_sizerJ   
seq_length
num_layersnum_attention_headsffn_hidden_sizer   r   r%   r   kv_channelsquery_projection_size%query_projection_to_hidden_size_rationum_dense_layersnum_moe_layersnum_experts_routed_torK   expansion_factorattention_componenttotal_flopsr4   r4   r5   transformer   s   
	"ra   c                 C   s~   | j du r| j| j | j| j  | j | _ | j| j | j | j | j  dd| j  | j   d| j | j | j | j | j  S )zModel FLOPs for CLIP ViTNr8   r9   rF   )	r   r   r   r   r   r   r   r
   r   r6   r4   r4   r5   
clip_vit_l8  s   


 rc   c                 C   sb   d| j v rd| j | j | j | j| j  S | j dkr)d| j | j | j | j S td| j  )zModel FLOPs for NeVA Projectionmlpr;   affinezhNeVA Projections FLOPs calculator only supports 'mlp', 'mcore_mlp' or 'affine' projector_type but found )r   r   r   r   r   r
   rR   rb   r4   r4   r5   neva_projectionD  s   
"
rf   c                 C   s  | j }| j| j }d| j }|| jd  d| | d| | j| j  d|d    d| j| j  |   }|| jd  | | dd|  d| d|   d	|  d  }|| j| j | | j| | j  | j|  ||  d| j| ||    | j| j | | j   }|| | S )
zModel FLOPs for FLUXr;   r   
   rF   rG      r9   r:      )r
   r   r   r   r   r   r   )r6   r
   rI   base_factorjoint_layer_flopssingle_layer_flopsother_flopsr4   r4   r5   fluxR  sZ   

	rn   c                    sL  d j  j   j  jd  }d j  j  jd  }d||   j } jdur6|d||   j 7 } j j  j j  j  j   }| j j 7 }| j j	  j	 j  j  j   7 }| j j  j 7 }| j } jdur|| j 7 } j j
 d } j j d } j j d }d}	t jtr fddt jD }
n j}
|
D ]}|dkr|	|7 }	q|	|| j  7 }	q jdurt jD ]}|	|| j  7 }	q||	 }d|  j }d j  j  j } jdurt jD ]}|d j  j  j 7 }|d j d  j  j 7 }q|| |  j S )	zModel FLOPs for DeepSeek V3rH   rF   r;   Nr:   r   c                    s"   g | ]}| j  d krdnd qS )r   rG   )r!   ).0irb   r4   r5   
<listcomp>  s   " zdeepseekv3.<locals>.<listcomp>)r   r   r   r	   r    r   r$   r
   r   r   r   r"   r#   
isinstancer!   r0   ranger   r   r   )r6   
bmm1_flops
bmm2_flopsper_input_attention_flopsper_layer_mla_params
mla_paramsdense_layer_ffn_paramsper_shared_expert_paramsper_selected_expert_params
ffn_paramsmoe_layer_patternrp   per_input_paramsper_input_linear_flopsper_input_vocab_flopsr4   rb   r5   
deepseekv3  sN   




 r   c                 C   s    d| j  | j | j | j d S )z4Model FLOPs for MLP layer. Assume gated linear unit.r;   r:   )r   r	   r
   r   rb   r4   r4   r5   _nemotronh_mlp_layer_flops  s    r   c                 C   sF   d| j  | j | j | j| j| j | j d  | jd d  | j  S )zModel FLOPs for attention layerr;   rF   )r   r	   r
   r   r   rb   r4   r4   r5   _non_mla_attn_layer_flops  s    r   c                 C   s   | j dusJ | jdusJ | jr| j}nd| j | j }|| j }d| j | j | j d| d| j | j   |  d| j | j | | j   d| j | j | | j  S )z{Model FLOPs for Mamba layer. We ignore part of the flops of scan because the
    chunk size is not known from model config.NrF   r;   )r(   r)   r+   r
   r   r	   r*   )r6   nheadsd_inr4   r4   r5   _mamba_layer_flops  s(   
r   c                 C   s   | j dksJ | jdusJ d\}}}| jD ]}|dkr!|d7 }q|dkr*|d7 }q|dkr2|d7 }q|t|  |t|   |t|   d| j | j | j | j  S )	zModel FLOPs for hybrid modelTN)r   r   r   MrG   -*r;   )	r&   r'   r   r   r   r   r	   r
   r   )r6   num_attn_layersnum_mamba_layersnum_mlp_layerscr4   r4   r5   _hybrid_model_flops  s&   






r   c                 C   s   t | S )zModel FLOPs for NemotronH)r   rb   r4   r4   r5   	nemotronh  s   r   N)dataclassesr   typingr   r   r   0nemo.collections.common.parts.perf_metrics_utilsr   r   r7   r>   rC   rD   rE   rO   rP   ra   rc   rf   rn   r   r   r   r   r   r   r4   r4   r4   r5   <module>   s,   ))f/7