o
    پi6                     @   sf   d Z ddlZddlmZ ddlmZ ddlmZm	Z	m
Z
 eeZdZdZdZd	ZG d
d deZdS )zNemotronH model configuration    N)PretrainedConfig)logging)Mamba2CacheParamsMamba2StateShapemamba2_state_dtypeM*-Ec                3       s   e Zd ZdZdZdgZdddddd	d
dddddddddddddddddddddddddddddedfdddddddddddddddf3 fd d!	Zed"d# Z	ed$d% Z
ed&efd'd(Z  ZS ))NemotronHConfigaz  
    This is the configuration class to store the configuration of a
    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
    to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to
    that of the NemotronH-v0.1 model.
    Args:
        vocab_size (`int`, *optional*, defaults to 131072):
            Vocabulary size of the NemotronH model. Defines the number of
            different tokens that can be represented by the `inputs_ids`
            passed when calling [`NemotronHModel`]
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be
            tied. Note that this is only relevant if the model has an output
            word embedding layer.
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 52):
            Number of hidden layers in the Transformer encoder.
        hybrid_override_pattern (`str`, *optional*, defaults to
            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
            The pattern of the hybrid model. The pattern is a string of
            characters where each character represents
            M: Mamba2, *: Attention, -: MLP
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        attention_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each attention head.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
            will use Multi Query Attention (MQA) otherwise GQA is used.
        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
            The non-linear activation function in the MLP layers.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in attention layers.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in MLP layers.
        use_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
            Whether or not residuals should be in `float32`. If set to `False`
            residuals will keep the same `dtype` as the rest of the model.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
            Number of prompt logits to calculate during generation. If `None`,
            all logits will be calculated. If an integer value, only last
            `num_logits_to_keep` logits will be calculated.
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        sliding_window (`int`, *optional*, defaults to None):
            Sliding window attention window size.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the hidden states.
        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
            Flag indicating whether or not to use the fast mamba kernels.
            These are available only if `mamba-ssm` and `causal-conv1d`
            are installed, and the mamba modules are running on a CUDA device.
        ssm_state_size (`int`, *optional*, defaults to 128):
            The dimension of the mamba state space latents.
        mamba_num_heads (`int`, *optional*, defaults to 128):
            Number of heads in Mamba layers.
        mamba_n_groups (`int`, *optional*, defaults to 8):
            Number of groups in Mamba layers.
        mamba_head_dim (`int`, *optional*, defaults to 64):
            Dimension of each Mamba head.
        mamba_d_conv (`int`, *optional*, defaults to 4):
            The size of the mamba convolution kernel.
        mamba_expand (`int`, *optional*, defaults to 2):
            Expanding factor used to determine the mamba intermediate size.
        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
            The non-linear activation function in the Mamba layers.
        mamba_dt_min (`float`, *optional*, defaults to 0.001):
            Minimum value for the time step in Mamba.
        mamba_dt_max (`float`, *optional*, defaults to 0.1):
            Maximum value for the time step in Mamba.
        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
            Limits for the time step in Mamba.
        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
            Floor value for time step initialization in Mamba.
        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the convolution layer of the mamba mixer
            block.
        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the input and output projections of the
            mamba mixer block.
        mamba_chunk_size (`int`, *optional*, defaults to 256):
            Size of chunks for Mamba processing.
        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
            Whether to rescale the pre-normalization residual connections.
    
nemotron_hpast_key_valuesi   Fi   i T  4   z4M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-          relu2g{Gz?gh㈵>T   r      Ng        @      silugMbP?g?infg-C6?   i  g      ?c4           5         sz  || _ || _|| _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _t| j| jks0J dtd| js;J d|	d u rA|}	|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _|!| _ |"| _!|#| _"|$| _#|%| _$|&| _%|'| _&|(| _'|)| _(|*| _)|+| _*|,| _+|-| _,|.| _-|/| _.|0| _/|1| _0|2| _1|3| _2t3 j4d||||d|4 d S )NzBhybrid_override_pattern must have same length as num_hidden_layersz
^[*\-ME]+$zIhybrid_override_pattern must only contain characters 'M', '*', '-' or 'E')pad_token_idbos_token_ideos_token_idtie_word_embeddings )5
vocab_sizer   hidden_sizeintermediate_sizenum_hidden_layershybrid_override_patternnum_attention_headshead_dimsliding_windowmax_position_embeddingsattention_dropouthidden_dropoutlenrematchnum_key_value_headsmlp_hidden_actattention_biasmlp_biasuse_biasinitializer_rangelayer_norm_epsilonresidual_in_fp32	use_cachenum_logits_to_keepuse_mamba_kernelsmamba_n_groupsmamba_head_dimssm_state_sizemamba_num_headsconv_kernelexpandmamba_hidden_acttime_step_mintime_step_maxtime_step_limittime_step_flooruse_conv_biasmamba_proj_biasmamba_chunk_sizerescale_prenorm_residualn_routed_expertsn_shared_expertsmoe_intermediate_size#moe_shared_expert_intermediate_sizemoe_latent_sizenum_experts_per_tokrouted_scaling_factorn_group
topk_groupnorm_topk_probsuper__init__)5selfr   r   r    r!   r"   r#   r$   r%   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r   r   r   r&   r'   r(   r)   r7   r:   r;   r8   r9   mamba_d_convmamba_expandr>   mamba_dt_minmamba_dt_maxmamba_dt_limitmamba_dt_init_floormamba_conv_biasrD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   kwargs	__class__r   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/configs/nemotron_h.pyrR      s   7
zNemotronHConfig.__init__c                        fddt  jD S )Nc                       g | ]} j | tkr|qS r   )r#   MAMBA.0irS   r   r^   
<listcomp>  
    z3NemotronHConfig.mamba_layer_ids.<locals>.<listcomp>ranger"   re   r   re   r^   mamba_layer_ids     
zNemotronHConfig.mamba_layer_idsc                    r_   )Nc                    r`   r   )r#   	ATTENTIONrb   re   r   r^   rf   $  rg   z<NemotronHConfig.full_attention_layer_ids.<locals>.<listcomp>rh   re   r   re   r^   full_attention_layer_ids"  rk   z(NemotronHConfig.full_attention_layer_idsreturnc              	   C   sL   ddl m} tj| | j| j | j| j| j| j| jd}t	|| j
t| dS )Nr   )get_attention_tp_size)tp_world_sizer!   n_groups	num_headsr%   
state_sizer<   )shapelayersdtype)sglang.srt.layers.dp_attentionro   r   creater;   r9   rq   r:   r<   r   rj   r   )rS   ro   rt   r   r   r^   mamba2_cache_params*  s   

z#NemotronHConfig.mamba2_cache_params)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencefloatrR   propertyrj   rm   r   ry   __classcell__r   r   r\   r^   r   $   s~    r
 

r   )r}   regexr+    transformers.configuration_utilsr   transformers.utilsr   sglang.srt.configs.mamba_utilsr   r   r   
get_loggerrz   loggerra   rl   MLPMOEr   r   r   r   r^   <module>   s   
