o
    
Û¾ix3  ã                   @   sB   d Z ddlZddlmZ ddlmZ e e¡Z	G dd„ deƒZ
dS )zNemotronH model configurationé    N)ÚPretrainedConfig)Úloggingc                3       s¦   e Zd ZdZdZdgZdddddd	d
dddddddddddddddddddddddddddddedƒfdddddddddddddddf3‡ fd d!„	Zed"d#„ ƒZ	‡  Z
S )$ÚNemotronHConfigaz  
    This is the configuration class to store the configuration of a
    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
    to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to
    that of the NemotronH-v0.1 model.
    Args:
        vocab_size (`int`, *optional*, defaults to 131072):
            Vocabulary size of the NemotronH model. Defines the number of
            different tokens that can be represented by the `inputs_ids`
            passed when calling [`NemotronHModel`]
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be
            tied. Note that this is only relevant if the model has an output
            word embedding layer.
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 21504):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 52):
            Number of hidden layers in the Transformer encoder.
        hybrid_override_pattern (`str`, *optional*, defaults to
            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
            The pattern of the hybrid model. The pattern is a string of
            characters where each character represents
            M: Mamba2, *: Attention, -: MLP
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the
            Transformer encoder.
        attention_head_dim (`int`, *optional*, defaults to 128):
            Dimension of each attention head.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to
            implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use
            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
            will use Multi Query Attention (MQA) otherwise GQA is used.
        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
            The non-linear activation function in the MLP layers.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in attention layers.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in MLP layers.
        use_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for
            initializing all weight matrices.
        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
            Whether or not residuals should be in `float32`. If set to `False`
            residuals will keep the same `dtype` as the rest of the model.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models). Only relevant if
            `config.is_decoder=True`.
        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
            Number of prompt logits to calculate during generation. If `None`,
            all logits will be calculated. If an integer value, only last
            `num_logits_to_keep` logits will be calculated.
        pad_token_id (`int`, *optional*, defaults to 0):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        sliding_window (`int`, *optional*, defaults to None):
            Sliding window attention window size.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used
            with.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the hidden states.
        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
            Flag indicating whether or not to use the fast mamba kernels.
            These are available only if `mamba-ssm` and `causal-conv1d`
            are installed, and the mamba modules are running on a CUDA device.
        ssm_state_size (`int`, *optional*, defaults to 128):
            The dimension of the mamba state space latents.
        mamba_num_heads (`int`, *optional*, defaults to 128):
            Number of heads in Mamba layers.
        mamba_n_groups (`int`, *optional*, defaults to 8):
            Number of groups in Mamba layers.
        mamba_head_dim (`int`, *optional*, defaults to 64):
            Dimension of each Mamba head.
        mamba_d_conv (`int`, *optional*, defaults to 4):
            The size of the mamba convolution kernel.
        mamba_expand (`int`, *optional*, defaults to 2):
            Expanding factor used to determine the mamba intermediate size.
        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
            The non-linear activation function in the Mamba layers.
        mamba_dt_min (`float`, *optional*, defaults to 0.001):
            Minimum value for the time step in Mamba.
        mamba_dt_max (`float`, *optional*, defaults to 0.1):
            Maximum value for the time step in Mamba.
        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
            Limits for the time step in Mamba.
        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
            Floor value for time step initialization in Mamba.
        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the convolution layer of the mamba mixer
            block.
        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the input and output projections of the
            mamba mixer block.
        mamba_chunk_size (`int`, *optional*, defaults to 256):
            Size of chunks for Mamba processing.
        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
            Whether to rescale the pre-normalization residual connections.
    Ú
nemotron_hÚpast_key_valuesi   Fi   i T  é4   z4M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-é    é€   é   Úrelu2g{®Gáz”?gñhãˆµøä>Té   r   é   Ng        é@   é   Úsilugü©ñÒMbP?gš™™™™™¹?Úinfg-Cëâ6?é   i  g      ð?c4           5         sz  || _ || _|| _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _t| jƒ| jks0J dƒ‚t d| j¡s;J dƒ‚|	d u rA|}	|	| _|
| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _|!| _ |"| _!|#| _"|$| _#|%| _$|&| _%|'| _&|(| _'|)| _(|*| _)|+| _*|,| _+|-| _,|.| _-|/| _.|0| _/|1| _0|2| _1|3| _2t3ƒ j4d||||dœ|4¤Ž d S )NzBhybrid_override_pattern must have same length as num_hidden_layersz^[*-M]+$zEhybrid_override_pattern must only contain characters 'M', '*', or '-')Úpad_token_idÚbos_token_idÚeos_token_idÚtie_word_embeddings© )5Ú
vocab_sizer   Úhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚhybrid_override_patternÚnum_attention_headsÚhead_dimÚsliding_windowÚmax_position_embeddingsÚattention_dropoutÚhidden_dropoutÚlenÚreÚmatchÚnum_key_value_headsÚmlp_hidden_actÚattention_biasÚmlp_biasÚuse_biasÚinitializer_rangeÚlayer_norm_epsilonÚresidual_in_fp32Ú	use_cacheÚnum_logits_to_keepÚuse_mamba_kernelsÚn_groupsÚmamba_head_dimÚssm_state_sizeÚmamba_num_headsÚconv_kernelÚexpandÚmamba_hidden_actÚtime_step_minÚtime_step_maxÚtime_step_limitÚtime_step_floorÚuse_conv_biasÚmamba_proj_biasÚ
chunk_sizeÚrescale_prenorm_residualÚn_routed_expertsÚn_shared_expertsÚmoe_intermediate_sizeÚ#moe_shared_expert_intermediate_sizeÚmoe_latent_sizeÚnum_experts_per_tokÚrouted_scaling_factorÚn_groupÚ
topk_groupÚnorm_topk_probÚsuperÚ__init__)5Úselfr   r   r   r   r   r   r   r   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r   r   r   r   r    r!   r"   r0   r3   r4   Úmamba_n_groupsr2   Úmamba_d_convÚmamba_expandr7   Úmamba_dt_minÚmamba_dt_maxÚmamba_dt_limitÚmamba_dt_init_floorÚmamba_conv_biasr=   Úmamba_chunk_sizer?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   Úkwargs©Ú	__class__r   ú^/home/ubuntu/.local/lib/python3.10/site-packages/vllm/transformers_utils/configs/nemotron_h.pyrK   ‘   s€   7ÿÿü
ûzNemotronHConfig.__init__c                    s   ‡ fdd„t ˆ jƒD ƒS )Nc                    sF   g | ]}ˆ j | d krdnˆ j | dkrdn
ˆ j | dkrdnd‘qS )ÚMÚmambaÚ*Ú	attentionú-ÚmlpÚmoe)r   )Ú.0Úi©rL   r   rY   Ú
<listcomp>  s    úÿÿÿùz5NemotronHConfig.layers_block_type.<locals>.<listcomp>)Úranger   rc   r   rc   rY   Úlayers_block_type  s   
øz!NemotronHConfig.layers_block_type)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typeÚkeys_to_ignore_at_inferenceÚfloatrK   Úpropertyrf   Ú__classcell__r   r   rW   rY   r      sv    r
Ì r   )rj   Úregexr$   Ú transformers.configuration_utilsr   Útransformers.utilsr   Ú
get_loggerrg   Úloggerr   r   r   r   rY   Ú<module>   s   
