o
    ¾e¦iU*  ã                   @   sP   d Z ddlmZmZ ddlmZ ddlmZ e e	¡Z
G dd„ deƒZdgZdS )zAFMoE model configurationé   )ÚPreTrainedConfigÚlayer_type_validation)ÚRopeParameters)Úloggingc                <       s‚  e Zd ZdZdZdgZdgdgfddgdgfdgdgfdœZ			
																											d?d edB d!edB d"edB d#edB d$edB d%edB d&edB d'edB d(edB d)edB d*edB d+e	dB d,e	dB d-e
dB d.e
dB d/e	dB d0eeeef B dB d1edB d2edB d3edB d4e	dB d5edB d6edB d7edB d8e	dB d9e
dB d:e
dB d;e
dB d<e
dB f:‡ fd=d>„Z‡  ZS )@ÚAfmoeConfiga®  
    This is the configuration class to store the configuration of a [`AfmoeModel`]. It is used to instantiate an
    AFMoE model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of [arcee-ai/Trinity-Mini](https://huggingface.co/arcee-ai/Trinity-Mini).

    AFMoE is an Adaptive Feedforward MoE (Mixture of Experts) model with token-choice routing, shared experts, and a
    hybrid attention mechanism combining sliding window and full attention patterns.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 200192):
            Vocabulary size of the AFMoE model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`AfmoeModel`].
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 6144):
            Dimension of the dense MLP representations.
        moe_intermediate_size (`int`, *optional*, defaults to 1408):
            Intermediate size of the routed expert MLPs.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_dense_layers (`int`, *optional*, defaults to 1):
            Number of initial dense layers before MoE layers begin. Layers with index < num_dense_layers will use
            standard dense MLPs instead of MoE.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 128):
            The dimension of each attention head.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the MLP blocks.
        max_position_embeddings (`int`, *optional*, defaults to 16384):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the RMS normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        num_experts (`int`, *optional*, defaults to 64):
            Number of routed experts in MoE layers.
        num_experts_per_tok (`int`, *optional*, defaults to 6):
            Number of experts to route each token to. This is the top-k value for the token-choice routing.
        num_shared_experts (`int`, *optional*, defaults to 2):
            Number of shared experts that are always activated for all tokens.
        route_scale (`float`, *optional*, defaults to 1.0):
            Scaling factor applied to routing weights.
        global_attn_every_n_layers (`int`, *optional*, defaults to 4):
            The frequency of full attention layers. Every Nth layer will use full attention, while others use sliding
            window attention.
        sliding_window (`int`, *optional*, defaults to 1024):
            Sliding window size for local attention layers.
        layer_types (`list[str]`, *optional*):
            A list that explicitly maps each layer index with its attention type. Each element should be either
            "sliding_attention" or "full_attention". If not provided, it will be automatically generated based on
            `global_attn_every_n_layers`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mup_enabled (`bool`, *optional*, defaults to `False`):
            Whether to enable muP (Maximal Update Parametrization) input scaling. When enabled, input embeddings
            are scaled by `sqrt(hidden_size)`.
        eos_token_id (`int`, *optional*):
            End of stream token id.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*):
            Beginning of stream token id.

    Example:
    ```python
    >>> from transformers import AfmoeModel, AfmoeConfig

    >>> # Initializing an AFMoE configuration
    >>> configuration = AfmoeConfig()

    >>> # Initializing a model from the afmoe-small-sft-v1 style configuration
    >>> model = AfmoeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    ÚafmoeÚpast_key_valuesÚ	input_idsÚinputs_embedsÚhidden_statesÚattention_mask)Úembed_tokensÚlayersÚnormé  é   é   é€  é    é   é   Né€   Úsilué @  ç{®Gáz”?çñhãˆµøä>TFç     ˆÃ@é@   é   é   ç      ð?é   é   ç        Ú
vocab_sizeÚhidden_sizeÚintermediate_sizeÚmoe_intermediate_sizeÚnum_hidden_layersÚnum_dense_layersÚnum_attention_headsÚnum_key_value_headsÚhead_dimÚ
hidden_actÚmax_position_embeddingsÚinitializer_rangeÚrms_norm_epsÚ	use_cacheÚtie_word_embeddingsÚ
rope_thetaÚrope_parametersÚnum_expertsÚnum_experts_per_tokÚnum_shared_expertsÚroute_scaleÚglobal_attn_every_n_layersÚsliding_windowÚlayer_typesÚattention_dropoutÚmup_enabledÚeos_token_idÚpad_token_idÚbos_token_idc                    s  || _ || _|| _|| _|| _|| _|| _|	| _|
| _|| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _d| _|| _ˆ | _|| _|| _|| _| jd u r]‡ fdd„t| jƒD ƒ| _t| jƒ |d u rh|}|| _|| _|| _|| _|| _t ƒ j!di |¤Ž d S )NFc                    s$   g | ]}t |d  ˆ  ƒrdnd‘qS )r   Úsliding_attentionÚfull_attention)Úbool)Ú.0Úi©r9   © úk/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/afmoe/configuration_afmoe.pyÚ
<listcomp>Ç   s    ÿÿz(AfmoeConfig.__init__.<locals>.<listcomp>rG   )"r$   r.   r%   r&   r(   r)   r*   r,   r-   r/   r0   r1   r3   r4   r'   r6   r5   r7   r8   Úattention_biasr<   r9   r:   r=   r;   Úranger   r+   r>   r?   r@   r2   ÚsuperÚ__init__)Úselfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   Úkwargs©Ú	__class__rF   rH   rM   ˆ   sL   !

þ
zAfmoeConfig.__init__)r   r   r   r   r   r   r   Nr   r   r   r   r   TFr   Nr   r   r   r    r!   r"   Nr#   FNNN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typeÚkeys_to_ignore_at_inferenceÚbase_model_pp_planÚintÚstrÚfloatrC   r   ÚdictÚlistrM   Ú__classcell__rG   rG   rP   rH   r      sÀ    e

ýâþýüûúùø	÷
öõôóòñðïîíìëêéèçæåäãâr   N)rU   Úconfiguration_utilsr   r   Úmodeling_rope_utilsr   Úutilsr   Ú
get_loggerrR   Úloggerr   Ú__all__rG   rG   rG   rH   Ú<module>   s   
 
B