o
    ei-&                     @   sL   d Z ddlmZ ddlmZ ddlmZ eeZ	G dd deZ
dgZdS )zMinistral model configuration   )PreTrainedConfig)RopeParameters)loggingc                (       s2  e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ																			d3dedB dedB d edB d!edB d"edB d#edB d$edB d%e	dB d&edB d'e
dB d(e
dB d)edB d*edB d+edB d,edB d-edB d.eee	ef B dB d/edB d0e
dB f& fd1d2Z  ZS )4Ministral3Configap  
    This is the configuration class to store the configuration of a [`Ministral3Model`]. It is used to instantiate an
    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the mistralai/Ministral-3-8B-Base-2512, mistralai/Ministral-3-8B-Instruct-2512 or mistralai/Ministral-3-8B-Reasoning-2512.

    [mistralai/Ministral-3-8B-Base-2512](https://huggingface.co/mistralai/Ministral-3-8B-Base-2512)
    [mistralai/Ministral-3-8B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512)
    [mistralai/Ministral-3-8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`Optional`, *optional*, defaults to 131072):
            Vocabulary size of the Ministral3 model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Ministral3Model`].
        hidden_size (`Optional`, *optional*, defaults to 4096):
            Dimensionality of the embeddings and hidden states.
        intermediate_size (`Optional`, *optional*, defaults to 14336):
            Dimensionality of the intermediate (feed-forward) layer.
        num_hidden_layers (`Optional`, *optional*, defaults to 34):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`Optional`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`Optional`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA); if
            `num_key_value_heads=1`, the model will use Multi Query Attention (MQA); otherwise GQA is used.
        head_dim (`Optional`, *optional*, defaults to 128):
            The attention head dimension. If not specified, will default to `hidden_size // num_attention_heads`.
        hidden_act (`Optional`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`Optional`, *optional*, defaults to 262144):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`Optional`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`Optional`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`Optional`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`Optional`, *optional*, defaults to 11):
            The id of the padding token.
        bos_token_id (`Optional`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`Optional`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`Optional`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_parameters (`Union`, *optional*, defaults to `{'type': 'yarn', 'rope_theta': 1000000.0, 'factor': 16.0, 'original_max_position_embeddings': 16384, 'beta_fast': 32.0, 'beta_slow': 1.0, 'mscale_all_dim': 1.0, 'mscale': 1.0, 'llama_4_scaling_beta': 0.1}`):
            Dictionary containing the configuration parameters for the RoPE embeddings, including optional Yarn scaling
            settings such as `factor`, `original_max_position_embeddings`, `mscale`, and `llama_4_scaling_beta`.
        sliding_window (`Optional`, *optional*):
            Sliding window attention window size. If `None`, full attention is used.
        attention_dropout (`Optional`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    Example:

    ```python
    >>> from transformers import Ministral3Config, Ministral3ForCausalLM, Mistral3Config, Mistral3ForConditionalGeneration, PixtralVisionConfig

    >>> # Initializing a Pixtral-vision config
    >>> vision_config = PixtralVisionConfig()

    >>> # Initializing a Ministral3 config
    >>> text_config = Ministral3Config()

    >>> # Initializing a Mistral3 configuration
    >>> configuration = Mistral3Config(vision_config, text_config)

    >>> # Initializing a model from the Ministral3 configuration
    >>> text_model = Ministral3ForCausalLM(text_config)

    >>> # Initializing a model from the Mistral3 configuration
    >>> model = Mistral3ForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
ministral3past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnorm       8  "             silu   {Gz?h㈵>T         FN        
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dim
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingsrope_parameterssliding_windowattention_dropoutc                    s   |d u rdddd|	dddddd
}|| _ |	| _|| _|| _|| _|| _|| _|d ur,|n|| | _|d u r7|}|| _|| _	|
| _
|| _|| _|| _d	|v rRtd
 || _|| _|| _|| _|| _t jddddhi| d S )Nyarng    .Ag      0@i @  g      @@g      ?g?)
type
rope_thetafactor original_max_position_embeddingsr(   	beta_fast	beta_slowmscale_all_dimmscalellama_4_scaling_betalayer_typeszDetected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility.ignore_keys_at_rope_validationr<   r(    )r    r(   r!   r"   r#   r$   r1   r&   r%   r'   r)   r*   r+   r2   loggerwarning_oncer0   r,   r-   r.   r/   super__init__)selfr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   kwargs	__class__r?   u/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ministral3/configuration_ministral3.pyrC   |   sT   

zMinistral3Config.__init__)r   r   r   r   r   r   r   r   r   r   r   Tr   r   r   FNNr   )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planintstrfloatboolr   dictrC   __classcell__r?   r?   rF   rH   r      s    Q


	
r   N)rL   configuration_utilsr   modeling_rope_utilsr   utilsr   
get_loggerrI   r@   r   __all__r?   r?   r?   rH   <module>   s   
 
/