o
    eid0                     @   s  d dl Z d dl mZ ddlmZmZ ddlmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$ G dd deeZ%G dd de Z&G dd deZ'G dd de#Z(G dd deZ)G dd de"Z*G dd de$Z+G dd de!Z,G d d! d!eZ-G d"d# d#eZ.G d$d% d%eZ/G d&d' d'eZ0g d(Z1dS ))    N)nn   )CacheDynamicCache)PreTrainedConfig)create_causal_mask!create_sliding_window_causal_mask)BaseModelOutputWithPast)RopeParameters)Unpack)TransformersKwargsauto_docstring)merge_with_config_defaults)capture_outputs   )MistralConfig)Qwen2AttentionQwen2DecoderLayerQwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassificationQwen2MLP
Qwen2ModelQwen2PreTrainedModelQwen2RMSNormQwen2RotaryEmbeddingc                *   @   s   e Zd ZdZdZ										
											d(dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB d edB d!edB d"e	dB d#edB d$edB d%e
e dB f(d&d'ZdS ))MinistralConfiga  
    This is the configuration class to store the configuration of a [`MinistralModel`]. It is used to instantiate an
    Ministral model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Ministral-8B-Instruct-2410.

    [mistralai/Ministral-8B-Instruct-2410](https://huggingface.co/mistralai/Ministral-8B-Instruct-2410)
    [mistralai/Ministral-8B-Instruct-2410](https://huggingface.co/mistralai/Ministral-8B-Instruct-2410)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Ministral model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MinistralModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. Ministral's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        sliding_window (`int`, *optional*, defaults to 4096):
            Sliding window attention window size. If not specified, will default to `4096`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.

    ```python
    >>> from transformers import MinistralModel, MinistralConfig

    >>> # Initializing a Ministral 8B style configuration
    >>> configuration = MinistralConfig()

    >>> # Initializing a model from the Ministral 8B style configuration
    >>> model = MinistralModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	ministral }      8         Nsilu   {Gz?ư>T   r   F        
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dim
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingsrope_parameterssliding_windowattention_dropoutlayer_typesc                 K   s   || _ || _|| _|| _|| _|	| _|| _|| _|| _|| _	|| _
|| _|d u r*|}|| _|| _|
| _|| _|| _|| _|| _| jd u rQ| j
d urKdndg| | _|| _tj| fi | d S )Nsliding_attentionfull_attention)r6   r7   r8   r9   r*   r2   r+   r,   r-   r.   r;   r0   r/   r1   r3   r4   r5   r<   r=   r:   r   __init__)selfr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   kwargs rC   m/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ministral/modular_ministral.pyr@   m   s8   
zMinistralConfig.__init__)r   r    r!   r"   r"   r#   Nr$   r%   r&   r'   TNr(   r   FNr    r)   N)__name__
__module____qualname____doc__
model_typeintstrfloatboolr
   listr@   rC   rC   rC   rD   r      s    M	

r   c                   @      e Zd ZdS )MinistralMLPNrE   rF   rG   rC   rC   rC   rD   rP          rP   c                       s"   e Zd Zdef fddZ  ZS )MinistralAttention	layer_idxc                    sf   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
d S )NF)bias)superr@   r   Linearr+   r.   r0   q_projr/   k_projv_proj)rA   configrT   	__class__rC   rD   r@      s    zMinistralAttention.__init__)rE   rF   rG   rJ   r@   __classcell__rC   rC   r\   rD   rS      s    rS   c                   @   rO   )MinistralRMSNormNrQ   rC   rC   rC   rD   r_      rR   r_   c                   @   rO   )MinistralDecoderLayerNrQ   rC   rC   rC   rD   r`      rR   r`   c                   @   rO   )MinistralPreTrainedModelNrQ   rC   rC   rC   rD   ra      rR   ra   c                   @   rO   )MinistralRotaryEmbeddingNrQ   rC   rC   rC   rD   rb      rR   rb   c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )MinistralModelr[   c                    s   t  | | `d S )N)rV   r@   has_sliding_layers)rA   r[   r\   rC   rD   r@      s   zMinistralModel.__init__N	input_idsattention_maskposition_idspast_key_valuesinputs_embedsr5   cache_positionrB   returnc              
   K   s:  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| }
tsf| j|||||d}td
i |td
i |d}
|}| ||}| jd | jj D ]}||f|
|j |||||d|}qw| |}t||r|d	S d d	S )Nz:You must specify exactly one of input_ids or inputs_embeds)r[   r   r(   )device)r[   ri   rf   rj   rh   rg   )r?   r>   )rf   rg   rh   r5   rj   position_embeddings)last_hidden_staterh   rC   )
ValueErrorembed_tokensr   r[   get_seq_lengthtorcharangeshaperl   	unsqueeze
isinstancedictr   r   
rotary_emblayersr-   attention_typenormr	   )rA   re   rf   rg   rh   ri   r5   rj   rB   past_seen_tokenscausal_mask_mappingmask_kwargshidden_statesrm   decoder_layerrC   rC   rD   forward   s\   



zMinistralModel.forward)NNNNNNN)rE   rF   rG   r   r@   r   r   r   rr   
LongTensorTensorr   FloatTensorrM   r   r   r	   r   r^   rC   rC   r\   rD   rc      s>    	
rc   c                   @   rO   )MinistralForCausalLMNrQ   rC   rC   rC   rD   r     rR   r   c                   @   rO   )"MinistralForSequenceClassificationNrQ   rC   rC   rC   rD   r     rR   r   c                   @   rO   )MinistralForTokenClassificationNrQ   rC   rC   rC   rD   r     rR   r   c                   @   rO   )MinistralForQuestionAnsweringNrQ   rC   rC   rC   rD   r     rR   r   )r   ra   rc   r   r   r   r   )2rr   r   cache_utilsr   r   configuration_utilsr   masking_utilsr   r   modeling_outputsr	   modeling_rope_utilsr
   processing_utilsr   utilsr   r   utils.genericr   utils.output_capturingr   mistral.configuration_mistralr   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   r   r   r   r   rP   rS   r_   r`   ra   rb   rc   r   r   r   r   __all__rC   rC   rC   rD   <module>   s6    4 	L