o
    wi*s                     @   s  d Z ddlmZ ddlZddlm  mZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' e(e)Z*G dd deZ+G dd de'Z,G dd deZ-G dd dej.Z/G dd deZ0G dd de eZ1G dd de&Z2G d d! d!e%Z3G d"d# d#e!Z4G d$d% d%e#Z5G d&d' d'e$Z6G d(d) d)e"Z7g d*Z8dS )+zPyTorch MiniMax model.    )OptionalN)nn   )ACT2FN)CacheDynamicCache)layer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)MoeModelOutputWithPast)Unpack)logging   )MixtralConfig)	MixtralAttentionMixtralDecoderLayerMixtralForCausalLMMixtralForQuestionAnswering MixtralForSequenceClassificationMixtralForTokenClassificationMixtralModelMixtralPreTrainedModelMixtralRMSNormc                       s2   e Zd ZdZ								d fdd	Z  ZS )MiniMaxConfiga  
    This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an
    MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the MiniMax.

    [MiniMaxAI/MiniMax-Text-01-hf](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the MiniMax model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MiniMaxModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. MiniMax's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `4096`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 8):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabeling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise to add to the router.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        block_size (`int`, *optional*, defaults to 256):
            The length of each attention block, determining how queries, keys, and values
            are grouped and processed for intra- and inter-block attention.
        full_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after normal attention.
        full_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after normal attention.
        linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after lightning attention.
        linear_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after lightning attention.
        mlp_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after MLP.
        mlp_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after MLP.

    ```python
    >>> from transformers import MiniMaxModel, MiniMaxConfig

    >>> # Initializing a MiniMax style configuration
    >>> configuration = MiniMaxConfig()

    >>> # Initializing a model from the MiniMax style configuration
    >>> model = MiniMaxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```N      c	           
         sp   t  jdi |	 || _|| _|| _|| _|| _|| _|| _|| _	| jd u r1dd t
| jD | _t| j d S )Nc                 S   s$   g | ]}t |d  d rdndqS )r   r   full_attentionlinear_attention)bool).0i r#   h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/minimax/modular_minimax.py
<listcomp>   s    z*MiniMaxConfig.__init__.<locals>.<listcomp>r#   )super__init__layer_types
block_sizefull_attn_alpha_factorfull_attn_beta_factorlinear_attn_alpha_factorlinear_attn_beta_factormlp_alpha_factormlp_beta_factorrangenum_hidden_layersr   )
selfr(   r)   r*   r+   r,   r-   r.   r/   super_kwargs	__class__r#   r$   r'      s   
zMiniMaxConfig.__init__)Nr   r   r   r   r   r   r   )__name__
__module____qualname____doc__r'   __classcell__r#   r#   r4   r$   r   2   s    gr   c                   @      e Zd ZdS )MiniMaxRMSNormNr6   r7   r8   r#   r#   r#   r$   r<          r<   c                       s   e Zd Z fddZdd ZdefddZ fdd	Zdef fd
dZdd Z	defddZ
dejfddZdefddZ  ZS )MiniMaxCachec                    s   t    g | _d S N)r&   r'   linear_cacher2   r4   r#   r$   r'      s   

zMiniMaxCache.__init__c                 C   s4   t t| j|d D ]}| jg  q
|| j|< d S )Nr   )r0   lenrA   append)r2   	layer_idxrA   _r#   r#   r$   set_linear_cache   s   zMiniMaxCache.set_linear_cacherE   c                 C   s   |t | k r| j| S d S r@   )rC   rA   r2   rE   r#   r#   r$   get_linear_cache   s   
zMiniMaxCache.get_linear_cachec                    s   t t  t| jS r@   )maxr&   __len__rC   rA   rB   r4   r#   r$   rK      s   zMiniMaxCache.__len__c                    s4   |t | jk r| j| g kr| j| fS t |S r@   )rC   rA   r&   __getitem__rH   r4   r#   r$   rL      s   zMiniMaxCache.__getitem__c                 c   s"    t t| D ]}| | V  qd S r@   )r0   rC   rH   r#   r#   r$   __iter__   s   zMiniMaxCache.__iter__repeatsc                 C   st   t t| D ]1}| j| g kr| j| j|dd| j|< q| j| j|dd| j|< | j| j|dd| j|< qd S )Nr   dim)r0   rC   rA   repeat_interleave	key_cachevalue_cache)r2   rN   rE   r#   r#   r$   batch_repeat_interleave   s   z$MiniMaxCache.batch_repeat_interleaveindicesc                 C   sn   t t| D ].}| j| g kr| j| |df | j|< q| j| |df | j|< | j| |df | j|< qd S )N.)r0   rC   rA   rR   rS   )r2   rU   rE   r#   r#   r$   batch_select_indices   s   z!MiniMaxCache.batch_select_indices
max_lengthc                 C   s   t d)Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r2   rW   r#   r#   r$   crop   s   zMiniMaxCache.crop)r6   r7   r8   r'   rG   intrI   rK   rL   rM   rT   torchTensorrV   rY   r:   r#   r#   r4   r$   r?      s    r?   c                       s   e Zd Zdedef fddZdd Zdd Z				dd
ej	de
ej	ej	f deej	 dee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )MiniMaxLightningAttentionconfigrE   c                    s  t    || _t|dd p|j|j | _|j| _|j| _|j| _t	|j
 | _t| j| j | _tj|j| j| j d dd| _tj| j| j |jdd| _tj|j| j| j dd| _|  }| |\}}}| d| | d| | d| | d| d S )	Nhead_dimr   F)bias
slope_ratequery_decay	key_decaydiagonal_decay)r&   r'   rE   getattrhidden_sizenum_attention_headsr_   r1   r)   r   
hidden_actact_fnr<   normr   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)r2   r^   rE   ra   rb   rc   rd   r4   r#   r$   r'      s"   
 z"MiniMaxLightningAttention.__init__c                 C   sd   ddd| j    }t| j d }d| j| jd d   d }|| }|| }|d d d d f }|S )Nr   r      gh㈵>)rg   r[   arangerE   r1   )r2   baseexponentfactorrater#   r#   r$   ro      s   z(MiniMaxLightningAttention.get_slope_ratec                 C   s   t | jd }t | |d d d f  }t | | j|d d d f   }|d d d f |d d d f  }|d d d d d d f }|| }t |dk| td}t |}|||fS )Nr   r   z-inf)r[   rs   r)   expwherefloat)r2   ra   block_size_rangerb   rc   rd   r#   r#   r$   rp     s   " 

z'MiniMaxLightningAttention.decay_factorsNhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc           #      K   sl  |j \}}}	|| j d | j }
| | |}|||| jd| j }tj|| jdd\}}}|	dd}|	dd}|	dd}d }|d urN|
| j}|d u r!t|| j| j| j|}|d ury|jtjd}||dd d}g }t|
D ]}|| j }t|| j |}|| }|d d d d ||f }|d d d d ||f }|d d d d ||f }| jd d d |f }| jd d | d f }| jd d d d d |d |f }t| j | }t||	dd}t|| |}t|| |}|| }|| t|| 	dd|} || |  }qnYt| j }!g }t|D ]K}|d d d d ||d f }|d d d d ||d f }|d d d d ||d f }t|	dd|}"|!| |" }t||}|| q.tj|dd}|	dd}|||| j| j }| |}t| || }| |}|d ur| | j| ||fS )	Nr   r   rO   r   )dtyper   )!shaper)   ri   rl   reshaperg   r_   r[   split	transposerI   rE   zerostor    masked_fill	unsqueezer0   minrb   rc   rd   rx   ra   matmulrD   catrj   Fsigmoidrn   rm   rG   )#r2   r|   r}   r~   r   r   r   
batch_sizeseq_lenrf   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputr"   	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_interr#   r#   r$   forward  sv   	

"
 


z!MiniMaxLightningAttention.forward)NN)r6   r7   r8   r   rZ   r'   ro   rp   r[   r\   tupler   r   
LongTensorr   r   r   r:   r#   r#   r4   r$   r]      s*    r]   c                   @   r;   )MiniMaxAttentionNr=   r#   r#   r#   r$   r   |  r>   r   c                       s   e Zd Zdedef fddZ							ddejdeejejf d	e	ej d
e	ej
 de	eej  de	e de	e de	e de	ej
 dee deeje	eejejf  f fddZ  ZS )MiniMaxDecoderLayerr^   rE   c                    sz   t  || || _|j| | _|j| _|j| _| jdkr-t||| _|j	| _
|j| _d S t||| _|j| _
|j| _d S )Nr   )r&   r'   rE   r(   
layer_typer.   r/   r]   	self_attnr,   attn_alpha_factorr-   attn_beta_factorr   r*   r+   )r2   r^   rE   r4   r#   r$   r'     s   
zMiniMaxDecoderLayer.__init__NFr|   r}   r~   position_idsr   output_attentionsoutput_router_logits	use_cacher   r   r   c
                 K   s   |  |}|}| jd||||||||	d|
\}}|| j || j  }| |}|}| |\}}|| j || j  }|f}|rF||f7 }|rM||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r|   r}   r~   r   r   r   r   r   Nr#   )input_layernormr   r   r   post_attention_layernormblock_sparse_moer.   r/   )r2   r|   r}   r~   r   r   r   r   r   r   r   residualself_attn_weightsrouter_logitsoutputsr#   r#   r$   r     s2   
&	



zMiniMaxDecoderLayer.forward)NNNFFFN)r6   r7   r8   r   rZ   r'   r[   r\   r   r   r   r    r   r   FloatTensorr   r:   r#   r#   r4   r$   r     s@    	
r   c                   @   s   e Zd ZdZdZdZdS )MiniMaxPreTrainedModelTFN)r6   r7   r8   _supports_cache_class_supports_static_cache_supports_quantized_cacher#   r#   r#   r$   r     s    r   c                   @   s   e Zd Z										ddejdeej deej deeej  deej dee	 dee	 d	ee	 d
ee	 deej de
e defddZdS )MiniMaxModelN	input_idsr~   r   past_key_valuesinputs_embedsr   r   output_hidden_statesr   r   flash_attn_kwargsr   c                 K   s  |d ur|n| j j}|	d ur|	n| j j}	|d ur|n| j j}|d ur$|n| j j}|d u |d uA r4td| jrC| jrC|rCt	d d}|rM|d u rMt
 }n|r^t|t
s^tdt| d|d u rg| |}|
d u r|d urs| nd}tj|||jd  |jd}
|d u r|
d}| j jd u rtnt}|| j |||
||d	}|}| ||}|rd
nd }|rd
nd }|	rd
nd }| jD ]:}|r||f7 }|jdkr|}n|}||f||||||	||
d|}|d }|r||d f7 }|	r||d f7 }q| |}|r||f7 }t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r   )device)r^   input_embedsr~   r   r   r   r#   r   )r}   r~   r   r   r   r   r   r   r   )last_hidden_stater   r|   
attentionsr   )r^   r   r   r   r   
ValueErrorgradient_checkpointingtrainingloggerwarning_oncer?   
isinstancetypeembed_tokensget_seq_lengthr[   rs   r   r   r   sliding_windowr	   r
   
rotary_emblayersr   rj   r   )r2   r   r~   r   r   r   r   r   r   r   r   r   past_seen_tokensmask_functioncausal_maskr|   r}   all_hidden_statesall_self_attnsall_router_logitsdecoder_layerinput_attention_masklayer_outputsr#   r#   r$   r     s   

	





zMiniMaxModel.forward)
NNNNNNNNNN)r6   r7   r8   r[   r   r   r\   listr   r    r   r   r   r   r#   r#   r#   r$   r     sH    	
r   c                       s   e Zd Z fddZ  ZS )MiniMaxForCausalLMc                    s   t  jdi |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```Nr#   )r&   r   )r2   r3   r4   r#   r$   r   W  s   zMiniMaxForCausalLM.forward)r6   r7   r8   r   r:   r#   r#   r4   r$   r   V  s    r   c                   @   r;   ) MiniMaxForSequenceClassificationNr=   r#   r#   r#   r$   r   q  r>   r   c                   @   r;   )MiniMaxForTokenClassificationNr=   r#   r#   r#   r$   r   u  r>   r   c                   @   r;   )MiniMaxForQuestionAnsweringNr=   r#   r#   r#   r$   r   y  r>   r   )r   r   r   r   r   r   r   )9r9   typingr   r[   torch.nn.functionalr   
functionalr   activationsr   cache_utilsr   r   configuration_utilsr   masking_utilsr	   r
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   processing_utilsr   utilsr   mixtral.configuration_mixtralr   mixtral.modeling_mixtralr   r   r   r   r   r   r   r   r   
get_loggerr6   r   r   r<   r?   Moduler]   r   r   r   r   r   r   r   r   __all__r#   r#   r#   r$   <module>   s@   ,
 0 Zv