o
    	۷iQl                     @   s  d Z ddlmZ ddlZddlm  mZ ddlmZ ddlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. e/e0Z1G dd de#Z2G dd de-Z3G dd deZ4G dd dej5Z6G dd de%Z7G dd de.Z8G d d! d!e&eZ9G d"d# d#e,Z:G d$d% d%e+Z;G d&d' d'e'Z<G d(d) d)e)Z=G d*d+ d+e*Z>G d,d- d-e(Z?g d.Z@dS )/zPyTorch MiniMax model.    )OptionalN)nn   )ACT2FN)CacheDynamicCache)layer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)MoeModelOutputWithPast)Unpack)TransformersKwargslogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )MixtralConfig)
MixtralAttentionMixtralDecoderLayerMixtralForCausalLMMixtralForQuestionAnswering MixtralForSequenceClassificationMixtralForTokenClassificationMixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralSparseMoeBlockc                       s2   e Zd ZdZ								d fdd	Z  ZS )MiniMaxConfiga  
    This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an
    MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the MiniMax.

    [MiniMaxAI/MiniMax-Text-01-hf](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the MiniMax model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MiniMaxModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. MiniMax's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_theta (`float`, *optional*, defaults to 1000000.0):
            The base period of the RoPE embeddings.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `4096`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 8):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise to add to the router.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        block_size (`int`, *optional*, defaults to 256):
            The length of each attention block, determining how queries, keys, and values
            are grouped and processed for intra- and inter-block attention.
        full_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after normal attention.
        full_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after normal attention.
        linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after lightning attention.
        linear_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after lightning attention.
        mlp_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after MLP.
        mlp_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after MLP.

    ```python
    >>> from transformers import MiniMaxModel, MiniMaxConfig

    >>> # Initializing a MiniMax style configuration
    >>> configuration = MiniMaxConfig()

    >>> # Initializing a model from the MiniMax style configuration
    >>> model = MiniMaxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```N      c	           
         st   t  jdi |	 || _|| _|| _|| _|| _|| _|| _|| _	| jd u r1dd t
| jD | _t| j| j d S )Nc                 S   s$   g | ]}t |d  d rdndqS )r"   r   full_attentionlinear_attention)bool).0i r(   a/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/minimax/modular_minimax.py
<listcomp>   s    z*MiniMaxConfig.__init__.<locals>.<listcomp>r(   )super__init__layer_types
block_sizefull_attn_alpha_factorfull_attn_beta_factorlinear_attn_alpha_factorlinear_attn_beta_factormlp_alpha_factormlp_beta_factorrangenum_hidden_layersr   )
selfr-   r.   r/   r0   r1   r2   r3   r4   super_kwargs	__class__r(   r)   r,      s   
zMiniMaxConfig.__init__)Nr!   r"   r"   r"   r"   r"   r"   )__name__
__module____qualname____doc__r,   __classcell__r(   r(   r9   r)   r    5   s    gr    c                   @      e Zd ZdS )MiniMaxRMSNormNr;   r<   r=   r(   r(   r(   r)   rA          rA   c                       s   e Zd Z fddZdd ZdefddZ fdd	Zdef fd
dZdd Z	defddZ
dejfddZdefddZ  ZS )MiniMaxCachec                    s   t    g | _d S N)r+   r,   linear_cacher7   r9   r(   r)   r,      s   

zMiniMaxCache.__init__c                 C   s4   t t| j|d D ]}| jg  q
|| j|< d S )Nr"   )r5   lenrF   append)r7   	layer_idxrF   _r(   r(   r)   set_linear_cache   s   zMiniMaxCache.set_linear_cacherJ   c                 C   s   |t | k r| j| S d S rE   )rH   rF   r7   rJ   r(   r(   r)   get_linear_cache   s   
zMiniMaxCache.get_linear_cachec                    s   t t  t| jS rE   )maxr+   __len__rH   rF   rG   r9   r(   r)   rP      s   zMiniMaxCache.__len__c                    s4   |t | jk r| j| g kr| j| fS t |S rE   )rH   rF   r+   __getitem__rM   r9   r(   r)   rQ      s   zMiniMaxCache.__getitem__c                 c   s"    t t| D ]}| | V  qd S rE   )r5   rH   rM   r(   r(   r)   __iter__   s   zMiniMaxCache.__iter__repeatsc                 C   sP   t t| D ]}| j| g kr| j| j|dd| j|< q| j| | qd S )Nr   dim)r5   rH   rF   repeat_interleavelayersbatch_repeat_interleave)r7   rS   rJ   r(   r(   r)   rX      s
   z$MiniMaxCache.batch_repeat_interleaveindicesc                 C   sN   t t| D ]}| j| g kr| j| |df | j|< q| j| | qd S )N.)r5   rH   rF   rW   batch_select_indices)r7   rY   rJ   r(   r(   r)   rZ      s
   z!MiniMaxCache.batch_select_indices
max_lengthc                 C   s   t d)Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)r7   r[   r(   r(   r)   crop   s   zMiniMaxCache.crop)r;   r<   r=   r,   rL   intrN   rP   rQ   rR   rX   torchTensorrZ   r]   r?   r(   r(   r9   r)   rD      s    rD   c                       s   e Zd Zdedef fddZdd Zdd Zed	d
dd		dde	j
dee	j
e	j
f dee	j
 d
ee dee	j dee dee	j
ee	j
 eee	j
  f fddZ  ZS )MiniMaxLightningAttentionconfigrJ   c                    s  t    || _t|dd p|j|j | _|j| _|j| _|j| _t	|j
 | _t| j| j | _tj|j| j| j d dd| _tj| j| j |jdd| _tj|j| j| j dd| _|  }| |\}}}| d| | d| | d| | d| d S )	Nhead_dimr   F)bias
slope_ratequery_decay	key_decaydiagonal_decay)r+   r,   rJ   getattrhidden_sizenum_attention_headsrc   r6   r.   r   
hidden_actact_fnrA   normr   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)r7   rb   rJ   re   rf   rg   rh   r9   r(   r)   r,      s"   
 z"MiniMaxLightningAttention.__init__c                 C   sd   ddd| j    }t| j d }d| j| jd d   d }|| }|| }|d d d d f }|S )Nr"   r      gh㈵>)rk   r_   arangerJ   r6   )r7   baseexponentfactorrater(   r(   r)   rs     s   z(MiniMaxLightningAttention.get_slope_ratec                 C   s   t | jd }t | |d d d f  }t | | j|d d d f   }|d d d f |d d d f  }|d d d d d d f }|| }t |dk| td}t |}|||fS )Nr"   r   z-inf)r_   rw   r.   expwherefloat)r7   re   block_size_rangerf   rg   rh   r(   r(   r)   rt     s   " 

z'MiniMaxLightningAttention.decay_factorspast_key_valuepast_key_values4.58new_nameversionNhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc           #      K   sl  |j \}}}	|| j d | j }
| | |}|||| jd| j }tj|| jdd\}}}|	dd}|	dd}|	dd}d }|d urN|
| j}|d u r!t|| j| j| j|}|d ury|jtjd}||dd d}g }t|
D ]}|| j }t|| j |}|| }|d d d d ||f }|d d d d ||f }|d d d d ||f }| jd d d |f }| jd d | d f }| jd d d d d |d |f }t| j | }t||	dd}t|| |}t|| |}|| }|| t|| 	dd|} || |  }qnYt| j }!g }t|D ]K}|d d d d ||d f }|d d d d ||d f }|d d d d ||d f }t|	dd|}"|!| |" }t||}|| q.tj|dd}|	dd}|||| j| j }| |}t| || }| |}|d ur| | j| ||fS )	Nr"   r   rT   r   )dtyper   )!shaper.   rm   rp   reshaperk   rc   r_   split	transposerN   rJ   zerostor%   masked_fill	unsqueezer5   minrf   rg   rh   r|   re   matmulrI   catrn   Fsigmoidrr   rq   rL   )#r7   r   r   r   r   r   r   
batch_sizeseq_lenrj   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputr'   	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_interr(   r(   r)   forward  sv   


"
 


z!MiniMaxLightningAttention.forward)NN)r;   r<   r=   r    r^   r,   rs   rt   r   r_   r`   tupler   r   
LongTensorr   r   r   r?   r(   r(   r9   r)   ra      s,    ra   c                   @   r@   )MiniMaxAttentionNrB   r(   r(   r(   r)   r   ~  rC   r   c                   @   r@   )MiniMaxSparseMoeBlockNrB   r(   r(   r(   r)   r     rC   r   c                       s   e Zd Zdedef fddZedddd							
	
	
		ddejde	ejejf de
ej de
ej de
e de
e de
e de
e de
ej dee de	eje
e	ejejf  f fddZ  ZS )MiniMaxDecoderLayerrb   rJ   c                    sz   t  || || _|j| | _|j| _|j| _| jdkr-t||| _|j	| _
|j| _d S t||| _|j| _
|j| _d S )Nr$   )r+   r,   rJ   r-   
layer_typer3   r4   ra   	self_attnr1   attn_alpha_factorr2   attn_beta_factorr   r/   r0   )r7   rb   rJ   r9   r(   r)   r,     s   
zMiniMaxDecoderLayer.__init__r   r   r   r   NFr   r   r   position_idsoutput_attentionsoutput_router_logits	use_cacher   r   r   c
                 K   s|   |  |}|}| jd||||||||	d|
\}}|| j || j  }| |}|}| |\}}|| j || j  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            attention_mask (`torch.Tensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`Cache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r   r   r   r   r   r   r   r   Nr(   )input_layernormr   r   r   post_attention_layernormblock_sparse_moer3   r4   )r7   r   r   r   r   r   r   r   r   r   r   residualrK   r(   r(   r)   r     s(   
'	

zMiniMaxDecoderLayer.forward)NNNFFFN)r;   r<   r=   r    r^   r,   r   r_   r`   r   r   r   r   r%   r   r   FloatTensorr   r?   r(   r(   r9   r)   r     sB    	
r   c                   @   s(   e Zd ZdZeeddeeegdZ	dS )MiniMaxPreTrainedModelFr"   )index)router_logitsr   
attentionsN)
r;   r<   r=   _can_compile_fullgraphr   r   r   r   ra   _can_record_outputsr(   r(   r(   r)   r     s    

r   c                   @   s   e Zd Ze 								ddeej deej deej dee deej	 dee
 dee
 d	eej d
ee defddZdS )MiniMaxModelN	input_idsr   r   r   inputs_embedsr   r   r   r   r   c	              
   K   s8  |d u |d uA rt d|r|d u rt }n|r't|ts't dt| d|d u r0| |}|d u rL|d ur<| nd}
tj|
|
|jd  |j	d}|d u rU|
d}| jjd u r]tnt}|| j|||||d}|}| ||}| jD ]}|jdkr|}n|}||f||||||d	|	}qu| |}t||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r"   )device)rb   input_embedsr   r   r   r   r#   )r   r   r   r   r   r   )last_hidden_stater   )
ValueErrorrD   
isinstancetypeembed_tokensget_seq_lengthr_   rw   r   r   r   rb   sliding_windowr	   r
   
rotary_embrW   r   rn   r   )r7   r   r   r   r   r   r   r   r   r   past_seen_tokensmask_functioncausal_maskr   r   decoder_layerinput_attention_maskr(   r(   r)   r     sb   

	


zMiniMaxModel.forward)NNNNNNNN)r;   r<   r=   r   r   r_   r   r`   rD   r   r%   r   r   r   r   r(   r(   r(   r)   r     s>    	
r   c                       s   e Zd Z fddZ  ZS )MiniMaxForCausalLMc                    s   t  jdi |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```Nr(   )r+   r   )r7   r8   r9   r(   r)   r   /  s   zMiniMaxForCausalLM.forward)r;   r<   r=   r   r?   r(   r(   r9   r)   r   .  s    r   c                   @   r@   ) MiniMaxForSequenceClassificationNrB   r(   r(   r(   r)   r   I  rC   r   c                   @   r@   )MiniMaxForTokenClassificationNrB   r(   r(   r(   r)   r   M  rC   r   c                   @   r@   )MiniMaxForQuestionAnsweringNrB   r(   r(   r(   r)   r   Q  rC   r   )r    r   r   r   r   r   r   )Ar>   typingr   r_   torch.nn.functionalr   
functionalr   activationsr   cache_utilsr   r   configuration_utilsr   masking_utilsr	   r
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   processing_utilsr   utilsr   r   utils.deprecationr   utils.genericr   r   mixtral.configuration_mixtralr   mixtral.modeling_mixtralr   r   r   r   r   r   r   r   r   r   
get_loggerr;   loggerr    rA   rD   Modulera   r   r   r   r   r   r   r   r   r   __all__r(   r(   r(   r)   <module>   sF   0
 . S	L