o
    eis                     @   s$  d Z ddlZddlm  mZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3 e 4e5Z6G dd deZ7G dd de1Z8G dd deZ9G dd dej:Z;G dd de'Z<G dd  d e)Z=G d!d" d"e3Z>G d#d$ d$e2Z?G d%d& d&e*eZ@G d'd( d(e0ZAG d)d* d*e/ZBG d+d, d,e+ZCG d-d. d.e-ZDG d/d0 d0e.ZEG d1d2 d2e,ZFg d3ZGdS )4zPyTorch MiniMax model.    N)nn   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)MoeModelOutputWithPast)RopeParameters)Unpack)TransformersKwargslogging)merge_with_config_defaults)OutputRecordercapture_outputs   )Gemma2RotaryEmbedding)MixtralAttentionMixtralDecoderLayerMixtralForCausalLMMixtralForQuestionAnswering MixtralForSequenceClassificationMixtralForTokenClassificationMixtralModelMixtralPreTrainedModelMixtralRMSNormMixtralSparseMoeBlockMixtralTopKRouterc                B       s  e Zd ZdZdZdgZdZdddddddd	Zd
gdgfddgdgfdgdgfdZddiZ																							 				!						dCd"e
dB d#e
dB d$e
dB d%e
dB d&e
dB d'e
dB d(e
dB d)edB d*e
dB d+edB d,e
dB d-edB d.e
dB d/e
dB d0e
dB d1edB d2e
dB d3edB d4e
dB de
dB d5edB d6edB d7edB d8eeeef B dB d9ee dB d:e
dB d;e
dB d<e
dB d=e
dB d>e
dB d?e
dB d@e
dB f@ fdAdBZ  ZS )DMiniMaxConfiga  
    This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an
    MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the MiniMax.

    [MiniMaxAI/MiniMax-Text-01-hf](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the MiniMax model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MiniMaxModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 14336):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
            The attention head dimension.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
            The maximum sequence length that this model might ever be used with. MiniMax's sliding window attention
            allows sequence of up to 4096*32 tokens.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If not specified, will default to `4096`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_experts_per_tok (`int`, *optional*, defaults to 2):
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter
        num_local_experts (`int`, *optional*, defaults to 8):
            Number of experts per Sparse MLP layer.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss. See [here]() for more details
        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
            The aux loss factor for the total loss.
        router_jitter_noise (`float`, *optional*, defaults to 0.0):
            Amount of noise to add to the router.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        block_size (`int`, *optional*, defaults to 256):
            The length of each attention block, determining how queries, keys, and values
            are grouped and processed for intra- and inter-block attention.
        full_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after normal attention.
        full_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after normal attention.
        linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after lightning attention.
        linear_attn_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after lightning attention.
        mlp_alpha_factor (`float`, *optional*, defaults to 1):
            Weight for residual value in residual connection after MLP.
        mlp_beta_factor (`float`, *optional*, defaults to 1):
            Weight for hidden state value in residual connection after MLP.

    ```python
    >>> from transformers import MiniMaxModel, MiniMaxConfig

    >>> # Initializing a MiniMax style configuration
    >>> configuration = MiniMaxConfig()

    >>> # Initializing a model from the MiniMax style configuration
    >>> model = MiniMaxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```minimaxpast_key_valuesg    .Acolwiserowwisecolwise_gather_outputpacked_colwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gatez!layers.*.mlp.experts.gate_up_projzlayers.*.mlp.experts.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormnum_expertsnum_local_experts }      8         Nsilu   {Gz?h㈵>T   r   F        MbP?   
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dim
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_windowattention_dropoutnum_experts_per_tokoutput_router_logitsrouter_aux_loss_coefrouter_jitter_noiserope_parameterslayer_types
block_sizefull_attn_alpha_factorfull_attn_beta_factorlinear_attn_alpha_factorlinear_attn_beta_factormlp_alpha_factormlp_beta_factorc!           "         s  || _ |	| _|| _|| _|| _|| _|| _|d u r|}|| _|| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _| jd u rsdd t| jD | _t | j| j || _!t" j#di |! d S )Nc                 S   s$   g | ]}t |d  d rdndqS )r<   r   full_attentionlinear_attention)bool).0i rd   i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/minimax/modular_minimax.py
<listcomp>   s    z*MiniMaxConfig.__init__.<locals>.<listcomp>rd   )$r@   rH   rA   rB   rC   rD   rP   rE   rG   rI   rJ   rK   rQ   rF   rR   r2   rS   rT   rU   rO   rL   rM   rN   rW   rX   rY   rZ   r[   r\   r]   r^   ranger	   rV   super__init__)"selfr@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   r2   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   kwargs	__class__rd   re   ri      sP   $
zMiniMaxConfig.__init__) r3   r4   r5   r6   r6   r7   Nr8   r9   r:   r;   TNr<   r   FNr=   r   r7   Fr>   r=   NNr?   r<   r<   r<   r<   r<   r<   )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencedefault_thetabase_model_tp_planbase_model_pp_planattribute_mapintstrfloatra   r   dictlistri   __classcell__rd   rd   rl   re   r#   5   s    g


	

 !r#   c                   @      e Zd ZdS )MiniMaxRMSNormNrn   ro   rp   rd   rd   rd   re   r         r   c                       sj   e Zd Z fddZdd ZdefddZ fdd	Zd
efddZde	j
fddZdefddZ  ZS )MiniMaxCachec                    s   t    g | _d S N)rh   ri   linear_cacherj   rl   rd   re   ri     s   

zMiniMaxCache.__init__c                 C   s4   t t| j|d D ]}| jg  q
|| j|< d S )Nr<   )rg   lenr   append)rj   	layer_idxr   _rd   rd   re   set_linear_cache  s   zMiniMaxCache.set_linear_cacher   c                 C   s   |t | k r| j| S d S r   )r   r   )rj   r   rd   rd   re   get_linear_cache  s   
zMiniMaxCache.get_linear_cachec                    s   t t  t| jS r   )maxrh   __len__r   r   r   rl   rd   re   r     s   zMiniMaxCache.__len__repeatsc                 C   sP   t t| D ]}| j| g kr| j| j|dd| j|< q| j| | qd S )Nr   dim)rg   r   r   repeat_interleaver/   batch_repeat_interleave)rj   r   r   rd   rd   re   r     s
   z$MiniMaxCache.batch_repeat_interleaveindicesc                 C   sN   t t| D ]}| j| g kr| j| |df | j|< q| j| | qd S )N.)rg   r   r   r/   batch_select_indices)rj   r   r   rd   rd   re   r   %  s
   z!MiniMaxCache.batch_select_indices
max_lengthc                 C   s   t d)Nz*MiniMaxCache doesnot support `crop` method)RuntimeError)rj   r   rd   rd   re   crop,  s   zMiniMaxCache.crop)rn   ro   rp   ri   r   rx   r   r   r   torchTensorr   r   r}   rd   rd   rl   re   r     s    r   c                       s   e Zd Zdedef fddZdd Zdd Z				dd
ej	de
ej	ej	f dej	d	B ded	B dejd	B dee de
ej	ej	d	B e
ej	 d	B f fddZ  ZS )MiniMaxLightningAttentionconfigr   c                    s  t    || _t|dd p|j|j | _|j| _|j| _|j| _t	|j
 | _t| j| j | _tj|j| j| j d dd| _tj| j| j |jdd| _tj|j| j| j dd| _|  }| |\}}}| d| | d| | d| | d| d S )	NrF   r   F)bias
slope_ratequery_decay	key_decaydiagonal_decay)rh   ri   r   getattrrA   rD   rF   rC   rX   r   rG   act_fnr   r0   r   Linearqkv_projout_projoutput_gateget_slope_ratedecay_factorsregister_buffer)rj   r   r   r   r   r   r   rl   rd   re   ri   1  s"   
 z"MiniMaxLightningAttention.__init__c                 C   sd   ddd| j    }t| j d }d| j| jd d   d }|| }|| }|d d d d f }|S )Nr<   r   r7   r;   )rD   r   aranger   rC   )rj   baseexponentfactorraterd   rd   re   r   G  s   z(MiniMaxLightningAttention.get_slope_ratec                 C   s   t | jd }t | |d d d f  }t | | j|d d d f   }|d d d f |d d d f  }|d d d d d d f }|| }t |dk| td}t |}|||fS )Nr<   r   z-inf)r   r   rX   expwhererz   )rj   r   block_size_ranger   r   r   rd   rd   re   r   R  s   " 

z'MiniMaxLightningAttention.decay_factorsNr,   position_embeddingsr-   r%   cache_positionrk   returnc           #      K   sl  |j \}}}	|| j d | j }
| | |}|||| jd| j }tj|| jdd\}}}|	dd}|	dd}|	dd}d }|d urN|
| j}|d u r!t|| j| j| j|}|d ury|jtjd}||dd d}g }t|
D ]}|| j }t|| j |}|| }|d d d d ||f }|d d d d ||f }|d d d d ||f }| jd d d |f }| jd d | d f }| jd d d d d |d |f }t| j | }t||	dd}t|| |}t|| |}|| }|| t|| 	dd|} || |  }qnYt| j }!g }t|D ]K}|d d d d ||d f }|d d d d ||d f }|d d d d ||d f }t|	dd|}"|!| |" }t||}|| q.tj|dd}|	dd}|||| j| j }| |}t| || }| |}|d ur| | j| ||fS )	Nr<   r   r   r   )dtyper   )!shaperX   r   r   reshaperD   rF   r   split	transposer   r   zerostora   masked_fill	unsqueezerg   minr   r   r   r   r   matmulr   catr0   Fsigmoidr   r   r   )#rj   r,   r   r-   r%   r   rk   
batch_sizeseq_lenrA   
num_blocks
qkv_statesquery_states
key_statesvalue_statesattn_weights_interattn_outputrc   	start_idxend_idxcurrent_block_sizecurrent_query_statescurrent_key_statescurrent_value_statescurrent_query_decaycurrent_key_decaycurrent_diagonal_decayblock_decayattn_weights_intraattn_output_intraattn_output_intercurrent_attn_outputnext_attn_weights_interratiocurrent_attn_weights_interrd   rd   re   forward`  sv   	

"
 


z!MiniMaxLightningAttention.forward)NN)rn   ro   rp   r#   rx   ri   r   r   r   r   tupler   
LongTensorr   r   r   r}   rd   rd   rl   re   r   0  s*    r   c                   @   r~   )MiniMaxRotaryEmbeddingNr   rd   rd   rd   re   r     r   r   c                   @   r~   )MiniMaxAttentionNr   rd   rd   rd   re   r     r   r   c                   @   r~   )MiniMaxTopKRouterNr   rd   rd   rd   re   r     r   r   c                   @   r~   )MiniMaxSparseMoeBlockNr   rd   rd   rd   re   r     r   r   c                       s   e Zd Zdedef fddZ						ddejdeejejf dB d	ejdB d
ej	dB de
dB dedB dej	dB dee deejeejejf dB f fddZ  ZS )MiniMaxDecoderLayerr   r   c                    s   t  || || _t|dr|j| nd | _|j| _|j| _| `t	|| _| jdkr;t
||| _|j| _|j| _d S t||| _|j| _|j| _d S )NrW   r`   )rh   ri   r   hasattrrW   
layer_typer]   r^   mlpr   r   	self_attnr[   attn_alpha_factorr\   attn_beta_factorr   rY   rZ   )rj   r   r   rl   rd   re   ri     s   

zMiniMaxDecoderLayer.__init__NFr,   r   r-   position_idsr%   rK   r   rk   r   c              
   K   sv   |  |}|}	| jd|||||||d|\}}
|	| j || j  }| |}|}	| |}|	| j || j  }|S )N)r,   r   r-   r   r%   rK   r   rd   )input_layernormr   r   r   post_attention_layernormr   r]   r^   )rj   r,   r   r-   r   r%   rK   r   rk   residualr   rd   rd   re   r     s&   




zMiniMaxDecoderLayer.forward)NNNNFN)rn   ro   rp   r#   rx   ri   r   r   r   r   r   ra   r   r   FloatTensorr   r}   rd   rd   rl   re   r     s6    	
r   c                       s:   e Zd ZdZeedddeeegdZ	 fddZ
  ZS )MiniMaxPreTrainedModelFzmlp.gater   )
layer_nameindex)router_logitsr,   
attentionsc                    sn   t  | t|tr5| }||\}}}t|j| t|j	| t|j
| t|j| d S d S r   )rh   _init_weights
isinstancer   r   r   initcopy_r   r   r   r   )rj   moduler   r   r   r   rl   rd   re   r     s   
z$MiniMaxPreTrainedModel._init_weights)rn   ro   rp   _can_compile_fullgraphr   r   r   r   r   _can_record_outputsr   r}   rd   rd   rl   re   r     s    r   c                   @   s   e Zd Zee							ddejdB dejdB dejdB dedB dej	dB de
dB dejdB d	ee d
eeB fddZdS )MiniMaxModelNr*   r-   r   r%   r+   rK   r   rk   r   c              
   K   s8  |d u |d uA rt d|r|d u rt }n|r't|ts't dt| d|d u r0| |}|d u rL|d ur<| nd}	tj|	|	|jd  |j	d}|d u rU|
d}| jjd u r]tnt}
|
| j|||||d}|}| ||}| jD ]}|jdkr|}n|}||f||||||d	|}qu| |}t||d
S )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type .r   r<   )device)r   r+   r-   r   r%   r   r_   )r-   r   r   r%   rK   r   )last_hidden_stater%   )
ValueErrorr   r   typer.   get_seq_lengthr   r   r   r   r   r   rP   r
   r   
rotary_embr/   r   r0   r   )rj   r*   r-   r   r%   r+   rK   r   rk   past_seen_tokensmask_functioncausal_maskr,   r   decoder_layerinput_attention_maskrd   rd   re   r     sb   

	


zMiniMaxModel.forward)NNNNNNN)rn   ro   rp   r   r   r   r   r   r   r   ra   r   r   r   r   r   rd   rd   rd   re   r     s:    	
r   c                       s   e Zd Z fddZ  ZS )MiniMaxForCausalLMc                    s   t  jdi |S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MiniMaxForCausalLM

        >>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```Nrd   )rh   r   )rj   super_kwargsrl   rd   re   r   d  s   zMiniMaxForCausalLM.forward)rn   ro   rp   r   r}   rd   rd   rl   re   r  c  s    r  c                   @   r~   ) MiniMaxForSequenceClassificationNr   rd   rd   rd   re   r  ~  r   r  c                   @   r~   )MiniMaxForTokenClassificationNr   rd   rd   rd   re   r    r   r  c                   @   r~   )MiniMaxForQuestionAnsweringNr   rd   rd   rd   re   r    r   r  )r#   r   r   r  r  r  r  )Hrq   r   torch.nn.functionalr   
functionalr    r   r   activationsr   cache_utilsr   r   configuration_utilsr   r	   masking_utilsr
   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   modeling_rope_utilsr   processing_utilsr   utilsr   r   utils.genericr   utils.output_capturingr   r   gemma2.modeling_gemma2r   mixtral.modeling_mixtralr   r   r   r   r   r   r   r   r    r!   r"   
get_loggerrn   loggerr#   r   r   Moduler   r   r   r   r   r   r   r   r  r  r  r  __all__rd   rd   rd   re   <module>   sL   4
 S% 3J