o
    
۾iQ                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 G dd dej7Z8G dd dej7Z9G dd dej7Z:e
G dd dej7Z;G d d! d!ej7e.e/Z<d"ed#e=d$e>dB fd%d&Z?dS )'zInference-only MiniMaxM2 model.    )Iterable)AnyN)nn)PretrainedConfig)support_torch_compile)CacheConfigModelConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)	Attention)FusedMoE)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)MiniMaxText01RMSNormTP)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sh   e Zd Z		ddededB def fddZedej	d	e
jd
dfddZde
jd
e
jfddZ  ZS )MiniMaxM2MoEN configquant_configprefixc                    s   t    t | _| j|jkrtd| j d|j dt|dd| _| jr9t	t
j|jt
jd| _tj| j_nd | _t|j|j|j| j|j|jdd|| dt
jd	| _t|j|jdt
jd | d
d| _d S )NzTensor parallel size z' is greater than the number of experts .use_routing_biasF)dtypeTz.experts)num_expertstop_kscoring_funce_score_correction_biashidden_sizeintermediate_sizereduce_resultsrenormalizer(   r)   router_logits_dtypez.gate)biasparams_dtyper(   r)   )super__init__r   tp_sizenum_local_experts
ValueErrorgetattrr+   r   	Parametertorchemptyfloat32r0   r%   ebias_weight_loaderweight_loaderr   num_experts_per_tokr/   r1   r2   expertsr   gate)selfr'   r(   r)   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/minimax_m2.pyr9   J   sJ   

zMiniMaxM2MoE.__init__paramloaded_weightreturnc                 C   s,   |   |  ks
J | j|tj d S N)sizedatacopy_tor?   rA   )rL   rM   rJ   rJ   rK   rB   z   s   z MiniMaxM2MoE.ebias_weight_loaderhidden_statesc                 C   s\   |j \}}|d|}| |tj\}}| j||d}|}| jdkr(t|}|||S )N)rT   router_logitsr   )	shapeviewrF   rS   r?   rA   rE   r:   r   )rG   rT   
num_tokens
hidden_dimrV   _final_hidden_statesrJ   rJ   rK   forward   s   

zMiniMaxM2MoE.forward)Nr&   )__name__
__module____qualname__r   r   strr9   staticmethodr   r>   r?   TensorrB   r]   __classcell__rJ   rJ   rH   rK   r%   I   s    0r%   c                       s   e Zd Z									ddededed	ed
eeef dB dedB dededB dedede	dB de
dB deddf fddZdejdejdejfddZ  ZS )MiniMaxM2AttentionN    ư>Fr&   r1   	num_headsnum_kv_heads
rotary_dimrope_parametersattn_window_sizemax_position_embeddingshead_dimrms_norm_epsqkv_biascache_configr(   r)   rN   c              
      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|pG|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _t|| j	| j| j|
|| dd| _t| j| j	 |d|| dd| _|d urd|vr|| j	 |d< t| j	||d	| _t| j| j	| j| j|||| d
d| _t| j	| j |	d| _t| j	| j |	d| _d S )Nr   r   g      z	.qkv_proj)r6   r(   r)   Fz.o_projpartial_rotary_factor)max_positionrk   z.attn)ri   per_layer_sliding_windowrq   r(   r)   eps)r8   r9   r1   r   total_num_headsrh   total_num_kv_headsmaxri   rn   q_sizekv_sizescalingrm   r   qkv_projr   o_projr   
rotary_embr   attnr   q_normk_norm)rG   r1   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   r(   r)   r:   rH   rJ   rK   r9      sp   



	zMiniMaxM2Attention.__init__	positionsrT   c           
      C   s   |  |\}}|j| j| j| jgdd\}}}t| j| j| | \}}| 	|||\}}| 
|||}| |\}	}|	S )NrU   )dim)r}   splitrz   r{   r   
forward_qkr   r   
contiguousr   r   r~   )
rG   r   rT   qkvr[   qkvattn_outputoutputrJ   rJ   rK   r]      s    zMiniMaxM2Attention.forward)	NNrf   Nrg   FNNr&   )r^   r_   r`   intdictra   r   floatboolr   r   r9   r?   rc   r]   rd   rJ   rJ   rH   rK   re      s\    	
Tre   c                       sj   e Zd Z		ddededededB dedB ddf fdd	Zd
e	j
de	j
de	j
dB de	j
fddZ  ZS )MiniMaxM2DecoderLayerNr'   r)   model_configrq   r(   rN   c                    s   t    |j| _t|dd}t|dr!t|jtr!t|j	|j}t|j
ddd }|| _t| j|j|j|j|j||jt|ddt|d	d ||| d
d| _t||| dd| _t|j|jd| _t|j|jd| _d S )Nrm   rf   max_model_lenr*   )seprU   attention_biasFrn   z
.self_attn)r1   rh   ri   rj   rk   rm   ro   rp   rn   rq   r(   r)   z.mlp)r'   r(   r)   ru   )r8   r9   r1   r=   hasattr
isinstancer   r   ry   rm   r   	layer_idxre   num_attention_headsnum_key_value_headsrj   rk   ro   	self_attnr%   block_sparse_moer   input_layernormpost_attention_layernorm)rG   r'   r)   r   rq   r(   rm   r   rH   rJ   rK   r9      s@   


zMiniMaxM2DecoderLayer.__init__r   rT   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r   rT   )r   r   r   r   )rG   r   rT   r   rJ   rJ   rK   r]   "  s   
zMiniMaxM2DecoderLayer.forwardNN)r^   r_   r`   r   ra   r   r   r   r9   r?   rc   r]   rd   rJ   rJ   rH   rK   r      s2    -r   c                       s   e Zd ZdZdddedef fddZdejd	ejfd
dZ		ddejdB dejde
dB dejdB d	eje
B f
ddZd	eeeeeef  fddZdeeeejf  d	ee fddZ  ZS )MiniMaxM2ModelFr&   r)   vllm_configr)   c                   s   t    |jj|j|j |j| _j| _t j	r,t
jjd | dd| _nt | _tj fdd| dd\| _| _| _t jrVtjjd| _nt | _tdd	gj| _d S )
Nz.embed_tokens)r(   r)   c                    s   t |  dS )N)r   rq   r(   )r   r   rq   r'   r   r(   rJ   rK   <lambda>V  s    z)MiniMaxM2Model.__init__.<locals>.<lambda>z.layersr   ru   rT   r   )r8   r9   r   	hf_configrq   r(   r'   
vocab_sizer
   is_first_rankr   r1   embed_tokensr    r#   num_hidden_layersstart_layer	end_layerlayersis_last_rankr   ro   normr"   make_empty_intermediate_tensors)rG   r   r)   rH   r   rK   r9   ?  s4   



zMiniMaxM2Model.__init__	input_idsrN   c                 C   s
   |  |S rO   )r   rG   r   rJ   rJ   rK   embed_input_idsh     
zMiniMaxM2Model.embed_input_idsNr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }| j| j| j D ]
}||||\}}q*t  js@t||dS | ||\}}|S )NrT   r   )rT   r   )	r
   r   r   r   r   r   r   r   r   )	rG   r   r   r   r   rT   r   layerr[   rJ   rJ   rK   r]   k  s    
zMiniMaxM2Model.forwardc                 C   s   t j| ddd| jjdS )Nw1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer-   )r   make_expert_params_mappingr'   r;   rG   rJ   rJ   rK   get_expert_mapping  s   z!MiniMaxM2Model.get_expert_mappingweightsc              	   C   s~  g d}|   }t|  }t }|D ]\}}d|v rqt| j|}|d ur'q|D ]7\}	}
}|
|vr3q)d|v r<||vr<q)||
|	}|drL||vrLq)t|| rRq)|| }|j	}||||  nV|D ]*}|\}	}
}}|
|vrpqc||
|	}t|| r|qc|| }|j	}||||||d  n)|dr||vrqt
||}|d u rqt|| rq|| }t|dt}||| || q|S )N))r}   q_projr   )r}   k_projr   )r}   v_projr   zrotary_emb.inv_freqzmlp.experts.z.bias)shard_id	expert_idrC   )r   r   named_parametersset#get_spec_layer_idx_from_weight_namer'   replaceendswithr!   rC   r   r=   r   add)rG   r   stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsnamerM   
spec_layer
param_nameweight_namer   rL   rC   mappingr   rJ   rJ   rK   load_weights  sn   	




zMiniMaxM2Model.load_weightsrO   )r^   r_   r`   fall_back_to_pt_during_loadr	   ra   r9   r?   rc   r   r   r]   listtupler   r   r   r   r   rd   rJ   rJ   rH   rK   r   ;  s$    )
,	r   c                       s   e Zd Zdg diZdddedef fddZd	ejd
ejfddZ			dd	ejdB dejde
dB dejdB d
eje
B f
ddZdejd
ejdB fddZdeeeejf  d
ee fddZd
eeeeeef  fddZ  ZS )MiniMaxM2ForCausalLMr}   )r   r   r   r&   r   r   r)   c                   s   t    |jj}|j}|| _|| _t|jdr|jj| j_t|t	|dd| _
t jr7t|j|jd d| _nt | _t|j| _| j
j| _d S )Nr   model)r   r)   )r(   )r8   r9   r   r   r(   r'   r   r   r   r$   r   r
   r   r   r   r1   lm_headr    r   logits_processorr   )rG   r   r)   r'   r(   rH   rJ   rK   r9     s$   



zMiniMaxM2ForCausalLM.__init__r   rN   c                 C   s   | j |S rO   )r   r   r   rJ   rJ   rK   r     s   z$MiniMaxM2ForCausalLM.embed_input_idsNr   r   r   c                 K   s   |  ||||}|S rO   )r   )rG   r   r   r   r   kwargsrT   rJ   rJ   rK   r]   
  s   zMiniMaxM2ForCausalLM.forwardrT   c                 C   s   |  | j|}|S rO   )r   r   )rG   rT   logitsrJ   rJ   rK   compute_logits  s   z#MiniMaxM2ForCausalLM.compute_logitsr   c                 C   s   t | }||S rO   )r   r   )rG   r   loaderrJ   rJ   rK   r     s   
z!MiniMaxM2ForCausalLM.load_weightsc                 C   s
   | j  S rO   )r   r   r   rJ   rJ   rK   r   "  r   z'MiniMaxM2ForCausalLM.get_expert_mappingr   )r^   r_   r`   packed_modules_mappingr	   ra   r9   r?   rc   r   r   r]   r   r   r   r   r   r   r   r   rd   rJ   rJ   rH   rK   r     s2    

$&r   r'   r   rN   c                 C   sP   t | dr&| jdkr&| j}t| jD ]}|d||  dr%||   S qd S )Nnum_mtp_modulesr   zmodel.layers.r*   )r   r   r   range
startswith)r'   r   r   irJ   rJ   rK   r   &  s   r   )@__doc__collections.abcr   typingr   r?   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   r	   vllm.distributedr
   r   r   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   ,vllm.model_executor.layers.mamba.linear_attnr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r   utilsr   r    r!   r"   r#   r$   Moduler%   re   r   r   r   ra   r   r   rJ   rJ   rJ   rK   <module>   sH    
FeG ->