o
    
۾ij                  
   @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC G dd dejDZEG dd dejDZFG dd dejDZGG dd dejDZHG d d! d!ejDZIeG d"d# d#ejDZJG d$d% d%ejDe6e9e:e7e;e8	ZKdS )&    )Iterable)isliceN)support_torch_compile)CacheConfigModelConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)	Attention)FusedMoE)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)	ShortConv)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors)Lfm2MoeConfig   )HasInnerStateIsHybridMixtureOfExpertsSupportsLoRA
SupportsPPSupportsQuant)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                	       sN   e Zd Z		ddedededB def fddZd	ejd
ejfddZ	  Z
S )
Lfm2MoeMlpN dimff_dimquant_configprefixc                    sP   t    t||gd d|| dd| _t||d|| dd| _t | _d S )N   F.w1)
input_sizeoutput_sizesbiasr3   r4   z.w2r7   output_sizer9   r3   r4   )super__init__r   w1r   w2r   act_fn)selfr1   r2   r3   r4   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/lfm2_moe.pyr=   @   s    
zLfm2MoeMlp.__init__xreturnc                 C   s*   |  |\}}| |}| |\}}|S N)r>   r@   r?   )rA   rF   gate_up_rD   rD   rE   forwardX   s   
zLfm2MoeMlp.forward)Nr0   )__name__
__module____qualname__intr   strr=   torchTensorrK   __classcell__rD   rD   rB   rE   r/   ?   s    r/   c                	       sP   e Zd Z			ddededB dedef fdd	Zd
ej	dej	fddZ
  ZS )Lfm2MoeSparseMoeBlockNr0   Fconfigr3   r4   enable_eplbc                    sR  t    t | _|j| _t j| _t j| _	| j
 | _|j| _| j| jkr4td| j d| j dt }|jj}|| _| j| _|j| _| j| j | _| j| j | _| j	| j | _| j| j | _t|j|jd|| dd| _|jrtt j!| jt j"d| j_#nd | j_#t$| j|j%|j|j&d|j'|dd	d	| d
| j| jd| jj#d| _(d S )NzTensor parallel size z' is greater than the number of experts .Fz.gate)r9   r3   r4   )dtypeTr!   z.expertssigmoid)num_expertstop_khidden_sizeintermediate_sizereduce_resultsrenormalizer3   use_grouped_topknum_expert_group
topk_groupr4   rV   num_redundant_expertsscoring_funce_score_correction_bias))r<   r=   r   tp_sizerouted_scaling_factorr	   device_groupep_grouprank_in_groupep_ranksizeep_sizerZ   n_routed_experts
ValueErrorr   parallel_configeplb_configrV   n_logical_expertsrc   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr   r\   gateuse_expert_biasnn	ParameterrQ   emptyfloat32re   r   num_experts_per_tokmoe_intermediate_sizenorm_topk_probexperts)rA   rU   r3   r4   rV   vllm_configrq   rB   rD   rE   r=   `   sh   





zLfm2MoeSparseMoeBlock.__init__hidden_statesrG   c                 C   s^   |j }|j d }|d|}| |\}}| j||d| j }| jdkr*| j|}||S )N)r   router_logitsr!   )shapeviewrx   r   rg   rf   &maybe_all_reduce_tensor_model_parallel)rA   r   
orig_shape
hidden_dimr   rJ   final_hidden_statesrD   rD   rE   rK      s   


zLfm2MoeSparseMoeBlock.forward)Nr0   F)rL   rM   rN   r    r   rP   boolr=   rQ   rR   rK   rS   rD   rD   rB   rE   rT   _   s    ErT   c                       st   e Zd Z				ddededededed	ed
edB dedB deddf fddZde	j
de	j
de	j
fddZ  ZS )Lfm2MoeAttention    Nr0   rU   	layer_idxr\   	num_headsnum_kv_headsmax_position_embeddingscache_configr3   r4   rG   c
              	      sx  t    || _|| _|| _t }
|| _| j|
 dksJ | j|
 | _|| _| j|
kr5| j|
 dks4J n	|
| j dks>J t	d| j|
 | _| j| j | _
| j| j
 | _| j| j
 | _| j
d | _|| _t| j| j
| j| jd||	 dd| _t| j| j
 | jd||	 dd| _t| j
| j|jd	d
| _t| j| j
| j| j||	 dd| _t| j
|jd| _t| j
|jd| _d S )Nr   r!   g      F	.qkv_proj)r\   	head_sizetotal_num_headstotal_num_kv_headsr9   r3   r4   z	.out_projr:   T)max_positionrope_parametersis_neox_stylez.attn)r   r   r4   eps)r<   r=   r   r\   r   r   r   r   r   maxhead_dimq_sizekv_sizescalingr   r   qkv_projr   out_projr   r   
rotary_embr   attnr   norm_epsq_layernormk_layernorm)rA   rU   r   r\   r   r   r   r   r3   r4   rf   rB   rD   rE   r=      sd   

	
zLfm2MoeAttention.__init__	positionsr   c                 C   s   |j \}}| |\}}|j| j| j| jgdd\}}}||| j| j }||| j	| j }| 
|}| |}| |||\}}||| j| j }||| j	| j }| |||}	| |	\}
}|
S )Nr   )r1   )r   r   splitr   r   r   r   r   
contiguousr   r   r   r   r   r   )rA   r   r   n_tokensrJ   qkvqkvattn_outputoutputrD   rD   rE   rK      s   
 

zLfm2MoeAttention.forward)r   NNr0   )rL   rM   rN   r    rO   r   r   rP   r=   rQ   rR   rK   rS   rD   rD   rB   rE   r      sB    	
Fr   c                       s   e Zd Z					ddedededB dedB dedB d	ed
e	ddf fddZ
dejdejdejdB deejejf fddZ  ZS )Lfm2MoeAttentionDecoderLayerNr0   FrU   r   model_configr   r3   r4   rV   rG   c           	         s   t    || _|| _|| _t|dd}t|||j|j|j	|||| dd	| _
||jk r<t|j|j|| dd| _nt||| d|d| _t|j|jd| _t|j|jd| _d S )	Nr   r   z
.self_attn)	rU   r   r\   r   r   r   r   r3   r4   .feed_forwardr1   r2   r3   r4   rU   r3   r4   rV   r   )r<   r=   r4   rU   r   getattrr   r\   num_attention_headsnum_key_value_heads	self_attnnum_dense_layersr/   r]   feed_forwardrT   r   r   operator_normffn_norm)	rA   rU   r   r   r   r3   r4   rV   r   rB   rD   rE   r=     s>   



z%Lfm2MoeAttentionDecoderLayer.__init__r   r   residualc                 K   sT   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| ||fS )N)r   r   )r   r   r   r   )rA   r   r   r   kwargsrD   rD   rE   rK   D  s   z$Lfm2MoeAttentionDecoderLayer.forwardNNNr0   F)rL   rM   rN   r    rO   r   r   r   rP   r   r=   rQ   rR   tuplerK   rS   rD   rD   rB   rE   r     s@    	/r   c                       sp   e Zd Z					ddedededB dedB dedB d	ed
e	ddf fddZ
dejdejdB fddZ  ZS )Lfm2MoeShortConvDecoderLayerNr0   FrU   r   r   r   r3   r4   rV   rG   c                    s   t    || _t||j|||| dd| _||jk r+t|j|j|| dd| _	nt
||| d|d| _	t|j|jd| _t|j|jd| _d S )Nz.conv)rU   r1   r   r   r   r4   r   r   r   r   )r<   r=   r   r   r\   
short_convr   r/   r]   r   rT   r   r   r   r   )rA   rU   r   r   r   r3   r4   rV   rB   rD   rE   r=   V  s2   


	
z%Lfm2MoeShortConvDecoderLayer.__init__r   r   c                 K   s`   |d u r|}|  |}n|  ||\}}t|}| || | ||\}}| |}||fS rH   )r   rQ   
empty_liker   r   r   )rA   r   r   r   r   rD   rD   rE   rK   }  s   

z$Lfm2MoeShortConvDecoderLayer.forwardr   )rL   rM   rN   r    rO   r   r   r   rP   r   r=   rQ   rR   rK   rS   rD   rD   rB   rE   r   U  s8    	'r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )Lfm2MoeModelr0   r4   r   r4   c                   s   t    |jj|j|j |j|j}|j|j}|j	_	_
j_tjjjd_dtf fdd}tj|| dd\___tddgj_t jrhtjjd	_d S t _d S )
N)org_num_embeddingsr4   c              	      s:   t | }jj| dk}|rtnt}|| | dS )Nfull_attention)r3   r4   rV   )r*   rU   layer_typesr   r   )r4   r   is_attnlayer_classr   rU   rV   r   r3   rA   rD   rE   	get_layer  s   z(Lfm2MoeModel.__init__.<locals>.get_layerz.layersr   r   r   r   )r<   r=   r   	hf_configr   r3   rp   rV   rq   rc   rU   
vocab_sizer   r\   embed_tokensrP   r-   num_hidden_layersstart_layer	end_layerlayersr,   make_empty_intermediate_tensorsr
   is_last_rankr   r   embedding_normr)   )rA   r   r4   rp   rq   r   rB   r   rE   r=     s0   

zLfm2MoeModel.__init__	input_idsrG   c                 C   s
   |  |S rH   )r   rA   r   rD   rD   rE   embed_input_ids     
zLfm2MoeModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]}||||d\}}q*t  jsAt||dS | 	||\}}|S )Nr   r   )r   r   r   )r   r   )
r
   is_first_rankr   r   r   r   r   r   r   r   )	rA   r   r   r   r   r   r   layerrJ   rD   rD   rE   rK     s(   
zLfm2MoeModel.forwardc                 C   s   t j| ddd| jj| jdS )Nr>   r?   w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerZ   rc   )r   make_expert_params_mappingrU   rZ   rc   rA   rD   rD   rE   get_expert_mapping  s   zLfm2MoeModel.get_expert_mappingweightsc              	   C   s  g d}t |  }t }|  }|D ]\}}d|v r!|dd}d|v r,|ddd}|D ]<\}}	}
|	|vr8q.d|v rA||vrAq.||	|}|dsQ|d	rV||vrVq.t|| r\q.|| }|j}||||
  n`|D ]9}|\}}	}}
|	|vrzqm||	|}t|| rqm|ds|d	r||vrqm|| }|j}|||||
|d
  n$|ds|d	r||vrqt|| rq|| }t|dt	}||| |
| q|S )N))r   z.q_projr   )r   z.k_projr   )r   z.v_projr   )r6   r6   r   )r6   z.w3r!   expert_biaszgate.e_score_correction_biasz.conv.z.short_conv.r!   zfeed_forward.experts.z.bias_bias)shard_id	expert_idweight_loader)dictnamed_parameterssetr   replaceendswithr+   r   r   r   add)rA   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   mappingr   rD   rD   rE   load_weights  s   



zLfm2MoeModel.load_weightsNN)rL   rM   rN   r   rP   r=   rQ   rR   r   r   rK   listr   rO   r   r   r   r   rS   rD   rD   rB   rE   r     s$    3
,
r   c                       sd  e Zd Zg dddgdgdZdddZed	d
deejdf fddZ	ed	d
deee
e
f  fddZedee fddZddd	ededdf fddZdejdejfddZde
de
ddfddZ		d-dejdB d ejd!edB d"ejdB dejf
d#d$Zd%ejdejfd&d'Zd(eeeejf  dee fd)d*Zdeeeee
ef  fd+d,Z  ZS ).Lfm2MoeForCausalLM)q_projk_projv_projr>   r   in_proj)r   r>   r  input_embeddingsoutput_embeddings)r   lm_headr   r   rG   .c                 C   s   t |jj|jjS rH   )r   short_conv_state_dtyper   rX   r   mamba_cache_dtype)clsr   rD   rD   rE   !get_mamba_state_dtype_from_configd  s   z4Lfm2MoeForCausalLM.get_mamba_state_dtype_from_configc                 C   s$   |j }|jj}tj|j|j|jdS )zCalculate shapes for LFM2's convolutional cache.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
        )tp_world_sizer]   conv_kernel)rp   r   r   r   short_conv_state_shapetensor_parallel_sizer\   conv_L_cache)r  r   rp   r   rD   rD   rE   !get_mamba_state_shape_from_confign  s   z4Lfm2MoeForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S rH   )r   short_conv_state_copy_func)r  rD   rD   rE   get_mamba_state_copy_func  s   z,Lfm2MoeForCausalLM.get_mamba_state_copy_funcr0   r   r4   Nc                   sR  |j j}|j}|j}|jdkrtdt   || _t	|t
|dd| _t jrAt|j|j|t
|dd| _| j| jj| _nt | _t|j| _| jj| _g | _g | _d }| jjD ]"}t|trdq\t|ttfsmJ t|jtr~|j}| j |jj! q\|d u rt"dt#| j| _$d| _%d	| _&|j'| _(|j)| _*|j+| _,|j-| _.|j/| _0d S )
NallzfLfm2Moe currently does not support 'all' prefix caching, please use '--mamba-cache-mode=align' insteadmodel)r   r4   r	  )r3   r4   z9No Lfm2MoeSparseMoeBlock layer found in the model.layers.r!   r   )1r   r   r3   r   mamba_cache_modeNotImplementedErrorr<   r=   rU   r   r.   r  r
   r   r   r   r\   r	  tie_weightsr   r)   r   logits_processorr   expert_weights
moe_layersr   
isinstancer   r   r   rT   appendr   RuntimeErrorlennum_moe_layersnum_expert_groupsnum_shared_expertsrr   num_logical_expertsrt   num_physical_expertsru   num_local_physical_expertsrn   num_routed_expertsrs   rc   )rA   r   r4   rU   r3   r   example_layerr   rB   rD   rE   r=     sb   



zLfm2MoeForCausalLM.__init__r   c                 C   s   | j |S rH   )r  r   r   rD   rD   rE   r     s   z"Lfm2MoeForCausalLM.embed_input_idsr&  r'  c                 C   sh   | j |ksJ || _|| _ || j | _| jjD ]}t|jtr1|j}||_	||_
| j|_|j  qd S rH   )r'  r&  r%  rc   r  r   r  r   rT   ru   rt   rs   r   update_expert_map)rA   r&  r'  r   moerD   rD   rE    update_physical_experts_metadata  s   
z3Lfm2MoeForCausalLM.update_physical_experts_metadatar   r   r   c                 K   s   |  ||||}|S rH   )r  )rA   r   r   r   r   r   r   rD   rD   rE   rK     s   zLfm2MoeForCausalLM.forwardr   c                 C   s   |  | j|}|S rH   )r  r	  )rA   r   logitsrD   rD   rE   compute_logits  s   z!Lfm2MoeForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r(   rU   tie_word_embeddingsr   )rA   r   loaderrD   rD   rE   r     s
   
zLfm2MoeForCausalLM.load_weightsc                 C   s
   | j  S rH   )r  r   r   rD   rD   rE   r     r   z%Lfm2MoeForCausalLM.get_expert_mappingr   )rL   rM   rN   packed_modules_mappingembedding_modulesclassmethodr   rQ   rX   r  rO   r  r   r  r   rP   r=   rR   r   r,  r   rK   r.  r   r   r   r  r   rS   rD   rD   rB   rE   r  H  sb    
	 @

$&r  )Lcollections.abcr   	itertoolsr   rQ   torch.nnrz   vllm.compilation.decoratorsr   vllm.configr   r   r   r   vllm.distributedr	   r
   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   +vllm.model_executor.layers.mamba.short_convr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   vllm.transformers_utils.configsr    
interfacesr"   r#   r$   r%   r&   r'   utilsr(   r)   r*   r+   r,   r-   r.   Moduler/   rT   r   r   r   r   r  rD   rD   rD   rE   <module>   sP    $ Z[A= 
6