o
    
۾iK^                     @   s  d dl mZ d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZmZmZ d dlmZ d dl m!Z!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z= ee>Z?G dd dej@ZAG dd dej@ZBG dd dej@ZCG dd dej@ZDeG d d! d!ej@ZEG d"d# d#ej@e5e8e7e6ZFd$e3d%eGd&eHdB fd'd(ZIdS ))    )IterableN)nn)support_torch_compile)CacheConfigModelConfigParallelConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)init_logger)
SiluAndMul)FusedMoE)KimiDeltaAttention)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)
MLAModulesMultiHeadLatentAttentionWrapper)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)KimiLinearConfig   )HasInnerStateIsHybridMixtureOfExperts
SupportsPP)PPMissingLayeris_pp_missing_parametermake_layersmaybe_prefixc                       sN   e Zd Z			ddededededB ded	ed
df fddZdd Z  Z	S )KimiMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixreturnc                    sj   t    t||gd d|| dd| _t||d||| dd| _|dkr/td| d	t | _d S )
N   F.gate_up_projbiasr1   r3   z
.down_proj)r8   r1   r2   r3   siluUnsupported activation: !. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr.   r/   r0   r1   r2   r3   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/kimi_linear.pyr=   ;   s*   
	
zKimiMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r>   rA   r?   )rB   xgate_up_rE   rE   rF   forward[   s   
zKimiMLP.forward)NTr-   )
__name__
__module____qualname__intstrr   boolr=   rK   __classcell__rE   rE   rC   rF   r,   :   s(     r,   c                	       sP   e Zd Z			ddededB dedef fdd	Zd
ej	dej	fddZ
  ZS )KimiMoENr-   r   configr1   r3   	layer_idxc           
         s  t    |j}|j}|j}|j}|j}	t | _|j	| _	|j
| _
|| _|jdkr1td|j dt||dd | dd| _tt|| j_t||j||d|	||j|j|j| d|j| jjd| _| j
d ur|| j
 }t|j||j|d| d	d
| _d S d S )Nr9   r:   r;   Fz.gater7   z.experts)num_expertstop_kr.   r/   r2   renormalizer1   use_grouped_topknum_expert_group
topk_groupr3   scoring_funce_score_correction_biasz.shared_experts)r.   r/   r0   r1   r2   r3   )r<   r=   r.   r/   moe_intermediate_sizerV   moe_renormalizer
   tp_sizerouted_scaling_factornum_shared_expertsrU   r0   r@   r   gater   	Parametertorchemptyr]   r   num_experts_per_tokenrY   rZ   r[   moe_router_activation_funcexpertsr,   shared_experts)
rB   rT   r1   r3   rU   r.   r/   r^   rV   r_   rC   rE   rF   r=   c   s`   



zKimiMoE.__init__hidden_statesr4   c                 C   sz   |j \}}|d|}| jd ur| |}| |\}}| j||d| j }|d ur.|| }| jdkr7t|}|||S )N)rk   router_logitsr#   )	shapeviewrb   rj   rc   ri   ra   r`   r   )rB   rk   
num_tokensr.   shared_outputrm   rJ   final_hidden_statesrE   rE   rF   rK      s   



zKimiMoE.forward)Nr-   r   )rL   rM   rN   r"   r   rP   rO   r=   re   TensorrK   rR   rE   rE   rC   rF   rS   b   s    >rS   c                       s   e Zd ZdZ				ddedededed	ed
ededB dedededB dedB de	ddf fddZ
dejdejdejddfddZ  ZS )KimiMLAAttentionz8
    Main reference: DeepseekV2 vllm Implementation
    FNr-   rT   r.   	num_headsqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rankuse_nopecache_configr1   r3   r4   c                    s  t    || _|| _|| _|| | _|| _|| _|| _|| _	t
 }|| | _| jd | _|	| _| jdu s7J | jd u s>J || dksFJ t| j| j| j d|| dd| _t| j| j	| j d|| dd| _t| j|jd| _t| j| j	| j| j  d|| d	d| _t| j	| j | jd|| d
d| _t| j| jd | jd | jd d | jd dd d}t| j| j| j| j| j| j| j| j||
||| _d S )Ng      Tr   Fz.kv_a_proj_with_mqar7   z.q_projepsz
.kv_b_projz.o_proj)kv_a_layernorm	kv_b_proj
rotary_embo_projfused_qkv_a_projkv_a_proj_with_mqaq_a_layernormq_b_projq_projindexer	is_sparsetopk_indices_buffer)r<   r=   r.   rv   rw   qk_head_dimrx   ry   rz   ru   r
   num_local_headsscalingr{   r   r   r   r   r   rms_norm_epsr   r   r   r   r   r   mla_attn)rB   rT   r.   ru   rv   rw   rx   ry   rz   r{   r|   r1   r3   kwargsr`   mla_modulesrC   rE   rF   r=      s   






zKimiMLAAttention.__init__	positionsrk   outputc                 C   s   |  |||d d < d S rG   )r   )rB   r   rk   r   rE   rE   rF   rK     s   zKimiMLAAttention.forward)FNNr-   )rL   rM   rN   __doc__r"   rO   rQ   r   r   rP   r=   re   rs   rK   rR   rE   rE   rC   rF   rt      sT    	
^rt   c                       s   e Zd Z					ddedededB dedB dedB dedB d	e	d
df fddZ
dejdejdejdB d
eejejf fddZ  ZS )KimiDecoderLayerNr-   rT   rU   r|   r1   parallel_configmodel_configr3   r4   c           	         s  t    |j| _|j| _||r"t||j|||| dd| _nt|| j|j|||| d||j	|j
|j|j|j|jd| _| jrd|jd urd||jkrd||j dkrdt||| dd| _| j| _nt| j|j|j|| dd| _t|j|jd	| _t|j|jd	| _d S )
Nz
.self_attn)rU   r.   r1   r|   r   r3   )rU   r.   ru   r1   r|   r   r3   rT   rv   rw   rx   ry   rz   r{   r   z.block_sparse_moe)rT   r1   r3   z.mlp)r.   r/   r0   r1   r3   r}   )r<   r=   r.   is_moeis_kda_layerr   	self_attnrt   num_attention_headsrv   rw   rx   ry   rz   mla_use_noperV   first_k_dense_replacemoe_layer_freqrS   block_sparse_moemlpr,   r/   r0   r   r   input_layernormpost_attention_layernorm)	rB   rT   rU   r|   r1   r   r   r3   r   rC   rE   rF   r=      sd   


	


zKimiDecoderLayer.__init__r   rk   residualc                 K   sh   |d u r|}|  |}n|  ||\}}t|}| j|||d |}| ||\}}| |}||fS )N)rk   r   r   )r   re   
empty_liker   r   r   )rB   r   rk   r   r   attn_outputrE   rE   rF   rK   d  s   

zKimiDecoderLayer.forward)NNNNr-   )rL   rM   rN   r"   rO   r   r   r   r   rP   r=   re   rs   tuplerK   rR   rE   rE   rC   rF   r     s@    
Dr   c                       sx   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB dejf
ddZ
  ZS )KimiLinearModelr-   r3   vllm_configr3   c                   s   t    |jj|j|j |j|j| _j| _	j
| _
t jr2tj
j| dd| _nt | _i dtf fdd}tj|| dd\| _| _| _t jretjjd| _nt | _t }j| dkswJ d	d S )
Nz.embed_tokensr   r3   c                    s0   t | ddd }t| | fi S )N.r#   )rO   rsplitr   )r3   rU   r|   rT   extra_kwargsr   r   r1   rE   rF   	get_layer  s   z+KimiLinearModel.__init__.<locals>.get_layerz.layersr}   r   z3num_attention_heads must be divisible by world_size)r<   r=   r   hf_text_configr|   r1   r   rT   pad_token_idpadding_idx
vocab_sizer	   is_first_rankr   r.   embed_tokensr(   rP   r*   num_hidden_layersstart_layer	end_layerlayersis_last_rankr   r   normr
   r   )rB   r   r3   r   
world_sizerC   r   rF   r=     s<   

zKimiLinearModel.__init__	input_idsr4   c                 C   s
   |  |S rG   )r   rB   r   rE   rE   rF   embed_input_ids  s   
zKimiLinearModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           
      K   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| j D ]\}}	|	|||d\}}q,t  jsEt||dS | 	||\}}|S )Nrk   r   )r   rk   r   )rk   r   )
r	   r   r   	enumerater   r   r   r   r!   r   )
rB   r   r   r   r   r   rk   r   rJ   layerrE   rE   rF   rK     s(   
zKimiLinearModel.forwardrG   )rL   rM   rN   r   rP   r=   re   rs   r   r!   rK   rR   rE   rE   rC   rF   r     s    5r   c                       s>  e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
edddeejejejejf fddZedddeeedf eedf eedf eedf f fddZedeeeeef fddZdejdejdB fddZdeeeejf  fddZ  ZS ) KimiLinearForCausalLMr-   r   r   r3   c                   s   t    |j| _|| _| jj| _|j}|| _t|t|dd| _	t
 jr6t| jj| jj|t|dd| _nt | _t| jdd}t| jj|d| _d S )Nmodel)r   r3   lm_head)r1   r3   logit_scaleg      ?)scale)r<   r=   r   r   	hf_configrT   r1   r   r+   r   r	   r   r   r   r.   r   r(   getattrr   logits_processor)rB   r   r3   r1   r   rC   rE   rF   r=     s*   



zKimiLinearForCausalLM.__init__r   r4   c                 C   s   | j |S rG   )r   r   r   rE   rE   rF   r     s   z%KimiLinearForCausalLM.embed_input_idsNr   r   r   c                 K   s   | j ||||fi |}|S rG   )r   )rB   r   r   r   r   r   rk   rE   rE   rF   rK     s   zKimiLinearForCausalLM.forwardr   c                 C   s   t |jj|jjS rG   )r   kda_state_dtyper   dtyper|   mamba_cache_dtype)clsr   rE   rE   rF   !get_mamba_state_dtype_from_config  s   z7KimiLinearForCausalLM.get_mamba_state_dtype_from_config.c                 C   sL   |j }|jj}|j}|jr|jjnd}tj||jd |jd |jd |dS )Nr   ru   head_dimshort_conv_kernel_size)conv_kernel_sizenum_spec)	r   r   r   tensor_parallel_sizespeculative_confignum_speculative_tokensr   kda_state_shapelinear_attn_config)r   r   r   r   r`   r   rE   rE   rF   !get_mamba_state_shape_from_config  s   
z7KimiLinearForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S rG   )r   kda_state_copy_func)r   rE   rE   rF   get_mamba_state_copy_func%  s   z/KimiLinearForCausalLM.get_mamba_state_copy_funcrk   c                 C   s   |  | j|S rG   )r   r   )rB   rk   rE   rE   rF   compute_logits-  s   z$KimiLinearForCausalLM.compute_logitsweightsc              	   C   s  ddg}| j jrtj| ddd| j jd}ng }t|  }t }|D ]}|d d \}}t|dkr6|d ni }	d|v r=q"t	| j |}
|
d urHq"d	|v sPd
|v rQq"|D ]7\}}}||vr]qSd|v rf||vrfqS|
||}|drv||vrvqSt|| r|qS|| }|j}||||  n`t|D ]*\}\}}}}||vrq|
||}t|| rq|| }|j}||||||d  n1|dr||vr| j jsq"t||}|d u rq"t|| rq"|| }t|dt}|||fi |	 || q"d S )N)r6   z
.gate_projr   )r6   z.up_projr#   w1w2w3)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerV   r5   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmlp.experts.z.bias)	expert_idshard_idweight_loader)rT   r   r   make_expert_params_mappingrV   dictnamed_parameterssetlen#get_spec_layer_idx_from_weight_namereplaceendswithr)   r   r   is_linear_attnr    r   r   add)rB   r   stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsargsnameloaded_weightr   
spec_layer
param_nameweight_namer   paramr   idxr   rE   rE   rF   load_weights3  s   



z"KimiLinearForCausalLM.load_weights)NN)rL   rM   rN   r   rP   r=   re   rs   r   r!   rK   classmethodr   r   r   rO   r   r   r   r   r   r   rR   rE   rE   rC   rF   r     sP    
.

$r   rT   r   r4   c                 C   sP   t | dr&| jdkr&| j}t| jD ]}|d||  dr%||   S qd S )Nnum_nextn_predict_layersr   zmodel.layers.r   )hasattrr   r   range
startswith)rT   r   rU   irE   rE   rF   r     s   

r   )Jcollections.abcr   re   r   vllm.compilation.decoratorsr   vllm.configr   r   r   r   vllm.distributedr	   r
   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.fused_moer   vllm.model_executor.layers.kdar   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   vllm.model_executor.layers.mlar   r   3vllm.model_executor.layers.quantization.base_configr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r    vllm.sequencer!   +vllm.transformers_utils.configs.kimi_linearr"   
interfacesr$   r%   r&   r'   utilsr(   r)   r*   r+   rL   loggerModuler,   rS   rt   r   r   r   rP   rO   r   rE   rE   rE   rF   <module>   sP   (Qla
\ 7