o
    -i\                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z? G dd de	j@ZAG dd de	j@ZBG dd  d e	j@ZCG d!d" d"e	j@ZDeG d#d$ d$e	j@ZEG d%d& d&e	j@e8e9e7ZFdS )'zAInference-only MiniCPM model compatible with HuggingFace weights.    N)Iterable)islice)Any)nn)PretrainedConfig)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)FatreluAndMul
SiluAndMul)fused_experts
fused_topk)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)set_weight_attrs)current_platform)IntermediateTensors   )SupportsEagle3SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s   e Zd ZdZ			ddededededejdB d	edB d
ef fddZde	j
dejdedefddZdejdejfddZ  ZS )
MiniCPMMoEzA tensor-parallel MoE implementation that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N num_expertstop_khidden_sizeintermediate_sizeparams_dtypetp_sizeprefixc              	      s   t    |p	t | _|| _|| _|| _|| j | _|d u r"t	 }|| _
t| j| jd| j
d | dd| _ttj| jd| j | jtj| j
d| _ttj| j| j| jtj| j
d| _t| jd| ji t| jd| ji d S )NFz.gate)biasr0   quant_configr2      )devicedtypeweight_loader)super__init__r   r1   num_total_expertsr-   r.   r/   torchget_default_dtyper0   r   gater   	Parameteremptyr   device_typewsw2sr   r8   )selfr,   r-   r.   r/   r0   r1   r2   	__class__ _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/minicpm.pyr:   U   sZ   

		
zMiniCPMMoE.__init__paramloaded_weightweight_name	expert_idc           	      C   s   t  }|j}| j}t|| |d | }|dr*||d d f ||d|d d f< |drB||d d f |||d| d d f< |drZ|d d |f ||d d d d f< d S d S )Nr!   z	w1.weightr   z	w3.weightr5   z	w2.weight)r   datar/   sliceendswith)	rD   rI   rJ   rK   rL   tp_rank
param_data
shard_sizeshardrG   rG   rH   r8      s   
"


&zMiniCPMMoE.weight_loaderhidden_statesreturnc           	      C   st   |j \}}|d| j}| |\}}t||| jdd\}}}t|| j| j||dd}| j	dkr4t
|}|||S )NT)renormalize)inplacer!   )shapeviewr.   r>   r   r-   r   rB   rC   r1   r   )	rD   rT   
num_tokensr.   router_logits_topk_weightstopk_idsfinal_hidden_statesrG   rG   rH   forward   s   


zMiniCPMMoE.forwardNNr+   )__name__
__module____qualname____doc__intr<   r7   strr:   r   r?   Tensorr8   ra   __classcell__rG   rG   rE   rH   r*   L   s<    >
r*   c                       sL   e Zd Z		ddedededededB ded	df fd
dZdd Z  Z	S )
MiniCPMMLPNr+   r.   r/   
hidden_acthidden_act_paramr4   r2   rU   c                    s   t    t||gd d|| dd| _t||d|| dd| _|dkr,t | _d S |dkr8t|d| _d S t	d	| d
)Nr5   Fz.gate_up_projr3   r4   r2   z
.down_projsilufatrelu)	thresholdzUnsupported activation: z.. Only silu and fatrelu are supported for now.)
r9   r:   r   gate_up_projr   	down_projr   act_fnr   
ValueError)rD   r.   r/   rl   rm   r4   r2   rE   rG   rH   r:      s,   
	
zMiniCPMMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)rr   rt   rs   )rD   xgate_upr]   rG   rG   rH   ra      s   
zMiniCPMMLP.forward)Nr+   )
rc   rd   re   rg   rh   floatr   r:   ra   rj   rG   rG   rE   rH   rk      s&    "rk   c                       s~   e Zd Z					ddedededeeef dB ded	edB d
edB deddf fddZ	de
jde
jde
jfddZ  ZS )MiniCPMAttentionN    r+   r.   	num_headsnum_kv_headsrope_parametersmax_position_embeddingscache_configr4   r2   rU   c	           
   	      s>  t    || _t }	|| _| j|	 dksJ | j|	 | _|| _| j|	kr/| j|	 dks.J n	|	| j dks8J td| j|	 | _|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _t|| j	| j| jd|| dd| _t| j| j	 |d|| dd| _t| j	||d| _t| j| j	| j| j||| d	d
| _d S )Nr   r!   g      Fz	.qkv_projrn   z.o_proj)max_positionr~   z.attn)r}   r   r4   r2   )r9   r:   r.   r   total_num_headsr|   total_num_kv_headsmaxr}   head_dimq_sizekv_sizescalingr   r   qkv_projr   o_projr   
rotary_embr   attn)
rD   r.   r|   r}   r~   r   r   r4   r2   r1   rE   rG   rH   r:      s\   

	
zMiniCPMAttention.__init__	positionsrT   c           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )NrV   )dim)r   splitr   r   r   r   r   )
rD   r   rT   qkvr]   qkvattn_outputoutputrG   rG   rH   ra   (  s    zMiniCPMAttention.forward)Nr{   NNr+   )rc   rd   re   rg   dictrh   r   r	   r   r:   r<   ri   ra   rj   rG   rG   rE   rH   rz      s@    	
Brz   c                       s   e Zd Z			ddededB dedB deddf
 fdd	Zd
d Zdd Z	de
jde
jde
jdB dee
je
jf fddZ  ZS )MiniCPMDecoderLayerNr+   configr   r4   r2   rU   c                    sL   t    || _|| _|| _|j| _t|dd| _|| _| 	  | 
  d S )Nr   r{   )r9   r:   r   r   r4   r.   getattrr   r2   _init_attn_block_init_ffn_block)rD   r   r   r4   r2   rE   rG   rH   r:   6  s   
zMiniCPMDecoderLayer.__init__c              
   C   sP   t | jj| jjd| _t| j| jj| jj| jj| j	| j
| j| j dd| _d S )Nepsz
.self_attn)r.   r|   r}   r~   r   r   r4   r2   )r   r   r.   rms_norm_epsinput_layernormrz   num_attention_headsnum_key_value_headsr~   r   r   r4   r2   	self_attnrD   rG   rG   rH   r   G  s   
z$MiniCPMDecoderLayer._init_attn_blockc                 C   s   t | jj| jjd| _t| jdd| _| jdkr4t| j| jj| jj	t| jdd| j
| j dd| _d S t| jj| jj| jj| jj| j dd| _d S )	Nr   r,   r   rm   g        z.mlp)r.   r/   rl   rm   r4   r2   )r,   r-   r.   r/   r2   )r   r   r.   r   post_attention_layernormr   r,   rk   r/   rl   r4   r2   mlpr*   num_experts_per_tokr   rG   rG   rH   r   V  s(   

	
z#MiniCPMDecoderLayer._init_ffn_blockr   rT   residualc                 C   sx   |}|  |}| j||d}||| jjt| jj   }|}| |}| |}||| jjt| jj   }|d fS )N)r   rT   )	r   r   r   scale_depthmathsqrtnum_hidden_layersr   r   )rD   r   rT   r   rG   rG   rH   ra   m  s    


zMiniCPMDecoderLayer.forwardrb   )rc   rd   re   r   r	   r   rh   r:   r   r   r<   ri   tuplera   rj   rG   rG   rE   rH   r   5  s4    r   c                       s   e Zd Zdddedef fddZdededed	B d
ed	B fddZ	de
jde
jfddZ				dde
jde
jded	B de
jd	B de
jeB ee
jee
j f B f
ddZdeeee
jf  dee fddZ  ZS )MiniCPMModelr+   r2   vllm_configr2   c                   s   t    |jj}|j}|j}|| _|| _|| _|j| _t| j|j	| _
t| jdd| _| |||| t|j	|jd| _ttdf  | _tddg| jj	| _d S )Nr,   r   r   .rT   r   )r9   r:   model_config	hf_configr   r4   r   
vocab_sizer   r.   embed_tokensr   r,   _init_layersr   r   normr   rg   aux_hidden_state_layersr'   make_empty_intermediate_tensors)rD   r   r2   r   r   r4   rE   rG   rH   r:     s&   

zMiniCPMModel.__init__r   r   Nr4   c                    s2   t j fdd| dd\| _| _| _d S )Nc                    s   t  | dS )Nr   )r   r   r   r   r4   rG   rH   <lambda>  s    z+MiniCPMModel._init_layers.<locals>.<lambda>z.layersr   )r(   r   start_layer	end_layerlayers)rD   r2   r   r   r4   rG   r   rH   r     s
   zMiniCPMModel._init_layers	input_idsrU   c                 C   s   |  |}|| jj S rv   )r   r   	scale_emb)rD   r   	embeddingrG   rG   rH   embed_input_ids  s   
zMiniCPMModel.embed_input_idsr   intermediate_tensorsinputs_embedsc           
      C   s   t  jr|d ur|}n| |}d }n|d }|d }g }tt| j| j| jD ]\}}	|| jv r>|	|d ur;|| n| |	|||\}}q(t  j
sRt||dS | |}t|dkra||fS |S )NrT   r   )rT   r   r   )r   is_first_rankr   	enumerater   r   r   r   r   appendis_last_rankr    r   len)
rD   r   r   r   r   rT   r   aux_hidden_statesidxlayerrG   rG   rH   ra     s8   



zMiniCPMModel.forwardweightsc                 C   sX  g d}dd t | jD }t|  }t }|D ]\}}d|v r"qd|v s*d|v r+q|D ].\}}	}
|	|vr7q-||	|}|drG||vrGq-t|| rMq-|| }|j}||||
  nH|D ]&\}}	}|	|vrhq^||	|}t|| rtq^|| }|j}||||	|d  n|dr||vrqt|| rq|| }t	|d	t
}||| || q|S )
N))r   q_projr   )r   k_projr   )r   v_projr   )rr   	gate_projr   )rr   up_projr!   c              	   S   s:   g | ]}d D ]}|dv rdndd| d| d|fqqS ))w1w2w3)r   r   rB   rC   zexperts..z.weightrG   ).0rL   rK   rG   rG   rH   
<listcomp>  s    z-MiniCPMModel.load_weights.<locals>.<listcomp>zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedz.bias)rL   r8   )ranger,   r   named_parameterssetreplacerO   r&   r8   r   r   add)rD   r   stacked_params_mappingexpert_params_mappingparams_dictloaded_paramsnamerJ   
param_namerK   shard_idrI   r8   rL   rG   rG   rH   load_weights  s\   




zMiniCPMModel.load_weightsNN)rc   rd   re   r
   rh   r:   r   r	   r   r   r<   ri   r   r    r   listra   r   r   r   rj   rG   rG   rE   rH   r     s4    

,*r   c                       s0  e Zd Zg dddgdZdddZdd	d
edef fddZdd	d
edefddZde	j
de	j
fddZdeedf ddfddZdeedf fddZ		d&de	j
de	j
dedB de	j
dB de	j
eB ee	j
ee	j
 f B f
ddZd e	j
de	j
dB fd!d"Zd#eeee	j
f  dee fd$d%Z  ZS )'MiniCPMForCausalLM)r   r   r   r   r   )r   rr   input_embeddingsoutput_embeddings)r   lm_headr+   r   r   r2   c                   s   t    |jj}|j}|j}|j}|| _|| _|| _	|| _|| _| j
|t|dd| _t|j|j|t|dd| _|jrF| j| jj| _| j	j| j	j | _t|j| _| jj| _|jrit|dddkrktdd S d S )Nmodelr   r2   r   )r4   r2   r,   r   z&EPLB is not supported for MiniCPM yet.)r9   r:   r   r   r   r4   parallel_configr2   r   r   _init_modelr)   r   r   r   r.   r   tie_word_embeddingstie_weightsr   dim_model_basescale_widthr   logits_processorr   enable_eplbr   NotImplementedError)rD   r   r2   r   r   r4   r   rE   rG   rH   r:   <  s8   

zMiniCPMForCausalLM.__init__c                C   s   t ||dS )Nr   )r   )rD   r   r2   rG   rG   rH   r   `     zMiniCPMForCausalLM._init_modelr   rU   c                 C   s   | j |S rv   )r   r   )rD   r   rG   rG   rH   r   c  r   z"MiniCPMForCausalLM.embed_input_idsr   .Nc                 C   s   || j _d S rv   )r   r   )rD   r   rG   rG   rH   set_aux_hidden_state_layersf  r   z.MiniCPMForCausalLM.set_aux_hidden_state_layersc                 C   s   t | jj}d|d |d fS )Nr5      )r   r   r   )rD   
num_layersrG   rG   rH   "get_eagle3_aux_hidden_state_layersi  s   z5MiniCPMForCausalLM.get_eagle3_aux_hidden_state_layersr   r   r   c                 C   s\   |  ||||}t|tr t|dkr |\}}|| j }||fS t|tr'|S || j }|S )Nr5   )r   
isinstancer   r   r   r    )rD   r   r   r   r   model_outputrT   r   rG   rG   rH   ra   m  s   


zMiniCPMForCausalLM.forwardrT   c                 C   s   |  | j|}|S rv   )r   r   )rD   rT   logitsrG   rG   rH   compute_logits  s   z!MiniCPMForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r%   r   r   r   )rD   r   loaderrG   rG   rH   r     s
   
zMiniCPMForCausalLM.load_weightsr   )rc   rd   re   packed_modules_mappingembedding_modulesr
   rh   r:   r   r<   ri   r   r   rg   r   r   r    r   ra   r   r   r   r   rj   rG   rG   rE   rH   r   )  sB    $

,r   )Grf   r   collections.abcr   	itertoolsr   typingr   r<   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr	   r
   vllm.distributedr   r   r   r   %vllm.model_executor.layers.activationr   r   $vllm.model_executor.layers.fused_moer   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.model_executor.utilsr   vllm.platformsr   vllm.sequencer    
interfacesr"   r#   r$   utilsr%   r&   r'   r(   r)   Moduler*   rk   rz   r   r   r   rG   rG   rG   rH   <module>   sD   	o*PT  