o
    
۾il                     @   s  d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z; ee<Z=G dd dej>Z?G dd dej>Z@G dd dej>ZAG dd  d ej>ZBedd!ddd"d#G d$d% d%ej>ZCG d&d' d'e2ZDG d(d) d)ej>e4e3eDZEd*e
d+eFd,eGdB fd-d.ZHdS )/zSInference-only GLM-4.5, GLM-4.6, GLM-4.7 model
compatible with HuggingFace weights.    N)CallableIterable)islice)nn)Glm4MoeConfig)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size)init_logger)
SiluAndMul)	Attention)SharedFusedMoE)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )MixtureOfExpertsSupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sN   e Zd Z			ddededededB ded	ed
df fddZdd Z  Z	S )
Glm4MoeMLPNT hidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixreturnc                    sj   t    t||gd d|| dd| _t||d||| dd| _|dkr/td| d	t | _d S )
N   Fz.gate_up_projbiasr-   r/   z
.down_proj)r3   r-   r.   r/   siluUnsupported activation: !. Only silu is supported for now.)	super__init__r   gate_up_projr   	down_proj
ValueErrorr   act_fn)selfr*   r+   r,   r-   r.   r/   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/glm4_moe.pyr8   O   s*   
	
zGlm4MoeMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r9   r<   r:   )r=   xgate_up_r@   r@   rA   forwardn   s   
zGlm4MoeMLP.forward)NTr)   )
__name__
__module____qualname__intstrr   boolr8   rF   __classcell__r@   r@   r>   rA   r(   N   s(    r(   c                	       sP   e Zd Z			ddededB dedef fdd	Zd
ej	dej	fddZ
  ZS )Glm4MoENr)   Fconfigr-   r/   enable_eplbc                    s  t    t | _|j| _t j| _t j| _	| j
 | _|j| _|j| _|jdkr3td|j dtj|j|jdtjd| _ttj|jtjd| j_t }|jj}|| _|j| _| j| _| j| j | _ | j | j | _!| j	| j! | _"| j"| j! | _#|jd ur|j$|j }t%|j||j|d| dd| _&nd | _&t'di d	| j&d
|jd|j(d|jd|j$ddd|j)d|ddd|j*d|j+d| dddddd| jjd| jd| jdtj| _,d S ) Nr4   r5   r6   F)r3   dtyperQ   z.shared_experts)r*   r+   r,   r-   r.   r/   shared_expertsnum_expertstop_kr*   r+   r.   renormalizer-   use_grouped_topkTnum_expert_group
topk_groupr/   z.expertsscoring_funcsigmoidrouted_scaling_factorg      ?e_score_correction_biasrP   num_redundant_expertsrouter_logits_dtyper@   )-r7   r8   r   tp_sizer\   r   device_groupep_grouprank_in_groupep_ranksizeep_sizen_routed_expertsn_shared_expertsr,   r;   r   Linearr*   torchfloat32gate	Parameteremptyr]   r
   parallel_configeplb_configrP   r^   n_redundant_expertsn_logical_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endmoe_intermediate_sizer(   rS   r   num_experts_per_toknorm_topk_probn_grouprY   experts)r=   rO   r-   r/   rP   vllm_configrp   r+   r>   r@   rA   r8   v   s   






		


zGlm4MoE.__init__hidden_statesr0   c                 C   s   |j \}}|d|}| |jtjd}| j||d}| jd ur3|\}}|d us+J || j | }n|| j }| j	dkrC| j
|}|||S )NrR   )r}   router_logitsr   )shapeviewrl   torj   rk   r{   rS   r\   r`   &maybe_all_reduce_tensor_model_parallel)r=   r}   
num_tokens
hidden_dimr   fused_moe_outshared_outputfinal_hidden_statesr@   r@   rA   rF      s"   



zGlm4MoE.forward)Nr)   F)rG   rH   rI   r   r   rK   rL   r8   rj   TensorrF   rM   r@   r@   r>   rA   rN   u   s    UrN   c                       s   e Zd Z								ddededed	ed
ededB dededededB dedB de	ddf fddZ
dejdejdejfddZ  ZS )Glm4MoeAttention   Nh㈵>Fr)   rO   r*   	num_headsnum_kv_headsmax_position_embeddingshead_dimrms_norm_epsqkv_biasuse_qk_normcache_configr-   r/   r0   c              	      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|pG|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _|	| _t|| j	| j| j||| dd| _t| j| j	 |d|| dd| _|jdd	 t| j	||jd
| _t| j| j	| j| j|
|| dd| _| jrt| j	|d| _t| j	|d| _d S d S )Nr   r   g      z	.qkv_projr2   Fz.o_projpartial_rotary_factorg      ?)max_positionrope_parametersz.attn)r   r   r-   r/   eps)r7   r8   r*   r   total_num_headsr   total_num_kv_headsmaxr   r   q_sizekv_sizescalingr   r   r   qkv_projr   o_projr   
setdefaultr   
rotary_embr   attnr   q_normk_norm)r=   rO   r*   r   r   r   r   r   r   r   r   r-   r/   r`   r>   r@   rA   r8      sh   




zGlm4MoeAttention.__init__	positionsr}   c           
      C   s   |  |\}}|j| j| j| jgdd\}}}| jr:| |d| j| j|j	}| 
|d| j| j|j	}| |||\}}| |||}| |\}	}|	S )Nr~   )dim)r   splitr   r   r   r   reshaper   r   r   r   r   r   r   r   )
r=   r   r}   qkvrE   qkvattn_outputoutputr@   r@   rA   rF   3  s    zGlm4MoeAttention.forward)r   Nr   FFNNr)   )rG   rH   rI   r   rJ   floatrL   r   r   rK   r8   rj   r   rF   rM   r@   r@   r>   rA   r      sV    	
Lr   c                       sx   e Zd Z				ddededB dedB deded	df fd
dZde	j
de	j
de	j
dB d	ee	j
e	j
f fddZ  ZS )Glm4MoeDecoderLayerNr)   FrO   r   r-   r/   rP   r0   c                    s   t    |j| _t|dd}t|jddd }|| _t|| j|j|j	||j
|j|j||| d|jd| _|jd urN||jkrNt||| d|d	| _nt|j|j|j|| dd
| _t|j|jd| _t|j|jd| _|j| _d S )Nr   r   .)sepr~   z
.self_attn)rO   r*   r   r   r   r   r   r   r   r-   r/   r   z.mlp)rO   r-   r/   rP   )r*   r+   r,   r-   r/   r   )r7   r8   r*   getattrrJ   r   	layer_idxr   num_attention_headsnum_key_value_headsr   r   attention_biasr   	self_attnrg   first_k_dense_replacerN   mlpr(   r+   r,   r   input_layernormpost_attention_layernormr\   )r=   rO   r   r-   r/   rP   r   r   r>   r@   rA   r8   I  sN   



zGlm4MoeDecoderLayer.__init__r   r}   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r   r}   )r   r   r   r   )r=   r   r}   r   r@   r@   rA   rF     s   
zGlm4MoeDecoderLayer.forward)NNr)   F)rG   rH   rI   r   r   r   rK   rL   r8   rj   r   tuplerF   rM   r@   r@   r>   rA   r   H  s6    8r   r~   )	input_idsr   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )Glm4MoeModelr)   r/   r|   r/   c                   s   t    |jj|j |j|jj| _j	| _	t
 jr,tj	j| dd| _nt | _tj fdd| dd\| _| _| _t
 jrVtjjd| _nt | _tddgj| _d S )	Nz.embed_tokensr   c                    s   t  | dS )N)rO   r   r-   r/   rP   )r   r   r   rO   rP   r-   r@   rA   <lambda>  s    z'Glm4MoeModel.__init__.<locals>.<lambda>z.layersr   r}   r   )r7   r8   model_config	hf_configr   r-   ro   rP   rO   
vocab_sizer   is_first_rankr   r*   embed_tokensr#   r&   num_hidden_layersstart_layer	end_layerlayersis_last_rankr   r   normr%   make_empty_intermediate_tensors)r=   r|   r/   r>   r   rA   r8     s.   



zGlm4MoeModel.__init__r   r0   c                 C   s
   |  |S rB   )r   r=   r   r@   r@   rA   embed_input_ids     
zGlm4MoeModel.embed_input_idsNr   r   r   c           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )Nr}   r   )r}   r   )
r   r   r   r   r   r   r   r   r   r   )	r=   r   r   r   r   r}   r   layerrE   r@   r@   rA   rF     s    
zGlm4MoeModel.forwardc                 C   s   t j| ddd| jjdS )N	gate_projr:   up_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerT   )r   make_expert_params_mappingrO   rg   r=   r@   r@   rA   get_expert_mapping  s   zGlm4MoeModel.get_expert_mappingweightsc              
   C   s  g d}t |  }t }|  }|D ]\}}t| j|}|d ur"q|D ]7\}	}
}|
|vr.q$d|v r7||vr7q$||
|	}|drG||vrGq$t|| rMq$|| }|j	}||||  nkd}|D ]:}|\}	}
}}|
|vrmq`d}||
|	}t|| r{q`|| }t
tdtf |j	}||||||dd}|r|} n-q`|rq|dr||vrqt||}|d u rqt|| rq|| }t|dt}||| || q|S )	N))r   q_projr   )r   k_projr   )r   v_projr   )r9   r   r   )r9   r   r   zmlp.experts.z.biasFT.)shard_id	expert_idreturn_successweight_loader)dictnamed_parameterssetr   #get_spec_layer_idx_from_weight_namerO   replaceendswithr$   r   typingcastr   rL   r   r   r   add)r=   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
spec_layer
param_nameweight_namer   paramr   is_expert_weightmappingr   name_mappedsuccessr@   r@   rA   load_weights  s~   	




zGlm4MoeModel.load_weightsNN)rG   rH   rI   r	   rK   r8   rj   r   r   r   rF   listr   rJ   r   r   r   r   rM   r@   r@   r>   rA   r     s$    	&
,r   c                   @   s8   e Zd ZdedB ddfddZdededdfdd	ZdS )
Glm4MixtureOfExpertsexample_moeNr0   c                 C   sD   |d u rt d|j| _|j| _|j| _|j| _|j	| _
|j| _d S )Nz'No Glm4MoE layer found in model.layers.)RuntimeErrorrr   num_logical_expertsrs   num_physical_expertsrt   num_local_physical_expertsrg   num_routed_expertsrh   num_shared_expertsrq   r^   )r=   r  r@   r@   rA   extract_moe_parametersV  s   z+Glm4MixtureOfExperts.extract_moe_parametersr  r  c                 C   sT   | j |ksJ || _|| _ || j | _| jD ]}||_||_| j|_|j	  qd S rB   )
r  r  r  r^   moe_mlp_layersrt   rs   rq   r{   update_expert_map)r=   r  r  moer@   r@   rA    update_physical_experts_metadataa  s   
z5Glm4MixtureOfExperts.update_physical_experts_metadata)rG   rH   rI   rN   r  rJ   r  r@   r@   r@   rA   r  U  s    r  c                       s   e Zd Zg dddgdZdZddded	ef fd
dZdej	dej	fddZ
		ddej	dB dej	dedB dej	dB dej	eB f
ddZdej	dej	dB fddZdeeeej	f  dee fddZdeeeeeef  fddZ  ZS )Glm4MoeForCausalLM)r   r   r   r   r   )r   r9   Fr)   r   r|   r/   c                   s  t    |jj}|j}|| _|| _t|t|dd| _t	 j
r/t|j|j|t|dd| _nt | _t|j| _| jj| _g | _|j|j | _|j| _g | _g | _d }| jjD ]'}t|tr`qXt|tsgJ t|jtr|j}| j |j | j |jj! qX| "| d S )Nmodel)r|   r/   lm_head)r-   r/   )#r7   r8   r   r   r-   rO   r   r'   r  r   r   r   r   r*   r  r#   r   logits_processorr   expert_weightsr   r   num_moe_layersrz   num_expert_groups
moe_layersr  r   
isinstancer   r   rN   appendr{   r  )r=   r|   r/   rO   r-   r  r   r>   r@   rA   r8     sF   



zGlm4MoeForCausalLM.__init__r   r0   c                 C   s   | j |S rB   )r  r   r   r@   r@   rA   r     s   z"Glm4MoeForCausalLM.embed_input_idsNr   r   r   c                 C   s   |  ||||}|S rB   )r  )r=   r   r   r   r   r}   r@   r@   rA   rF     s   zGlm4MoeForCausalLM.forwardr}   c                 C   s   |  | j|}|S rB   )r  r  )r=   r}   logitsr@   r@   rA   compute_logits  s   z!Glm4MoeForCausalLM.compute_logitsr   c                 C   s   t | }||S rB   )r"   r   )r=   r   loaderr@   r@   rA   r     s   
zGlm4MoeForCausalLM.load_weightsc                 C   s
   | j  S rB   )r  r   r   r@   r@   rA   r     r   z%Glm4MoeForCausalLM.get_expert_mappingr  )rG   rH   rI   packed_modules_mappingfall_back_to_pt_during_loadr	   rK   r8   rj   r   r   r   rF   r  r   r   r   r   r  rJ   r   rM   r@   r@   r>   rA   r  q  s:    -

$&r  rO   r   r0   c                 C   sN   t | dr%| jdkr%| j}t| jD ]}d||  d|v r$||   S qd S )Nnum_nextn_predict_layersr   zlayers.r   )hasattrr  r   range)rO   r   r   ir@   r@   rA   r     s   

r   )I__doc__r   collections.abcr   r   	itertoolsr   rj   r   transformers.models.glm4_moer   vllm.compilation.decoratorsr   vllm.configr   r	   r
   vllm.distributedr   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r    r!   utilsr"   r#   r$   r%   r&   r'   rG   loggerModuler(   rN   r   r   r   r  r  rK   rJ   r   r@   r@   r@   rA   <module>   s^    	'qbJ <Z