o
    i|8                     @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z* ddl(m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 G dd dej1Z2G dd dej1Z3de3iZ4eddddddG d d! d!e+Z5G d"d# d#ej1e&e'Z6d$ed%e7d&e8dB fd'd(Z9dS ))zDInference-only GLM-4-0414 model compatible with HuggingFace weights.    )IterableN)nn)
Glm4Config)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)	Attention)RMSNorm)QKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHead)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors)AttentionType   )SupportsLoRA
SupportsPP)LlamaMLP)
LlamaModel)AutoWeightsLoaderPPMissingLayeris_pp_missing_parametermaybe_prefixc                       s   e Zd Zddddddejfdedededed	ed
edB dededB de	dB de
de
ddf fddZdejdejdejfddZ  ZS )Glm4Attentioni   NF confighidden_size	num_headsnum_kv_headsmax_positionhead_dimqkv_biascache_configquant_configprefix	attn_typereturnc              
      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J t|dd }t|t	rQd|v rQ|j
d|d  n|j
dd td| j| | _|pg|| j | _| j| j | _| j| j | _| jd | _t|| j| j| j||	|
 dd| _t| j| j |d	|	|
 d
d| _t| j||j
d	d| _t| j| j| j| j||	|
 d|d| _d S )Nr   rope_parameterspartial_rotary_factorg      ?r   g      	.qkv_proj)biasr)   r*   Fz.o_proj)r%   r-   is_neox_stylez.attn)r$   r(   r)   r*   r+   )super__init__r"   r	   total_num_headsr#   total_num_kv_headsgetattr
isinstancedictr-   
setdefaultmaxr$   r&   q_sizekv_sizescalingr   qkv_projr   o_projr   
rotary_embr
   attn)selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   tp_sizerope_params	__class__ U/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/glm4.pyr3   =   sj   

	
zGlm4Attention.__init__	positionshidden_statesc           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )N)dim)r>   splitr;   r<   r@   rA   r?   )
rB   rI   rJ   qkv_qkvattn_outputoutputrG   rG   rH   forward   s    zGlm4Attention.forward)__name__
__module____qualname__r   DECODERr   intboolr   r   strr3   torchTensorrU   __classcell__rG   rG   rE   rH   r   <   sP    	
Lr   c                
       sh   e Zd Z		ddedededB ddf fddZd	ejd
ejdejdB de	ejejf fddZ
  ZS )Glm4DecoderLayerr    Nvllm_configr*   r!   r,   c                    s   t    |p
|jj}|j}|j}|j| _t|| j|j|j	|j
t|ddt|dd ||| dtjd| _t| j|j|j|| dd| _t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _d S )	Nattention_biasFr&   z
.self_attn)r!   r"   r#   r%   r$   r'   r&   r(   r)   r*   r+   z.mlp)r"   intermediate_size
hidden_actr)   r*   )eps)r2   r3   model_config	hf_configr(   r)   r"   r   num_attention_headsmax_position_embeddingsnum_key_value_headsr6   r   rY   	self_attnGlm4MLPrc   rd   mlpr   rms_norm_epsinput_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernorm)rB   ra   r*   r!   r(   r)   rE   rG   rH   r3      sB   


zGlm4DecoderLayer.__init__rI   rJ   residualc                 C   sl   |d u r|}|  |}n|  ||\}}| j||d}| |}| ||\}}| |}| |}||fS )N)rI   rJ   )ro   rk   rq   rp   rm   rr   )rB   rI   rJ   rs   rG   rG   rH   rU      s   


zGlm4DecoderLayer.forward)r    N)rV   rW   rX   r   r\   r   r3   r]   r^   tuplerU   r_   rG   rG   rE   rH   r`      s*    +r`   	attentionrK   )	input_idsrI   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       sP   e Zd Zdddedef fddZdeeeej	f  de
e fd	d
Z  ZS )	Glm4Modelr    r*   ra   r*   c                   s   t  j||td d S )N)ra   r*   
layer_type)r2   r3   r`   )rB   ra   r*   rE   rG   rH   r3      s   
zGlm4Model.__init__weightsr,   c                 C   s  g d}t |  }t }|D ]\}}t| j|}|d urqd|v r#qd|v s+d|v r,q| jd urZ| j| }rZ|| }	t|	dt}
|	 dkrK|n|d }|
|	| |
| qd|v sbd|v rlt||}|d u rlq|D ].\}}}||vrxqn|||}|d	r||vrqnt|| rqn|| }	|	j}
|
|	||  n|d	r||vrqt|| rq|| }	t|	dt}
|
|	| |
| q|S )
N))r/   z.q_projrP   )r/   z.k_projrQ   )r/   z.v_projrR   ).gate_up_projz
.gate_projr   )r~   z.up_projr   zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedweight_loaderr   scale
zero_pointz.bias)r8   named_parametersset#get_spec_layer_idx_from_weight_namer!   r)   get_cache_scaler6   r   rL   addr   replaceendswithr   r   )rB   r}   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
spec_layer
scale_nameparamr   
param_nameweight_nameshard_idrG   rG   rH   load_weights   s\   






zGlm4Model.load_weights)rV   rW   rX   r   r\   r3   r   rt   r]   r^   r   r   r_   rG   rG   rE   rH   rz      s    	,rz   c                       s   e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ			ddejdB dejde
dB dejdB deje
B f
ddZdejdejdB fddZdeeeejf  dee fddZ  ZS )Glm4ForCausalLM)q_projk_projv_proj	gate_projup_proj)r>   gate_up_projr    r{   ra   r*   c                   s   t    |jj}|j}|| _|| _t|t|dd| _t	 j
r8|jr)| jj| _nt|j|j|t|dd| _nt | _t|j| _| jj| _d S )Nmodel)ra   r*   lm_head)r)   r*   )r2   r3   rf   rg   r)   r!   rz   r   r   r   is_last_ranktie_word_embeddingsembed_tokensr   r   
vocab_sizer"   r   r   logits_processormake_empty_intermediate_tensors)rB   ra   r*   r!   r)   rE   rG   rH   r3   A  s*   


zGlm4ForCausalLM.__init__rv   r,   c                 C   s   | j |S N)r   embed_input_ids)rB   rv   rG   rG   rH   r   `  s   zGlm4ForCausalLM.embed_input_idsNrI   rw   rx   c                 C   s   |  ||||}|S r   )r   )rB   rv   rI   rw   rx   rJ   rG   rG   rH   rU   c  s   zGlm4ForCausalLM.forwardrJ   c                 C   s   |  | j|}|S r   )r   r   )rB   rJ   logitsrG   rG   rH   compute_logitso  s   zGlm4ForCausalLM.compute_logitsr}   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   r!   r   r   )rB   r}   loaderrG   rG   rH   r   v  s
   
zGlm4ForCausalLM.load_weights)NN)rV   rW   rX   packed_modules_mappingr   r\   r3   r]   r^   r   r   rU   r   r   rt   r   r   r_   rG   rG   rE   rH   r   4  s6    

,r   r!   r   r,   c                 C   sN   t | dr%| jdkr%| j}t| jD ]}d||  d|v r$||   S qd S )Nnum_nextn_predict_layersr   zlayers..)hasattrr   num_hidden_layersrange)r!   r   	layer_idxirG   rG   rH   r   ~  s   

r   ):__doc__collections.abcr   r]   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr   r	   $vllm.model_executor.layers.attentionr
   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   vllm.v1.attention.backendr   
interfacesr   r   llamar   rl   r   utilsr   r   r   r   Moduler   r`   ALL_DECODER_LAYER_TYPESrz   r   r\   rZ   r   rG   rG   rG   rH   <module>   sT   ZHJJ