o
    -iL)                     @   sr  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z' ddl%m(Z( ddl)m*Z*m+Z+m,Z, G dd dej-Z.G dd dej-Z/de/iZ0e
ddddddG dd  d e(Z1G d!d" d"ej-e#e$Z2dS )#zDInference-only GLM-4-0414 model compatible with HuggingFace weights.    )IterableN)nn)
Glm4Config)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)RMSNorm)QKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHead)IntermediateTensors)AttentionType   )SupportsLoRA
SupportsPP)LlamaMLP)
LlamaModel)AutoWeightsLoaderPPMissingLayermaybe_prefixc                       s   e Zd Zddddddejfdedededed	ed
edB dededB de	dB de
de
ddf fddZdejdejdejfddZ  ZS )Glm4Attentioni   NF confighidden_size	num_headsnum_kv_headsmax_positionhead_dimqkv_biascache_configquant_configprefix	attn_typereturnc              
      sP  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J |jdd t	d| j| | _
|pN|| j | _| j| j | _| j
| j | _| jd | _t|| j| j| j||	|
 dd| _t| j| j |d|	|
 d	d| _t| j||jdd
| _t| j| j| j| j
||	|
 d|d| _d S )Nr   partial_rotary_factorg      ?r   g      z	.qkv_proj)biasr&   r'   Fz.o_proj)r"   rope_parametersis_neox_stylez.attn)r!   r%   r&   r'   r(   )super__init__r   r
   total_num_headsr    total_num_kv_headsr,   
setdefaultmaxr!   r#   q_sizekv_sizescalingr   qkv_projr   o_projr   
rotary_embr   attn)selfr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   tp_size	__class__ \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/glm4.pyr/   4   s`   

	
zGlm4Attention.__init__	positionshidden_statesc           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )N)dim)r7   splitr4   r5   r9   r:   r8   )
r;   rA   rB   qkv_qkvattn_outputoutputr?   r?   r@   forwardx   s    zGlm4Attention.forward)__name__
__module____qualname__r   DECODERr   intboolr   r   strr/   torchTensorrM   __classcell__r?   r?   r=   r@   r   3   sP    	
Dr   c                
       sh   e Zd Z		ddedededB ddf fddZd	ejd
ejdejdB de	ejejf fddZ
  ZS )Glm4DecoderLayerr   Nvllm_configr'   r   r)   c                    s   t    |p
|jj}|j}|j}|j| _t|| j|j|j	|j
t|ddt|dd ||| dtjd| _t| j|j|j|| dd| _t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _d S )	Nattention_biasFr#   z
.self_attn)r   r   r    r"   r!   r$   r#   r%   r&   r'   r(   z.mlp)r   intermediate_size
hidden_actr&   r'   )eps)r.   r/   model_config	hf_configr%   r&   r   r   num_attention_headsmax_position_embeddingsnum_key_value_headsgetattrr   rQ   	self_attnGlm4MLPr[   r\   mlpr   rms_norm_epsinput_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernorm)r;   rY   r'   r   r%   r&   r=   r?   r@   r/      sB   


zGlm4DecoderLayer.__init__rA   rB   residualc                 C   sl   |d u r|}|  |}n|  ||\}}| j||d}| |}| ||\}}| |}| |}||fS )N)rA   rB   )rh   rd   rj   ri   rf   rk   )r;   rA   rB   rl   r?   r?   r@   rM      s   


zGlm4DecoderLayer.forward)r   N)rN   rO   rP   r   rT   r   r/   rU   rV   tuplerM   rW   r?   r?   r=   r@   rX      s*    +rX   	attentionrC   )	input_idsrA   intermediate_tensorsinputs_embeds)dynamic_arg_dimsc                       s,   e Zd Zdddedef fddZ  ZS )	Glm4Modelr   r'   rY   r'   c                   s   t  j||td d S )N)rY   r'   
layer_type)r.   r/   rX   )r;   rY   r'   r=   r?   r@   r/      s   
zGlm4Model.__init__)rN   rO   rP   r   rT   r/   rW   r?   r?   r=   r@   rs      s    $	rs   c                       s   e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ			ddejdejde
dB dejdB deje
B f
ddZdejdejdB fddZdeeeejf  dee fddZ  ZS )Glm4ForCausalLM)q_projk_projv_proj	gate_projup_proj)r7   gate_up_projr   rt   rY   r'   c                   s   t    |jj}|j}|| _|| _t|t|dd| _t	 j
r8|jr)| jj| _nt|j|j|t|dd| _nt | _t|j| _| jj| _d S )Nmodel)rY   r'   lm_head)r&   r'   )r.   r/   r^   r_   r&   r   rs   r   r}   r	   is_last_ranktie_word_embeddingsembed_tokensr~   r   
vocab_sizer   r   r   logits_processormake_empty_intermediate_tensors)r;   rY   r'   r   r&   r=   r?   r@   r/      s*   


zGlm4ForCausalLM.__init__ro   r)   c                 C   s   | j |S N)r}   embed_input_ids)r;   ro   r?   r?   r@   r     s   zGlm4ForCausalLM.embed_input_idsNrA   rp   rq   c                 C   s   |  ||||}|S r   )r}   )r;   ro   rA   rp   rq   rB   r?   r?   r@   rM     s   zGlm4ForCausalLM.forwardrB   c                 C   s   |  | j|}|S r   )r   r~   )r;   rB   logitsr?   r?   r@   compute_logits  s   zGlm4ForCausalLM.compute_logitsweightsc                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r   r   r   load_weights)r;   r   loaderr?   r?   r@   r   "  s
   
zGlm4ForCausalLM.load_weights)NN)rN   rO   rP   packed_modules_mappingr   rT   r/   rU   rV   r   r   rM   r   r   rm   setr   rW   r?   r?   r=   r@   rv      s6    

,rv   )3__doc__collections.abcr   rU   r   transformersr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   vllm.sequencer   vllm.v1.attention.backendr   
interfacesr   r   llamar   re   r   utilsr   r   r   Moduler   rX   ALL_DECODER_LAYER_TYPESrs   rv   r?   r?   r?   r@   <module>   sD   RH