o
    پi[                     @   s  d Z ddlZddlmZmZmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2 dZ3e4e5Z6G dd de
j7Z8G dd de
j7Z9G dd de
j7Z:G dd de
j7Z;G dd de
j7Z<e<gZ=dS )zDInference-only GLM-4-0414 model compatible with HuggingFace weights.    N)AnyDictIterableOptionalTupleUnion)nn)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)
SiluAndMul)is_dp_attention_enabled)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loaderkv_cache_scales_loader)
add_prefixmake_layersc                       sZ   e Zd Z			ddedededee ded	ed
df fddZ		ddefddZ	  Z
S )Glm4MLPN Thidden_sizeintermediate_size
hidden_actquant_configprefixreduce_resultsreturnc                    sj   t    t||gd d|td|d| _t||d|td||d| _|dkr/td| d	t | _	d S )
N   Fgate_up_projbiasr'   r(   	down_proj)r.   r'   r(   r)   siluzUnsupported activation: z!. Only silu is supported for now.)
super__init__r   r    r,   r   r/   
ValueErrorr   act_fn)selfr$   r%   r&   r'   r(   r)   	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/glm4.pyr2   >   s*   
	
zGlm4MLP.__init__Fuse_reduce_scatterc                 C   s.   |  |\}}| |}| j||d\}}|S )N)skip_all_reduce)r,   r4   r/   )r5   xforward_batchr:   gate_up_r8   r8   r9   forward]   s   

zGlm4MLP.forward)Nr#   T)NF)__name__
__module____qualname__intstrr   r   boolr2   r@   __classcell__r8   r8   r6   r9   r"   =   s0    "r"   c                       s   e Zd Z										dded	ed
edee dededeeeef  dedee	 dee
eef  dedededdf fddZdejdejdedejfddZ  ZS )Glm4AttentionNr   @B          ?Tr#   r$   	num_headsnum_kv_headshead_dimlayer_id
rope_thetarope_scalingmax_position_embeddingsr'   dual_chunk_attention_configpartial_rotary_factorr.   r(   r*   c              
      sf  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _|d urI|| _	n|| j | _	| j| j	 | _
| j| j	 | _| j	d | _|| _|| _|| _t|| j	| j| j||	td|d| _t| j| j	 |d|	td|d| _t| j	| j	||||
|dd| _t| j| j	| j| j||	td	|d
| _d S )Nr      g      qkv_projr-   Fo_proj)
rotary_dimmax_positionbaserQ   rS   rT   is_neox_styleattn)rM   rO   r'   r(   )r1   r2   r$   r   total_num_headsrL   total_num_kv_headsmaxrM   rN   q_sizekv_sizescalingrP   rR   rT   r   r    rV   r   rW   r   
rotary_embr   r\   )r5   r$   rL   rM   rN   rO   rP   rQ   rR   r'   rS   rT   r.   r(   tp_sizer6   r8   r9   r2   m   sn   

	

zGlm4Attention.__init__	positionshidden_statesr=   c                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )N)dim)rV   splitr`   ra   rc   r\   rW   )r5   re   rf   r=   qkvr?   qkvattn_outputoutputr8   r8   r9   r@      s    zGlm4Attention.forward)
Nr   rI   NrJ   NNrK   Tr#   )rA   rB   rC   rD   r   floatr   rE   r   r   dictrF   r2   torchTensorr   r@   rG   r8   r8   r6   r9   rH   l   sb    	
PrH   c                       s   e Zd ZdZ				ddededee ded	ee	j
j d
df fddZde	jde	jdedee	j d
ee	je	jf f
ddZ  ZS )Glm4DecoderLayerzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    r   Nr#   configrO   r'   r(   
alt_streamr*   c                    sN  t    |j| _t|dd }t|tr/|dt|dd}|dt|dd}t|dd }	nt|dd}t|dd }	t|dd}t|dd}
t|d	d
}t|dd }t|dd }t| j|j|j	||||	|||||
t
d|d| _t|j|j|j|t
d|d| _t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _d S )Nrope_parametersrP   rI   rT   rK   rQ   attention_biasTrR   i   rN   rS   	self_attn)r$   rL   rM   rN   rO   rP   rQ   rR   r'   rS   rT   r.   r(   mlp)r%   r&   r'   r(   eps)r1   r2   r$   getattr
isinstancerq   getrH   num_attention_headsnum_key_value_headsr    ry   r"   r%   r&   rz   r   rms_norm_epsinput_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernorm)r5   ru   rO   r'   r(   rv   rprP   rT   rQ   r.   rR   rN   rS   r6   r8   r9   r2      sb   


zGlm4DecoderLayer.__init__re   rf   r=   residualc                 C   sn   |d u r|}|  |}n|  ||\}}| j|||d}| |}| ||\}}| |}| |}||fS )N)re   rf   r=   )r   ry   r   r   rz   r   )r5   re   rf   r=   r   r8   r8   r9   r@     s   


zGlm4DecoderLayer.forward)r   Nr#   N)rA   rB   rC   __doc__
Glm4ConfigrD   r   r   rE   rr   cudaStreamr2   rs   r   r   r@   rG   r8   r8   r6   r9   rt      s<    	
@rt   c                       s   e Zd Zddedfdedee dedee	j
 deejj ddf fd	d
Zde	jfddZ		ddejdejdedejdee deejef fddZdeddfddZ  ZS )	Glm4ModelNr#   ru   r'   r(   decoder_layer_typerv   r*   c                    s   t    | _j| _j| _t | _| jjr)t	jj
t td|d| _nt | _p0ttj fdd| jj| jjtd|d\| _| _| _| jjr]tj
jd| _ntdd	| _g | _d S )
Nembed_tokens)r'   use_attn_tp_groupr(   c                    s   | | dS )N)rO   ru   r'   r(   rv   r8   )idxr(   rv   ru   r   r'   r8   r9   <lambda>L  s    z$Glm4Model.__init__.<locals>.<lambda>layers)pp_rankpp_sizer(   r{   T)return_tuple)r1   r2   ru   pad_token_idpadding_idx
vocab_sizer	   pp_groupis_first_rankr   r$   r   r    r   r   rt   r!   num_hidden_layersrank_in_group
world_sizer   start_layer	end_layeris_last_rankr   r   normlayers_to_capture)r5   ru   r'   r(   r   rv   r6   r   r9   r2   /  s4   


zGlm4Model.__init__c                 C   s   | j S N)r   r5   r8   r8   r9   get_input_embeddings_  s   zGlm4Model.get_input_embeddings	input_idsre   r=   input_embedspp_proxy_tensorsc                 C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }g }t| j| jD ]"}	|	| jv r>||d ur;|| n| | j|	 }
|
||||\}}q*| j j	sXt
||dS |jd dkrq|d u ri| |}n| ||\}}t|dkry|S ||fS )Nrf   r   )rf   r   r   )r   r   r   ranger   r   r   appendr   r   r   shaper   len)r5   r   re   r=   r   r   rf   r   aux_hidden_statesilayerr?   r8   r8   r9   r@   b  sD   


zGlm4Model.forwardquantization_param_pathc                 C   sv   t  }t }t|||| jj| jjjD ]%\}}t| j| t	j
s&| j| j}t|jdr5||j_||j_qtdd S )Nk_scalez8Self attention has no KV cache scaling factor attribute!)r   r
   r   ru   r   r7   
model_typer~   r   r   Identityry   hasattrr\   r   v_scaleRuntimeError)r5   r   rd   tp_rank	layer_idxscaling_factorlayer_self_attnr8   r8   r9   load_kv_cache_scales  s$   
zGlm4Model.load_kv_cache_scales)NN)rA   rB   rC   rt   r   r   r   rE   typer   Modulerr   r   r   r2   	Embeddingr   rs   r   r   r   r@   r   rG   r8   r8   r6   r9   r   .  sF    
0
6r   c                       s.  e Zd Z		d'dedee deddf fddZd	ej	dej	fd
dZ
dejfddZe 			d(d	ej	dej	dedej	dedee dej	fddZe 	d)d	ej	dej	dedeeef dej	f
ddZedd Zedd Zdeeeej	f  fddZd d! Zd"d# Zd$eddfd%d&Z  ZS )*Glm4ForCausalLMNr#   ru   r'   r(   r*   c                    s  t    t | _|| _|| _t||td|d| _| jj	r<| jj
dkr-|jr-| jj| _nt|j|j|td|d| _nt | _| jj
dkrw|jrw| jjr[| jj| jjj| jjd n| jj|j|jft| j j| jjd}| jj| t|| _ttj dd| _!d	| _"d S )
Nmodel)r'   r(   rU   lm_head)dst)sizedtypesrcT)pooling_type	normalizeF)#r1   r2   r	   r   ru   r'   r   r    r   r   r   tie_word_embeddingsr   r   r   r   r$   r   r   sendweight	last_rankrecvnext
parametersr   
first_rankcopy_r   logits_processorr   r   LASTpoolercapture_aux_hidden_states)r5   ru   r'   r(   emb_token_weightr6   r8   r9   r2     s>   




zGlm4ForCausalLM.__init__r   c                 C   s   | j |S r   )r   get_input_embedding)r5   r   r8   r8   r9   r     s   z#Glm4ForCausalLM.get_input_embeddingc                 C      | j jS r   )r   r   r   r8   r8   r9   r     s   z$Glm4ForCausalLM.get_input_embeddingsFre   r=   r   get_embeddingr   c           	      C   sV   | j |||||d}d }| jr|\}}| jjr)|s#| ||| j||S | ||S |S )N)r   )r   r   r   r   r   r   r   )	r5   r   re   r=   r   r   r   rf   r   r8   r8   r9   r@     s*   
zGlm4ForCausalLM.forwardsplit_intervalc                 C   s   |\}}|dkr|d u r| j ||_n||_t||D ]}| j j| }	|	||j||j\|_|_q|| j jjkrS| j |j|j\}
}|
|_| 	||j| j
|}|S d }|S )Nr   )r   r   rf   r   r   r   ru   r   r   r   r   )r5   r   re   r=   r   r   startendr   r   rf   r?   resultr8   r8   r9   forward_split_prefill  s0   	z%Glm4ForCausalLM.forward_split_prefillc                 C   r   r   )r   r   r   r8   r8   r9   r   0     zGlm4ForCausalLM.start_layerc                 C   r   r   )r   r   r   r8   r8   r9   r   4  r   zGlm4ForCausalLM.end_layerweightsc                 C   sb  g d}t |  }|D ]\}}t|}|d ur+t| jdr+|| jjk s*|| jjkr+qd|v s3d|v r4q| jjrUd|v rU| j	j
dkrT| j	jrTttdd |d }|}nq|D ]-\}}	}
|	|vraqW||	|}|d	rq||vrqqW||vrvqW|| }|j}||||
  n)|d	r||vrq|| v r|| }t|d
t}||| qtd| d qd S )N))	.qkv_projz.q_projrk   )r   z.k_projrl   )r   z.v_projrm   ).gate_up_projz.up_projrU   )r   z
.gate_projr   r   zrotary_emb.inv_freq	projectorzlm_head.weightrU   c                 S   s   | d dkS )Nr   zmodel.embed_tokens.weightr8   )r<   r8   r8   r9   r   V  s    z.Glm4ForCausalLM.load_weights.<locals>.<lambda>z.biasweight_loaderz
Parameter z not found in params_dict)rq   named_parametersr   r   r   r   r   ru   r   r   r   r   r   filterreplaceendswithr   keysr}   r   loggerwarning)r5   r   stacked_params_mappingparams_dictnameloaded_weightrO   embed_token_weights
param_nameweight_nameshard_idparamr   r8   r8   r9   load_weights8  sX   	
zGlm4ForCausalLM.load_weightsc                 C   s   | j jj| jjfS r   )r   r   r   r   r   r8   r8   r9   get_embed_and_headw  s   z"Glm4ForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S r   )r   r   r   r   rr   r   empty_cachesynchronize)r5   embedheadr8   r8   r9   set_embed_and_headz  s   

z"Glm4ForCausalLM.set_embed_and_headr   c                 C   s   | j | d S r   )r   r   )r5   r   r8   r8   r9   r     s   z$Glm4ForCausalLM.load_kv_cache_scales)Nr#   )NFNr   )rA   rB   rC   r   r   r   rE   r2   rr   rs   r   r   r   r   no_gradr   rF   r   r@   r   rD   r   propertyr   r   r   r   r   r   r   rG   r8   r8   r6   r9   r     sl    0"
(

?r   )>r   loggingtypingr   r   r   r   r   r   rr   r   sglang.srt.distributedr	   r
   r   sglang.srt.layers.activationr   sglang.srt.layers.dp_attentionr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.utilsr    r!   r   	getLoggerrA   r   r   r"   rH   rt   r   r   
EntryClassr8   r8   r8   r9   <module>   s:    
/_c 
Z