o
    پi6                     @   sj  d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% dZ&G dd dej'Z(G dd dej'Z)G dd dej'Z*G dd dej'Z+G dd dej'Z,G dd dej'Z-G dd  d e-Z.e.gZ/dS )!z;Inference-only ChatGLM model compatible with THUDM weights.    )IterableOptionalTupleN)nn)	LayerNorm)ChatGLMConfig)$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixc                       sV   e Zd Z			ddedee def fddZd	ej	d
ej	de
dej	fddZ  ZS )GLMAttentionr   N layer_idquant_configprefixc              
      s  t    |j| _t }|j| _| j| dksJ | j| | _|j| _|jr)|jn|j| _	| j	|kr<| j	| dks;J n	|| j	 dksEJ t
d| j	| | _|j| j | _| j| j | _| j| j | _| jd | _t| j| j| j| j	|jpw|j|td|d| _t| j| j |j|j|td|d| _t|dd}t|d	d
}t| j| jd |d| dd| _t| j| j| j| j||td|d| _d S )Nr      g      query_key_valuebiasr   r   dense
rope_ratiog      ?
seq_length       i'  F)
rotary_dimmax_positionbaseis_neox_styleattn)num_kv_headsr   r   r   )super__init__hidden_sizer   num_attention_headstotal_num_heads	num_headsmulti_query_attentionmulti_query_group_numtotal_num_kv_headsmaxr*   head_dimq_sizekv_sizescalingr   add_bias_linearadd_qkv_biasr   r   r   r    getattrr   
rotary_embr   r)   )selfconfigr   r   r   tp_sizer!   max_positions	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/chatglm.pyr,   2   sj   


	
	zGLMAttention.__init__hidden_statesposition_idsforward_batchreturnc                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )N)dim)r   splitr6   r7   r<   r)   r    )r=   rE   rF   rG   qkv_qkvcontext_layerattn_outputrC   rC   rD   forwardx   s    zGLMAttention.forward)r   Nr   )__name__
__module____qualname__intr   r   strr,   torchTensorr   rS   __classcell__rC   rC   rA   rD   r   1   s(    Fr   c                       s<   e Zd ZdZ		d
dee def fddZdd	 Z  Z	S )GLMMLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    Nr   r   r   c                    sd   t    |j| _t|j|jgd |j|td|d| _t	 | _
t|j|j|j|td|d| _d S )Nr$   dense_h_to_4hr   dense_4h_to_h)r+   r,   r9   add_biasr   r-   ffn_hidden_sizer   r]   r	   activation_funcr   r^   r=   r>   r   r   rA   rC   rD   r,      s"   

zGLMMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r]   ra   r^   )r=   rE   intermediate_parallelrM   outputrC   rC   rD   rS      s   
zGLMMLP.forwardNr   )
rT   rU   rV   __doc__r   r   rX   r,   rS   r[   rC   rC   rA   rD   r\      s    
r\   c                       sX   e Zd ZdZ		ddedee def fddZd	e	j
d
e	j
dede	j
fddZ  ZS )GLMBlockzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    Nr   r   r   r   c                    s   t    |j| _|j| _|jrtnt}||j|jd| _	t
|||td|d| _|j| _||j|jd| _t||td|d| _d S )Nepsself_attentionr   mlp)r+   r,   (apply_residual_connection_post_layernormfp32_residual_connectionrmsnormr
   r   r-   layernorm_epsiloninput_layernormr   r   rk   hidden_dropoutpost_attention_layernormr\   rm   )r=   r>   r   r   r   layer_norm_funcrA   rC   rD   r,      s    
zGLMBlock.__init__rE   rF   rG   rH   c           	      C   s^   |  |}| j|||d}| jr|}n|}|| }| |}| jr$|}n|}| || }|S N)rE   rF   rG   )rr   rk   rn   rt   rm   )	r=   rE   rF   rG   layernorm_outputattention_outputresiduallayernorm_inputre   rC   rC   rD   rS      s    

zGLMBlock.forwardrf   )rT   rU   rV   rg   rW   r   r   rX   r,   rY   rZ   r   rS   r[   rC   rC   rA   rD   rh      s(    
"rh   c                       sT   e Zd ZdZ		ddee def fddZdej	d	ej	d
e
dej	fddZ  ZS )GLMTransformerzTransformer class.Nr   r   r   c                    sl   t     j| _ j| _t fddt| jD | _| jr4 jr't	nt
}| j jd| _d S d S )Nc              	      s(   g | ]}t  |td | dqS )zlayers.rl   )rh   r   ).0ir>   r   r   rC   rD   
<listcomp>  s    z+GLMTransformer.__init__.<locals>.<listcomp>ri   )r+   r,   post_layer_norm
num_layersr   
ModuleListrangelayersrp   r
   r   r-   rq   final_layernorm)r=   r>   r   r   ru   rA   r~   rD   r,     s   
zGLMTransformer.__init__rE   rF   rG   rH   c                 C   s<   t | jD ]}| j| }||||d}q| jr| |}|S rv   )r   r   r   r   r   )r=   rE   rF   rG   r}   layerrC   rC   rD   rS   ,  s   

zGLMTransformer.forwardrf   )rT   rU   rV   rg   r   r   rX   r,   rY   rZ   r   rS   r[   rC   rC   rA   rD   r{   	  s$     r{   c                       sP   e Zd Z		ddee def fddZdejdejd	e	d
ejfddZ
  ZS )ChatGLMMNr   r   r   c                    sn   t    t|j|jtd|d| _|j| _|j| _|j	| _	t
||td|| _t|j|jtd|d| _d S )N	embeddingrl   encoderoutput_layer)r+   r,   r   padded_vocab_sizer-   r   r   r   r2   kv_channelsr{   r   r   r   rb   rA   rC   rD   r,   A  s"   
zChatGLMM.__init__	input_idsrF   rG   rH   c                 C   s   |  |}| j|||d}|S rv   )r   r   )r=   r   rF   rG   inputs_embedsrE   rC   rC   rD   rS   \  s   
zChatGLMM.forwardrf   )rT   rU   rV   r   r   rX   r,   rY   rZ   r   rS   r[   rC   rC   rA   rD   r   @  s"    r   c                	       s   e Zd ZdgdgdZg dZi Zg Z		ddedee	 d	e
f fd
dZe dejdejdedejfddZdeee
ejf  fddZ  ZS )ChatGLMForCausalLMr   r]   )r   r]   )r   r    r]   r^   Nr   r>   r   r   c                    sR   t    || _|| _t|dd| _t||td|d| _| jj	| _
t|| _d S )Nmax_sequence_lengthr#   transformerrl   )r+   r,   r>   r   r;   max_position_embeddingsr   r   r   r   lm_headr   logits_processorrb   rA   rC   rD   r,   |  s   

zChatGLMForCausalLM.__init__r   	positionsrG   rH   c                 C   s    |  |||}| ||| j|S rc   )r   r   r   )r=   r   r   rG   rE   rC   rC   rD   rS     s   
zChatGLMForCausalLM.forwardweightsc                 C   sr   t | jdd}|D ],\}}d|v rq
d|v r|dd}|dr'||vr'q
|| }t|dt}||| q
d S )	NF)remove_duplicatezrotary_pos_emb.inv_freqword_embeddingsz.word_embeddingsr   z.biasweight_loader)dictnamed_parametersreplaceendswithr;   r   )r=   r   params_dictnameloaded_weightparamr   rC   rC   rD   load_weights  s   zChatGLMForCausalLM.load_weightsrf   )rT   rU   rV   packed_modules_mappingsupported_lora_modulesembedding_modulesembedding_padding_modulesr   r   r   rX   r,   rY   no_gradrZ   r   rS   r   r   r   r[   rC   rC   rA   rD   r   m  s6    $r   c                   @   s   e Zd ZdS )ChatGLMModelN)rT   rU   rV   rC   rC   rC   rD   r     s    r   )0rg   typingr   r   r   rY   r   torch.nnr   sglang.srt.configsr   sglang.srt.distributedr   sglang.srt.layers.activationr	   sglang.srt.layers.layernormr
   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   
LoraConfigModuler   r\   rh   r{   r   r   r   
EntryClassrC   rC   rC   rD   <module>   s6   Z/O7-:
