o
    -iD                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 G dd dej6Z7G dd dej6Z8G dd dej6Z9G dd dej6Z:eG d d! d!ej6e.Z;G d"d# d#ej6Z<G d$d% d%e<e,e-e.Z=dS )&z;Inference-only ChatGLM model compatible with THUDM weights.    N)Iterable)islice)nn)	LayerNorm)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors)ChatGLMConfig   )SupportsLoRA
SupportsPPSupportsQuant)AutoWeightsLoaderWeightsMapperis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                	       sZ   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	fddZ
  ZS )GLMAttentionN configcache_configquant_configprefixc           
   	      s  t    |j| _t }|j| _| j| dksJ | j| | _|j| _|jr)|jn|j| _	| j	|kr<| j	| dks;J n	|| j	 dksEJ t
d| j	| | _|j| j | _| j| j | _| j| j | _| jd | _t| j| j| j| j	|jpw|j|| dd| _t| j| j |j|j|| dd| _t|dd}t|d	d
}dd| dd}|j }	t| j|||	d| _t| j| j| j| j||| dd| _d S )Nr   r   g      z.query_key_valuebiasr'   r(   z.dense
rope_ratiog      ?
seq_length    defaulti'  g      ?)	rope_type
rope_thetapartial_rotary_factor)max_positionrope_parametersis_neox_stylez.attn)num_kv_headsr&   r'   r(   )super__init__hidden_sizer   num_attention_headstotal_num_heads	num_headsmulti_query_attentionmulti_query_group_numtotal_num_kv_headsmaxr5   head_dimq_sizekv_sizescalingr   add_bias_linearadd_qkv_biasquery_key_valuer   densegetattroriginal_roper   
rotary_embr   attn)
selfr%   r&   r'   r(   tp_sizer+   max_positionsr3   r4   	__class__ _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/chatglm.pyr7   1   sr   


	
	zGLMAttention.__init__hidden_statesposition_idsreturnc           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )N)dim)rF   splitrA   rB   rJ   rK   rG   )
rL   rS   rT   qkv_qkvcontext_layerattn_outputrQ   rQ   rR   forward~   s    zGLMAttention.forwardNNr$   )__name__
__module____qualname__r   r   r   strr7   torchTensorr`   __classcell__rQ   rQ   rO   rR   r#   0   s(    Mr#   c                       s@   e Zd ZdZ		ddededB def fddZd	d
 Z  Z	S )GLMMLPzMLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
    Nr$   r%   r'   r(   c                    sd   t    |j| _t|j|jgd |j|| dd| _t | _	t
|j|j|j|| dd| _d S )N   z.dense_h_to_4hr)   z.dense_4h_to_h)r6   r7   rD   add_biasr   r8   ffn_hidden_sizedense_h_to_4hr   activation_funcr   dense_4h_to_h)rL   r%   r'   r(   rO   rQ   rR   r7      s"   

zGLMMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)rm   rn   ro   )rL   rS   intermediate_parallelrZ   outputrQ   rQ   rR   r`      s   
zGLMMLP.forward)Nr$   )
rb   rc   rd   __doc__r   r   re   r7   r`   rh   rQ   rQ   rO   rR   ri      s    
ri   c                	       s^   e Zd ZdZ			ddededB dedB def fdd	Zd
e	j
de	j
de	j
fddZ  ZS )GLMBlockzA single transformer layer.

    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    Nr$   r%   r&   r'   r(   c                    s   t    |j| _|j| _|jrtnt}||j|jd| _	t
|||| dd| _|j| _||j|jd| _t||| dd| _d S )Nepsz.self_attentionr(   z.mlp)r6   r7   (apply_residual_connection_post_layernormfp32_residual_connectionrmsnormr   r   r8   layernorm_epsiloninput_layernormr#   self_attentionhidden_dropoutpost_attention_layernormri   mlprL   r%   r&   r'   r(   layer_norm_funcrO   rQ   rR   r7      s    
zGLMBlock.__init__rS   rT   rU   c                 C   s\   |  |}| j||d}| jr|}n|}|| }| |}| jr#|}n|}| || }|S )NrS   rT   )r|   r}   rx   r   r   )rL   rS   rT   layernorm_outputattention_outputresiduallayernorm_inputrr   rQ   rQ   rR   r`      s   

zGLMBlock.forwardra   )rb   rc   rd   rs   r   r   r   re   r7   rf   rg   r`   rh   rQ   rQ   rO   rR   rt      s*    	"rt   c                	       sb   e Zd ZdZ			ddededB dedB def fdd	Zd
e	j
de	j
de	j
eB fddZ  ZS )GLMTransformerzTransformer class.Nr$   r%   r&   r'   r(   c                    s   t    j| _j| _t| j fdd| dd\| _| _| _| jr7jr,t	nt
}|jjd| _tdgj| _d S )Nc                    s   t  | dS )Nrw   )rt   rw   r&   r%   r'   rQ   rR   <lambda>  s    z)GLMTransformer.__init__.<locals>.<lambda>z.layersrw   ru   rS   )r6   r7   post_layer_norm
num_layersr!   start_layer	end_layerlayersrz   r   r   r8   r{   final_layernormr    make_empty_intermediate_tensorsr   rO   r   rR   r7   
  s    

zGLMTransformer.__init__rS   rT   rU   c                 C   sL   t | j| j| jD ]}|||d}q	t jstd|iS | jr$| |}|S )Nr   rS   )	r   r   r   r   r
   is_last_rankr   r   r   )rL   rS   rT   layerrQ   rQ   rR   r`   )  s   
zGLMTransformer.forwardra   )rb   rc   rd   rs   r   r   r   re   r7   rf   rg   r   r`   rh   rQ   rQ   rO   rR   r     s*    r   c                       s   e Zd ZdddgiZdddedef fdd	Zd
ejdejfddZ			dd
ejdejde
dB dejdB dedeje
B fddZdeeeejf  dee fddZ  ZS )ChatGLMModellinear_proj.merged_projlinear_proj.gate_projlinear_proj.dense_h_to_4hr$   rw   vllm_configr(   c                   s   t    |jj}|j}|j}|| _t|j|j	|| dd| _
|j| _|j| _|j| _t|||| dd| _t|j|j	|| dd| _| jj| _d S )Nz
.embedding)r'   r(   z.encoderrw   z.output_layer)r6   r7   model_config	hf_configr&   r'   r%   r   padded_vocab_sizer8   	embeddingr   r=   kv_channelsr   encoderr   output_layerr   )rL   r   r(   r%   r&   r'   rO   rQ   rR   r7   F  s2   
zChatGLMModel.__init__	input_idsrU   c                 C   s
   |  |S rp   )r   rL   r   rQ   rQ   rR   embed_input_idsh  s   
zChatGLMModel.embed_input_idsN	positionsintermediate_tensorsinputs_embedskwargsc                 K   sH   t  jr|d ur|}n| |}n
|d usJ |d }| j||d}|S )NrS   r   )r
   is_first_rankr   r   )rL   r   r   r   r   r   rS   rQ   rQ   rR   r`   k  s   zChatGLMModel.forwardweightsc                 C   s   ddg}t |  }t }|D ]^\}}|D ].\}}}	||vrq|||}|dr/||vr/qt|| r5q|| }
|
j}||
||	  n$d|v rIq|drS||vrSqt|| rYq|| }
t|
dt}||
| |	| q|S )N)r   r   r   )r   r   r   z.biaszrotary_pos_emb.inv_freqweight_loader)
dictnamed_parameterssetreplaceendswithr   r   rH   r   add)rL   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   rQ   rQ   rR   load_weights  s:   


zChatGLMModel.load_weightsNN)rb   rc   rd   packed_modules_mappingr	   re   r7   rf   rg   r   r   objectr`   r   tupler   r   rh   rQ   rQ   rO   rR   r   =  s0    "
,r   c                	       s   e Zd ZeddidZdeddededee dd	f fd
dZ	de
jde
jfddZde
jde
jd	B fddZdeeee
jf  fddZ  ZS )ChatGLMBaseModelz.word_embeddingsr$   )orig_to_new_substr)r(   transformer_typer   r(   r   rU   Nc                   s   t    |jj}|j}|jj}|| _|| _|| _t|dd| _||t	|dd| _
| jjr6| j
jj| j
j_| j
j| _t|j| _| j
j| _d S )Nmax_sequence_lengthr-   transformerr   r(   )r6   r7   r   r   r'   multimodal_configr%   rH   max_position_embeddingsr"   r   tie_word_embeddingsr   weightr   lm_headr   r   logits_processorr   )rL   r   r(   r   r%   r'   r   rO   rQ   rR   r7     s"   


zChatGLMBaseModel.__init__r   c                 C   s   | j |S rp   )r   r   r   rQ   rQ   rR   r     s   z ChatGLMBaseModel.embed_input_idsrS   c                 C   s   |  | j|}|S rp   )r   r   )rL   rS   logitsrQ   rQ   rR   compute_logits  s   zChatGLMBaseModel.compute_logitsr   c                 C   s   t | }|j|| jdS )N)mapper)r   r   hf_to_vllm_mapper)rL   r   loaderrQ   rQ   rR   r     s   zChatGLMBaseModel.load_weights)rb   rc   rd   r   r   r   r	   re   typer7   rf   rg   r   r   r   r   r   rh   rQ   rQ   rO   rR   r     s,    
$r   c                       sr   e Zd ZdgdgdZdddedef fdd	Z	
	
ddejdejde	d
B dejd
B deje	B f
ddZ
  ZS )ChatGLMForCausalLMrF   rm   )rF   rm   r$   rw   r   r(   c                   sF   |j j}t|drddgi}tdt| dt j||d d S )Nvision_configarchitecturesGLM4VForCausalLMzThe configuration of this model indicates that it supports vision inputs, but you instantiated the text-only version of this model. Please use the vision model by setting `--hf-overrides 'z'`r   )r   r   hasattrRuntimeErrorjsondumpsr6   r7   )rL   r   r(   r%   hf_overridesrO   rQ   rR   r7     s   

zChatGLMForCausalLM.__init__Nr   r   r   r   rU   c                 C   s   |  ||||}|S rp   )r   )rL   r   r   r   r   rS   rQ   rQ   rR   r`     s   zChatGLMForCausalLM.forwardr   )rb   rc   rd   r   r	   re   r7   rf   rg   r   r`   rh   rQ   rQ   rO   rR   r     s$    r   )>rs   r   collections.abcr   	itertoolsr   rf   r   torch.nnr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   vllm.transformers_utils.configsr   
interfacesr   r   r   utilsr   r   r   r    r!   r"   Moduler#   ri   rt   r   r   r   r   rQ   rQ   rQ   rR   <module>   s>    
[/M6k0