o
    پiX/                     @   sX  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/ d dl0m1Z1 e. Z2e- Z3e 4e5Z6ee1Z7G dd de'Z8e8gZ9dS )    N)	lru_cache)IterableOptionalTuple)Glm4vMoeConfig)"get_moe_expert_parallel_world_size$get_tensor_model_parallel_world_size)get_pp_group)vision_utils)LogitsProcessor)get_moe_a2a_backend)FusedMoE)PoolerPoolingType)QuantizationConfig)PPMissingLayer)ParallelLMHead)default_weight_loader)Glm4MoeModel)Glm4vForConditionalGenerationGlm4vVisionModel)get_global_server_args)
add_prefixget_device_smis_cudalog_info_on_rank0)get_processorc                	   @   sV   e Zd Z		ddedee deddfddZd	d
 Zdde	e
eejf  fddZdS ) Glm4vMoeForConditionalGenerationN configquant_configprefixreturnc                 C   s  t j|  t | _|| _t j| _t	
| j t | _|| _d| _|   t||td|d| _t|j|td|| jd| _| jjre| jjdkrS| jjrS| jj| _nt|j|j|td|t jd| _nt | _t || _!t"t#j$d	d
| _%d| jj&v | _'d| _(d S )Nr   language_model)r!   visual)r    r!   use_data_parallel   lm_head)r    r!   use_attn_tp_groupT)pooling_type	normalizemrope_sectionF))nnModule__init__r	   pp_groupr   r   mm_enable_dp_encoderr%   r
   "update_vit_attn_dummy_heads_configr   tp_sizer    num_fused_shared_experts"determine_num_fused_shared_expertsr   r   modelr   vision_configr$   is_last_rank
world_sizetie_word_embeddingsembed_tokensr'   r   
vocab_sizehidden_sizeenable_dp_lm_headr   r   logits_processorr   r   LASTpoolerrope_scalingis_mrope_enabledcapture_aux_hidden_states)selfr   r    r!    rE   O/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/glm4v_moe.pyr.   &   sF   

	

z)Glm4vMoeForConditionalGeneration.__init__c                 C   s   t  jrd S d }t| jdd sd}n!tsd}ntr$td ur$tdk r$d}nt dkr,d}nt  r3d}|d urEd	t  _t	t
| d
 d S | jj| _| jdksSJ dt	t
d d S )Nn_shared_expertsz,No shared experts are defined in the config.z6Shared experts fusion currently requires CUDA devices.P   z2Shared experts fusion requires SM80 or newer GPUs.r&   zLShared experts fusion is not supported together with expert parallelism yet.zJShared experts fusion is not supported when Deepep MoE backend is enabled.Tz0 Shared experts fusion optimization is disabled.zLOnly 1 fused shared expert is supported for Glm4vMoeForConditionalGenerationz+Shared experts fusion optimization enabled.)r   disable_shared_experts_fusiongetattrr   _is_cuda
_device_smr   r   	is_deepepr   loggerrG   r3   )rD   disable_reasonrE   rE   rF   r4   Y   s2   


zCGlm4vMoeForConditionalGeneration.determine_num_fused_shared_expertsFweightsc              	   C   s,  |r%t | jdr!| jj}|dksJ d| jjdkrdn| jj}ntdg d}tjddd	| jj| j d
}|rBd| }g d}t	| 
 }	g }
|D ]F\}}|
| | jdkrjd|v rj|dd| jj }|st | jdr| jj}|dkr|dr|d}t|dkrt|d | jjkrqLn,||sqLd|v sd|v rqLd}|D ]}||v r||d}d} nq|r||d}d|v r|dd}d|v r|dd}d|v rqL|D ]3\}}}||vrqd|v rq|||}|dr||	vrq||	vrq|	| }|j}||||  nd}|D ]/}|\}}}}||vr$qd}|||}||	vr3q|	| }|j}||||||d   nN|rIqLd!|v rT|d"d#}|dr`||	vr`qL||	vrfqL||	 v r|	| }t|d$t}d!|v rt| j||}||| qLtd%| d& qLd S )'Nnum_nextn_predict_layersr&   zOnly 1 nextn layer is supportedr   z-num_nextn_predict_layers is not in the config))qkv_projq_projq)rR   k_projk)rR   v_projv)gate_up_proj	gate_projr   )rY   up_projr&   rZ   	down_projr[   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_expertszmodel.layers.)zshared_head.normeh_projenormhnormzmlp.shared_expertszmlp.experts.zmodel.layers.      zshared_head.headr:   Tr5   Fzmodel.decoderzlanguage_model.r   zmodel.visual.zvisual.zrotary_emb.inv_freqzmlp.expertsz.bias)shard_id	expert_idr$   z	attn.qkv.zattn.qkv_proj.weight_loaderz
Parameter z not found in params_dict)hasattrr   rQ   num_hidden_layers
ValueErrorr   make_expert_params_mappingn_routed_expertsr3   dictnamed_parametersappendreplace
startswithsplitlenintendswithri   keysrJ   r   r
   pad_vit_attn_dummy_headsrN   warning)rD   rP   is_nextnnum_nextn_layersnextn_layer_idstacked_params_mappingexpert_params_mappingnextn_layer_prefixnextn_spec_weight_namesparams_dictweight_namesnameloaded_weight	name_list
is_decoderweight_name
param_namerg   paramri   is_expert_weightmappingrh   rE   rE   rF   load_weightsw   s   	









z-Glm4vMoeForConditionalGeneration.load_weights)Nr   )F)__name__
__module____qualname__r   r   r   strr.   r4   r   r   torchTensorr   rE   rE   rE   rF   r   %   s    
3"r   ):logging	functoolsr   typingr   r   r   r   torch.nnr,   5transformers.models.glm4v_moe.configuration_glm4v_moer   sglang.srt.distributedr   r   %sglang.srt.distributed.parallel_stater	   sglang.srt.layers.attentionr
   "sglang.srt.layers.logits_processorr   sglang.srt.layers.moer   ,sglang.srt.layers.moe.fused_moe_triton.layerr   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   sglang.srt.layers.utilsr   *sglang.srt.layers.vocab_parallel_embeddingr   $sglang.srt.model_loader.weight_utilsr   sglang.srt.models.glm4_moer   sglang.srt.models.glm4vr   r   sglang.srt.server_argsr   sglang.srt.utilsr   r   r   r   &sglang.srt.utils.hf_transformers_utilsr   rK   rL   	getLoggerr   rN   cached_get_processorr   
EntryClassrE   rE   rE   rF   <module>   s:    
 
y