o
    -iZ                     @   s  d dl Z d dlmZmZ d dl mZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZ d d	lmZmZmZmZmZmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4m5Z5 d dl6m7Z7m8Z8m9Z9 d dl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB d dlCmDZD d dlEmFZF d dlGmHZH d dlImJZJ d dlKmLZL deMfddZNG dd  d ejOZPG d!d" d"ejOZQG d#d$ d$ejOZRG d%d& d&ejOZSG d'd( d(ejOZTG d)d* d*ejOZUeG d+d, d,ejOZVG d-d. d.ejOe9e8ZWG d/d0 d0eWe7ZXG d1d2 d2eWZYG d3d4 d4eYZZG d5d6 d6eXZ[G d7d8 d8eXZ\dS )9    N)CallableIterable)Any)nn)PretrainedConfig)	AttentionAttentionType)support_torch_compile)CacheConfigParallelConfig
VllmConfig)get_ep_groupget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeget_tp_group tensor_model_parallel_all_gather)
SiluAndMul)StaticSinkAttention)SharedFusedMoE)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)
MLAModulesMultiHeadLatentAttentionWrapper)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)MixtureOfExpertsSupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixsequence_parallel_chunk)set_weight_attrs)current_platform)IntermediateTensors)set_default_rope_theta)FlashAttentionDiffKVBackendact_fnc                 C   s   | dkrt d|  dd S )NsiluzUnsupported activation: z!. Only silu is supported for now.)
ValueError)r5    r8   a/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/openpangu.pycheck_ffn_act_fnZ   s
   
r:   c                       sd   e Zd Z					ddededededB d	ed
ededdf fddZdej	dej	fddZ
  ZS )OpenPanguMLPNFT hidden_sizeintermediate_size
hidden_actquant_configbiasreduce_resultsprefixreturnc	           	   	      s^   t    t||gd |||| dd| _t||||||| dd| _t| t | _d S )N   .gate_up_proj)rA   r@   
disable_tprC   z
.down_proj)rA   r@   rB   rG   rC   )	super__init__r   gate_up_projr   	down_projr:   r   r5   )	selfr=   r>   r?   r@   rA   rB   is_sequence_parallelrC   	__class__r8   r9   rI   b   s(   

zOpenPanguMLP.__init__xc                 C   s   |  | | |d d S )Nr   )rK   r5   rJ   )rL   rP   r8   r8   r9   forward   s   zOpenPanguMLP.forward)NFTFr<   )__name__
__module____qualname__intstrr   boolrI   torchTensorrQ   __classcell__r8   r8   rN   r9   r;   a   s0    	
!r;   c                	       sN   e Zd Z		ddedededB def fddZd	ej	d
ej	fddZ
  ZS )OpenPanguMoENr<   configparallel_configr@   rC   c              	      s  t    t | _t j| _|j| _t j	| _
| j
 | _| j
 | _|j| _|j| _|j| _t|j t|j|jdd | dd| _t|drZ|jrZttj| jtjd| j_nd | j_|j }|j!| _!|j"| _#| j| _$| j$| j# | _%| j%| j | _&| j| j& | _'| j'| j& | _(|jd ur|j)|j }t*|j||j|| jd| dd| _+nd | _+t,di d| j+d	|jd
|j-d|jd|j)ddd|j.d|ddddddd| dddddd| jjd| j!d| j#d| j| _/d S ) NFz.gaterA   r@   rC   router_enable_expert_biasdtypez.shared_experts)r=   r>   r?   r@   rM   rB   rC   shared_expertsnum_expertstop_kr=   r>   rB   renormalizer@   use_grouped_topkTnum_expert_group   
topk_grouprC   z.expertsscoring_funcsigmoidrouted_scaling_factor      ?e_score_correction_biasenable_eplbnum_redundant_expertsrM   r8   )0rH   rI   r   tp_sizer   rank_in_grouptp_rankrl   r   device_groupep_grouprankep_ranksizeep_sizen_routed_expertsn_shared_expertsuse_sequence_parallel_moerM   r:   r?   r   r=   gatehasattrr_   r   	ParameterrX   emptyfloat32rn   eplb_configro   rp   n_redundant_expertsn_logical_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endmoe_intermediate_sizer;   rb   r   num_experts_per_toknorm_topk_probexperts)rL   r\   r]   r@   rC   r   r>   rN   r8   r9   rI      s   








	


zOpenPanguMoE.__init__hidden_statesrD   c           	      C   s   |j \}}|d|}| jrt|}| |\}}| j||d}|\}}| jd u r/|d u s/J |jtj	kr;|| j
9 }n| jd urM|d usFJ |d| j
 9 }| jd ur\|d usXJ ||7 }| jrkt|d}|d | }n| jdkrv| j|}|||S )N)r   router_logitsrm   r   rh   )shapeviewrM   r/   r}   r   rb   ra   rX   float16rl   r   rq   &maybe_all_reduce_tensor_model_parallel)	rL   r   
num_tokens
hidden_dimr   _fused_moe_outshared_outputfinal_hidden_statesr8   r8   r9   rQ      s:   




zOpenPanguMoE.forward)Nr<   )rR   rS   rT   r   r   r   rV   rI   rX   rY   rQ   rZ   r8   r8   rN   r9   r[      s"    Xr[   c                       s   e Zd Z				ddededededed	ed
edB dedededB dedB deddf fddZde	j
de	j
de	j
fddZ  ZS )OpenPanguMLAAttention    Nr<   r\   r=   	num_headsqk_nope_head_dimqk_rope_head_dim
v_head_dimq_lora_rankkv_lora_rankmax_position_embeddingscache_configr@   rC   rD   c                    s  t    || _|| _|| _|| _|| | _|| _|| _|| _	t
 | _|| j dkr6td| d| j d|| j | _| jd | _|	| _|| _| jd ur~t| j| j| j	| j gd|| ddd	| _t| j|jd
| _t|| j| j d|| dd| _n$t| j| j| j d|| dd| _t| j| j	| j d|| dd| _t| j	|jd
| _t| j	| j| j| j  d|| dd| _t| j| j | jd|| dd| _t|dd |j d ddddd|	ddd	}t!||	|dd| _"t#| j| j| j"| j| jd ur| jnd | jd u r| jnd | jd ur| jnd | jd ur | jnd | jd u r*| jnd d dd d}t$| j| j| j| j| j| j| j| j	||
||| _%d S )Nr   z
num_heads  is not divisible by tp_size .      F.fused_qkv_a_projT)rA   r@   rC   rG   epsz	.q_b_projr^   .q_proj.kv_a_proj_with_mqaz
.kv_b_proj.o_proji'  )default_theta
rope_theta    rh   rm   yarndeepseek_yarn)	r   	beta_fast	beta_slowfactormscalemscale_all_dim original_max_position_embeddingstype	rope_typemax_positionrope_parametersis_neox_style)kv_a_layernorm	kv_b_proj
rotary_embo_projfused_qkv_a_projkv_a_proj_with_mqaq_a_layernormq_b_projq_projindexer	is_sparsetopk_indices_buffer)&rH   rI   r=   r   r   r   qk_head_dimr   r   r   r   rq   r7   num_local_headsscalingr   rC   r   r   r   rms_norm_epsr   r   r   r   r   r   r   r   r   r   r3   r   r    r   r   r   mla_attn)rL   r\   r=   r   r   r   r   r   r   r   r   r@   rC   r   mla_modulesrN   r8   r9   rI     s   







	
zOpenPanguMLAAttention.__init__	positionsr   c                 C   s   |  ||S N)r   )rL   r   r   r8   r8   r9   rQ     s   zOpenPanguMLAAttention.forward)r   NNr<   )rR   rS   rT   r   rU   r
   r   rV   rI   rX   rY   rQ   rZ   r8   r8   rN   r9   r     sP    	
 r   c                       s   e Zd Zddddddejfdedededed	ed
edB dedede	dB de
de
ddf fddZdejdejdejfddZded
edB ddfddZ  ZS )OpenPanguEmbeddedAttentionr   NFr<   r\   r=   r   num_kv_headsr   r@   rA   bias_o_projr   rC   	attn_typerD   c                    s  t    t|
}|| _t }|| _| j| dkr%td| j d| d| j| | _|| _| j|krF| j| dkrFtd| j d| d| j|k r^|| j dkr^td| d| j dt	d| j| | _
t|d	d }|d u rw| j| j }|| _| j| j | _| j
| j | _| jd
 | _|| _t|| j| j| j|||
 dd| _t| j| j ||||
 dd| _| j||d t|dr|j}t|tr|}nt|tr|t| }|| }ntt| dd }t| j| j| j| j
|	||||
 dd	| _d S )Nr   total_num_heads r   r   CNumber of KV heads is greater than TP size, but total_num_kv_heads z5Number of KV heads is less than TP size, but tp_size z( is not divisible by total_num_kv_heads rh   head_dimr   	.qkv_proj)r=   	head_sizetotal_num_headstotal_num_kv_headsrA   r@   rC   r   
input_sizeoutput_sizerA   r@   rC   )r@   interleaved_sliding_window1 for interleaved_sliding_window is not supported..attn)r   r   r@   per_layer_sliding_windowr   rC   )rH   rI   r*   r=   r   r   r7   r   r   maxr   getattrr   q_sizekv_sizer   r   r   qkv_projr   r   _init_rotary_embr~   r   
isinstancerU   listlenr   r   attn)rL   r\   r=   r   r   r   r@   rA   r   r   rC   r   	layer_idxrq   r   r   sliding_windowsw_idxrN   r8   r9   rI     s   







z#OpenPanguEmbeddedAttention.__init__r   r   c           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )Nr   dim)r   splitr   r   r   r   r   
rL   r   r   qkvr   qkvattn_outputoutputr8   r8   r9   rQ     s    z"OpenPanguEmbeddedAttention.forwardc                 C   sB   d}|o	|  dk}|r|jdkrd}t| j| j|j|d| _d S )NTggufPanguEmbeddedFr   )get_name
model_typer    r   r   r   r   )rL   r\   r@   r   is_ggufr8   r8   r9   r     s   z+OpenPanguEmbeddedAttention._init_rotary_emb)rR   rS   rT   r   DECODERr   rU   r   rW   r
   rV   rI   rX   rY   rQ   r   rZ   r8   r8   rN   r9   r     s^    	
d
r   c                       s   e Zd Zdddddddejfdedededed	eee	f dB d
ede
dB dedededB dededdf fddZdejdejfddZdejdejdejfddZded	eee	f dB de
dB ddfddZd ddZ  ZS )!OpenPanguSinkAttentionNr   Fr<   r\   r=   r   r   r   r   r@   rA   r   r   rC   r   rD   c                    sf  t    t|}|| _t | _t | _|| _| j| j dkr,t	d| j d| j d| j| j | _
|| _| j| jkrQ| j| j dkrQt	d| j d| j d| j| jk rdt	d| j d| j dtd	| j| j | _t|d
d | _t|dd | _t|dd | _| j| j | _| j
| j | _| j| j | _| j| j | _| jd | _|| _t|dd| _t|dd| _t|dd | _t|dd| _t|| j| j | j| j | j| j g||| dd| _t| j| j ||	|| dd| _t | j|j!d| _"| j#|||d t$|dr.|j%}t&|t'r|}nt&|t(r%|t)| }|| }nt	t*| dd }t+,| j t-| j
| j| j| j| j|
|||| dt+| jd| _.| jdkrt/j01t/j2| j| j| jft34 |j5d| _6t7| j6d	| j8d | jrt/j01t/j2| j| j| jft34 |j5d| _9t7| j9d	| j8d nt/j:| j| j| jft34 |j5d| _9| ;  d S )Nr   r   r   r   r   zNumber of KV heads z is less than TP size z*, KV heads replication is not support yet.rh   qk_nope_dimqk_rope_dim
v_channelsr   param_sink_numberparam_sink_with_valueFparam_sink_scalarparam_sink_of_head_dimr   )r   output_sizesrA   r@   rC   r   r   r   )r   r@   r   r   r   )	sink_lenr   r   r@   r   r   rC   attn_backendhead_size_v)devicera   )
output_dimweight_loader)<rH   rI   r*   r=   r   rq   r   rs   r   r7   r   r   r   r   r   r  r  r  r   r   k_sizev_sizer   r   r  r  r  param_sink_of_head_numr   r   r   r   r   r   k_layernormr   r~   r   r   rU   r   r   r   r4   set_head_size_vr   r   rX   r   r   r   r1   current_devicetorch_dtypeparam_sink_keyr0   r  param_sink_valuezerospost_weight_load)rL   r\   r=   r   r   r   r   r@   rA   r   r   rC   r   r   r   r   r   rN   r8   r9   rI   $  s
  








zOpenPanguSinkAttention.__init__paramloaded_weightc                 C   s  t |dd }t |dd}t |dd}|p|}t |dd}t |dd}|r)| |_|rVt|tjrVt|j}|d urN|| | j dksEJ || | j ||< |j	||j
d |j}	|d urp|sp|	j| }
| j|
 }||||
}t|jdkr||d	}|	j|jksJ |	| d S )
Nr  is_sharded_weightFuse_bitsandbytes_4bitis_gguf_weightis_gguf_weight_typer   r`   rh   )r   itemweight_typer   r   UninitializedParameterr   r   rq   materializera   datars   narrowr   reshapecopy_)rL   r  r  r  r  r  r  r   final_shape
param_data
shard_size	start_idxr8   r8   r9   r    s.   




z$OpenPanguSinkAttention.weight_loaderr   r   c           
   	   C   s   |  |\}}|j| j| j| jgdd\}}}| |d| j| j}| 	|||\}}|d| j}|d| j}| j
|||t|jd |jd | j | j gd}| |\}	}|	S )Nr   r   r   rh   )output_shape)r   r   r   r  r  r  r   r   r   r   r   rX   Sizer   r  r   r   r8   r8   r9   rQ     s     zOpenPanguSinkAttention.forwardc                 C   s.   d}d| j | j i}t| j| j||d| _d S )NFpartial_rotary_factorr   )r  r   r    r   r   )rL   r\   r   r@   r   r8   r8   r9   r     s   z'OpenPanguSinkAttention._init_rotary_embc                 C   s<   t | dr| jd ur| | j}n| j}| j|| j d S )Nr  )r~   r  r  r   update_sink_kvr  )rL   r  r8   r8   r9   r    s   z'OpenPanguSinkAttention.post_weight_loadrD   N)rR   rS   rT   r   r   r   rU   dictrV   r   r   rW   r
   rI   r   r   rX   rY   r  rQ   r   r  rZ   r8   r8   rN   r9   r  #  sn    	
 ,%

r  c                       sT   e Zd Zdedededdf fddZdejd	ejd
ejdB dejfddZ	  Z
S )OpenPanguDecoderLayerr\   rC   vllm_configrD   Nc                    s  t    |d u r|jj}|j}|j}|j}|j| _t|dd}t	|j
ddd }|| _t|do@t|do@t|do@t|d	| _t|d
oK|jdk| _| jrst|| j|j|j|j|jt|drd|jnd |j|||| dd| _n| jrt|ddpt|dd}	|	}
t|dr|j}	t|ddrtj}n	td|j dt|dd }|d u rd|jd}t|| j|jt|d|j||||	|
|| d|d| _n=t|ddpt|dd}	|	}
t|dr|j}	t|ddrtj}ntj}t || j|jt|d|j|||	|
|| d|d| _t|dd d ur$||j!kr$t"|||| dd| _#nt$| j|j%|j&|t|d d| dd!| _#t|d"d#| _'|j(| _(t|d$| j(| _!t)|j|j*d%| _+t)|j|j*d%| _,t- j.| _/t|d&d| _0| j0rt)|j|j*d%| _1t)|j|j*d%| _2d S d S )'Nr   r   r   )sepr   r   r   r   r   r  r   r   z
.self_attn)r\   r=   r   r   r   r   r   r   r   r   r@   rC   attention_biasFrA   qkv_bias	is_causalTz
is_causal=z' is not support for attention with sinkrope_scalingdefault)r   r   num_key_value_heads)r\   r=   r   r   r   r   r@   rA   r   r   rC   r   )r\   r=   r   r   r   r@   rA   r   r   rC   r   rz   z.mlp)r\   r]   r@   rC   mlp_bias)r=   r>   r?   r@   rA   rC   rl   rm   first_k_dense_replacer   sandwich_norm)3rH   rI   model_config	hf_configr   r@   r]   r=   r   rU   r   r   r~   use_mlar  use_sink_attentionr   num_attention_headsr   r   r   r   r   	self_attnr7  r   r   r7   r8  r   r  ENCODER_ONLYr   r=  r[   mlpr;   r>   r?   rl   num_hidden_layersr   r   input_layernormpost_attention_layernormr   rt   tp_groupr>  pre_mlp_layernormpost_mlp_layernorm)rL   r\   rC   r4  r   r@   r]   r   r   r6  r   r   r   rN   r8   r9   rI   &  s   








zOpenPanguDecoderLayer.__init__r   r   residualc                 C   s   |d u r|  }| |}n| ||\}}| j||d}| jd ur;|jtjkr;|d| j 9 }| jdkr;|d| j 9 }| jrL| 	|}| 
||\}}n| 	||\}}| |}| jd urqt| jtrq|jtjkrq|d| j 9 }| jry| |}||fS )N)r   r   rm   r   )clonerH  rD  rl   ra   rX   r   r   r>  rI  rK  rF  r   r;   rL  )rL   r   r   rM  r8   r8   r9   rQ     s8   






zOpenPanguDecoderLayer.forward)rR   rS   rT   r   rV   r   rI   rX   rY   rQ   rZ   r8   r8   rN   r9   r3  %  s(     r3  c                       s4  e Zd ZdZdddedef fddZdejd	ejfd
dZ		d"dejdejde
dB dejdB d	eje
B f
ddZdeeeeef  deeef dedejdee d	efddZdeeeeeef  deeef dedejdee deeef d	efddZdeeeejf  d	ee fddZd#d d!Z  ZS )$OpenPanguModelFr<   rC   r4  rC   c                   s   t    jj j}jj} | _|j| _ j	| _
 j| _t js* jr9t jr9t j j|| dd| _nt | _t j fdd| dd\| _| _| _t jrat j jd| _nt | _tdd	g j| _d S )
Nz.embed_tokensr@   rC   c                    s   t  | S r   )r3  rP  r\   r4  r8   r9   <lambda>  s    z)OpenPanguModel.__init__.<locals>.<lambda>z.layersrP  r   r   rM  )rH   rI   r?  r@  r@   r]   r   r\   rp   pad_token_idpadding_idx
vocab_sizer   is_first_ranktie_word_embeddingsis_last_rankr"   r=   embed_tokensr)   r-   rG  start_layer	end_layerlayersr   r   normr,   make_empty_intermediate_tensors)rL   r4  rC   r@   r   rN   rR  r9   rI     s>   



zOpenPanguModel.__init__	input_idsrD   c                 C   s
   |  |S r   )rZ  rL   r`  r8   r8   r9   embed_input_ids  s   
zOpenPanguModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           
      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| jD ]}| j| }||||\}}q(t  jsCt||dS | 	||\}}	|S )Nr   rM  )r   rM  )
r   rW  rb  ranger[  r\  r]  rY  r2   r^  )
rL   r`  r   rc  rd  r   rM  ilayerr   r8   r8   r9   rQ   !  s"   

zOpenPanguModel.forwardattn_mlp_replace_mappingparams_dictweight_namer  loaded_paramsc                 C   s   |D ]G\}}}||vsd|v r||vrq| ||}	|dkr#|	|vr#q|	}|dr/||vr/qt|| r5q|| }
|
j}||
|| ||  dS dS )Nzmlp.experts.r   .biasTF)replaceendswithr+   r  add)rL   rh  ri  rj  r  rk  
param_nameorigin_nameshard_idweight_name_mappedr  r  r8   r8   r9   load_attn_mlp_weight?  s&   

z#OpenPanguModel.load_attn_mlp_weightexpert_merge_mapping	flag_dictc              	   C   s   |D ]B}|\}}	}
}|	|vrqd|d< | |	|}t|| rq|| }ttdtf |j}||||||
dd}|rD|}||  dS qdS )NTis_expert_weight.)rr  	expert_idreturn_successF)rm  r+   typingcastr   rW   r  ro  )rL   ru  ri  rj  r  rk  rv  mappingrp  rq  rx  rr  rs  r  r  successr8   r8   r9   load_expert_weight`  s0   	

z!OpenPanguModel.load_expert_weightweightsc              	   C   s  g d}t | jd}|rtj| ddd| jj| jd}t|  }t }|D ]\}}d|v r.q%| jj	r7d|v r7q%d	|v rft | jd
rf| jj
dkrft|dd dd }	|	| jj }
|
dkrf|
| jj
k rfq%ddi}| |||||s|r| ||||||rq%|d rq%|dr||vrq%t||}|dr|dd}|d u rq%t|| rq%|| }t|dt}||| || q%|   |S )N))r   r   r   )r   z.k_projr   )r   z.v_projr   )r   z	.q_a_projr   )r   r   rh   )rF   z
.gate_projr   )rF   z.up_projrh   rz   	gate_projrK   up_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namerc   rp   zrotary_emb.inv_freqzlm_head.weightr]  num_nextn_predict_layersr   zlayers.r   r   rw  Frl  rn   zgate.e_score_correction_biasr  )r~   r\   r   make_expert_params_mappingrz   rp   r2  named_parameterssetrX  r  rU   r   rG  rt  r~  rn  r$   rm  r+   r   r#   ro  r  )rL   r  rh  has_expertsru  ri  rk  namer  r   mtp_idxrv  r  r  r8   r8   r9   load_weights  s   		
	



zOpenPanguModel.load_weightsc                 C   s2   |   D ]\}}|| u rqt|dr|  qd S )Nr  )named_modulesr~   r  )rL   r  moduler8   r8   r9   r    s   
zOpenPanguModel.post_weight_loadr   r1  )rR   rS   rT   fall_back_to_pt_during_loadr   rV   rI   rX   rY   rb  r2   rQ   r   tuplerU   r2  r   r  rW   rt  r~  r   r  r  rZ   r8   r8   rN   r9   rO    s\    &


!


$!RrO  c                       s   e Zd Zg dddgdZdddedef fd	d
ZdejdejfddZ			ddejdejde
dB dejdB deje
B f
ddZdejdejdB fddZdeeeejf  dee fddZ  ZS )OpenPanguModelBase)r   k_projv_projr  r  )r   rJ   r<   rP  r4  rC   c                   s   t    |jj}|j}|| _|| _t|do|jd u| _| jr'ddg| j	d< t
|t|dd| _t jrNt|j|j|t|dd| _|jrM| jjj| j_nt | _t|j| _| jj| _d S )	Nr   q_a_projr   r   modelr4  rC   lm_headrQ  )rH   rI   r?  r@  r@   r\   r~   r   fuse_qkv_a_projpacked_modules_mappingrO  r.   r  r   rY  r!   rV  r=   r  rX  rZ  weightr)   r   logits_processorr_  )rL   r4  rC   r\   r@   rN   r8   r9   rI     s8   


zOpenPanguModelBase.__init__r`  rD   c                 C   s   | j |S r   )r  rb  ra  r8   r8   r9   rb    s   z"OpenPanguModelBase.embed_input_idsNr   rc  rd  c                 C   s   |  ||||}|S r   )r  )rL   r`  r   rc  rd  r   r8   r8   r9   rQ     s   zOpenPanguModelBase.forwardr   c                 C   s   |  | j|}|S r   )r  r  )rL   r   logitsr8   r8   r9   compute_logits  s   z!OpenPanguModelBase.compute_logitsr  c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.)skip_prefixes)r(   r\   rX  r  )rL   r  loaderr8   r8   r9   r    s
   
zOpenPanguModelBase.load_weights)NN)rR   rS   rT   r  r   rV   rI   rX   rY   rb  r2   rQ   r  r   r  r  r  rZ   r8   r8   rN   r9   r    s2    #

,r  c                       sB   e Zd Zdddedef fddZdeded	d
fddZ  ZS )OpenPanguMoEModelr<   rP  r4  rC   c                   s   t  j||d |jj}g | _|j|j | _d| _g | _	d }| j
jD ] }t|tr*q"t|ts1J t|jtrB|j}| j	|jj q"|d u rKtd|j| _|j| _|j| _|j| _|j| _|j| _d S )Nr  rh   z#No MOE layer found in model.layers.)rH   rI   r?  r@  expert_weightsrG  r=  num_moe_layersnum_expert_groups
moe_layersr  r]  r   r)   r3  rF  r[   appendr   RuntimeErrorr   num_logical_expertsr   num_physical_expertsr   num_local_physical_expertsrz   r{   r   rp   )rL   r4  rC   r\   example_moerg  rN   r8   r9   rI   #  s.   
zOpenPanguMoEModel.__init__r  r  rD   Nc                 C   sh   | j |ksJ || _|| _ || j | _| jjD ]}t|jtr1|j}||_	||_
| j|_|j  qd S r   )r  r  r  rp   r  r]  r   rF  r[   r   r   r   r   update_expert_map)rL   r  r  rg  moer8   r8   r9    update_physical_experts_metadataB  s   
z2OpenPanguMoEModel.update_physical_experts_metadata)	rR   rS   rT   r   rV   rI   rU   r  rZ   r8   r8   rN   r9   r  "  s    r  c                       s,   e Zd Zdddedef fddZ  ZS )OpenPanguEmbeddedModelr<   rP  r4  rC   c                   s   t  j||d d S )Nr  )rH   rI   )rL   r4  rC   rN   r8   r9   rI   U  s   zOpenPanguEmbeddedModel.__init__)rR   rS   rT   r   rV   rI   rZ   r8   r8   rN   r9   r  T  s    $r  c                   @      e Zd ZdS )PanguEmbeddedForCausalLMNrR   rS   rT   r8   r8   r8   r9   r  Y      r  c                   @   r  )PanguUltraMoEForCausalLMNr  r8   r8   r8   r9   r  ]  r  r  c                   @   r  )PanguProMoEV2ForCausalLMNr  r8   r8   r8   r9   r  a  r  r  )]rz  collections.abcr   r   r   rX   r   transformersr   vllm.attention.layerr   r   vllm.compilation.decoratorsr	   vllm.configr
   r   r   vllm.distributedr   r   r   r   r   r   %vllm.model_executor.layers.activationr   :vllm.model_executor.layers.attention.static_sink_attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   r   +vllm.model_executor.layers.logits_processorr   vllm.model_executor.layers.mlar   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr    3vllm.model_executor.layers.vocab_parallel_embeddingr!   r"   -vllm.model_executor.model_loader.weight_utilsr#   r$   %vllm.model_executor.models.interfacesr%   r&   r'    vllm.model_executor.models.utilsr(   r)   r*   r+   r,   r-   r.   r/   vllm.model_executor.utilsr0   vllm.platformsr1   vllm.sequencer2   vllm.transformers_utils.configr3   ,vllm.v1.attention.backends.flash_attn_diffkvr4   rV   r:   Moduler;   r[   r   r   r  r3  rO  r  r  r  r  r  r  r8   r8   r8   r9   <module>   sd    (
&      P gG2