o
    
۾ioZ                     @   s  d Z ddlmZ ddlmZ ddlZddlm  mZ	 ddlmZ ddl
mZ ddlmZ ddlmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 G dd dej8Z9G dd dej8Z:G dd dej8Z;G dd dej8Z<eG d d! d!ej8Z=G d"d# d#ej8e0e/Z>G d$d% d%e>Z?dS )&zDInference-only BailingMoE model compatible with HuggingFace weights.    )Iterable)isliceN)nn)PretrainedConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)
SiluAndMul)	Attention)SharedFusedMoE)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s`   e Zd Z				ddededB dedB dedef
 fd	d
Zde	j
de	j
de	j
fddZ  ZS )BailingAttentionNT configcache_configquant_configreduce_resultsprefixc              	      s  t    |j| _|j| _|j| _t }| j| dksJ | j| jks%J | j| | _|j	p3| j| j | _	| j	| j | _
td| j| | _| j| j	 | _| j	d | _t|dd| _t|dd| _t| j| j	| j| j|jpn|j|| dd| _| jr| jrt| j	|jd	ntj| j	d
d	| _| jrt| j	|jd	ntj| j	d
d	| _t| j| j	 | j|j||| dd| _t|d| j	}|| j	 |jd< t| j	|j |jdd| _!t"| j| j	| j| j|| dd| _#d S )Nr   r   g      use_qk_normFuse_rmsnormz.query_key_valuebiasr'   r)   epsgư>z.denser-   r'   r(   r)   
rotary_dimpartial_rotary_factorT)max_positionrope_parametersis_neox_stylez.attn)num_kv_headsr&   r)   )$super__init__hidden_sizenum_attention_headstotal_num_headsnum_key_value_headstotal_kv_headsr   	num_headshead_dimq_size_per_rankmaxr6   kv_size_per_rankscalegetattrr*   r+   r   use_biasuse_qkv_biasquery_key_valuer   rms_norm_epsr   	LayerNormquery_layernormkey_layernormr   denser4   r   max_position_embeddings
rotary_embr   attn)selfr%   r&   r'   r(   r)   tp_sizer1   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/bailing_moe.pyr8   I   sr   



	zBailingAttention.__init__hidden_statesposition_idsreturnc           
      C   s   |  |\}}|j| j| j| jgdd\}}}| jrD|d| j| j}|d| j| j}| 	|}| 
|}|d| j}|d| j}| |||\}}| |||}| |\}	}|	S )N)dim)rG   splitr@   rB   r*   viewr>   r?   r6   rJ   rK   rN   rO   rL   )
rP   rV   rW   qkv_qkvcontext_layerattn_outputrT   rT   rU   forward   s   

zBailingAttention.forward)NNTr$   )__name__
__module____qualname__r   r   r   boolstrr8   torchTensorrd   __classcell__rT   rT   rR   rU   r#   H   s.    Lr#   c                       sN   e Zd Z			ddedededB dedB ded	df fd
dZdd Z	  Z
S )
BailingMLPNTr$   intermediate_sizer%   r'   r(   r)   rX   c                    sZ   t    t|j|gd |j|| dd| _t||j|j||| dd| _t | _	d S )N   z.gate_up_projr,   z
.down_projr0   )
r7   r8   r   r9   rE   gate_up_projr   	down_projr   act_fn)rP   rn   r%   r'   r(   r)   rR   rT   rU   r8      s"   
zBailingMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)rp   rr   rq   )rP   xr^   rT   rT   rU   rd      s   
zBailingMLP.forwardNTr$   )re   rf   rg   intr   r   rh   ri   r8   rd   rl   rT   rT   rR   rU   rm      s$    rm   c                       sX   e Zd Z			ddedededB dedB def
 fd	d
Zde	j
de	j
fddZ  ZS )
BailingMoENTr$   rn   r%   r'   r(   r)   c                    s  t    t | _t | _|j| _|j| _|j	| _
|j| _|| _|j| _t|dd | _t|dd | _t|dd | _| jd uoB| jd u| _t|dd| _t|dd }|d u rYd | _n|dkrbtj| _ntj| _tj| j| jd| jd	| _t|d
drttj|jftjd| j_nd | j_| jjd ur| jjjnd | _| jd ur| jdkr| jd u s| jdkr| jd usJ dnd| _| jdkrt |dr|j!}n|j"}||j9 }t#|||d| dd| _$nd | _$t%| j$| j| j| j|j"d| j
|| d| j| jj| j| j| j| jd| _&d S )Nscore_functionn_group
topk_grouprouted_scaling_factorg      ?router_dtypefp32F)r-   dtypemoe_router_enable_expert_bias)r~   softmaxsigmoidzdscore_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)r   #moe_shared_expert_intermediate_sizez.shared_experts)rn   r%   r'   r(   r)   z.experts)shared_expertsnum_expertstop_kr9   rn   r(   renormalizer'   r)   scoring_funce_score_correction_biasnum_expert_grouprz   use_grouped_topkrouter_logits_dtype)'r7   r8   r   rQ   r
   tp_rankr   num_experts_per_tokr   norm_topk_probnorm_expert_probr9   r'   num_shared_expertsrD   rx   ry   rz   r   r{   r|   rj   float32bfloat16r   Lineargate	Parameteremptyexpert_biasdatacorrection_biashasattrr   moe_intermediate_sizerm   r   r   experts)rP   rn   r%   r'   r(   r)   r|   rR   rT   rU   r8      s   







zBailingMoE.__init__rV   rX   c                 C   s   |j \}}|d|}| || j}||j}| j||d}| jd ur+|\}}nd }|| j9 }|d ur:|| }| j	dkrE| j
|}|||S )NrY   )rV   router_logitsr   )shaper\   r   tor|   r~   r   r   r{   rQ   &maybe_all_reduce_tensor_model_parallel)rP   rV   
num_tokensr9   r   final_hidden_statesshared_outputrT   rT   rU   rd   2  s$   




zBailingMoE.forwardru   )re   rf   rg   rv   r   r   rh   ri   r8   rj   rk   rd   rl   rT   rT   rR   rU   rw      s     `rw   c                	       sd   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	dB dej	fddZ
  ZS )BailingMoeBlockNr$   r%   r&   r'   r)   c           	         s   t    t|dd }|| _|j}|j}t||jd| _	t
|||| dd| _t||jd| _||jk r;t}nt}||||d| dd| _d S )N.rY   r.   z
.attentionr)   Tz.mlp)r7   r8   rv   r[   r%   r9   rn   r   rH   input_layernormr#   	attentionpost_attention_layernormfirst_k_dense_replacerm   rw   mlp)	rP   r%   r&   r'   r)   	layer_idxr9   rn   	mlp_classrR   rT   rU   r8   P  s    

zBailingMoeBlock.__init__rV   rW   residualrX   c                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)rV   rW   )r   r   r   r   )rP   rV   rW   r   rT   rT   rU   rd   m  s   
zBailingMoeBlock.forward)NNr$   )re   rf   rg   r   r   r   ri   r8   rj   rk   rd   rl   rT   rT   rR   rU   r   O  s,    r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )BailingMoeModelr$   r   vllm_configr)   c                   s   t    |jj|j |j| _j| _j| _	t
dd| _t js,| jr;t jr;t| j| j	| dd| _nt | _tjj| _tj fdd| dd\| _| _| _td	d
gj| _t jrvt| j	jd| _d S t | _d S )Ntie_word_embeddingsFz.word_embeddingsr'   r)   c                    s   t  | dS )N)r%   r&   r'   r)   )r   r   r&   r%   r'   rT   rU   <lambda>  s    z*BailingMoeModel.__init__.<locals>.<lambda>z.layersr   rV   r   r.   ) r7   r8   model_config	hf_configr&   r'   r%   
vocab_sizer9   	embed_dimrD   r   r	   is_first_rankis_last_rankr   word_embeddingsr   rj   r   Dropoutembedding_dropoutr!   num_hidden_layersstart_layer	end_layerlayersr    make_empty_intermediate_tensorsr   rH   norm)rP   r   r)   rR   r   rU   r8     s@   


zBailingMoeModel.__init__	input_idsrX   c                 C   s
   |  |S rs   )r   rP   r   rT   rT   rU   embed_input_ids     
zBailingMoeModel.embed_input_idsNrW   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS |d u rK| 	|}|S | 	||\}}|S )NrV   r   )rV   r   )
r	   r   r   r   r   r   r   r   r   r   )	rP   r   rW   r   r   rV   r   layerr^   rT   rT   rU   rd     s.   


zBailingMoeModel.forwardc                 C   s   t j| ddd| jjdS )N	gate_projrq   up_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   )r   make_expert_params_mappingr%   r   rP   rT   rT   rU   get_expert_mapping  s   z"BailingMoeModel.get_expert_mappingweightsc              	   C   s  ddg}t | jdd}t }|  }|D ]\}}t| jdr0| jjr0d|v r0tj|ddd	d
}|D ]8\}}	}
|	|vr<q2d|v rAq2|	|	|}|
drQ||vrQq2||vrVq2t|| r\q2|| }|j}||||
  nV|D ]/}|\}}	}}
|	|vrzqm|	|	|}t|| rqm||vrqm|| }|j}|||||
|d  n$|
dr||vrq||vrqt|| rq|| }t|dt}||| || q|S )N)rp   r   r   )rp   r   r   F)remove_duplicate	norm_headzlm_head.weightr   ro   gHz>)rZ   pr/   zmlp.expertsz.bias)shard_id	expert_idweight_loader)dictnamed_parameterssetr   r   r%   r   F	normalizereplaceendswithr   r   rD   r   add)rP   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   mappingr   rT   rT   rU   load_weights  sz   




zBailingMoeModel.load_weightsrs   )re   rf   rg   r   ri   r8   rj   rk   r   r   rd   listtuplerv   r   r   r   r   rl   rT   rT   rR   rU   r     s,    2
$,	r   c                       s   e Zd ZdgddgdZdddeded	d
f fddZdejd	ejfddZ		
	
ddejd
B dejde
d
B dejd
B d	eje
B f
ddZdejd	ejd
B fddZdeeeejf  d	ee fddZd	eeeeeef  fddZ  ZS )BailingMoeForCausalLMrG   r   r   )rG   rp   r$   r   r   r)   rX   Nc                   s   t    |jj }||j_|j}|| _|| _|j| _t|t	|dd| _
t|dd| _t jrO| jr:| j
j| _nt|j|j|t	|dd| _t|j| _nt | _| j
j| _d S )Nmodel)r   r)   r   Flm_headr   )r7   r8   r   r   get_text_configr'   r%   rM   r   r"   r   rD   r   r	   r   r   r   r   r   r9   r   logits_processorr   r   )rP   r   r)   r%   r'   rR   rT   rU   r8   <  s0   

zBailingMoeForCausalLM.__init__r   c                 C   s   | j |S rs   )r   r   r   rT   rT   rU   r   b  s   z%BailingMoeForCausalLM.embed_input_ids	positionsr   r   c                 C   s   |  ||||}|S rs   )r   )rP   r   r   r   r   model_outputrT   rT   rU   rd   e  s   zBailingMoeForCausalLM.forwardrV   c                 C   s   |  | j|}|S rs   )r   r   )rP   rV   logitsrT   rT   rU   compute_logitsq  s   z$BailingMoeForCausalLM.compute_logitsr   c                 C   s"   t | | jrdgnd d}||S )Nzlm_head.)skip_prefixes)r   r   r   )rP   r   loaderrT   rT   rU   r   x  s
   
z"BailingMoeForCausalLM.load_weightsc                 C   s
   | j  S rs   )r   r   r   rT   rT   rU   r     r   z(BailingMoeForCausalLM.get_expert_mapping)NN)re   rf   rg   packed_modules_mappingr   ri   r8   rj   rk   r   r   rd   r   r   r   r   r   r   rv   r   rl   rT   rT   rR   rU   r   3  sF    &

$&r   c                   @   s   e Zd ZdS )BailingMoeV2ForCausalLMN)re   rf   rg   rT   rT   rT   rU   r     s    r   )@__doc__collections.abcr   	itertoolsr   rj   torch.nn.functionalr   
functionalr    transformers.configuration_utilsr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   r   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   utilsr   r   r   r    r!   r"   Moduler#   rm   rw   r   r   r   r   rT   rT   rT   rU   <module>   s@    
g"~4 0P