o
    
۾iU                     @   s  d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z* ddl(m+Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 G dd dej8Z9G dd dej8Z:eG dd dej8Z;G dd dej8e.e/Z<dS )zLInference-only K-EXAONE-236B-A22B model compatible with HuggingFace weights.    N)CallableIterable)islice)nn)PretrainedConfig)support_torch_compile)CacheConfig
VllmConfigget_current_vllm_config)get_ep_groupget_pp_group$get_tensor_model_parallel_world_size)FusedMoE)RMSNorm)ReplicatedLinear)LogitsProcessor)QuantizationConfig)DEFAULT_VOCAB_PADDING_SIZEParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )Exaone4Attention)Exaone4GatedMLP)SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayerextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                	       sP   e Zd Z			ddededB dedef fdd	Zd
ej	dej	fddZ
  ZS )	ExaoneMoeN Fconfigquant_configprefixenable_eplbc                    s  t    t | _|j| _t j| _| j | _	| j
 | _|j| _| j|jkr5td| j d|j dt|j|jdd | dd| _ttj|jtjd| _t }|jj}|| _| j| _|jd urg|jnd|_|j| _| j| j | _| j| j | _ | j	| j  | _!| j!| j  | _"t#di d	| jd
|j$d|jd|j%ddd|j&d|ddd|j'd|j(d| dddd| jd| jd| jd| j| _)t*|dddkr|j%|j+ }t,|j||j-|| j). | dd| _/d S d | _/d S ) NzTensor parallel size z' is greater than the number of experts .Fz.gate)biasr(   r)   )dtyper   num_expertstop_khidden_sizeintermediate_sizereduce_resultsrenormalizer(   use_grouped_topkTnum_expert_group
topk_groupr)   z.expertsscoring_funcsigmoidrouted_scaling_factore_score_correction_biasr*   num_redundant_expertsnum_shared_expertsz.shared_experts)r0   r1   
hidden_actr(   r2   r)    )0super__init__r   tp_sizer9   r   device_groupep_grouprankep_ranksizeep_sizer.   n_routed_experts
ValueErrorr   r0   gater   	Parametertorchemptyfloat32r:   r
   parallel_configeplb_configr*   n_logical_expertsr;   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endr   num_experts_per_tokmoe_intermediate_sizenorm_topk_probn_groupr6   expertsgetattrr<   ExaoneMoeGatedMLPr=   !must_reduce_shared_expert_outputsshared_experts)selfr'   r(   r)   r*   vllm_configrP   r1   	__class__r>   Y/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/exaone_moe.pyr@   @   s   




	


	zExaoneMoe.__init__hidden_statesreturnc                 C   st   |j }|j d }|d|}| |\}}| j||d}| jd ur*| |}|| }| jdkr5| j|}||S )N)re   router_logitsr   )shapeviewrJ   r[   r_   rA   &maybe_all_reduce_tensor_model_parallel)r`   re   
orig_shape
hidden_dimrh   _final_hidden_statesshared_outputr>   r>   rd   forward   s   




zExaoneMoe.forward)Nr&   F)__name__
__module____qualname__r   r   strboolr@   rL   Tensorrq   __classcell__r>   r>   rb   rd   r%   ?   s    Wr%   c                       sx   e Zd Z				ddededB dedB dededdf fd	d
Zde	j
de	j
de	j
dB dee	j
e	j
f fddZ  ZS )ExaoneMoeDecoderLayerNr&   r'   cache_configr(   	mtp_layerr)   rf   c           	         s   t    t|}|j| _t|dd}t|ddpt|dd}t|| j|jt|d|j||||| dd	| _|j| rJ|sJt	||| d	d
| _
nt| j|j|j|t|dd| d	d| _
t|j|jd| _t|j|jd| _d S )Nmax_position_embeddingsi    attention_biasFr,   num_key_value_headsz
.self_attn)	r'   r0   	num_headsnum_kv_headsr|   r(   r,   rz   r)   z.mlp)r'   r(   r)   mlp_bias)r0   r1   r=   r(   r,   r)   eps)r?   r@   r    r0   r\   ExaoneMoeAttentionnum_attention_heads	self_attnis_moe_layerr%   mlpr]   r1   r=   r   rms_norm_epsinput_layernormpost_attention_layernorm)	r`   r'   rz   r(   r{   r)   	layer_idxr|   r}   rb   r>   rd   r@      sH   


zExaoneMoeDecoderLayer.__init__	positionsre   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r   re   )r   r   r   r   )r`   r   re   r   r>   r>   rd   rq      s   
zExaoneMoeDecoderLayer.forward)NNNr&   )rr   rs   rt   r   r   r   rv   ru   r@   rL   rw   tuplerq   rx   r>   r>   rb   rd   ry      s6    2ry   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )ExaoneMoeModelr&   r)   ra   r)   c                   s   t    |jj|j |j|j}|jjj	| _	| _
| _|r(|j|jp&d nd}j| | _t js;jrHt jrHt| jjjd| _nt | _tj fdd| dd\| _| _| _t jrqtjjd| _nt | _td	d
gj| _d S )Nr   r   )org_num_embeddingsr(   c                    s   t  | dS )N)r'   rz   r(   r)   )ry   r   rz   r'   r(   r>   rd   <lambda>  s    z)ExaoneMoeModel.__init__.<locals>.<lambda>z.layersr   r   re   r   ) r?   r@   model_config	hf_configrz   r(   lora_configrO   rP   r;   r'   lora_extra_vocab_size	max_loras
vocab_sizer   is_first_ranktie_word_embeddingsis_last_rankr   r0   embed_tokensr   r#   num_hidden_layersstart_layer	end_layerlayersr   r   normr"   make_empty_intermediate_tensors)r`   ra   r)   r   
lora_vocabrb   r   rd   r@      sJ   




zExaoneMoeModel.__init__	input_idsrf   c                 C   s
   |  |S N)r   r`   r   r>   r>   rd   embed_input_ids/  s   
zExaoneMoeModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )Nre   r   )re   r   )
r   r   r   r   r   r   r   r   r   r   )	r`   r   r   r   r   re   r   layerrn   r>   r>   rd   rq   2  s(   

zExaoneMoeModel.forwardc                 C   s   t j| ddd| jj| jdS )N	gate_proj	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer.   r;   )r   make_expert_params_mappingr'   r.   r;   )r`   r>   r>   rd   get_expert_mappingQ  s   z!ExaoneMoeModel.get_expert_mappingweightsc              
   C   s@  g d}d}t |  }t }|  }|D ]\}}|dr qd|v r%qd|v s-d|v r.q| jd ur\| j| }	r\||	 }
t|
dt}|	 dkrM|n|d }||
| |
|	 q|D ]8\}}}||vrhq^d	|v rmq^|||}|d
r}||vr}q^t|| rq^||vrq^|| }
|
j}||
||  nd}|D ]D}|\}}}}||vrqd}|||}t|| rq||r||vrq|| }
ttdtf |
j}||
||||dd}|r|} n9q|rq|d
r||vrq||r||vrqt||}|d u rqt|| r	q|| }
t|
dt}||
| |
| q|S )N))	.qkv_projz.q_projq)r   z.k_projk)r   z.v_projv).gate_up_projz
.gate_projr   )r   z.up_projr   )
.bias_biasz.k_scale_k_scalez.v_scale_v_scalez.weight_scale_weight_scalez.input_scale_input_scalemtp.zrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedweight_loaderr   zmlp.expertsr   FT.)shard_id	expert_idreturn_success)dictnamed_parameterssetr   
startswithr(   get_cache_scaler\   r   dimaddreplaceendswithr!   r   typingcastr   rv   r   )r`   r   stacked_params_mappingignore_suffixesparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
scale_nameparamr   
param_nameweight_namer   is_expert_weightmappingr   name_mappedsuccessr>   r>   rd   load_weights]  s   









zExaoneMoeModel.load_weightsr   )rr   rs   rt   r	   ru   r@   rL   rw   r   r   rq   listr   intr   r   r   r   rx   r>   r>   rb   rd   r      s"    1
,r   c                       s   e Zd Zg dddgdZdddZdgZd	d
dedef fddZde	j
de	j
fddZ		dde	j
dB de	j
dedB de	j
dB de	j
eB f
ddZde	j
de	j
dB fddZdeeee	j
f  dee fddZ  ZS ) ExaoneMoeForCausalLM)q_projk_projv_projr   r   )qkv_projgate_up_projinput_embeddingsoutput_embeddings)r   lm_headr   r&   r   ra   r)   c                   s   t    |jj }|j}|j}|| _|| _|| _t|t	|dd| _
t jrb|j| _|r6|  j|j7  _t| j|j|j|sAtn|j|d| _|jrR| j
jj| j_t|dd}t| j|j|| _nt | _| j
j| _d S )Nmodel)ra   r)   )r   padding_sizer(   logit_scaleg      ?)r?   r@   r   r   get_text_configr(   r   r'   r   r$   r   r   r   r   unpadded_vocab_sizer   r   r0   r   lora_vocab_padding_sizer   r   r   weightr\   r   logits_processorr   r   )r`   ra   r)   r'   r(   r   r   rb   r>   rd   r@     sB   

zExaoneMoeForCausalLM.__init__r   rf   c                 C   s   | j |S r   )r   r   r   r>   r>   rd   r   #  s   z$ExaoneMoeForCausalLM.embed_input_idsNr   r   r   c                 C   s   |  ||||}|S r   )r   )r`   r   r   r   r   model_outputr>   r>   rd   rq   &  s   zExaoneMoeForCausalLM.forwardre   c                 C   s   |  | j|}|S r   )r   r   )r`   re   logitsr>   r>   rd   compute_logits2  s   z#ExaoneMoeForCausalLM.compute_logitsr   c                 C   s(   t | | jjr
ddgndgd}||S )Nzlm_head.r   )skip_prefixes)r   r'   r   r   )r`   r   loaderr>   r>   rd   r   9  s
   
	z!ExaoneMoeForCausalLM.load_weights)NN)rr   rs   rt   packed_modules_mappingembedding_modulesembedding_padding_modulesr	   ru   r@   rL   rw   r   r   rq   r   r   r   r   r   rx   r>   r>   rb   rd   r     s>    +

,r   )=__doc__r   collections.abcr   r   	itertoolsr   rL   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r	   r
   vllm.distributedr   r   r   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   exaone4r   r   r   r]   
interfacesr   r   utilsr   r   r    r!   r"   r#   r$   Moduler%   ry   r   r   r>   r>   r>   rd   <module>   s8   $qL h