o
    
۾iLp                     @   s  d Z ddlZddlmZmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6 ee7Z8G dd de
Z9G dd dej:Z;G dd dej:Z<G dd  d ej:Z=G d!d" d"ej:Z>eG d#d$ d$ej:Z?G d%d& d&ej:e/e0Z@dS )'z?Inference-only Flash model compatible with HuggingFace weights.    N)CallableIterable)islice)nn)PretrainedConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group)init_logger)
SiluAndMul)FusedMoEZeroExpertFusedMoE)RMSNorm)MergedColumnParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)block_dequant)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)DeepseekV2MLAAttention)IntermediateTensors   )SupportsLoRA
SupportsPP)PPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sx   e Zd ZdZdZdgZ								
					
	
																									d fdd	Z  ZS )FlashConfigzFlash model configuration.longcat_flashpast_key_values             N`      r         @   F{Gz?h㈵>T順 顆         bfloat16float32      ?r   c'           *         sh  t  jd||||||| |"|!|&d
|' || _|| _|| _|d ur#|n|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _|d u rG|}|| _|| _|| _|| _|| _|'dd }(|(pc|pcddi}|'dd})d|vrr|)|d< || _|| _|| _|| _|| _|$| _|%| _|#| _d| _t| dr| jn|| _ t| d	r| j!| _!d S t| d
r| j"| _!d S | j | _!d S )N)
pad_token_idbos_token_ideos_token_idtie_word_embeddingsdtypeparams_dtyperouter_dtypetopk_methodrouter_biasnextn_use_scmoerope_scaling	rope_typedefault
rope_thetag    .Asiluffn_hidden_sizemoe_intermediate_sizeexpert_ffn_hidden_size )#super__init__
vocab_sizemax_position_embeddingshidden_sizenum_hidden_layersnum_attention_headsep_sizekv_lora_rankq_lora_rankqk_rope_head_dim
v_head_dimqk_nope_head_dimnum_experts_per_toknorm_topk_probnum_key_value_headsinitializer_rangerms_norm_epspretraining_tp	use_cachepoprope_parametersattention_biasattention_dropoutmla_scale_q_loramla_scale_kv_lorazero_expert_numzero_expert_typerouted_scaling_factor
hidden_acthasattrrF   intermediate_sizerG   rH   )*selfrL   rN   ri   
num_layersrO   rP   rY   rQ   rR   rS   rT   rU   rV   rW   rX   rM   rZ   r[   r]   r7   r8   r9   r\   r:   r_   r`   ra   rb   rc   r;   r<   r=   r?   r>   rf   rd   re   r@   kwargsrA   rD   	__class__rI   \/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/longcat_flash.pyrK   U   st   *

zFlashConfig.__init__)&r&   r'   r(   r)   Nr*   r+   r   r,   r-   r.   r+   r+   NFr(   r/   r0   TNr1   r2   r   FNFr3   FFr4   r4   r5   FNr6   r   NF)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencerK   __classcell__rI   rI   rm   ro   r#   O   sV    r#   c                       s`   e Zd ZdZ			ddededededB d	ed
eddf fddZde	j
de	j
fddZ  ZS )FlashMLPzFlash MLP layer.NT rN   ri   rg   quant_configreduce_resultsprefixreturnc                    sj   t    t||gd d|| dd| _t||d||| dd| _|dkr/td| d	t | _d S )
N   F.gate_up_proj)biasry   r{   z
.down_proj)r   ry   rz   r{   rE   zUnsupported activation: z!. Only silu is supported for now.)	rJ   rK   r   gate_up_projr   	down_proj
ValueErrorr   act_fn)rj   rN   ri   rg   ry   rz   r{   rm   rI   ro   rK      s*   
	
zFlashMLP.__init__xc                 C   s:   |  dkr|S | |\}}| |}| |\}}|S )Nr   )numelr   r   r   )rj   r   gate_up_rI   rI   ro   forward   s   
zFlashMLP.forward)NTrx   )rp   rq   rr   rs   intstrr   boolrK   torchTensorr   rv   rI   rI   rm   ro   rw      s*    rw   c                	       s<   e Zd Z	d
dededejdef fddZdd	 Z	  Z
S )LongcatRouterrx   configrd   rounter_params_dtyper{   c                    sp   t    t|dr|jn|jd | _| j| | _t|j| j|j|d | dd| _t	
tj| j|d| _d S )Nn_routed_expertsr   z.classifier)r   r<   ry   r{   )r;   )rJ   rK   rh   r   num_expertsr   rN   r?   
classifierr   	Parameterr   zerose_score_correction_bias)rj   r   rd   r   r{   rm   rI   ro   rK      s"   

zLongcatRouter.__init__c                 C   s   |  |\}}|S N)r   )rj   hidden_stateslogitsr   rI   rI   ro   r     s   zLongcatRouter.forward)rx   )rp   rq   rr   r#   r   r   r;   r   rK   r   rv   rI   rI   rm   ro   r      s    r   c                       sl   e Zd Z				ddededededed	ejdB d
edB dede	f fddZ
dejdejfddZ  ZS )
LongcatMoeNrx   Fr   r   top_krN   ri   r<   ry   r{   enable_eplbc
           
         s   t    || _|| _|jdkrtj| _t||j| j| dd| _	|jd us)J |j
d us0J t|j|j
| j	||||d|d|| d|	|j| jd| _d S )Nr5   z.gate)r   rd   r   r{   TFz.experts)rd   re   routerr   r   rN   ri   rz   r<   renormalizery   r{   r   rf   router_logits_dtype)rJ   rK   rN   r   r=   r   r5   r   rd   r   re   r   rf   experts)
rj   r   r   r   rN   ri   r<   ry   r{   r   rm   rI   ro   rK     s<   

zLongcatMoe.__init__r   r|   c                 C   s   |j \}}|d|}| jj}||k r#tjjj|d|| fddd}n|}| |	| j
}| j||d}||krA|dd |f }|||S )Nr   constantr3   )modevalue)r   router_logits.)shapeviewr   rN   r   r   
functionalpadr   tor   )rj   r   
num_tokens
hidden_dimpadded_hiddenhidden_states_paddedrouter_logits_fullfinal_hidden_statesrI   rI   ro   r   ;  s*   


zLongcatMoe.forwardNNrx   F)rp   rq   rr   r#   r   r   r;   r   r   r   rK   r   r   rv   rI   rI   rm   ro   r     s2    	
.r   c                       s   e Zd ZdZ				ddedededB dedB d	ed
e	ddf fddZ
dejdejdejdB deejejf fddZ  ZS )FlashDecoderLayerz:Flash decoder layer with dual attention and MLP structure.Nrx   Fvllm_configr   cache_configry   r{   r   r|   c              	      s  t    tjddd _j_tddt fddt	dD _
tfd	dt	dD _tfd
dt	dD _tfddt	dD _ttdrijnjj tdrvjnjjj dd_d S )N.)sepr   rM   r(   c                    sh   g | ]0}t jjjjjtd rjndj dt	dg v r(dn d| dqS )rS   N	self_attndisable_quant_modulez.self_attn.)r   r   rN   	num_headsrV   rT   rU   rS   rR   rM   r   ry   r{   )
r   rN   rP   rV   rT   rU   rh   rS   rR   getattr.0ir   r   rM   r{   ry   rj   r   rI   ro   
<listcomp>q  s(    z.FlashDecoderLayer.__init__.<locals>.<listcomp>r}   c                       g | ]
}t  j jd qS epsr   rN   r[   r   r   rI   ro   r         c                    r   r   r   r   r   rI   ro   r     r   c                    sB   g | ]}t j j jd t dg v rdn d| dqS )mlpsr   Nz.mlps.)rN   ri   rg   ry   r{   )rw   rN   ri   rg   r   r   )r   r{   ry   rj   rI   ro   r     s    
r   moe_topkz.mlp)r   r   r   rN   ri   ry   r{   )rJ   rK   r   split	layer_idxrN   r   r   
ModuleListranger   input_layernormpost_attention_layernormr   r   rh   r   r   r   rW   rG   mlp)rj   r   r   r   ry   r{   r   rm   r   ro   rK   a  sB   
	

zFlashDecoderLayer.__init__	positionsr   residualc                 C   s   |d u r|}| j d |}n
| j d ||\}}| jd ||d d}| jd ||\}}| }| |}| jd |}| j d ||\}}| jd ||d d}| jd ||\}}| jd |}|| }||fS )Nr   )r   r   llama_4_scalingr   )r   r   r   cloner   r   )rj   r   r   r   hidden_states_copymoe_hidden_statesrI   rI   ro   r     s6   
zFlashDecoderLayer.forwardr   )rp   rq   rr   rs   r	   r#   r   r   r   r   rK   r   r   tupler   rv   rI   rI   rm   ro   r   ^  s<    Lr   c                       s   e Zd ZdZdddedef fddZdejd	ejfd
dZ			ddejdB dejde
dB dejdB d	eje
B f
ddZ  ZS )
FlashModelzFlash model.rx   r{   r   r{   c                   s   t    td
i jjjj j| _t	dd | _
j| _t jr5tjjt|dd| _nt | _tj fdd| dd\| _| _| _t jr_tjjd| _nt | _tdd	gj| _d S )Nr7   embed_tokensr   c                    s   t  | dS )N)r   ry   r{   )r   r   r   r   ry   r   rI   ro   <lambda>  s    z%FlashModel.__init__.<locals>.<lambda>z.layersr   r   r   rI   )rJ   rK   r#   model_config	hf_config__dict__r   ry   r   r   padding_idxrL   r
   is_first_rankr   rN   r"   r   r   r!   rO   start_layer	end_layerlayersis_last_rankr   r[   normr    make_empty_intermediate_tensors)rj   r   r{   rm   r   ro   rK     s2   



zFlashModel.__init__	input_idsr|   c                 C   s
   |  |S r   )r   rj   r   rI   rI   ro   embed_input_ids  s   
zFlashModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS | 	||\}}|S )Nr   r   )r   r   )
r
   r   r   r   r   r   r   r   r   r   )	rj   r   r   r   r   r   r   layerr   rI   rI   ro   r   
  s(   

zFlashModel.forwardNN)rp   rq   rr   rs   r	   r   rK   r   r   r   r   r   rv   rI   rI   rm   ro   r     s"    %r   c                       s   e Zd ZdZg dddgdZddded	ef fd
dZdej	dej	fddZ
		ddej	dB dej	dedB dej	dB dej	eB f
ddZdej	dej	dB fddZdeeeeeef  fddZdeeeej	f  dee fddZ  ZS )LongcatFlashForCausalLMz)Flash model for causal language modeling.)q_projk_projv_proj	gate_projup_proj)qkv_projr   rx   r   r   r{   c                   s   t    tdi |jjj}|j}|| _t|dr|j	n|j
|_
|| _t|t|dd| _t jrAt|j|j|t|dd| _nt | _t|j| _| jj| _d S )NrF   model)r   r{   lm_head)ry   r{   rI   )rJ   rK   r#   r   r   r   ry   r   rh   rF   ri   r   r"   r   r
   r   r   rL   rN   r   r   r   logits_processorr   )rj   r   r{   r   ry   rm   rI   ro   rK   ;  s.   


z LongcatFlashForCausalLM.__init__r   r|   c                 C   s   | j |S r   )r   r   r   rI   rI   ro   r   \  s   z'LongcatFlashForCausalLM.embed_input_idsNr   r   r   c                 C   s   |  ||||}|S r   )r   )rj   r   r   r   r   r   rI   rI   ro   r   _  s   zLongcatFlashForCausalLM.forwardr   c                 C   s   |  | j|}|S r   )r   r   )rj   r   r   rI   rI   ro   compute_logitsk  s   z&LongcatFlashForCausalLM.compute_logitsc                 C   s4   t j| dddt| jdr| jjdS | jjd dS )Nr   r   r   r   r   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer   )r   make_expert_params_mappingrh   r   r   r   )rj   rI   rI   ro   get_expert_mappingr  s   

z*LongcatFlashForCausalLM.get_expert_mappingweightsc              
   C   sX  g d}|   }t }t|  }|D ]\}}d|v rq|D ]A\}}	}
|	|vr(qd|v r1d|vr1q||	|}|dsA|drF||vrFqd|v rKqt|| rQq|| }|j}||||
  nd}|D ]Q}|\}}	}}
|	|vrqqdd	}||	|}d|v r~qd|ds|dr||vrqdt|| rqd|| }|j}t	t
d
tf |j}|||||
|d	d}|r|} n7qd|rq|dr||vrq|dr||vrqd|v rq|d u rqt|| rq|| }t|dt}||| || qt| jjD ]}tdD ]}t| jj| trq| jj| j| }t| jdrK|jjjtjtjfv rK| jj}|d urJt|jds9J t  }t!|jj|jj"|#|}n|jj}|$dd|j%|j& fj'|j%|j&gdd\}}|(dd) (dd|_*|) (dd|_+| jj,r|j-j j.| jj/| jj0 d 9  _.| jj1r|j2j j.| jj/| jj3 d 9  _.qq|S )N))fused_qkv_a_projq_a_projr   )r   kv_a_proj_with_mqar   )r~   z
.gate_projr   )r~   z.up_projr   zrotary_emb.inv_freqr   r   z.bias_biasz.mtp.FT.)shard_id	expert_idreturn_successz	.kv_scaleweight_loaderr}   weight_block_sizeweight_scale_invr   r   r   )dimg      ?)4r   setdictnamed_parametersreplaceendswithr   r  typingcastr   r   r   r   addr   r   rO   
isinstancer   r   r   r   rh   ry   	kv_b_projweightr;   r   float8_e4m3fnfloat8_e4m3fnuzr  get_default_dtyper   r  r   	unflattenrV   rU   r   	transpose
contiguousw_kcw_vcrb   q_a_layernormdatarN   rS   rc   kv_a_layernormrR   )rj   r   stacked_params_mappingexpert_params_mappingloaded_paramsparams_dictnameloaded_weight
param_nameweight_namer  paramr  is_expert_weightmappingr  name_mappedsuccesslayer_idr   r   r  r;   wr  r  rI   rI   ro   load_weights  s   









#z$LongcatFlashForCausalLM.load_weightsr   )rp   rq   rr   rs   packed_modules_mappingr	   r   rK   r   r   r   r   r   r   listr   r   r   r   r
  r/  rv   rI   rI   rm   ro   r   ,  s:    !

,r   )Ars   r  collections.abcr   r   	itertoolsr   r   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r	   vllm.distributedr
   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.fused_moer   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   8vllm.model_executor.layers.quantization.utils.int8_utilsr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   &vllm.model_executor.models.deepseek_v2r   vllm.sequencer   
interfacesr   r   utilsr   r   r    r!   r"   rp   loggerr#   Modulerw   r   r   r   r   r   rI   rI   rI   ro   <module>   sD   !q, R M