o
    it                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC eeDZEG dd de'ZFG dd dejGZHG dd  d ejGZIG d!d" d"ejGZJG d#d$ d$ejGZKe
G d%d& d&ejGZLG d'd( d(ejGe:e9ZMd)ed*eNd+eOdB fd,d-ZPdS ).zInference-only Jurassic model.    )Iterable)AnyN)nn)	Parameter)support_torch_compile)CacheConfigModelConfig
VllmConfig)get_dp_groupget_ep_groupget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeget_tp_group)init_logger)
SiluAndMulSwigluStepAndMul)	Attention)FusedMoE)SharedFusedMoE)GemmaRMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors)AttentionType   )MixtureOfExperts
SupportsPP)AutoWeightsLoaderPPMissingLayerWeightsMapperextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s@   e Zd ZdZdejdejeejedB f B f fddZ  Z	S )FP32ReplicatedLinearz(
    Use FP32 for higher precision.
    xreturnNc                    s$   | j tjksJ t |tjS N)params_dtypetorchfloat32superforwardto)selfr0   	__class__ X/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/step3p5.pyr7   B   s   zFP32ReplicatedLinear.forward)
__name__
__module____qualname____doc__r4   Tensortupler   r7   __classcell__r<   r<   r:   r=   r/   =   s    r/   c                       s`   e Zd Z			ddedededededB d	ed
eddf fddZde	j
de	j
fddZ  ZS )
Step3p5MLPNT confighidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixr1   c           	         s   t    t||gd d|| dd| _t||d||| dd| _|dkr/td| d	t | _|| _	|| _
d | _t|}|jr`|j| d urb|j| d
krd|j| | _t| jd| _d S d S d S d S )N   Fz.gate_up_projbiasrK   rM   z
.down_proj)rP   rK   rL   rM   siluzUnsupported activation: z!. Only silu is supported for now.r   )limit)r6   __init__r   gate_up_projr   	down_proj
ValueErrorr   act_fnrM   rH   rR   r*   swiglu_limits_sharedr   )	r9   rG   rH   rI   rJ   rK   rL   rM   	layer_idxr:   r<   r=   rS   K   sD   

	
zStep3p5MLP.__init__hidden_statesc                 C   s*   |  |\}}| |}| |\}}|S r2   )rT   rW   rU   )r9   rZ   gate_up_intermediate_actoutputr<   r<   r=   r7   w   s   
zStep3p5MLP.forward)NTrF   )r>   r?   r@   r   intstrr   boolrS   r4   rB   r7   rD   r<   r<   r:   r=   rE   J   s,    	,rE   c                )       s   e Zd Zdddddddddejdddddddfded	ed
edededB dededeee B dB de	dB de
dB deeef dB dedededB dedededededB def( fddZdejdejd ejfd!d"Z  ZS )#Step3p5Attentioni   Ngư>Fi'  rF         ?rH   	num_headsnum_kv_headsmax_positionhead_dimrms_norm_epsqkv_bias
rope_thetacache_configrK   rope_scalingrM   	attn_typesliding_windowuse_head_wise_attn_gatelayer_typesuse_rope_layersyarn_only_typesswa_num_attention_headspartial_rotary_factorc                    s  t    || _|| _t }t|| _|r|| j dk}n| jd dk}|r/|| j |vr/d }|d urA|rA|}|d ur@|}|| _nd }t|trM|| j }t	 | _
|| _| j| dks]J | j| | _|| _| j|kru| j| dkstJ n	|| j dks~J td| j| | _|p|| j | _| j| j | _| j| j | _| jd | _|| _t|| j| j| j||
| dd| _t| j| j |d|
| d	d| _|d urt|tstd
|d urt|ni }|dd | j|d< ||d< t| j||d| _t| j|| _t| j|| _ || _!|rt"|| jd| dd| _#d| _$|r%|| j | _$t%| j| j| j| j|	|
| d||d	| _&|| _'| jdksK| jdksKJ | jdkrW| j| _(d S | jd | _(d S )Nsliding_attentionrN   r   r$   g      z	.qkv_projrO   Fz.o_projz1rope_scaling must be a dict for Step3p5Attention.	rope_typedefaultrj   rt   )	head_sizerf   rope_parametersz.g_proj)rP   rM   Tz.attn)re   rk   rK   rM   per_layer_sliding_windowrm   g      ?))r6   rS   rH   total_num_headsr   r*   rY   
isinstancelistr   rankrt   rd   total_num_kv_headsmaxre   rg   q_sizekv_sizescalingrj   r   qkv_projr   o_projdictrV   
setdefaultr   
rotary_embr   q_normk_normro   r   g_projuse_roper   attnmax_position_embeddings
rotary_dim)r9   rH   rd   re   rf   rg   rh   ri   rj   rk   rK   rl   rM   rm   rn   ro   rp   rq   rr   rs   rt   tp_sizeenable_sliding_windowry   r:   r<   r=   rS      s   




	

zStep3p5Attention.__init__	positionsrZ   r1   c                 C   sN  |  |\}}|j| j| j| jgdd\}}}|jg |jd d |jd | j | jR  }| | }||j}|jg |jd d |jd | j | jR  }	| 	|	 }	|	|j}| j
rm| |||\}}| |||}
| jr| |\}}|
jg |
jd d | j| jR  |d  }|j|
j }
| |
\}}|S )N)dim)r   splitr   r   viewshaperg   r   
contiguousr   r   r   r   ro   r   rd   	unsqueezesigmoidr   )r9   r   rZ   qkvr\   qkv	q_by_head	k_by_headattn_output
extra_dimsr^   r<   r<   r=   r7     s(    00$zStep3p5Attention.forward)r>   r?   r@   r#   DECODERr_   floatra   r}   r   r   r   r`   r   rS   r4   rB   r7   rD   r<   r<   r:   r=   rb   ~   s    	
 rb   c                       s@   e Zd Z	d
dedef fddZdejdejfdd	Z  Z	S )FusedMoEBlockrF   vllm_configrM   c           	   	      sf  t    t | _t|| _t j | _	t j
 | _|jj}|j}|j}|j| _|j| _|j| _| j| _|jj| _| j| j | _| j| j	 | _| j| j | _| j| j | _| j|jkrjtd| j d|j dt|j|jdd tj| dd| _ |j!| _!| j!sJ d|j"| _#t$j%tj&|jtjddd	| _'|j(| _(| j(sJ d
d}|j)pg }| jt*|k r|| j nd }|dvrt+|}|dksJ dd}t,-d| j|| t.|| j|j/dd|| dd| _0t1d'i d| j0d| j d|jd|j2d|jd|j3ddd|j4d|d|d| ddt5|d d!d"| j'd#|j"d$| jd%| jd&tj| _6d S )(NzTensor parallel size z' is greater than the number of experts .Fz.gate)rP   rK   r3   rM   z)Only support use_moe_router_bias is true.)dtype)requires_gradz>Router logits must use FP32 precision for numerical stability.rQ   )Nr   g      @z4Swiglu limit in fused moe block only suport 7.0 now.
swiglustepz0step3p5 layer_idx: %s, activation: %s, limit: %sz.share_expert)rG   rH   rI   rJ   rL   rK   rM   shared_expertsgatenum_expertstop_krH   rI   rL   renormalizerK   
activationrM   z.expertsscoring_funcmoe_router_activationr   e_score_correction_biasrouted_scaling_factorenable_eplbnum_redundant_expertsrouter_logits_dtyper<   )7r6   rS   r   r   r*   rY   r   device_groupsizeep_sizer~   ep_rankmodel_config	hf_configrK   parallel_configrH   r   moe_num_expertsn_routed_expertsn_logical_expertseplb_configr   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endrV   r/   r4   r5   r   use_moe_router_biasmoe_router_scaling_factorr   r   r   zerosrouter_biasneed_fp32_gateswiglu_limitslenr   loggerdebugrE   share_expert_dimshare_expertr   	moe_top_kmoe_intermediate_sizenorm_expert_weightgetattrexperts)	r9   r   rM   rG   rK   r   r   r   swiglu_limitr:   r<   r=   rS   !  s   







		


zFusedMoEBlock.__init__rZ   r1   c           	      C   s   |j \}}|d|}| jjr| j||d}n| |\}}| j||d}|\}}| jd u r4|d u s4J | jd urC|d us?J ||7 }| jdkrN| j|}|||S )Nr   )rZ   router_logitsr$   )r   r   r   is_internal_routerr   r   r   &maybe_all_reduce_tensor_model_parallel)	r9   rZ   
num_tokens
hidden_dimfused_moe_outr   r\   shared_outputfinal_hidden_statesr<   r<   r=   r7     s*   



zFusedMoEBlock.forwardrF   )
r>   r?   r@   r	   r`   rS   r4   rB   r7   rD   r<   r<   r:   r=   r      s    hr   c                       sf   e Zd Z	ddededdf fddZdejd	ejdejfd
dZdejdejdejfddZ	  Z
S )Step3p5DecoderLayerrF   r   rM   r1   Nc              	      s  t    |jj}|j| _t|}|| _|j}|j}|d ur!d |_	|j
dkrd }d }d }	t|dd rQt|dg rQ|j| |jd krQ|jd }|jd }|jd }	t|dg }
td,i d	| jd
|rc|n|jd|jd|ro|n|jd|jd|jdt|ddd|	r|	nt|dd d|d|dt|dd dt|dd dt|dddt|dg dt|dg dt|dg d|
r|
| ndd| d| _ntd|j
 d| _t | _t dkot jdk| _| jrtd ntd  t|d!d }|d urd"d# |  d$D }nd%d# t!d|j"D }||v r.t#|| d&d'| _$d(| _nt%||j|j&d)|d(| d*d+| _'t(|j|j| _)t(|j|j| _*|| _+d S )-NGQAattention_other_settingrp   attention_typenum_attention_headsnum_attention_groupsrg   partial_rotary_factorsrH   rd   rf   re   rj   rh   ri   attention_biasFrk   rK   rl   rn   ro   rq   rr   rt   rc   rM   z
.self_attnz&Unsupported attention implementation: r$   z!Enable custom fused all reduce...z"Disable custom fused all reduce...moe_layers_enumc                 S   s   g | ]}t |qS r<   )r_   .0ir<   r<   r=   
<listcomp>      z0Step3p5DecoderLayer.__init__.<locals>.<listcomp>,c                 S   s   g | ]}|qS r<   r<   r   r<   r<   r=   r     s    z.moerM   TrQ   z.mlp)rG   rH   rI   rJ   rK   rL   rM   r<   ),r6   rS   r   r   rH   r*   rY   rk   rK   rn   att_impl_typer   rp   r   rb   r   r   r   rj   rh   	self_attnrV   use_moer   tp_groupr   r
   
world_sizeuse_fused_all_reducer   warning_oncestripr   rangenum_hidden_layersr   moerE   rI   mlpr   input_layernormpost_attention_layernormrM   )r9   r   rM   rG   rY   rk   rK   r   r   rg   r   r   moe_layers_idxr:   r<   r=   rS     s   




	








	
zStep3p5DecoderLayer.__init__in1in2c                 C   s   | j s|| S | j|| S r2   )r   r   
all_reduce)r9   r  r  r<   r<   r=    add_and_maybe_inplace_all_reduce  s   z4Step3p5DecoderLayer.add_and_maybe_inplace_all_reducer   rZ   c                 C   sZ   |}|  |}| j||d}||7 }|}| |}| jr"| |}n| |}|| }|S )N)r   rZ   )r   r   r   r   r   r   )r9   r   rZ   residual
ffn_outputr<   r<   r=   r7     s   


zStep3p5DecoderLayer.forwardr   )r>   r?   r@   r	   r`   rS   r4   rB   r  r7   rD   r<   r<   r:   r=   r     s.    f
r   c                       s   e Zd Zddededdf fddZdejdejfd	d
Z		ddejdejde	dB dejdB dejf
ddZ
deeeejf  dee fddZ  ZS )Step3p5ModelrF   r   rM   r1   Nc                    s   t     | _ jj}|j| _|| _|j| _t j	s"|j
r+t jr+t| j|j| _nt | _t|j fdd| dd\| _| _| _t jrQt|j|j| _nt | _tdg|j| _d S )Nc                    s   t  | dS )Nr   )r   r   r   r<   r=   <lambda>E  s    z'Step3p5Model.__init__.<locals>.<lambda>z.layersr   rZ   )r6   rS   r   r   r   
vocab_sizerG   r   r   is_first_ranktie_word_embeddingsis_last_rankr    rH   embed_tokensr(   r-   r   start_layer	end_layerlayersr   rh   normr,   make_empty_intermediate_tensors)r9   r   rM   rG   r:   r  r=   rS   /  s6   


zStep3p5Model.__init__	input_idsc                 C   s
   |  |S r2   )r  r9   r  r<   r<   r=   embed_input_idsT  s   
zStep3p5Model.embed_input_idsr   intermediate_tensorsinputs_embedsc                 C   sv   t  jr|d ur|}n| |}n
|d usJ |d }t| j| jD ]}| j| }|||}q"t  js9td|iS |S )NrZ   )	r   r  r  r   r  r  r  r  r"   )r9   r  r   r  r  rZ   r   layerr<   r<   r=   r7   W  s   
zStep3p5Model.forwardweightsc              
      s  | j }|jdksJ dg }g d}t|  }t }g d}dd |D }|D ]^\}	}
|	dr=|	tdd   |	}n|	 |	rFd|	 nd}t||}|d urRq(|d	rt|d
}t|dkrt|d 	 rtt
|d }||jkrtq(|D ]:\}}}| vrqvt fdd|D rqv ||}t|| rqv||vrqv|| }|j}|||
| ||  n|D ]W\}}}| vrq ||}t|| rq|ds|dr||vrq||vrq|| }|j}| j}|
jd |ksJ t|D ]}|
| }||||||d q||  n||D ]M\}}}}| vrq ||}t|| r(q||vr/q|| }|j|j }t
|| }t
|| }||j||| }||
 ||  n,t | rbq(d v rmtd q( |vrsq(|  }t|dt}|||
 |  q(|S )Nr$   zOnly support GQA))r   q_projr   )r   k_projr   )r   v_projr   )rT   	gate_projr   )rT   up_projr$   )).moe.experts.w13_weightz.moe.gate_proj.weightw1)r   z.moe.up_proj.weightw3)z.moe.experts.w2_weightz.moe.down_proj.weightw2c                 S   s   g | ]}|d  qS )r$   r<   )r   datar<   r<   r=   r     r   z-Step3p5Model.load_weights.<locals>.<listcomp>zmodel.modelmodel.layers.r   rN   c                 3   s    | ]}| v V  qd S r2   r<   )r   disable_moe_stacked_param
local_namer<   r=   	<genexpr>  s
    
z,Step3p5Model.load_weights.<locals>.<genexpr>z.bias_biasr   )shard_id	expert_idexpert_biaszignore expert_biasweight_loader)rG   r   r   named_parametersset
startswithr   #get_spec_layer_idx_from_weight_namer   isdigitr_   r   anyreplacer+   r/  addendswithr   r   r   
output_dimnarrowcopy_r   r   r   r!   )r9   r  rG   qkv_params_mappingstacked_params_mappingparams_dictloaded_paramsexpert_params_mappingdisable_moe_stacked_paramsnameloaded_weight	full_name
spec_layerpartsrY   
param_nameweight_namer,  replaced_nameparamr/  moe_expert_numr-  loaded_weight_expert	start_idxend_idxr   	begin_idxparam_slicer<   r(  r=   load_weightss  s   	

















zStep3p5Model.load_weightsr   NN)r>   r?   r@   r	   r`   rS   r4   rB   r  r"   r7   r   rC   r1  rQ  rD   r<   r<   r:   r=   r  -  s"    %
,r  c                
       s   e Zd ZeddidZdddedef fdd	Z	
	
d#dej	dej	de
d
B dej	d
B fddZdej	dej	fddZdej	dej	fddZdej	dej	dej	dd
fddZdededd
fddZd eeeej	f  dee fd!d"Z  ZS )$Step3p5ForCausalLMz.share_expert.z.moe.share_expert.)orig_to_new_substrrF   r   r   rM   c                   s(  t    |jj}t|t|dd| _t jr-t	|j
|j|jt|dd| _t|j
| _nt | _| jj| _g | _| jjD ]!}t|trEq=t|tsLJ t|dr^t|jtr^| j|j q=g | _t| jdksmJ d| jd }t| j| _d| _d| _|j| _ |j!| _"|j#| _$|j%| _&|j'| _(d S )	Nr%  )r   rM   lm_head)rK   rM   r   r   z!No MoE layers found in the model.r$   ))r6   rS   r   r   r  r.   r%  r   r  r   r
  rH   rK   rU  r   logits_processorr(   r  
moe_layersr  r|   r   hasattrr   r   appendexpert_weightsr   num_moe_layersnum_expert_groupsnum_shared_expertsr   num_logical_expertsr   num_physical_expertsr   num_local_physical_expertsr   num_routed_expertsr   r   )r9   r   rM   rG   r  example_layerr:   r<   r=   rS     sF   



zStep3p5ForCausalLM.__init__Nr  r   r  r  c                 C   s   |  ||||}|S r2   )r%  )r9   r  r   r  r  rZ   r<   r<   r=   r7   +  s   zStep3p5ForCausalLM.forwardrZ   r1   c                 C   s   | j |}| | j|}|S r2   )r%  r  rV  rU  )r9   rZ   logitsr<   r<   r=   compute_logits7  s   z!Step3p5ForCausalLM.compute_logitsc                 C   s   | j |S r2   )r%  r  r  r<   r<   r=   r  <  s   z"Step3p5ForCausalLM.embed_input_idsexpert_load_viewlogical_to_physical_maplogical_replica_countc                 C   sN   t | jD ]\}}|j}t|tsJ | j|  |j||||d qd S )N)moe_layer_idxre  rf  rg  )		enumeraterW  r   r|   r   rZ  rY  get_expert_weightsset_eplb_state)r9   re  rf  rg  rY   r  r   r<   r<   r=   rk  ?  s   z!Step3p5ForCausalLM.set_eplb_stater_  r`  c                 C   sb   | j |ksJ || _|| _ || j | _| jD ]}t|tsJ ||_||_| j|_	|j
  qd S r2   )r`  r_  r^  r   rW  r|   r   r   r   r   r   update_expert_map)r9   r_  r`  r  r<   r<   r=    update_physical_experts_metadataQ  s   
z3Step3p5ForCausalLM.update_physical_experts_metadatar  c                 C   s   t | }|j|| jdS )N)mapper)r'   rQ  hf_to_vllm_mapper)r9   r  loaderr<   r<   r=   rQ  a  s   zStep3p5ForCausalLM.load_weightsrR  )r>   r?   r@   r)   ro  r	   r`   rS   r4   rB   r"   r7   rd  r  rk  r_   rm  r   rC   r1  rQ  rD   r<   r<   r:   r=   rS    sP    3


,rS  rG   rH  r1   c                 C   sf   t | dr1| jdkr1| j}t| jD ]}|d||  ds*|d||  dr0||   S qd S )Nnum_nextn_predict_layersr   zlayers.r   r&  )rX  rq  r   r   r2  )rG   rH  rY   r   r<   r<   r=   r3  f  s   

r3  )QrA   collections.abcr   typingr   r4   r   torch.nn.parameterr   vllm.compilation.decoratorsr   vllm.configr   r   r	   vllm.distributedr
   r   r   r   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   r   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   5vllm.model_executor.layers.fused_moe.shared_fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   r   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.quantization.base_configr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r    -vllm.model_executor.model_loader.weight_utilsr!   vllm.sequencer"   vllm.v1.attention.backendr#   
interfacesr%   r&   utilsr'   r(   r)   r*   r+   r,   r-   r.   r>   r   r/   ModulerE   rb   r   r   r  rS  r`   r_   r3  r<   r<   r<   r=   <module>   sZ    (4 # 
  Jo