o
    -i                     @   s"  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD eeEZFG dd de'ZGG dd dejHZIG dd  d ejHZJG d!d" d"ejHZKG d#d$ d$ejHZLeG d%d& d&ejHZMG d'd( d(ejHe;e:ZNd)ed*eOd+ePdB fd,d-ZQdS ).zInference-only Jurassic model.    )Iterable)AnyN)nn)	Parameter)	Attention)support_torch_compile)CacheConfigModelConfig
VllmConfig)get_dp_groupget_ep_groupget_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizeget_tp_group)init_logger)
SiluAndMulSwigluStepAndMul)FusedMoE)SharedFusedMoE)GemmaRMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)DEFAULT_VOCAB_PADDING_SIZEParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors)AttentionType   )MixtureOfExperts
SupportsPP)AutoWeightsLoaderPPMissingLayerWeightsMapperextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       s@   e Zd ZdZdejdejeejedB f B f fddZ  Z	S )FP32ReplicatedLinearz(
    Use FP32 for higher precision.
    xreturnNc                    s$   | j tjksJ t |tjS N)params_dtypetorchfloat32superforwardto)selfr1   	__class__ _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/step3p5.pyr8   C   s   zFP32ReplicatedLinear.forward)
__name__
__module____qualname____doc__r5   Tensortupler   r8   __classcell__r=   r=   r;   r>   r0   >   s    r0   c                       s`   e Zd Z			ddedededededB d	ed
eddf fddZde	j
de	j
fddZ  ZS )
Step3p5MLPNT confighidden_sizeintermediate_size
hidden_actquant_configreduce_resultsprefixr2   c           	         s   t    t||gd d|| dd| _t||d||| dd| _|dkr/td| d	t | _|| _	|| _
d | _t|}|jr`|j| d urb|j| d
krd|j| | _t| jd| _d S d S d S d S )N   Fz.gate_up_projbiasrL   rN   z
.down_proj)rQ   rL   rM   rN   siluzUnsupported activation: z!. Only silu is supported for now.r   )limit)r7   __init__r   gate_up_projr   	down_proj
ValueErrorr   act_fnrN   rI   rS   r+   swiglu_limits_sharedr   )	r:   rH   rI   rJ   rK   rL   rM   rN   	layer_idxr;   r=   r>   rT   L   sD   

	
zStep3p5MLP.__init__hidden_statesc                 C   s*   |  |\}}| |}| |\}}|S r3   )rU   rX   rV   )r:   r[   gate_up_intermediate_actoutputr=   r=   r>   r8   x   s   
zStep3p5MLP.forward)NTrG   )r?   r@   rA   r	   intstrr   boolrT   r5   rC   r8   rE   r=   r=   r;   r>   rF   K   s,    	,rF   c                )       s   e Zd Zdddddddddejdddddddfded	ed
edededB dededeee B dB de	dB de
dB deeef dB dedededB dedededededB def( fddZdejdejd ejfd!d"Z  ZS )#Step3p5Attentioni   Ngư>Fi'  rG         ?rI   	num_headsnum_kv_headsmax_positionhead_dimrms_norm_epsqkv_bias
rope_thetacache_configrL   rope_scalingrN   	attn_typesliding_windowuse_head_wise_attn_gatelayer_typesuse_rope_layersyarn_only_typesswa_num_attention_headspartial_rotary_factorc                    s  t    || _|| _t }t|| _|r|| j dk}n| jd dk}|r/|| j |vr/d }|d urA|rA|}|d ur@|}|| _nd }t|trM|| j }t	 | _
|| _| j| dks]J | j| | _|| _| j|kru| j| dkstJ n	|| j dks~J td| j| | _|p|| j | _| j| j | _| j| j | _| jd | _|| _t|| j| j| j||
| dd| _t| j| j |d|
| d	d| _|d urt|tstd
|d urt|ni }|dd | j|d< ||d< t| j||d| _t| j|| _t| j|| _ || _!|rt"|| jd| dd| _#d| _$|r%|| j | _$t%| j| j| j| j|	|
| d||d	| _&|| _'| jdksK| jdksKJ | jdkrW| j| _(d S | jd | _(d S )Nsliding_attentionrO   r   r%   g      z	.qkv_projrP   Fz.o_projz1rope_scaling must be a dict for Step3p5Attention.	rope_typedefaultrk   ru   )	head_sizerg   rope_parametersz.g_proj)rQ   rN   Tz.attn)rf   rl   rL   rN   per_layer_sliding_windowrn   g      ?))r7   rT   rI   total_num_headsr   r+   rZ   
isinstancelistr   rankru   re   total_num_kv_headsmaxrf   rh   q_sizekv_sizescalingrk   r   qkv_projr   o_projdictrW   
setdefaultr   
rotary_embr   q_normk_normrp   r   g_projuse_roper   attnmax_position_embeddings
rotary_dim)r:   rI   re   rf   rg   rh   ri   rj   rk   rl   rL   rm   rN   rn   ro   rp   rq   rr   rs   rt   ru   tp_sizeenable_sliding_windowrz   r;   r=   r>   rT      s   




	

zStep3p5Attention.__init__	positionsr[   r2   c                 C   sN  |  |\}}|j| j| j| jgdd\}}}|jg |jd d |jd | j | jR  }| | }||j}|jg |jd d |jd | j | jR  }	| 	|	 }	|	|j}| j
rm| |||\}}| |||}
| jr| |\}}|
jg |
jd d | j| jR  |d  }|j|
j }
| |
\}}|S )N)dim)r   splitr   r   viewshaperh   r   
contiguousr   r   r   r   rp   r   re   	unsqueezesigmoidr   )r:   r   r[   qkvr]   qkv	q_by_head	k_by_headattn_output
extra_dimsr_   r=   r=   r>   r8     s(    00$zStep3p5Attention.forward)r?   r@   rA   r$   DECODERr`   floatrb   r~   r   r   r   ra   r   rT   r5   rC   r8   rE   r=   r=   r;   r>   rc      s    	
 rc   c                       s@   e Zd Z	d
dedef fddZdejdejfdd	Z  Z	S )FusedMoEBlockrG   vllm_configrN   c           	   	      s^  t    t | _t|| _t j | _	t j
 | _|jj}|j}|j}|j| _|j| _|j| _| j| _|jj| _| j| j | _| j| j	 | _| j| j | _| j| j | _| j|jkrjtd| j d|j dt|j|jdd tj| dd| _ |j!| _!| j!sJ d|j"| _#t$j%tj&|jtjddd	| _'|j(| _(| j(sJ d
d}|j)pg }| jt*|k r|| j nd }|dvrt+|}|dksJ dd}t,-d| j|| t.|| j|j/dd|| dd| _0t1d&i d| j0d| j d|jd|j2d|jd|j3ddd|j4d|d|d| ddt5|d d!d"| j'd#|j"d$| jd%| j| _6d S )'NzTensor parallel size z' is greater than the number of experts .Fz.gate)rQ   rL   r4   rN   z)Only support use_moe_router_bias is true.)dtype)requires_gradz>Router logits must use FP32 precision for numerical stability.rR   )Nr   g      @z4Swiglu limit in fused moe block only suport 7.0 now.
swiglustepz0step3p5 layer_idx: %s, activation: %s, limit: %sz.share_expert)rH   rI   rJ   rK   rM   rL   rN   shared_expertsgatenum_expertstop_krI   rJ   rM   renormalizerL   
activationrN   z.expertsscoring_funcmoe_router_activationr   e_score_correction_biasrouted_scaling_factorenable_eplbnum_redundant_expertsr=   )7r7   rT   r   r   r+   rZ   r   device_groupsizeep_sizer   ep_rankmodel_config	hf_configrL   parallel_configrI   r   moe_num_expertsn_routed_expertsn_logical_expertseplb_configr   n_redundant_expertsn_physical_expertsn_local_physical_expertsphysical_expert_startphysical_expert_endrW   r0   r5   r6   r   use_moe_router_biasmoe_router_scaling_factorr   r   r   zerosrouter_biasneed_fp32_gateswiglu_limitslenr   loggerdebugrF   share_expert_dimshare_expertr   	moe_top_kmoe_intermediate_sizenorm_expert_weightgetattrexperts)	r:   r   rN   rH   rL   r   r   r   swiglu_limitr;   r=   r>   rT   "  s   







		


zFusedMoEBlock.__init__r[   r2   c           	      C   s   |j \}}|d|}| jjr| j||d}n| |\}}| j||d}|\}}| jd u r4|d u s4J | jd urC|d us?J ||7 }| jdkrN| j|}|||S )Nr   )r[   router_logitsr%   )r   r   r   is_internal_routerr   r   r   &maybe_all_reduce_tensor_model_parallel)	r:   r[   
num_tokens
hidden_dimfused_moe_outr   r]   shared_outputfinal_hidden_statesr=   r=   r>   r8     s*   



zFusedMoEBlock.forwardrG   )
r?   r@   rA   r
   ra   rT   r5   rC   r8   rE   r=   r=   r;   r>   r   !  s    gr   c                       sf   e Zd Z	ddededdf fddZdejd	ejdejfd
dZdejdejdejfddZ	  Z
S )Step3p5DecoderLayerrG   r   rN   r2   Nc              	      s  t    |jj}|j| _t|}|| _|j}|j}|d ur!d |_	|j
dkrd }d }d }	t|dd rQt|dg rQ|j| |jd krQ|jd }|jd }|jd }	t|dg }
td,i d	| jd
|rc|n|jd|jd|ro|n|jd|jd|jdt|ddd|	r|	nt|dd d|d|dt|dd dt|dd dt|dddt|dg dt|dg dt|dg d|
r|
| ndd| d| _ntd|j
 d| _t | _t dkot jdk| _| jrtd ntd  t|d!d }|d urd"d# |  d$D }nd%d# t!d|j"D }||v r.t#|| d&d'| _$d(| _nt%||j|j&d)|d(| d*d+| _'t(|j|j| _)t(|j|j| _*|| _+d S )-NGQAattention_other_settingrq   attention_typenum_attention_headsnum_attention_groupsrh   partial_rotary_factorsrI   re   rg   rf   rk   ri   rj   attention_biasFrl   rL   rm   ro   rp   rr   rs   ru   rd   rN   z
.self_attnz&Unsupported attention implementation: r%   z!Enable custom fused all reduce...z"Disable custom fused all reduce...moe_layers_enumc                 S   s   g | ]}t |qS r=   )r`   .0ir=   r=   r>   
<listcomp>      z0Step3p5DecoderLayer.__init__.<locals>.<listcomp>,c                 S   s   g | ]}|qS r=   r=   r   r=   r=   r>   r     s    z.moerN   TrR   z.mlp)rH   rI   rJ   rK   rL   rM   rN   r=   ),r7   rT   r   r   rI   r+   rZ   rl   rL   ro   att_impl_typer   rq   r   rc   r   r   r   rk   ri   	self_attnrW   use_moer   tp_groupr   r   
world_sizeuse_fused_all_reducer   warning_oncestripr   rangenum_hidden_layersr   moerF   rJ   mlpr   input_layernormpost_attention_layernormrN   )r:   r   rN   rH   rZ   rl   rL   r   r   rh   r   r   moe_layers_idxr;   r=   r>   rT     s   




	








	
zStep3p5DecoderLayer.__init__in1in2c                 C   s   | j s|| S | j|| S r3   )r   r   
all_reduce)r:   r  r  r=   r=   r>    add_and_maybe_inplace_all_reduce  s   z4Step3p5DecoderLayer.add_and_maybe_inplace_all_reducer   r[   c                 C   sZ   |}|  |}| j||d}||7 }|}| |}| jr"| |}n| |}|| }|S )N)r   r[   )r   r   r   r   r   r   )r:   r   r[   residual
ffn_outputr=   r=   r>   r8     s   


zStep3p5DecoderLayer.forwardr   )r?   r@   rA   r
   ra   rT   r5   rC   r  r8   rE   r=   r=   r;   r>   r     s.    f
r   c                       s   e Zd Zddededdf fddZdejdejfd	d
Z		ddejdejde	dB dejdB dejf
ddZ
deeeejf  dee fddZ  ZS )Step3p5ModelrG   r   rN   r2   Nc                    s   t     | _ jj}|j| _|| _|j| _t j	s"|j
r+t jr+t| j|j| _nt | _t|j fdd| dd\| _| _| _t jrQt|j|j| _nt | _tdg|j| _d S )Nc                    s   t  | dS )Nr   )r   r   r   r=   r>   <lambda>E  s    z'Step3p5Model.__init__.<locals>.<lambda>z.layersr   r[   )r7   rT   r   r   r   
vocab_sizerH   r   r   is_first_ranktie_word_embeddingsis_last_rankr!   rI   embed_tokensr)   r.   r   start_layer	end_layerlayersr   ri   normr-   make_empty_intermediate_tensors)r:   r   rN   rH   r;   r  r>   rT   /  s6   


zStep3p5Model.__init__	input_idsc                 C   s
   |  |S r3   )r  r:   r  r=   r=   r>   embed_input_idsT  s   
zStep3p5Model.embed_input_idsr   intermediate_tensorsinputs_embedsc                 C   sv   t  jr|d ur|}n| |}n
|d usJ |d }t| j| jD ]}| j| }|||}q"t  js9td|iS |S )Nr[   )	r   r  r  r   r  r  r  r  r#   )r:   r  r   r  r  r[   r   layerr=   r=   r>   r8   W  s   
zStep3p5Model.forwardweightsc              
      s  | j }|jdksJ dg }g d}t|  }t }g d}dd |D }|D ]^\}	}
|	dr=|	tdd   |	}n|	 |	rFd|	 nd}t||}|d urRq(|d	rt|d
}t|dkrt|d 	 rtt
|d }||jkrtq(|D ]:\}}}| vrqvt fdd|D rqv ||}t|| rqv||vrqv|| }|j}|||
| ||  n|D ]W\}}}| vrq ||}t|| rq|ds|dr||vrq||vrq|| }|j}| j}|
jd |ksJ t|D ]}|
| }||||||d q||  n||D ]M\}}}}| vrq ||}t|| r(q||vr/q|| }|j|j }t
|| }t
|| }||j||| }||
 ||  n,t | rbq(d v rmtd q( |vrsq(|  }t|dt}|||
 |  q(|S )Nr%   zOnly support GQA))r   q_projr   )r   k_projr   )r   v_projr   )rU   	gate_projr   )rU   up_projr%   )).moe.experts.w13_weightz.moe.gate_proj.weightw1)r   z.moe.up_proj.weightw3)z.moe.experts.w2_weightz.moe.down_proj.weightw2c                 S   s   g | ]}|d  qS )r%   r=   )r   datar=   r=   r>   r     r   z-Step3p5Model.load_weights.<locals>.<listcomp>zmodel.modelmodel.layers.r   rO   c                 3   s    | ]}| v V  qd S r3   r=   )r   disable_moe_stacked_param
local_namer=   r>   	<genexpr>  s
    
z,Step3p5Model.load_weights.<locals>.<genexpr>z.bias_biasr   )shard_id	expert_idexpert_biaszignore expert_biasweight_loader)rH   r   r   named_parametersset
startswithr   #get_spec_layer_idx_from_weight_namer   isdigitr`   r   anyreplacer,   r/  addendswithr   r   r   
output_dimnarrowcopy_r   r   r   r"   )r:   r  rH   qkv_params_mappingstacked_params_mappingparams_dictloaded_paramsexpert_params_mappingdisable_moe_stacked_paramsnameloaded_weight	full_name
spec_layerpartsrZ   
param_nameweight_namer,  replaced_nameparamr/  moe_expert_numr-  loaded_weight_expert	start_idxend_idxr   	begin_idxparam_slicer=   r(  r>   load_weightss  s   	

















zStep3p5Model.load_weightsr   NN)r?   r@   rA   r
   ra   rT   r5   rC   r  r#   r8   r   rD   r1  rQ  rE   r=   r=   r;   r>   r  -  s"    %
,r  c                
       s   e Zd ZeddidZdddedef fdd	Z	
	
d#dej	dej	de
d
B dej	d
B fddZdej	dej	fddZdej	dej	fddZdej	dej	dej	dd
fddZdededd
fddZd eeeej	f  dee fd!d"Z  ZS )$Step3p5ForCausalLMz.share_expert.z.moe.share_expert.)orig_to_new_substrrG   r   r   rN   c                   s^  t    |jj}|j}|| _|| _t|t|dd| _	g | _
| j	jD ]!}t|tr+q#t|ts2J t|drDt|jtrD| j
|j q#t jrq|j| _|rW|  j|j7  _t| j|j|j|sbtn|jd| _t| j|j| _nt | _| j	j| _g | _ t!| j
dksJ d| j
d }t!| j
| _"d| _#d| _$|j%| _&|j'| _(|j)| _*|j+| _,|j-| _.d S )Nr%  )r   rN   r   )org_num_embeddingspadding_sizer   z!No MoE layers found in the model.r%   )/r7   rT   r   r   lora_configrH   r   r  r/   r%  
moe_layersr  r}   r)   r   hasattrr   r   appendr   r  r
  unpadded_vocab_sizelora_extra_vocab_sizer    rI   r   lora_vocab_padding_sizelm_headr   logits_processorr  expert_weightsr   num_moe_layersnum_expert_groupsnum_shared_expertsr   num_logical_expertsr   num_physical_expertsr   num_local_physical_expertsr   num_routed_expertsr   r   )r:   r   rN   rH   rW  r  example_layerr;   r=   r>   rT     sZ   



zStep3p5ForCausalLM.__init__Nr  r   r  r  c                 C   s   |  ||||}|S r3   )r%  )r:   r  r   r  r  r[   r=   r=   r>   r8   7  s   zStep3p5ForCausalLM.forwardr[   r2   c                 C   s   | j |}| | j|}|S r3   )r%  r  r_  r^  )r:   r[   logitsr=   r=   r>   compute_logitsC  s   z!Step3p5ForCausalLM.compute_logitsc                 C   s   | j |S r3   )r%  r  r  r=   r=   r>   r  H  s   z"Step3p5ForCausalLM.embed_input_idsexpert_load_viewlogical_to_physical_maplogical_replica_countc                 C   sN   t | jD ]\}}|j}t|tsJ | j|  |j||||d qd S )N)moe_layer_idxrk  rl  rm  )		enumeraterX  r   r}   r   r`  rZ  get_expert_weightsset_eplb_state)r:   rk  rl  rm  rZ   r  r   r=   r=   r>   rq  K  s   z!Step3p5ForCausalLM.set_eplb_statere  rf  c                 C   sb   | j |ksJ || _|| _ || j | _| jD ]}t|tsJ ||_||_| j|_	|j
  qd S r3   )rf  re  rd  r   rX  r}   r   r   r   r   r   update_expert_map)r:   re  rf  r  r=   r=   r>    update_physical_experts_metadata]  s   
z3Step3p5ForCausalLM.update_physical_experts_metadatar  c                 C   s   t | }|j|| jdS )N)mapper)r(   rQ  hf_to_vllm_mapper)r:   r  loaderr=   r=   r>   rQ  m  s   zStep3p5ForCausalLM.load_weightsrR  )r?   r@   rA   r*   ru  r
   ra   rT   r5   rC   r#   r8   rj  r  rq  r`   rs  r   rD   r1  rQ  rE   r=   r=   r;   r>   rS    sP    ?


,rS  rH   rH  r2   c                 C   sf   t | dr1| jdkr1| j}t| jD ]}|d||  ds*|d||  dr0||   S qd S )Nnum_nextn_predict_layersr   zlayers.r   r&  )rY  rw  r   r   r2  )rH   rH  rZ   r   r=   r=   r>   r3  r  s   

r3  )RrB   collections.abcr   typingr   r5   r   torch.nn.parameterr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r	   r
   vllm.distributedr   r   r   r   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   r   $vllm.model_executor.layers.fused_moer   5vllm.model_executor.layers.fused_moe.shared_fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   r   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.quantization.base_configr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r    r!   -vllm.model_executor.model_loader.weight_utilsr"   vllm.sequencer#   vllm.v1.attention.backendr$   
interfacesr&   r'   utilsr(   r)   r*   r+   r,   r-   r.   r/   r?   r   r0   ModulerF   rc   r   r   r  rS  ra   r`   r3  r=   r=   r=   r>   <module>   sZ    (4 # 	  J{