o
    iT                  	   @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@ G dd dejAZBG dd dejAZCG dd  d ejAZDeDeCd!ZEe
G d"d# d#ejAZFG d$d% d%ejAe5e7e9e6e8ZGG d&d' d'eGZHdS )(zInference-only Jamba model.    )Iterable)isliceN)nn)JambaConfig)support_torch_compile)CacheConfigModelConfig
VllmConfig)$get_tensor_model_parallel_world_size)get_pp_group)	Attention)FusedMoE)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)
MambaMixer)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)DispatchPooler)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)LlamaMLP)IntermediateTensors   )HasInnerStateIsHybridSupportsLoRASupportsMambaPrefixCaching
SupportsPP)AutoWeightsLoaderWeightsMapperis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       st   e Zd Z						ddededB dedB dejdB dedB dedB d	ef fd
dZ	dej
dej
fddZ  ZS )JambaMoEN confignum_expertstop_kparams_dtypetp_sizequant_configprefixc                    s   t    |p	|j| _|p|j| _|j| _|j| _| jdkr.t| j| jdd || dd| _	t
| j| j| j| j||ddd|| dd| _d S )Nr   Fz.router)biasr2   r0   r3   Tz.experts)r1   r0   reduce_resultsrenormalizeuse_grouped_topkr2   r3   )super__init__r.   num_total_expertsnum_experts_per_tokr/   hidden_sizeintermediate_sizer   routerr   experts)selfr-   r.   r/   r0   r1   r2   r3   	__class__ V/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/jamba.pyr9   <   s6   


	zJambaMoE.__init__hidden_statesreturnc                 C   sb   |j }|d| j}| jdkr| |\}}ntj|j d df|j|jd}| 	||}||S )Nr   r   )devicedtype)
shapeviewr<   r:   r>   torchonesrH   rI   r?   )r@   rE   
orig_shaperouter_logits_rC   rC   rD   forwardd   s   

zJambaMoE.forward)NNNNNr,   )__name__
__module____qualname__r   intrL   rI   r   strr9   TensorrQ   __classcell__rC   rC   rA   rD   r+   ;   s.    (r+   c                       st   e Zd Z					ddedededB dedB dedB d	edB d
e	ddf fddZ
dejdejdB fddZ  ZS )JambaMambaDecoderLayerNFr,   r-   	layer_idxmodel_configcache_configr2   is_lora_enabledr3   rF   c           
         s   t    || _|| _t|j|j|j|j|j |j	|j
|jd|j|j| j||| dd| _|j| }	|	dkrCt||| dd| _nt|j|j|j|| dd| _t|j|jd| _t|j|jd| _d S )NTz.mixer)r<   ssm_state_sizeconv_kernel_sizer=   time_step_rankuse_conv_biasuse_biasuse_rms_normrms_norm_eps
activationr]   r[   r\   r3   r   .feed_forwardr2   r3   eps)r8   r9   r-   r]   r   r<   mamba_d_statemamba_d_convmamba_expandmamba_dt_rankmamba_conv_biasmamba_proj_biasrd   
hidden_actmambalayers_num_expertsr+   feed_forwardJambaMLPr=   r   input_layernormpre_ff_layernorm)
r@   r-   rZ   r[   r\   r2   r]   r3   kwargsr.   rA   rC   rD   r9   u   sF   



zJambaMambaDecoderLayer.__init__rE   residualc                 K   s`   |d u r|}|  |}n|  ||\}}t|}| || | ||\}}| |}||fS N)ru   rL   
empty_likerq   rv   rs   )r@   rE   rx   rw   outputrC   rC   rD   rQ      s   

zJambaMambaDecoderLayer.forward)NNNFr,   )rR   rS   rT   r   rU   r   r   r   boolrV   r9   rL   rW   rQ   rX   rC   rC   rA   rD   rY   t   s8    
1rY   c                       s   e Zd Z				ddedededB dedB dedB ded	df fd
dZ	de
jde
jd	e
jfddZde
jde
jde
jdB fddZ  ZS )JambaAttentionDecoderLayerNr,   r-   rZ   r[   r\   r2   r3   rF   c           
   	      s  t    |j| _t }|j| _| j| dksJ | j| | _|j| _| j|kr2| j| dks1J n	|| j dks;J t	d| j| | _
|j| j | _| j| j | _| j
| j | _| jd | _t|j| j| j| jd|| dd| _t| j| j |jd|| dd| _t| j| j| j| j
|| dd	| _|j| }	|	dkrt||| d
d| _nt|j|j|j|| d
d| _t|j|jd| _t|j|jd| _d S )Nr   r   g      Fz	.qkv_proj)r4   r2   r3   z.o_projz.attn)num_kv_headsr\   r3   rf   rg   rh   )r8   r9   r<   r
   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr~   head_dimq_sizekv_sizescalingr   qkv_projr   o_projr   attnrr   r+   rs   rt   r=   rp   r   rd   ru   rv   )
r@   r-   rZ   r[   r\   r2   r3   rw   r1   r.   rA   rC   rD   r9      sn   


	

	
z#JambaAttentionDecoderLayer.__init__	positionsrE   c                 K   sN   |  |\}}|j| j| j| jgdd\}}}| |||}	| |	\}
}|
S )NrG   )dim)r   splitr   r   r   r   )r@   r   rE   rw   qkvrP   qkvattn_outputr{   rC   rC   rD   self_attention  s
    z)JambaAttentionDecoderLayer.self_attentionrx   c                 K   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r   rE   )ru   r   rv   rs   )r@   r   rE   rx   rw   rC   rC   rD   rQ     s   
z"JambaAttentionDecoderLayer.forward)NNNr,   )rR   rS   rT   r   rU   r   r   r   rV   r9   rL   rW   r   rQ   rX   rC   rC   rA   rD   r}      sD    	K
r}   )	attentionrq   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
deeeeeef  fddZdeeeejf  dee fddZ  ZS )
JambaModelr,   r3   vllm_configr3   c                   s   t    |jj|j|j |j| _j| _t| jj	| _
dt|jidtf fdd}tj|| dd\| _| _| _tddgj	| _tj	jd	| _d S )
Nr]   r3   c                    s>   t | ddd }tj|  }|| f| dS )N.r   rg   )rU   rsplitALL_DECODER_LAYER_TYPESlayers_block_type)r3   rZ   layer_classr\   r-   extra_kwargsr[   r2   rC   rD   	get_layerD  s   z&JambaModel.__init__.<locals>.get_layerz.layersr   rE   rx   rh   )r8   r9   r[   	hf_configr\   r2   r-   
vocab_sizer   r<   embed_tokensr|   lora_configrV   r)   num_hidden_layersstart_layer	end_layerlayersr(   make_empty_intermediate_tensorsr   rd   final_layernorm)r@   r   r3   r   rA   r   rD   r9   1  s(   

zJambaModel.__init__	input_idsrF   c                 C   s
   |  |S ry   )r   r@   r   rC   rC   rD   embed_input_idsZ     
zJambaModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]}||||d\}}q*t  jsAt||dS | 	||\}}|S )NrE   rx   )r   rE   rx   )rE   rx   )
r   is_first_rankr   r   r   r   r   is_last_rankr   r   )	r@   r   r   r   r   rE   rx   layerrP   rC   rC   rD   rQ   ]  s$   
zJambaModel.forwardc                 C   s   t j| ddd| jjdS )N	gate_proj	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer.   )r   make_expert_params_mappingr-   r.   r@   rC   rC   rD   get_expert_mapping{  s   zJambaModel.get_expert_mappingweightsc              	   C   sH  g d}t |  }t }|  }|D ]\}}d|v rq|D ]3\}}	}
|	|vr(qd|v r-q||	|}|dr=||vr=qt|| rCq|| }|j}||||
  nJ|D ](\}}	}}
|	|vr_qTt|| reqT||	|}|| }|j}|||||
|d  n|dr||vrqt|| rq|| }t|dt	}||| |
| q|S )N))r   q_projr   )r   k_projr   )r   v_projr   ).gate_up_projz
.gate_projr   )r   z.up_projr   zrotary_emb.inv_freqr?   z.bias)shard_id	expert_idweight_loader)dictnamed_parameterssetr   replaceendswithr'   r   getattrr   add)r@   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   r   rC   rC   rD   load_weights  sj   	



zJambaModel.load_weightsNN)rR   rS   rT   r	   rV   r9   rL   rW   r   r   rQ   listtuplerU   r   r   r   r   rX   rC   rC   rA   rD   r   /  s$    )
,r   c                
       s  e Zd ZeddddZg dddgdgd	Zd
ddZdddedef fddZ	de
jde
jfddZ		d1de
jdB de
jdedB de
jdB fddZdd Zdefd d!Zedd"dee
je
jf fd#d$Zedd"deeeef eeef f fd%d&Zedeeef fd'd(Zd)e
jde
jdB fd*d+Zd,eeee
jf  dee fd-d.Zdeeeeeef  fd/d0Z  ZS )2JambaForCausalLMr   z.A)z.self_attn.z.A_log)orig_to_new_substr)r   r   r   r   r   in_proj)r   gate_up_projr   input_embeddingsoutput_embeddings)r   lm_headr,   r   r   r3   c                   sz   |j j}|j}t   || _|| _|j | _ || _t|t|dd| _	t
|j|jt|dd| _t|j| _| j	j| _d S )Nmodelr   r3   r   r   )r[   r   scheduler_configr8   r9   r-   r   r   r*   r   r   r   r<   r   r   logits_processorr   )r@   r   r3   r-   r   rA   rC   rD   r9     s$   

zJambaForCausalLM.__init__r   rF   c                 C      | j |S ry   )r   r   r   rC   rC   rD   r        z JambaForCausalLM.embed_input_idsNr   r   r   c                 K   s   |  ||||}|S ry   )r   )r@   r   r   r   r   rw   rE   rC   rC   rD   rQ     s   zJambaForCausalLM.forwardc                 K   s   | j j|fi |S ry   )mamba_cachecopy_inputs_before_cuda_graphs)r@   input_buffersrw   rC   rC   rD   r     s   z/JambaForCausalLM.copy_inputs_before_cuda_graphs
batch_sizec                 C   r   ry   )r   "get_seqlen_agnostic_capture_inputs)r@   r   rC   rC   rD   r     r   z3JambaForCausalLM.get_seqlen_agnostic_capture_inputsr	   c                 C   s   t |jj|jj|jjS ry   )r   mamba1_state_dtyper[   rI   r\   mamba_cache_dtypemamba_ssm_cache_dtype)clsr   rC   rC   rD   !get_mamba_state_dtype_from_config  s
   z2JambaForCausalLM.get_mamba_state_dtype_from_configc                 C   s2   |j }|jj}|j}tj|j|j| |j|j	dS )N)tp_world_sizer=   
state_sizeconv_kernel)
parallel_configr[   r   r<   r   mamba1_state_shapetensor_parallel_sizerl   rj   rk   )r   r   r   r   r<   rC   rC   rD   !get_mamba_state_shape_from_config#  s   z2JambaForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S ry   )r   mamba1_state_copy_func)r   rC   rC   rD   get_mamba_state_copy_func3  s   z*JambaForCausalLM.get_mamba_state_copy_funcrE   c                 C   s   |  | j|}|S ry   )r   r   )r@   rE   logitsrC   rC   rD   compute_logits7  s   zJambaForCausalLM.compute_logitsr   c                 C   s   t | }|j|| jdS )N)mapper)r%   r   hf_to_vllm_mapper)r@   r   loaderrC   rC   rD   r   >  s   zJambaForCausalLM.load_weightsc                 C   s
   | j  S ry   )r   r   r   rC   rC   rD   r   B  r   z#JambaForCausalLM.get_expert_mappingr   ) rR   rS   rT   r&   r   packed_modules_mappingembedding_modulesr	   rV   r9   rL   rW   r   r   rQ   r   rU   r   classmethodr   rI   r   r   r   r   r   r   r   r   r   r   rX   rC   rC   rA   rD   r     s^    


$&r   c                       s0   e Zd ZdZdddedef fddZ  ZS )JambaForSequenceClassificationTr,   r   r   r3   c                   sn   t  j||d |jj}|j}t|dd}tj|j|||jj	d| _
|jj}|d us,J tj|| j
d| _d S )Nr   
score_biasF)r4   rI   )
classifier)r8   r9   r[   r   
num_labelsr   r   Linearr<   
head_dtypescorepooler_configr   for_seq_clspooler)r@   r   r3   r-   r  r  r  rA   rC   rD   r9   I  s   z'JambaForSequenceClassification.__init__)rR   rS   rT   is_pooling_modelr	   rV   r9   rX   rC   rC   rA   rD   r  F  s    $r  )I__doc__collections.abcr   	itertoolsr   rL   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   r	   vllm.distributedr
   vllm.distributed.parallel_stater   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   ,vllm.model_executor.layers.mamba.mamba_mixerr   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   !vllm.model_executor.layers.poolerr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr    vllm.model_executor.models.llamar   rt   vllm.sequencer   
interfacesr    r!   r"   r#   r$   utilsr%   r&   r'   r(   r)   r*   Moduler+   rY   r}   r   r   r   r  rC   rC   rC   rD   <module>   sV    
9Fp 
y