o
    
۾iE                  
   @   s  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8m9Z9m:Z:m;Z; G dd dej<Z=G dd dej<Z>G dd dej<Z?e?e>dZ@eG d d! d!ej<ZAG d"d# d#ej<e0e2e4e1e5e3	ZBdS )$zInference-only Bamba model.    )IterableN)nn)BambaConfig)support_torch_compile)CacheConfigModelConfig
VllmConfig)$get_tensor_model_parallel_world_size)get_pp_group)
SiluAndMul)	Attention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)MambaMixer2)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )HasInnerStateIsHybridSupportsLoRASupportsMambaPrefixCaching
SupportsPPSupportsQuant)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sF   e Zd Z			ddededB dededdf
 fd	d
Zdd Z  Z	S )BambaMLPNF configquant_configbiasprefixreturnc                    st   t    t|j|jgd ||| dd| _t|j|j||| dd| _|jdkr4t	d|j dt
 | _d S )	N   z.gate_up_proj)
input_sizeoutput_sizesr-   r,   r.   z
.down_proj)r1   output_sizer-   r,   r.   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__r   hidden_sizeintermediate_sizegate_up_projr   	down_proj
hidden_act
ValueErrorr   act_fn)selfr+   r,   r-   r.   	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/bamba.pyr6   ;   s(   


zBambaMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r9   r=   r:   )r>   x_rA   rA   rB   forwardX   s   
zBambaMLP.forward)NFr*   )
__name__
__module____qualname__r   r   boolstrr6   rF   __classcell__rA   rA   r?   rB   r)   :   s     r)   c                       sj   e Zd Z				ddedededB dedB dedB ded	df fd
dZ	de
jde
jdB fddZ  ZS )BambaMixerDecoderLayerNr*   r+   	layer_idxmodel_configcache_configr,   r.   r/   c                    s   t    || _t|j|j|j|j|j |j|j	|j
|j|j|j|j|||| dd| _t||| dd| _t|j|jd| _t|j|jd| _d S )Nz.mixer)r7   ssm_state_sizeconv_kernel_sizer8   use_conv_biasuse_biasn_groups	num_headshead_dimrms_norm_eps
activationrO   rP   r,   r.   .feed_forwardr,   r.   eps)r5   r6   r+   r   r7   mamba_d_statemamba_d_convmamba_expandmamba_conv_biasmamba_proj_biasmamba_n_groupsmamba_n_headsmamba_d_headrX   r;   mambar)   feed_forwardr   input_layernormpre_ff_layernorm)r>   r+   rN   rO   rP   r,   r.   r?   rA   rB   r6   `   s0   
	
zBambaMixerDecoderLayer.__init__hidden_statesresidualc                 K   sT   |d u r|}|  |}n|  ||\}}| |}| ||\}}| |}||fS rC   )rh   rf   ri   rg   )r>   rj   rk   kwargsoutputrA   rA   rB   rF      s   

zBambaMixerDecoderLayer.forwardNNNr*   )rG   rH   rI   r   intr   r   r   rK   r6   torchTensorrF   rL   rA   rA   r?   rB   rM   _   s2    #rM   c                       s   e Zd Z				ddedededB dedB dedB ded	df fd
dZ	de
jde
jd	e
jfddZde
jde
jde
jdB fddZ  ZS )BambaAttentionDecoderLayerNr*   r+   rN   rO   rP   r,   r.   r/   c           
   	      s  t    t|dd}|j| _t }|j| _| j| dksJ | j| | _|j| _	| j	|kr8| j	| dks7J n	|| j	 dksAJ t
d| j	| | _|j| j | _| j| j | _| j| j | _| jd | _|| _t|d| j}	|	| j |jd< t| j||jdt d	| _t|j| j| j| j	d
|| dd| _t| j| j |jd
|| dd| _t| j| j| j| j|| dd| _t||| dd| _t|j|jd| _ t|j|jd| _!d S )Nmax_position_embeddingsi    r   r   g      attn_rotary_embpartial_rotary_factorT)	head_sizemax_positionrope_parametersis_neox_styledtypeFz	.qkv_proj)r-   r,   r.   z.o_projz.attn)num_kv_headsrP   r.   rZ   r[   r\   )"r5   r6   getattrr7   r	   num_attention_headstotal_num_headsrV   num_key_value_headstotal_num_kv_headsmaxr{   rW   q_sizekv_sizescalingrs   rx   r   rp   get_default_dtype
rotary_embr   qkv_projr   o_projr   attnr)   rg   r   rX   rh   ri   )
r>   r+   rN   rO   rP   r,   r.   rs   tp_size
rotary_dimr?   rA   rB   r6      sn   
	
	
	z#BambaAttentionDecoderLayer.__init__	positionsrj   c                 K   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}	| |	\}
}|
S )N)dim)r   splitr   r   r   r   r   )r>   r   rj   rl   qkvrE   qkvattn_outputrm   rA   rA   rB   self_attention   s    z)BambaAttentionDecoderLayer.self_attentionrk   c                 K   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)r   rj   )rh   r   ri   rg   )r>   r   rj   rk   rl   rA   rA   rB   rF      s   
z"BambaAttentionDecoderLayer.forwardrn   )rG   rH   rI   r   ro   r   r   r   rK   r6   rp   rq   r   rF   rL   rA   rA   r?   rB   rr      sD    K
rr   )	attentionrf   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
deeeejf  dee fddZ  ZS )
BambaModelr*   r.   vllm_configr.   c                   s   t    |jj|j|j |j| _j| _t| jj	| _
dtf fdd}tj|| dd\| _| _| _tddgj	| _tj	jd| _d S )	Nr.   c                    s6   t | ddd }tj|  }|| | dS )N.r   r[   )ro   rsplitALL_DECODER_LAYER_TYPESlayers_block_type)r.   rN   layer_classrP   r+   rO   r,   rA   rB   	get_layer   s   z&BambaModel.__init__.<locals>.get_layerz.layersr   rj   rk   r\   )r5   r6   rO   	hf_configrP   r,   r+   
vocab_sizer   r7   embed_tokensrK   r'   num_hidden_layersstart_layer	end_layerlayersr&   make_empty_intermediate_tensorsr   rX   final_layernorm)r>   r   r.   r   r?   r   rB   r6     s&   

zBambaModel.__init__	input_idsr/   c                 C   s
   |  |S rC   )r   r>   r   rA   rA   rB   embed_input_ids5  s   
zBambaModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc           
      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }d }t| jD ]\}}||||d\}}q(t  jsAt||dS | ||\}}	|S )Nrj   rk   )r   rj   rk   )rj   rk   )r
   is_first_rankr   	enumerater   is_last_rankr   r   )
r>   r   r   r   r   rj   rk   ilayerrE   rA   rA   rB   rF   8  s*   
zBambaModel.forwardweightsc                 C   s  g d}t |  }t }|D ]r\}}d|v rqd|v r"|dd}d|v r,|dd}|D ].\}}}	||vr8q.|||}|drH||vrHq.t|| rNq.|| }
|
j}||
||	  n|drg||vrgqt|| rmq|| }
t|
d	t}||
| |	| q|S )
N))r   q_projr   )r   k_projr   )r   v_projr   )r9   	gate_projr   )r9   up_projr   zrotary_emb.inv_freqA_logAz.self_attn.z
.self_attnr*   z.biasweight_loader)
dictnamed_parameterssetreplaceendswithr%   r   r|   r   add)r>   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   rA   rA   rB   load_weightsY  s>   	


zBambaModel.load_weightsNN)rG   rH   rI   r   rK   r6   rp   rq   r   r   rF   r   tupler   r   rL   rA   rA   r?   rB   r     s"    &
,!r   c                
       s:  e Zd Zg dddgdZdddZedd	d
eejejf fddZ	edd	d
eee
e
f ee
e
e
f f fddZed
eeef fddZdddedef fddZdejd
ejfddZ		d%dejdB dejdedB dejdB fddZdejd
ejdB fd d!Zd"eeeejf  d
ee fd#d$Z  ZS )&BambaForCausalLM)r   r   r   r   r:   )r   r9   input_embeddingsoutput_embeddings)r   lm_headr   r   r/   c                 C   s   t |jj|jj|jjS rC   )r   mamba2_state_dtyperO   rz   rP   mamba_cache_dtypemamba_ssm_cache_dtype)clsr   rA   rA   rB   !get_mamba_state_dtype_from_config  s
   z2BambaForCausalLM.get_mamba_state_dtype_from_configc              	   C   s>   |j }|jj}|j|j }tj||j|j|j	|j
|j|jdS )a3  Calculate shapes for Mamba's convolutional and state caches.

        Args:
            vllm_config: vLLM config

        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
            - temporal_state_shape: Shape for state space model cache
        )r8   tp_world_sizerU   rV   rW   
state_sizeconv_kernel)parallel_configrO   r   r`   r7   r   mamba2_state_shapetensor_parallel_sizerc   rd   re   r^   r_   )r   r   r   r   r8   rA   rA   rB   !get_mamba_state_shape_from_config  s   z2BambaForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S rC   )r   mamba2_state_copy_func)r   rA   rA   rB   get_mamba_state_copy_func  s   z*BambaForCausalLM.get_mamba_state_copy_funcr*   r   r.   c                   s   |j j}|| _|j | _ |j}|j| _t   || _|| _t|t	|dd| _
t|j|jt	|dd| _t|j| _| j
j| _d S )Nmodel)r   r.   r   r   )rO   r   r   scheduler_configr,   r5   r6   r+   r   r(   r   r   r   r7   r   r   logits_processorr   )r>   r   r.   r+   r   r?   rA   rB   r6     s&   

zBambaForCausalLM.__init__r   c                 C   s   | j |S rC   )r   r   r   rA   rA   rB   r     s   z BambaForCausalLM.embed_input_idsNr   r   r   c                 K   s   |  ||||}|S rC   )r   )r>   r   r   r   r   rl   rj   rA   rA   rB   rF     s   zBambaForCausalLM.forwardrj   c                 C   s   |  | j|}|S rC   )r   r   )r>   rj   logitsrA   rA   rB   compute_logits  s   zBambaForCausalLM.compute_logitsr   c                 C   s   t | }||S rC   )r$   r   )r>   r   loaderrA   rA   rB   r     s   
zBambaForCausalLM.load_weightsr   )rG   rH   rI   packed_modules_mappingembedding_modulesclassmethodr   rp   rz   r   ro   r   r   r   r   rK   r6   rq   r   r   rF   r   r   r   r   rL   rA   rA   r?   rB   r     sP    



,r   )C__doc__collections.abcr   rp   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   r   vllm.distributedr	   vllm.distributed.parallel_stater
   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   -vllm.model_executor.layers.mamba.mamba_mixer2r   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   r    r!   r"   r#   utilsr$   r%   r&   r'   r(   Moduler)   rM   rr   r   r   r   rA   rA   rA   rB   <module>   sP    	%7r
~