o
    
۾iÔ                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 ddl7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZB ddlCmDZDmEZEmFZFmGZGmHZH ddlImJZJ ddlKmLZL ddlMmNZN dd lOmPZP dd!lQmRZR dd"lSmTZT G d#d$ d$eZUd%eUd&eVd'eWfd(d)ZXeYd*G d+d, d,e&eZZd-ej[d.ej[d/e\d'dfd0d*Z]d-ej[d.ej[d/e\d'dfd1d2Z^ePd*e]d.ge^d3 G d4d5 d5ej_Z`G d6d7 d7ej_ZaG d8d9 d9ej_ZbG d:d; d;ejj_Zce
G d<d= d=ejj_ZdG d>d? d?ejj_e?eAeBe@ZedS )@zInference-only PLaMo2 model.    )Iterable)isliceN)nn)PretrainedConfig)support_torch_compile)
VllmConfigget_current_vllm_config)divide$get_tensor_model_parallel_world_size)get_pp_group)ForwardContextget_forward_context)PluggableLayer)
SiluAndMul)	Attention)RMSNorm)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)	MambaBase)MambaStateCopyFuncMambaStateCopyFuncCalculatorMambaStateDtypeCalculatorMambaStateShapeCalculator)causal_conv1d_fncausal_conv1d_update)selective_state_update) mamba_chunk_scan_combined_varlen)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)composed_weight_loaderdefault_weight_loadersharded_weight_loader)HasInnerStateIsHybridSupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefix)set_weight_attrs)current_platform)IntermediateTensors)direct_register_custom_op)AttentionMetadata)Mamba2AttentionMetadatac                   @   sz   e Zd ZU dZeed< eed< eed< eed< eed< eed< eed< eed	< eed
< eed< eed< eed< eed< dS )Plamo2Configplamo2
model_typehidden_sizenum_hidden_layersrms_norm_epsnum_attention_headshidden_size_per_headnum_key_value_headsmamba_d_statemamba_d_convmamba_num_heads
mamba_stepintermediate_size
vocab_sizeN)__name__
__module____qualname__r8   str__annotations__intfloat rL   rL   U/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/plamo2.pyr6   L   s   
 r6   configireturnc                 C   s@   | j dksJ | j| j d kr|| jd kS || j  | j d kS )N      )rB   r:   )rN   rO   rL   rL   rM   is_mambaa   s   rS   plamo2_mamba_mixerc                       s   e Zd Zdddededdf fddZd	d
 ZdejdejfddZ	dejdejfddZ
deejejf fddZdeeedf eedf f fddZedefddZ  ZS )Plamo2MambaMixer prefixvllm_configrX   rP   Nc             	      s  t    |jj| _|j| _|j| _|j| _t|j| _	| jj
| _
| jj| _| jj| _| jj| jj | _t | _| jj| _| jj| _td| j
d | _t| j| jd| ddd| _| jjjd| jj_t| j
| jgd d| j| ddd	| _t| j| j| jd  d| j| d
dd	| _ t| j| jd| j| ddd	| _!t"#t$j%t&| j| jt$j'd| _(t"#t$)t&| j| j| _*t"#t$)t&| j| j| _+t,| j*dt-di t.t-ddd }t,| j(d|i t,| j+dt-di t| j| j
dd| j| ddd| _/d| _0t1| j| jj2d| _3t1| j| jj2d| _4t1| j| jj2d| _5| jj6| _7t8 j9}||j:v r;t;d| | |j:|< t$<g t$<g f| _=| j7dksUJ d|| _>d S )N@      Fz.conv1d)
input_sizeoutput_sizebiasrX   return_biasrQ   rR   z.in_proj)r^   quant_configrX   r_   z
.bcdt_projz.dt_proj)dtypeweight_loaderr   c                 S   s   t |   S N)torchexprK   )xrL   rL   rM   <lambda>   s    z+Plamo2MambaMixer.__init__.<locals>.<lambda>Tz	.out_proj)r^   input_is_parallelr`   rX   r_   siluepszDuplicate layer name: zchunk_size must be set for v1)?super__init__model_config	hf_configrN   cache_configr`   boollora_configis_lora_enabledr9   r?   ssm_state_sizer@   conv_kernel_sizerA   r=   rC   r
   tp_sizehead_dim	num_headsmaxtime_step_rankr   conv1dweightdata	unsqueezer   in_projr   	bcdt_projdt_projr   	Parameterrd   emptyr	   float32AonesDdt_biasr0   r&   r$   out_proj
activationr   r;   dt_normB_normC_normmamba_chunk_size
chunk_sizer   compilation_configstatic_forward_context
ValueErrortensorkv_cacherX   )selfrY   rX   kwargsa_weight_loaderr   	__class__rL   rM   rn   r   s   







		



zPlamo2MambaMixer.__init__c                 C   s   | j r| | }n| |}tj|| j| j| jgdd\}}}| | }| | }| 	| }| 
|}|||fS Nrl   dim)rt   r   
contiguousrd   splitru   r{   r   r   r   r   )r   hidden_statesssm_parametersBC	time_stepdtrL   rL   rM   _project_ssm_parameters   s   


z(Plamo2MambaMixer._project_ssm_parametersr   outputc                 K   s   t jj||| j d S rc   )rd   opsvllmrT   rX   )r   r   r   r   rL   rL   rM   forward   s
   zPlamo2MambaMixer.forwardc           ,      K   sD  t  }|j}|d urGt|tsJ || j }t|tsJ | j|j }|d dd}|d }|j	}	|j
}
|j}|j}|j}|j}|j}|j}| |}|jddd\}}| jj| jjd| jjd}|d u r|dd dd }| ||d d < d S |j}|j}|j}|dk}|dk}|| }tj|d | ||gdd\}}tj|d | ||gdd\}}tj|	||gdd\}}tj|| | j | j! | j" g|j#|j$d} tj| ||gdd\}!}"|rz|dd}#t%|#|| jj&| j'||
|||d	}|dd}|d | }| }| (|\}$}%}&d }'|
d ur2|r2t)|
d d d d d f || d}'t*||| j | j! | j"|&| j+|$|dd|%|ddf|| j,||| j | j! | j"| j-|||||'d	d
t.df|"|d| j"|j#d}(|(||< |rt/|||| jj&| j'|d}t01 r| }| (|\}$}%}&| j+d d d df d d d d d f 2d| j"| j3j4})|&d d d d d f 2dd| j"}&| j-d d d df 2d| j"}*| j,d d d df 2d| j"}+|$5d}$|%5d}%|d| j | j! | j"}t6|||&|)|$|%|+|7|d| j"|*d	||!|d| j"d | | |d |< d S )Nr   rl   rQ   rR   r   )ra   device)r   conv_stateshas_initial_statecache_indicesmetadataquery_start_locTg        inf)r   r   zr   seq_idx
cu_seqlenscu_chunk_seqlenslast_chunk_indicesinitial_statesdt_softplusdt_limitoutstate_dtype)conv_state_indices.)r   r   r   state_batch_indicesr   )8r   attn_metadata
isinstancedictrX   r5   r   virtual_engine	transposestate_indices_tensorhas_initial_states_pprep_initial_statesr   	seq_idx_pquery_start_loc_pcu_chunk_seqlen_plast_chunk_indices_pr   chunkr|   r}   viewsizecloner   r   num_prefillsnum_decode_tokensnum_prefill_tokensrd   r   r   ry   rw   rx   ra   r   r   r^   r   r   wherer   r   r   r   rK   r   r1   is_rocmexpandrN   r?   r   r   reshape),r   r   r   r   forward_contextr   self_kv_cache
conv_state	ssm_stater   r   r   r   r   r   r   r   projected_statesgateconv_weightsr   num_decodesr   has_prefill
has_decodenum_actual_tokenshidden_states_dhidden_states_pgate_dgate_pstate_indices_tensor_dstate_indices_tensor_ppreallocated_ssm_outpreallocated_ssm_out_dpreallocated_ssm_out_prf   r   r   r   r   varlen_stater   r   r   rL   rL   rM   forward_impl   s&  








&"

	zPlamo2MambaMixer.forward_implc                 C   s6   | j d usJ | jd usJ t| j j| jj| jjS rc   )ro   rq   r   mamba2_state_dtypera   mamba_cache_dtypemamba_ssm_cache_dtyper   rL   rL   rM   get_state_dtype  s   z Plamo2MambaMixer.get_state_dtype.c              	   C   s$   t j| jt d| j| j| j| jdS )Nr   rC   tp_world_sizen_groupsry   rx   
state_sizeconv_kernel)r   mamba2_state_shaperC   r
   ry   rx   ru   rv   r   rL   rL   rM   get_state_shape  s   z Plamo2MambaMixer.get_state_shapec                 C   s   dS )Nmamba2rL   r   rL   rL   rM   
mamba_type  s   zPlamo2MambaMixer.mamba_type)rE   rF   rG   r   rH   rn   r   rd   Tensorr   r   tuplera   r   rJ   r   propertyr   __classcell__rL   rL   r   rM   rU   n   s$     g

 R&	rU   r   r   
layer_namec                 C   s"   t  }|j| }|j| |d d S )N)r   r   )r   no_compile_layersr   )r   r   r   r   r   rL   rL   rM   rT     s   
c                 C   s   d S rc   rL   )r   r   r   rL   rL   rM   plamo2_mamba_mixer_fake  s   r   )op_nameop_funcmutates_args	fake_implc                	       sN   e Zd Z		ddededB deddf fddZd	ejdejfd
dZ	  Z
S )DenseMLPNrV   rN   r`   rX   rP   c                    sl   t    |j| _|j| _t| j| jgd d| d|dd| _t | _t| j| jd| d|dd| _	d S )NrR   Fz.gate_up_proj)r^   rX   r`   r_   z
.down_proj)
rm   rn   r9   rC   r   gate_up_projr   actr   	down_proj)r   rN   r`   rX   r   rL   rM   rn     s(   

zDenseMLP.__init__r   c                 C   s   |  |}| |}| |S rc   )r  r  r  )r   r   hrL   rL   rM   r     s   


zDenseMLP.forward)NrV   )rE   rF   rG   r6   r    rH   rn   rd   r   r   r   rL   rL   r   rM   r    s    r  c                       sL   e Zd Zdddededdf fddZd	ejd
ejdejfddZ  Z	S )Plamo2AttentionMixerrV   rW   rY   rX   rP   Nc          	   	      s  t    |jj}|j}|j}|j| _t }|j| _	| j	| dks#J | j	| | _
|j| _| j|kr<| j| dks;J n	|| j dksEJ td| j| | _|j| _| j
| j | _| j| j | _| jd | _t|j| j| j	| jd|| dd| _t| j	| j |jd|| dd| _|j}t|jdrt|jjtrt||jj}t| j||jd	| _ t!|j|j"d
| _#t$j%&t$'| j
|jf| j#_(t)| j#j(dt*di t!|j|j"d
| _+t$j%&t$'| j|jf| j+_(| jdkrt)| j+j(dt*di t,| j
| j| j| j|| dd| _-d S )Nr   rQ   g      Fz	.qkv_proj)r^   r`   rX   z.o_projmax_model_len)max_positionrope_parametersrj   rb   z.attn)num_kv_headsrq   rX   ).rm   rn   ro   rp   rq   r`   r9   r
   r<   total_num_headsry   r>   total_num_kv_headsrz   r  r=   rx   q_sizekv_sizescalingr   qkv_projr   o_projmax_position_embeddingshasattrr   r  rJ   minr!   r  
rotary_embr   r;   q_normrd   r   r   r   r}   r0   r&   k_normr   attn)	r   rY   rX   r   rN   rq   r`   rw   r  r   rL   rM   rn     s   

	

zPlamo2AttentionMixer.__init__	positionsr   c                 K   s   |  |\}}|j| j| j| jgdd\}}}|j}	||	d d | jjj }| j||	}|j}
||
d d | j	jj }| j	||
}| 
|||\}}| |||}| |\}}|S r   )r  r   r  r  shaper   r  r}   forward_nativer  r  r  r  )r   r  r   r   qkv_qkvq_shapek_shapeattn_outputr   rL   rL   rM   r   p  s    zPlamo2AttentionMixer.forward
rE   rF   rG   r   rH   rn   rd   r   r   r   rL   rL   r   rM   r
    s     Qr
  c                	       sR   e Zd Z	ddedededdf fddZd	ejd
ejdejdB fddZ	  Z
S )Plamo2DecoderLayerrV   rY   	layer_idxrX   rP   Nc                    s   t    |jj}|j}t||| _| jr t|| dd| _n
t|| dd| _t	||| dd| _
t|j|jd| _t|j|jd| _t|j|jd| _t|j|jd| _d S )Nz.mixerrY   rX   z.mlp)rN   r`   rX   rj   )rm   rn   ro   rp   r`   rS   rU   mixerr
  r  mlpr   r9   r;   pre_mixer_normpost_mixer_normpre_mlp_normpost_mlp_norm)r   rY   r*  rX   r   rN   r`   r   rL   rM   rn     s$   



zPlamo2DecoderLayer.__init__r  r   residualc                 K   s   |d u r|}|  |}n|  ||\}}| jr!t|}d|i}nd|i}| jdd|i|}| jr4|}| |}| ||\}}| |}| |}||fS )Nr   r  r   rL   )	r.  rS   rd   
empty_liker,  r/  r0  r-  r1  )r   r  r   r2  r   r   mixer_kwargsrL   rL   rM   r     s,   



zPlamo2DecoderLayer.forward)rV   )rE   rF   rG   r   rJ   rH   rn   rd   r   r   r   rL   rL   r   rM   r)    s$    r)  c                       sV   e Zd Zdddededdf fddZd	ejd
ejdejdB dejfddZ  Z	S )Plamo2DecoderrV   rW   rY   rX   rP   Nc                   sZ   t    jj}dtji dtf fdd}t|j|| dd\| _	| _
| _d S )Nrt   rX   c                    s*   t | ddd }td|| d S )N.rQ   )rY   r*  rX   rL   )rJ   rsplitr)  )rX   r*  extra_kwargsrY   rL   rM   	get_layer  s   z)Plamo2Decoder.__init__.<locals>.get_layer.layersrW   )rm   rn   ro   rp   rr   rs   rH   r.   r:   start_layer	end_layerlayers)r   rY   rX   rN   r:  r   r8  rM   rn     s   
	zPlamo2Decoder.__init__r  r   r2  c                 C   s2   t | j| j| jD ]}||||d\}}q	||fS )Nr  r   r2  )r   r>  r<  r=  )r   r  r   r2  layerrL   rL   rM   r     s   zPlamo2Decoder.forwardr(  rL   rL   r   rM   r5    s     r5  c                       sz   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdB dejde	dB dejdB dejf
ddZ
  ZS )Plamo2ModelrV   rW   rY   rX   c                   s~   t    |jj}|| _|j| _|j| _t| j|j	| dd| _
tddg|j	| _t|| dd| _t|j	|jd| _d S )Nz.embed_tokensrW   r   r2  r;  r+  rj   )rm   rn   ro   rp   rN   pad_token_idpadding_idxrD   r#   r9   embed_tokensr-   make_empty_intermediate_tensorsr5  r>  r   r;   norm)r   rY   rX   rN   r   rL   rM   rn     s   

zPlamo2Model.__init__	input_idsrP   c                 C   s
   |  |S rc   )rD  r   rG  rL   rL   rM   embed_input_ids  s   
zPlamo2Model.embed_input_idsNr  intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }| j|||d\}}t  js6t||dS | ||\}}|S )Nr   r2  r?  )r   r2  )r   is_first_rankrI  r>  is_last_rankr2   rF  )r   rG  r  rJ  rK  r   r2  r!  rL   rL   rM   r      s&   

zPlamo2Model.forwardNN)rE   rF   rG   r   rH   rn   rd   r   rI  r2   r   r   rL   rL   r   rM   rA    s     rA  c                
       s,  e Zd ZdgdgdgdZdddeded	d
f fddZdejd	ejfddZ		
	
d"dejd
B dejde
d
B dejd
B fddZeddd	eejejf fddZeddd	eeeef eeeef f fddZed	eeef fddZdejd	ejd
B fddZdeeeejf  fd d!Z  ZS )#Plamo2ForCausalLMr  r  r   )r  r  r   rV   rW   rY   rX   rP   Nc                   s   t    |jj}|j}|| _|| _|j| _|| _| jj| j_t	|t
|dd| _| jj| _t| j| jj| dd| _| jjrI| j| jj| _t|j| jj| _| jj| _d S )Nmodelr+  z.lm_headrW   )rm   rn   ro   rp   scheduler_configrN   rY   r=   rx   rA  r/   rP  rD   r"   r9   lm_headtie_word_embeddingstie_weightsrD  r   logits_processorrE  )r   rY   rX   rN   rQ  r   rL   rM   rn   (  s0   



zPlamo2ForCausalLM.__init__rG  c                 C   s   | j |S rc   )rP  rI  rH  rL   rL   rM   rI  J  s   z!Plamo2ForCausalLM.embed_input_idsr  rJ  rK  c                 K   s   |  ||||}|S rc   )rP  )r   rG  r  rJ  rK  r   r   rL   rL   rM   r   M  s   zPlamo2ForCausalLM.forwardr   c                 C   s   t |jj|jj|jjS rc   )r   r   ro   ra   rq   r   r   )clsrY   rL   rL   rM   !get_mamba_state_dtype_from_configZ  s
   z3Plamo2ForCausalLM.get_mamba_state_dtype_from_configc              	   C   s<   |j }|jj}|j|j }tj||jd|j|j|j|j	dS )a1  Calculate shapes for Mamba's convolutional and state caches.
        Args:
            vllm_config: vLLM config
        Returns:
            Tuple containing:
            - conv_state_shape: Shape for convolutional state cache
            - temporal_state_shape: Shape for state space model cache
        r   r   )
parallel_configro   rp   rA   r=   r   r   tensor_parallel_sizer?   r@   )rV  rY   rX  rp   rC   rL   rL   rM   !get_mamba_state_shape_from_confige  s   z3Plamo2ForCausalLM.get_mamba_state_shape_from_configc                 C   s   t  S rc   )r   mamba2_state_copy_func)rV  rL   rL   rM   get_mamba_state_copy_func  s   z+Plamo2ForCausalLM.get_mamba_state_copy_funcr   c                 C   s   |  | j|}|S rc   )rU  rR  )r   r   logitsrL   rL   rM   compute_logits  s   z Plamo2ForCausalLM.compute_logitsweightsc                    s  t |  }|D ]\ } dkr| jjrd|vsJ qt fddtjD r(qdddddd	d
}| D ]\}}| v rC || q5d v sTd v sTd v sTd v rd v r^|	dd}|
|jd | jjd}|jddd\}}|
|jd d}|
|jd d}tj||gdd}d v r|	dd}d v r|d7 }n#d v r|d7 }nd v r|d7 }nd v r|d7 }nd v r|d7 }t | rq|  }	t|	dt}
|
|	| qd S )Nzlm_head.weightc                 3   s    | ]}| v V  qd S rc   rL   ).0substrnamerL   rM   	<genexpr>  s
    
z1Plamo2ForCausalLM.load_weights.<locals>.<genexpr>z.Az.B_norm.weightz.C_norm.weightz.dt_norm.weightz.q_norm.weightz.k_norm.weight)z.A_logz.B_norm_weightz.C_norm_weightz.dt_norm_weightz	.q_weightz	.k_weightz.mixer.in_proj.weightzmixer.in_proj.qweightzmixer.in_proj.scaleszmixer.in_proj.qzeroszmixer.in_proj.weightr   rQ   rl   rR   r   z.pre_mixer_normg      ?z.post_mixer_normg?z.pre_mlp_normz.post_mlp_normgWfѷ?zmodel.norm.weightrb   )r   named_parametersrN   rS  anyr+   ROTARY_EMBEDS_UNUSED_WEIGHTSitemsreplacer   r   r  rA   r   rd   catr,   getattrr%   )r   r_  params_dictloaded_weightreplacementsoldnewgate_weighthidden_states_weightparamrb   rL   rb  rM   load_weights  sj   






zPlamo2ForCausalLM.load_weightsrN  )rE   rF   rG   packed_modules_mappingr   rH   rn   rd   r   rI  r2   r   classmethodr   ra   rW  rJ   rZ  r   r\  r^  r   rt  r   rL   rL   r   rM   rO    sL     "


$rO  )f__doc__collections.abcr   	itertoolsr   rd   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   vllm.distributed.parallel_stater   vllm.forward_contextr   r   vllm.model_executor.custom_opr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   )vllm.model_executor.layers.mamba.abstractr   ,vllm.model_executor.layers.mamba.mamba_utilsr   r   r   r   2vllm.model_executor.layers.mamba.ops.causal_conv1dr   r   .vllm.model_executor.layers.mamba.ops.mamba_ssmr   1vllm.model_executor.layers.mamba.ops.ssd_combinedr   'vllm.model_executor.layers.quantizationr    +vllm.model_executor.layers.rotary_embeddingr!   3vllm.model_executor.layers.vocab_parallel_embeddingr"   r#   -vllm.model_executor.model_loader.weight_utilsr$   r%   r&   %vllm.model_executor.models.interfacesr'   r(   r)   r*    vllm.model_executor.models.utilsr+   r,   r-   r.   r/   vllm.model_executor.utilsr0   vllm.platformsr1   vllm.sequencer2   vllm.utils.torch_utilsr3   vllm.v1.attention.backendr4   &vllm.v1.attention.backends.mamba2_attnr5   r6   rJ   rr   rS   registerrU   r   rH   rT   r   Moduler  r
  r)  r5  rA  rO  rL   rL   rL   rM   <module>   s     u


"h?"
7