o
    پil                     @   sn  d dl mZmZmZmZ d dlZd dlmZ d dlm	Z	m
Z
 d dlmZmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZm Z  d dl!m"Z"m#Z#m$Z$m%Z% e# r~d dl&m'Z'm(Z( d dl)m'Z* d dl)m(Z+ ne$ rd dl,m-Z' d dl,m.Z( eej/ej/gdf Z0deee1e1e2f  de1de1de0fddZ3G dd dejj4Z5dS )    )CallableListOptionalTupleN)Mamba2CacheParamsextra_groups_for_head_shards)divideget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)Mamba2Metadata)Mixer2RMSNormGated)mamba_chunk_scan_combinedselective_state_update)ColumnParallelLinearMergedColumnParallelLinearRowParallelLinear)QuantizationConfig)	MambaPool)composed_weight_loadersharded_weight_loader)is_cpuis_cudais_npuset_weight_attrs)causal_conv1d_fncausal_conv1d_update)r   )r   )causal_conv1d_fn_npu)causal_conv1d_update_npu
shard_spectp_sizetp_rankreturnc                    s&   dt jdt jddf fdd}|S )zCreate a weight loader for mamba v2. This ensures that the projections
    are correctly sharded so that they can be split into x, B, C. It also
    ensures the the all the groups corresponding to a head shard is placed
    together with it.
    paramloaded_weightr!   Nc                    s  d\}}t  r1d}g }g } D ]\}}}|| }|| q|D ]}|t|| |d  q  D ]\}}	}
| }|
r@dn}|| }|| }t|||	 | }t  r|d dkrdd l}||}tj||dd\}}}t	|d |d  |d|d
|j}t	|d |d  |d|d
|j}tj||fdd}tj||fdd}tj||fdd}tj||fdd}tj||fdd}||||  | j||| df< ||7 }|||	 7 }q3d S )N)r   r   r   dim      .)r   appendintsizemincopydeepcopytorchsplitzerostodtypecatdata)r"   r#   boundaryloaded_boundaryfull_dim_sumfull_dim_listweight_full_dim_listfull_dim_extraduplicate_groups
shard_sizerankloaded_skiploaded_start_idxtaker,   loaded_weight_qkvpad_qkpad_vloaded_weight_qkr   r    r    [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/mamba/mamba.pyloaderC   sl   	

z.mamba_v2_sharded_weight_loader.<locals>.loader)r.   Tensor)r   r   r    rM   rK   rJ   rL   mamba_v2_sharded_weight_loader8   s   "UrO   c                       s   e Zd ZdZ						d!ded	ed
edededededede	e
 def fddZddddejdejdejdede	ej defddZedefdd Z  ZS )"MambaMixer2u  
    Compute ∆, A, B, C, and D the state space parameters and compute
    the `contextualized_states`. A, D are input independent
    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
    for why A isn't selective) ∆, B, C are input-dependent
    (this is a key difference between Mamba and the linear time
    invariant S4, and is why Mamba is called
    **selective** state spaces)
    r&   h㈵>siluTN cache_paramshidden_sizeuse_conv_biasuse_biasn_groupsrms_norm_eps
activationuse_rms_normquant_configprefixc              	      s\  t    t | _t | _|jj | _}|jj| _|| j dks$J d|| j dks3|dks3J d|| j dksG| jdksG|	d u sGJ d|jj	| _	|| _
|jj}|jj | _}|| _|| j dkrot|| j}|| | _| j| j	 | _|jj| _|| j dkrt||| j| jg|d |
 dd| _t|||| j| j| jg||	|
 dd| _nt|| j|d |
 dd	| _t||| j | j ||	|
 dd	| _| j| j| | j	 |dkf}|dd
f}| jdd
f}t| jjd t| jjdt|||g| j| ji t| jjd t| jjdt|||g| j| ji |	d u r6t| jjd t| jjdt|||||g| j| ji | jjjd| jj_ttjt || jtj!d| _"tt#|| j | _$tt#|| j | _%|| _&t| j$dt'di t(t'ddd }t| j"d|i t| j%dt'di t)|||d|	|
 dd| _*t+||| j&|d| _,|
| _-d S )Nr   z1Tensor parallel world size must divide num heads.r&   zWIf tensor parallel world size does not divide num_groups, then num_groups must equal 1.zoTensor parallel currently supported for quantized models only if tensor parallel world size divides num groups.z.conv1d)
input_sizeoutput_sizesbiasr\   r]   z.in_proj)r^   output_sizer`   r\   r]   Fweight_loaderr2   c                 S   s   t |   S )N)r.   expfloat)xrK   rK   rL   <lambda>r  s    z&MambaMixer2.__init__.<locals>.<lambda>Tz	.out_proj)r`   input_is_parallelr\   r]   )eps).super__init__r
   r   r	   r    shape	num_headshead_dimssm_state_sizerZ   conv_kernelintermediate_sizerX   r   groups_ssm_state_sizeconv_dimr   conv1din_projr   delattrr`   r   rO   weightr4   	unsqueezenn	Parameterr.   emptyr   float32AonesDdt_biasr[   r   r   r   out_projr   normr]   )selfrT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   rm   conv_kernel_sizerq   groupsgroup_shard_settingsintermediate_settingshead_settingsa_weight_loader	__class__rK   rL   rk      s  

"






	

zMambaMixer2.__init__F)
mup_vectoruse_triton_causal_convhidden_statesoutputlayer_cachemetadatar   r   c          :         s  |j }|jd }|j}	|j}
 |\}}|d ur|| }tj| j j  j	 j  j
 j gdd\}}} jj jjd jjd} fdd}|j}|j}|jr]||j n|}|j}|dk}|dk}|| }||jd kswJ tj|||gdd\}}tj|||gdd\}}tj|||gdd\}}|r|
d |d  nd }tj|jd  j
 j  j g|j|jd} tj| ||gdd\}!}"|rw|j}#|#d usJ |#j}$|#j}%|}&|dd}'|stnt}(|(|'| jj j ||$|&||#j!d		ddd | }||\})}*}+d },|$d ur$|%r$t"|$d d d d d f |	| d},t#|)d| j
 j  j|$d j%|*d| j& j d|+d| j& j df|#j' j(d  j)|#j*|#j+|#j,||,d
dd
dt-df|!d|d j|	jd}-|-|	|< |r|j}.|.r|sJ dt.|t/j0sJ d|j}/tj1|tj2|jd _3|||/ddd}0t4|0|| jj j |d | |j5d  j3|j6|j7|j8d}1|1dd|d}n|st9nt4}2|2||| jj j |d}||\}3}4}5 j& j }6 j%d d d df d d d d d f :d j j;j<tj=d}7|d d d d d f :dd j} j)d d d df :d j}8 j(d d d df :d j}9|4d|6|4jd |6 }4|5d|6|5jd |6 }5|3d j
 j  j}3|.rt>|	|3||/ j
 j  j|||/ j
 j  j|7|4||/|6d|5||/|6d|9d |8d
|d | |"||/ j
 j  jd
|j?|/|j8 j3d nt>|	|3||7|4|5|9d |8d
||"|d jd  @| |d | } A|\|d |< }d S )Nr   r$   r'   c                    s.   t j|  j j  j j  j j gddS )Nr   r$   )r.   r/   rq   r   rr   )hidden_states_B_Cr   rK   rL   rg     s    


z%MambaMixer2.forward.<locals>.<lambda>r&   )r2   device)rZ   conv_stateshas_initial_statecache_indicesquery_start_locseq_lens_cpuTFg        inf)
chunk_sizer   zr   seq_idxchunk_indiceschunk_offsets
cu_seqlensinitial_statesreturn_varlen_statesreturn_final_statesdt_softplusdt_limitoutstate_dtypezXSpeculative decoding requires use_triton_causal_conv=True for intermediate state supportz=layer_cache must be SpeculativeState for speculative decoding)conv_state_indicesintermediate_conv_windowintermediate_state_indicesretrieve_next_tokenretrieve_next_siblingretrieve_parent_token)r   .rc   )
r   r   r   state_batch_indicesr   disable_state_updateintermediate_states_buffercache_stepsr   r   )r   r   r   r   r   )Bmamba_cache_indicesconvtemporalr   ru   r.   r/   rq   r   rs   rm   rt   rw   viewr*   num_prefillsnum_decodesis_target_verifydraft_token_numnum_prefill_tokensrl   r{   rn   r2   r   mixed_metadatahas_initial_statesprep_initial_states	transposer   causal_conv1d_fn_tritonr`   rZ   extend_seq_lens_cpuwherer   rx   r}   rX   r   r   r   r   r   r   re   
isinstancer   SpeculativeStatearangeint32r   causal_conv1d_update_tritonr   r   r   r   r   expandro   r1   r|   r   intermediate_ssmr   r   ):r   r   r   r   r   r   r   state_indices_tensor
conv_state	ssm_stater   projected_statesr;   gater   dtconv_weightssplit_hidden_states_B_C_fnr   r   num_decode_tokensr   has_prefill
has_decodenum_actual_tokenshidden_states_B_C_phidden_states_B_C_ddt_pdt_dstate_indices_tensor_pstate_indices_tensor_dquery_start_loc_ppreallocated_ssm_outpreallocated_ssm_out_ppreallocated_ssm_out_dr   has_initial_states_pr   r   rf   ccfnhidden_states_pB_pC_pr   varlen_stater   r   hidden_states_B_C_d_reshapedhidden_states_B_C_d_processedccuhidden_states_dB_dC_drX   A_dr   D_drK   r   rL   forward  s  



	









	$"



#zMambaMixer2.forwardr!   c                 C   s   dS )Nmamba2rK   r   rK   rK   rL   
mamba_type  s   zMambaMixer2.mamba_type)r&   rQ   rR   TNrS   )__name__
__module____qualname____doc__r   r)   boolre   strr   r   rk   r.   rN   r   Stater   r   propertyr   __classcell__rK   rK   r   rL   rP      sb    	
 h
  3rP   )6typingr   r   r   r   r.   torch.nnry   sglang.srt.configs.mamba_utilsr   r   sglang.srt.distributedr   r	   r
   1sglang.srt.layers.attention.mamba.mamba2_metadatar   7sglang.srt.layers.attention.mamba.mixer2_rms_norm_gatedr   %sglang.srt.layers.attention.mamba.opsr   r   sglang.srt.layers.linearr   r   r   *sglang.srt.layers.quantization.base_configr    sglang.srt.mem_cache.memory_poolr   $sglang.srt.model_loader.weight_utilsr   r   sglang.srt.utilsr   r   r   r   /sglang.srt.layers.attention.mamba.causal_conv1dr   r   6sglang.srt.layers.attention.mamba.causal_conv1d_tritonr   r   "sgl_kernel_npu.mamba.causal_conv1dr   r   rN   LoaderFunctionr)   re   rO   ModulerP   rK   rK   rK   rL   <module>   s>    
c