o
    پim                     @  s  d dl mZ 	 d dlmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZm Z  eryd dl!m"Z" d dl#m$Z$ d dl%m&Z& d dlmZ ej'( rd dl)Z)ej*j+e)j,d dej-j._/e rd dl0m1Z1m2Z2 eG dd dZ3eG dd dZ4da5G dd dZ6G dd deZ7G dd dZ8G dd  d Z9G d!d" d"Z:d8d6d7Z;dS )9    )annotations)	dataclass)partial)TYPE_CHECKINGCallableOptionalUnionN)is_in_piecewise_cuda_graph)envs)AttentionBackend)#create_flashinfer_kv_indices_triton)get_attention_tp_size)ForwardBatchForwardMode)get_global_server_args)	SpecInput)is_flashinfer_availableis_sm100_supportednext_power_of_2)FlashInferMlaAttnBackend)RadixAttention)ModelRunner)dynamoT)BatchMLAPagedAttentionWrapper$BatchPrefillWithRaggedKVCacheWrapperc                   @  s   e Zd ZU ded< dS )DecodeMetadatar   decode_wrapperN__name__
__module____qualname____annotations__ r"   r"   f/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/flashinfer_mla_backend.pyr   7   s   
 r   c                   @  s   e Zd ZU ded< ded< dS )PrefillMetadatar   prefill_wrapperbool
use_raggedNr   r"   r"   r"   r#   r$   <   s   
 r$   c                   @  s8   e Zd ZdddZdd	d
Z	ddddZdddZdS ) FlashInferMhaChunkKVRunnermodel_runnerr   attn_backendr   c                 C  sp   |j jt  | _|j j| _|j j| _|j j| _|j| _|j| _	|j
| _
|j| _|j| _|j| _g | _|j| _d S N)model_confignum_attention_headsr   num_local_headsqk_nope_head_dimqk_rope_head_dim
v_head_dimdtype	data_typeq_data_type	qo_indptr	kv_indptrworkspace_bufferfmha_backendchunk_ragged_wrappersprefill_wrapper_raggedragged_wrapperselfr)   r*   r"   r"   r#   __init__G   s   


z#FlashInferMhaChunkKVRunner.__init__num_prefix_chunksintc                 C  sB   |t | jkrt| jd| jd}| j| |t | jksd S d S )NNHDbackend)lenr9   r   r7   r8   append)r=   r?   r;   r"   r"   r#   update_prefix_chunks]   s   
z/FlashInferMhaChunkKVRunner.update_prefix_chunksFforward_batchr   disable_flashinfer_raggedr&   c                 C  s2  |j d usJ |j }| | |j}|j}t|}| j}tj|| dd|d|d < |d |d  }t|j D ]6}|j	d usAJ |j
d usHJ |jd usOJ |j
| }	| j| }
|
j||	| j| j| j| j | j| jdd q8|s|jsv|n| jd |d  }	| jj||	| j| j| j| j | j| jdd d S d S )Nr   dim   Fr5   r6   num_qo_headsnum_kv_headshead_dim_qkhead_dim_vor4   causalT)r?   rF   extend_prefix_lensseq_lensrD   r5   torchcumsumrangeprefix_chunk_idxprefix_chunk_cu_seq_lensprefix_chunk_max_seq_lensr9   begin_forwardr.   r/   r0   r1   r4   mha_one_shotr6   r;   )r=   rG   rH   r?   prefix_lensrS   bsr5   	chunk_idxr6   wrapperr"   r"   r#   update_wrapperd   sR   





z)FlashInferMhaChunkKVRunner.update_wrapperqtorch.Tensorkvlayerr   c                 C  s   |j }|jr?|j}|dksJ | j| }|j|d|j|j|d|j|j	|j
|d|j|j	|j
d|j|d}	|	S |jrF| jjn| jj}
|
|d|j|j|d|j|j	|j
|d|j|j	|j
d|j|d}	|	S )Nr   FrQ   sm_scalelogits_soft_capT)	logit_capattn_attend_prefix_cacherW   r9   forward_return_lseviewtp_q_head_numhead_dimtp_k_head_numtor2   tp_v_head_numr1   scalingmha_return_lser;   forward)r=   ra   rc   rd   re   rG   ri   r^   r_   oru   r"   r"   r#   ru      s6   

z"FlashInferMhaChunkKVRunner.forwardN)r)   r   r*   r   )r?   r@   FrG   r   rH   r&   )
ra   rb   rc   rb   rd   rb   re   r   rG   r   )r   r   r    r>   rF   r`   ru   r"   r"   r"   r#   r(   F   s    


5r(   c                      s   e Zd ZdZ			d9d: fddZd;ddZ	d<d=ddZd>d!d"Zd?d%d&Zd'd( Z		d@dAd*d+Z
	,		dBdCd5d6Z	,		dBdCd7d8Z  ZS )DFlashInferMLAAttnBackendzFlashinfer attention kernels.FNr)   r   skip_prefillr&   kv_indptr_bufOptional[torch.Tensor]q_indptr_decode_bufc                   s  t    |jj| _|j| _|| _| o#t jdko#t j	 o#t j
 | _|j| _td u r:tjtj tj|jdat| _|jj}|d u rTtj|d ftj|jd| _n|| _| jshtj|d ftj|jd| _|d u r{tjd|d tj|jd| _n|| _t rd| _nd| _t| jd| jd| _ | jst!| jdd| _"t!| jdd| _#t!| jdd| _$|st%|| | _&| jrt'|| | _(t)|| | _*d | _+i | _,i | _-d S )	Ndecoder2   devicerK   r   cutlassautorA   rB   ).superr>   r,   context_lenmax_context_lenr   rz   r   disaggregation_modedisable_chunked_prefix_cacheflashinfer_mla_disable_raggedenable_chunk_kv	page_sizeglobal_workspace_bufferrT   emptyr
    SGLANG_FLASHINFER_WORKSPACE_SIZEgetuint8r7   req_to_token_poolsizezerosint32r6   r5   arangeq_indptr_decoder   r8   r   r:   r   prefill_wrapper_pagedprefill_wrapper_verifyr   "FlashInferMLAIndicesUpdaterPrefillindices_updater_prefillr(   mha_chunk_kv_cache!FlashInferMLAIndicesUpdaterDecodeindices_updater_decodeforward_metadatadecode_cuda_graph_metadataprefill_cuda_graph_metadata)r=   r)   rz   r{   r}   max_bs	__class__r"   r#   r>      s   






z!FlashInferMLAAttnBackend.__init__rG   r   c              	   C  s  |j  r| jj|j|j|j| jdd t| j| _	d S |j 
 r<| jj|j|j|jd | jd|jd t| jd| _	d S |j  r\| jj|j|j|jd | jd|jd t| jd| _	d S |j}t|j }t j oo|oot  }| jj|j|j|j|| j|d t| j|| _	d S )NF)r   init_metadata_replayr\   r   r'   	spec_info)r   r'   )forward_modeis_decode_or_idler   updatereq_pool_indicesrS   seq_lens_sumr   r   r   is_draft_extendr   r   r   r$   is_target_verifyr   rR   anyextend_prefix_lens_cpur   r   r	   )r=   rG   r\   extend_no_prefixr'   r"   r"   r#   init_forward_metadata   sb   

	
	

z.FlashInferMLAAttnBackend.init_forward_metadatar   r@   max_num_tokenskv_indices_bufc                 C  s   |d u rt j|| j ft jdd}n|}|| _| j | _| j | _	t j
|ft j| jd| _| jd| _| j	d| _| j| j| jd| _d S )Ncudar   cpu)qo_indptr_cpukv_indptr_cpu
kv_indices)rT   r   r   r   cuda_graph_kv_indicesr   clonecuda_graph_qo_indptrr6   cuda_graph_kv_indptronesr   cuda_graph_kv_lensrq   cuda_graph_qo_indptr_cpucuda_graph_kv_indptr_cpufast_decode_kwargs)r=   r   r   r   r   r"   r"   r#   init_cuda_graph_stateV  s&   
z.FlashInferMLAAttnBackend.init_cuda_graph_stater]   
num_tokensr   rb   rS   encoder_lensr   r   r   Optional[SpecInput]c              	   C  s  |  rHt| jd| jd |d  | jd |d  | j| jd | dd}|  }	| j	j
|||	|d|d || j|< t|| _tt||_d S | rt| jd| jd |d  | jd |d  | j| jd | dd}
|  }	| jj
|||	d |
d|d |
| j|< t|
d| _d S | rt| jd| jd |d  | jd |d  | j| jd | dd}|  }	| jj
|||	d |d|d || j|< t|d| _d S td|)	NTrK   r   )use_cuda_graphr5   r6   r   
kv_len_arrrC   Fr   r   r   r   zInvalid mode: forward_mode=)r   r   r7   r   r   r   r   sumitemr   r   r   r   r   r   fast_mla_decode_planplanr   r   r   r$   r   
ValueError)r=   r]   r   r   rS   r   r   r   r   r   verify_wrapperdraft_extend_wrapperr"   r"   r#   (init_forward_metadata_capture_cuda_graphu  s   



	
		
	zAFlashInferMLAAttnBackend.init_forward_metadata_capture_cuda_graphr   seq_lens_cpuc	           
   	   C  s.  |  rT|d us
J |d | }	tj|	dd| jd|d < | j| jd |d  | jd |d  |	d | jj|d | |d | |f| j| d|d| j d S |	 rr| j
j|d | |d | |d | j| d|d d S | r| j
j|d | |d | |d | j| d|d d S td	|)
Nr   rI   rK   )r   r   kv_len_arr_cpuTr   Fr   z#Invalid forward mode: forward_mode=)r   rT   rU   r   r   r   r   r   r   r   r   r   r   r   )
r=   r]   r   rS   r   r   r   r   r   r   r"   r"   r#   'init_forward_metadata_replay_cuda_graph  sX   


	


	



z@FlashInferMLAAttnBackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS NrK   r"   r=   r"   r"   r#   !get_cuda_graph_seq_len_fill_value  s   z:FlashInferMLAAttnBackend.get_cuda_graph_seq_len_fill_valuerH   c                 C  s   | j || dS )z%Init the metadata for a forward pass.N)r   r`   )r=   rG   rH   r"   r"   r#   init_mha_chunk_metadata  s   z0FlashInferMLAAttnBackend.init_mha_chunk_metadataTra   rc   rd   re   r   save_kv_cacheq_ropek_ropec	              	   C  s>  |j d ur%t|jr%| jsJ |d u sJ |d u sJ | j|||||S |j}	|j}
| jj	}|rT|d urT|d us;J |rT|d urK|j
||	|| n	|j
||	|| |d urm|d|j|j}|d|j|j|j }| jjr|d ur~tj||gdd}|d|j|j}|d urtj||gdd}| jj||d|j|j|j|d|j|j|jd|j|
d}n]|j
|j|j}|d u r|d|j|j}|d d d d d |jf |d d d d |jd f }}||j}|j|||d d d d d |jf |d d d d |jd f |d}|d|j|j S )Nrf   rI   Trg   out)rk   r   r   r   r   ru   out_cache_locrj   r   r%   token_to_kv_poolset_mla_kv_bufferset_kv_bufferrm   rn   r1   ro   r'   rT   catr:   rp   rq   r2   rs   get_key_bufferlayer_id	new_emptyshaperun)r=   ra   rc   rd   re   rG   r   r   r   	cache_locri   r   qallrv   k_bufr"   r"   r#   forward_extend  sl   

z'FlashInferMLAAttnBackend.forward_extendc	              	   C  sL  | j j}	|j}
|d ur*|d usJ |r*|d ur!|j||
|| n	|j||
|| |d urD|d|j|j}|d|j|j	|j }n%|d|j|j	}|d d d d d |jf }|d d d d |jd f }|j
|j|j}||j}|	j|||d d d d d |jf |d d d d |jd f |d}|d|j|j S )Nrf   r   )r   r   r   r   r   r   rm   rn   r1   ro   r   r   rq   r2   r   r   r   )r=   ra   rc   rd   re   rG   r   r   r   r   r   q_nope
reshaped_qk_bufferrv   r"   r"   r#   forward_decodeS  sL   z'FlashInferMLAAttnBackend.forward_decode)FNN)r)   r   rz   r&   r{   r|   r}   r|   rG   r   r+   )r   r@   r   r@   r   r|   )r]   r@   r   r@   r   rb   rS   rb   r   r|   r   r   r   r   )r]   r@   r   rb   rS   rb   r   r@   r   r|   r   r   r   r   r   r|   rw   rx   )TNN)ra   rb   rc   rb   rd   rb   re   r   rG   r   r   r&   r   r|   r   r|   )r   r   r    __doc__r>   r   r   r   r   r   r   r   r   __classcell__r"   r"   r   r#   ry      s,    
^:

P9Sry   c                   @  s6   e Zd ZdddZ		ddddZ		dd ddZdS )!r   r)   r   r*   r   c                 C  sd   |j jt  | _|j j| _|j j| _|j j| _|j j| _|j| _	|| _
|j| _|jj| _|j| _d S r+   )r,   r-   r   r.   kv_lora_rankr/   r0   rs   r2   r3   r*   r6   r   req_to_tokenr   q_indptrr<   r"   r"   r#   r>     s   




z*FlashInferMLAIndicesUpdaterDecode.__init__FNr   rb   rS   r   r@   r   r   r   r&   r   r   c              	   K  s2   |p| j }| j||||| j| j||fi | d S r+   )r   call_begin_forwardr   r6   )r=   r   rS   r   r   r   r   r   r"   r"   r#   r     s   

	
z(FlashInferMLAIndicesUpdaterDecode.updater_   paged_kernel_lenspaged_kernel_lens_sumr   r6   c	                 K  s  t |}
|d |
d  }|tj}| j}|d u rPtj|dd|d|
d < |d |
d  }|s9tj|tjddn|	d }t|
f | j|||d || jj	d  n|j
|j}}|sp|||||| j| j| jdd|| j| j d S ||	d |	d	 ||	d
 | j| j| jdd|| j| j d S )NrK   r   rI   r   r   r   Fr   r   r   )rD   rq   rT   r   rs   rU   r   r   r   r   r6   r   r   r.   r   r0   r3   )r=   r_   r   r   r   r   r6   r   r   r   r]   kv_lensrh   r   r"   r"   r#   r     sd   

z4FlashInferMLAIndicesUpdaterDecode.call_begin_forwardr)   r   r*   r   )FN)r   rb   rS   rb   r   r@   r   r   r   r&   r   r   )r_   r   r   rb   r   rb   r   r@   r   rb   r6   rb   r   r&   r   r   r   r   r    r>   r   r   r"   r"   r"   r#   r     s    
r   c                   @  s2   e Zd Zd ddZ	d!d"ddZ	d!d#ddZdS )$r   r)   r   r*   r   c                 C  s~   |j jt  | _|j j| _|j j| _|j j| _|j j| _|j j| _|j	| _
|j	| _|| _|j| _|j| _|jj| _|j| _d S r+   )r,   r-   r   r.   r   r/   r0   r1   rs   r2   r3   r4   r*   r6   r5   r   r   r:   r<   r"   r"   r#   r>     s   





z+FlashInferMLAIndicesUpdaterPrefill.__init__Nr   torch.TnesorrS   rb   r   r@   r\   r   r   r'   r&   r   r   c           
      C  sF   |r|}|   }	n|}|}	| | j||||	||| j| j|| d S r+   )r   r   r   r:   r6   r5   )
r=   r   rS   r   r\   r   r'   r   r   r   r"   r"   r#   r     s$   
z)FlashInferMLAIndicesUpdaterPrefill.updatewrapper_raggedr   wrapper_pagedr   r   r6   r5   c                 C  s`  t |}| j}|d u r`t |t |ksJ tj|dd|d|d < |d |d  }tj|tj|jd}t|f | j|||d || jj	d  tj|| dd|	d|d < |	d |d  }	d }nt
|tsgJ ||||| j\}}}	}|
r|j|	|	| j| j| j| j | j| jdd d S |dd  |d d  }||	|||| j| j| jdd|| j| j d S )Nr   rI   rK   r   TrL   rf   )rD   rs   rT   rU   r   r   r   r   r   r   
isinstancer   generate_attn_arg_prefillrZ   r.   r/   r0   r1   r4   r   r   r3   )r=   r   r   r   r   r   rS   r\   r6   r5   r'   r   r]   rh   r   custom_maskr   r"   r"   r#   r   1  sr   
	
	

z5FlashInferMLAIndicesUpdaterPrefill.call_begin_forwardr   r+   )r   r   rS   rb   r   r@   r\   rb   r   r   r'   r&   r   r   )r   r   r   r   r   rb   r   rb   r   r@   rS   rb   r\   rb   r6   rb   r5   rb   r'   r&   r   r   r   r"   r"   r"   r#   r     s    
+r   c                   @  sL   e Zd ZdZdddZdddZdddZd ddZdddZd!ddZ	dS )""FlashInferMLAMultiStepDraftBackendzs
    Wrap multiple flashinfer mla attention backends as one for multiple consecutive
    draft decoding steps.
    r)   r   topkr@   speculative_num_stepsc              	   C  s   ddl m} |dkrtd|| _|| _|| _|jj| j }tj| j|d ftj	|j
d| _tjd|d tj	|j
d| _g | _t| jd D ]}| jt|d| j| | jd qF| jd j| _|jjjd | _|jj| _d S )Nr   ) generate_draft_decode_kv_indicesrK   zFCurrently Flashinfer MLA only supports topk=1 for speculative decodingr   T)rz   r{   r}   )!sglang.srt.speculative.spec_utilsr  r   r  r  r   r   rT   r   r   r   r6   r   r   attn_backendsrV   rE   ry   r   r   r   pool_lenserver_argsr   )r=   r)   r  r  r  r   ir"   r"   r#   r>     s@   	z+FlashInferMLAMultiStepDraftBackend.__init__rG   r   kv_indices_bufferrb   call_fnr   c                 C  s   |j }| j| }|j}| j| j|| jf |j|jj|j|| j	|j
| j|jd | j	jd t|t| jt|| j |jd us@J |j sGJ t| jd D ]'}| j	|d |d f |j_	|| d || j ||d    |j_||| qNd S r   )
batch_sizer  r   r  r  r   r   r   rS   r6   	positionsr  r   r   r   r   is_draft_inputrV   r   )r=   rG   r	  r
  num_seqsr]   r   r  r"   r"   r#   common_template  s:   

z2FlashInferMLAMultiStepDraftBackend.common_templatec                   sD   t j j|j j  j ft jdd} fdd} ||| d S )Nr   r   c                   s4   |j j |j _|j j |j _ j|  | d S r+   )r   r6   r   r   r  r   r  rG   r   r"   r#   r
    s
   

zIFlashInferMLAMultiStepDraftBackend.init_forward_metadata.<locals>.call_fn)rT   r   r  r  r  r   r   r  )r=   rG   r   r
  r"   r   r#   r     s   		z8FlashInferMLAMultiStepDraftBackend.init_forward_metadatar   r   c                 C  sT   t j| j|| j ft jdd| _t| jd D ]}| j| j||| j| d qd S )Nr   r   rK   )r   )	rT   r   r  r   r   r   rV   r  r   )r=   r   r   r  r"   r"   r#   r     s   
z8FlashInferMLAMultiStepDraftBackend.init_cuda_graph_statec                   s     fdd}  | j| d S )Nc              	     s4    j |  j|j|j j |j|jd tj|jd d S )N)r   r   r   )	r  r   r  r  r   rS   r   DECODEr   r  r   r"   r#   r
    s   


z\FlashInferMLAMultiStepDraftBackend.init_forward_metadata_capture_cuda_graph.<locals>.call_fnr  r   )r=   rG   r
  r"   r   r#   r     s   zKFlashInferMLAMultiStepDraftBackend.init_forward_metadata_capture_cuda_graphr]   c                   s"    fdd} |j| d S )Nc              
     s.   j |  j |j|jdd tj|j|jd d S )Nrf   )r   r   r   r   r   )r  r   r   rS   r   r  r   r   r  r]   r=   r"   r#   r
    s   

z[FlashInferMLAMultiStepDraftBackend.init_forward_metadata_replay_cuda_graph.<locals>.call_fnr  )r=   rG   r]   r
  r"   r  r#   r     s   zJFlashInferMLAMultiStepDraftBackend.init_forward_metadata_replay_cuda_graphN)r)   r   r  r@   r  r@   )rG   r   r	  rb   r
  r   r   )r   r@   r   r@   )rG   r   r]   r@   )
r   r   r    r   r>   r  r   r   r   r   r"   r"   r"   r#   r     s    

.
&

r   r   rb   r   r   r   	num_headsr@   head_dim_ckvhead_dim_kper   rQ   r&   rh   floatr4   torch.dtypekv_data_typereturnNonec                 C  sb   |	| _ || _|
| _z| j| j| j| j||||||		 W dS  ty0 } zt	d| d}~ww )zA faster version of BatchMLAPagedAttentionWrapper::plan,
    for skipping the stream synchronization in original plan function during
    cuda graph replaying.
    zError in alternate MLA plan: N)
_causal
_page_size	_sm_scale_cached_moduler   _float_workspace_buffer_int_workspace_buffer _pin_memory_int_workspace_buffer	ExceptionRuntimeError)r=   r   r   r   r   r  r  r  r   rQ   rh   r4   r  er"   r"   r#   r     s&   
r   )r   rb   r   rb   r   rb   r   rb   r  r@   r  r@   r  r@   r   r@   rQ   r&   rh   r  r4   r  r  r  r  r  )<
__future__r   dataclassesr   	functoolsr   typingr   r   r   r   rT   0sglang.srt.compilation.piecewise_context_managerr	   sglang.srt.environr
   -sglang.srt.layers.attention.base_attn_backendr   .sglang.srt.layers.attention.flashinfer_backendr   sglang.srt.layers.dp_attentionr   ,sglang.srt.model_executor.forward_batch_infor   r   sglang.srt.server_argsr    sglang.srt.speculative.spec_infor   sglang.srt.utilsr   r   r   2sglang.srt.layers.attention.flashinfer_mla_backendr   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr   SGLANG_ENABLE_TORCH_COMPILEr   logging_loggingset_logsERROR_dynamoconfigsuppress_errors
flashinferr   r   r   r$   r   r(   ry   r   r   r   r   r"   r"   r"   r#   <module>   sT    	

y   Tm  