o
    پi[                     @  s   d dl mZ 	 d dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ er_d dlmZ d dlmZ d dlmZ dZ eG dd dZ!G dd deZ"G dd dZ#dS )    )annotations)	dataclass)TYPE_CHECKINGCallableOptionalTupleUnionN)flash_mla_with_kvcacheget_mla_metadata)FlashInferMLAAttnBackend)!create_flashmla_kv_indices_triton)get_attention_tp_size)scaled_fp8_quant)ForwardBatchForwardMode)RadixAttention)ModelRunner)	SpecInput@   c                   @  sD   e Zd ZU dZded< dZded< dZded< 			d	d
ddZdS )FlashMLADecodeMetadataN+Optional[Tuple[torch.Tensor, torch.Tensor]]flashmla_metadataOptional[torch.Tensor]
num_splitsblock_kv_indicesc                 C  s   || _ || _|| _d S N)r   r   r   )selfr   r   r    r   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/flashmla_backend.py__init__&   s   
zFlashMLADecodeMetadata.__init__)NNN)r   r   r   r   r   r   )__name__
__module____qualname__r   __annotations__r   r   r   r   r   r   r   r       s   
 r   c                      s   e Zd ZdZ			d4d5 fddZd6 fddZ	d7d8ddZd9 fd!d"Zd: fd%d&Zd'd( Z		)d;d<d0d1Z
	)d;d< fd2d3Z  ZS )=FlashMLABackendzFlashmla attention kernels.FNmodel_runnerr   skip_prefillboolkv_indptr_bufr   kv_last_page_len_bufc                   s   t  |||| |jjt  | _|jj| _|jjt  | _d | _	|jj
| _
|jj| _|jj| _|jj| _|jj| _|j| _|j| _| j
| j | _| jtjtjhv | _|jj| _d S r   )superr   model_confignum_attention_headsr   num_q_headsreq_to_token_poolreq_to_tokennum_local_headsforward_metadatakv_lora_rankqk_nope_head_dimqk_rope_head_dim
v_head_dimscalingkv_cache_dtype	data_typedtypeq_data_typekv_cache_dimtorchfloat8_e4m3fnfloat8_e5m2is_fp8_kvcacheserver_argsspeculative_num_draft_tokensnum_draft_tokens)r   r%   r&   r(   r)   	__class__r   r   r   4   s,   





zFlashMLABackend.__init__forward_batchr   c           	   	     s\  |j }|j rOt|j  t}t	j
||fdt	j|jjd}t|f | j|j|jd || jd| t|jt	j| jd| jd\}}t|||| _d S |j r|j| j }|j| j }t|  t}t	j
||fdt	j|jd}t|f | j|j|d || jd| t|t	j| j| j d| jd\}}t|||| _d S t | d S )Nr9   devicer      r?   )
batch_sizeforward_modeis_decode_or_idletritoncdivseq_lens_cpumaxitem	PAGE_SIZEr<   fullint32seq_lensrH   r   r/   req_pool_indicesstrider
   tor-   r?   r   r1   is_target_verifyrB   r*   init_forward_metadata)	r   rE   bsmax_seqlen_padr   mla_metadatar   rP   rV   rC   r   r   r[   W   s|   

	



	



z%FlashMLABackend.init_forward_metadatamax_bsintmax_num_tokensr   c                 C  s   |d u rt j|| jt t fdt jdd}n|}| jr5tt j|t j|jd| j| j	 d| j
d\| _| _ntt j|t j|jd| j	d| j
d\| _| _|| _d S )NrI   cudarG   rJ   )r<   rT   max_context_lenrS   rU   rB   r
   onesrH   r-   r?   cuda_graph_mla_metadatacuda_graph_num_splitscuda_graph_kv_indices)r   r_   ra   r   rg   r   r   r   init_cuda_graph_state   s4   

	

z%FlashMLABackend.init_cuda_graph_stater\   
num_tokensrW   torch.TensorrV   encoder_lensrL   r   	spec_infoOptional[SpecInput]c              
     s  |  rht|  t}t|f | j||d | j| j	d| j	d | j
| jp+d }	t|tj|	d| jd\}
}| j|
 | jd |d  | t| j| jd |d  | jd |d |f | _d S | r|| j }t|  t}t|f | j||d | j| j	d| j	d t|tj| j| j
 d| jd\}
}| j|
 | jd |d  | t| j| jd |d  | jd |d |f | _d S t ||||||| d S Nr   rI   rJ   )rM   rN   rO   rQ   rR   rS   r   r/   rg   rX   r-   rB   r
   rY   r<   rU   r?   re   copy_rf   r   r1   rZ   r*   (init_forward_metadata_capture_cuda_graph)r   r\   ri   rW   rV   rk   rL   rl   r]   r-   r^   r   rC   r   r   rp      sv   


	





	



z8FlashMLABackend.init_forward_metadata_capture_cuda_graphseq_lens_sumrP   c	              
     s$  |  r|d us
J |d | }|d | }t|  t}	t|f | j|d | |d | j| j	d| j	d | j
| jpAd }
t|tj|
d| jd\}}| j| | jd |d  | | j| j_| jd |d  | j_| jd |d |	f | j_d S | r|d | | j }|d | | j }t|  t}	t|f | j|d | |d | j| j	d| j	d t|tj| j| j
 d| jd\}}| j| | jd |d  | | j| j_| jd |d  | j_| jd |d |	f | j_d S t |||||||| d S rn   )rM   rN   rO   rQ   rR   rS   r   r/   rg   rX   r-   rB   r
   rY   r<   rU   r?   re   ro   rf   r1   r^   r   r   rZ   r*   'init_forward_metadata_replay_cuda_graph)r   r\   rW   rV   rq   rk   rL   rl   rP   r]   r-   r^   r   rC   r   r   rr   	  s   


	






	



z7FlashMLABackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS NrI   r   r   r   r   r   !get_cuda_graph_seq_len_fill_valueW  s   z1FlashMLABackend.get_cuda_graph_seq_len_fill_valueTqkvlayerr   save_kv_cachec                 C  s  |j }|d ur|d usJ |r|j|||| |j}|j|j}	||d|j|j}
| j	r|j
d urD|j
}|j
d}|j
d}ntjdtj|
jd}tjdtj|
jd}tjdtj|
jd}|
j}|
d|d }t||\}}||}t||	dtd| j| jjd | |jtj| j| jj| jj|jd||d\}}|d|j|j S t|
|	dtd| j| jjd | |jtj| j| jj| jj|jdd	\}}|d|j|j S NrF   rI   )rI   rG   T)rv   k_cacheblock_tablecache_seqlens
head_dim_vtile_scheduler_metadatar   softmax_scalecausal	descale_q	descale_k)	rv   r|   r}   r~   r   r   r   r   r   )out_cache_loctoken_to_kv_poolset_kv_bufferrK   get_key_bufferlayer_idviewtp_q_head_numhead_dimr?   k_scalereshaper<   rd   float32rH   shaper   r	   rS   r;   r1   r   rV   rY   rU   r2   r   r   r6   r5   r   rv   rw   rx   ry   rE   rz   	cache_locr\   r|   	reshape_qq_scaler   r   q_shapereshape_q_2dreshape_q_fp8_2d_reshape_q_fp8or   r   r   forward_decodeZ  sn   	





zFlashMLABackend.forward_decodec                   s  |j tjks|j tjkrt ||||||S |j}|d ur/|d us$J |r/|j|||| |j	}|j
|j}	||d|j|j}
| jr|jd ur[|j}|jd}|jd}ntjdtj|
jd}tjdtj|
jd}tjdtj|
jd}|
j}|
d|d }t||\}}||}t||	dtd| j| jjd | |jtj| j  | j!| jj"| jj#|j$d||d\}}n*t|
|	dtd| j| jjd | |jtj| j  | j!| jj"| jj#|j$dd	\}}|d|j|j% S r{   )&rL   r   EXTENDDRAFT_EXTENDr*   forward_extendr   r   r   rK   r   r   r   r   r   r?   r   r   r<   rd   r   rH   r   r   r	   rS   r;   r1   r   rV   rY   rU   rB   r2   r   r   r6   r5   r   rC   r   r   r     st   






zFlashMLABackend.forward_extend)FNN)r%   r   r&   r'   r(   r   r)   r   rE   r   r   )r_   r`   ra   r`   r   r   )r\   r`   ri   r`   rW   rj   rV   rj   rk   r   rL   r   rl   rm   )r\   r`   rW   rj   rV   rj   rq   r`   rk   r   rL   r   rl   rm   rP   r   )T)rv   rj   rw   rj   rx   rj   ry   r   rE   r   rz   r'   )r    r!   r"   __doc__r   r[   rh   rp   rr   ru   r   r   __classcell__r   r   rC   r   r$   1   s     #I$IN
Tr$   c                   @  sL   e Zd ZdZdddZdddZdddZdddZdddZdddZ	dS ) FlashMLAMultiStepDraftBackendzm
    Wrap multiple flashmla attention backends as one for multiple consecutive
    draft decoding steps.
    r%   r   topkr`   speculative_num_stepsc              	   C  s   |dkrt d|| _|| _|jj| j }tj| j|d ftj|jd| _	g | _
t| jd D ]}| j
t|d| j	| d d q/d S )NrI   z@Currently FlashMLA only supports topk=1 for speculative decodingrG   T)r&   r(   r)   )
ValueErrorr   r   r.   sizer<   zerosrU   rH   	kv_indptrattn_backendsrangeappendr$   )r   r%   r   r   r_   ir   r   r   r     s2   	z&FlashMLAMultiStepDraftBackend.__init__rE   r   call_fnr   c                 C  s0   |j d usJ t| jd D ]}||| qd S rs   )rl   r   r   )r   rE   r   r   r   r   r   common_template   s   z-FlashMLAMultiStepDraftBackend.common_templatec                       fdd}  || d S )Nc                   s"   |j d usJ  j|  | d S r   )rl   r   r[   r   rE   rt   r   r   r   +  s   zDFlashMLAMultiStepDraftBackend.init_forward_metadata.<locals>.call_fnr   r   rE   r   r   rt   r   r[   *  s   z3FlashMLAMultiStepDraftBackend.init_forward_metadatar_   ra   c                 C  s.   t | jd D ]}| j| j||d d qd S )NrI   )r   )r   r   r   rh   )r   r_   ra   r   r   r   r   rh   1  s
   
z3FlashMLAMultiStepDraftBackend.init_cuda_graph_statec                   r   )Nc              	     s4    j |  j|j|j j |j|jd tj|jd d S )N)rk   rL   rl   )	r   rp   rK   r   rW   rV   r   DECODErl   r   rt   r   r   r   8  s   


zWFlashMLAMultiStepDraftBackend.init_forward_metadata_capture_cuda_graph.<locals>.call_fnr   r   r   rt   r   rp   7  s   zFFlashMLAMultiStepDraftBackend.init_forward_metadata_capture_cuda_graphr\   c                   s    fdd} || d S )Nc              
     s.   j |  j |j|jdd tj|j|jd d S )NrF   )rq   rk   rL   rl   rP   )r   rr   rW   rV   r   r   rl   rP   r   r\   r   r   r   r   H  s   

zVFlashMLAMultiStepDraftBackend.init_forward_metadata_replay_cuda_graph.<locals>.call_fnr   )r   rE   r\   r   r   r   r   rr   E  s   zEFlashMLAMultiStepDraftBackend.init_forward_metadata_replay_cuda_graphN)r%   r   r   r`   r   r`   )rE   r   r   r   r   )r_   r`   ra   r`   )rE   r   r\   r`   )
r    r!   r"   r   r   r   r[   rh   rp   rr   r   r   r   r   r     s    

!



r   )$
__future__r   dataclassesr   typingr   r   r   r   r   r<   rN   sgl_kernel.flash_mlar	   r
   2sglang.srt.layers.attention.flashinfer_mla_backendr   !sglang.srt.layers.attention.utilsr   sglang.srt.layers.dp_attentionr   )sglang.srt.layers.quantization.fp8_kernelr   ,sglang.srt.model_executor.forward_batch_infor   r   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr    sglang.srt.speculative.spec_infor   rS   r   r$   r   r   r   r   r   <module>   s0       K