o
    پiy&                     @  s   d dl mZ 	 d dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZ d d	lmZ erSd d
lmZ d dlmZ d dlmZ e Zer`d dlmZmZ dZeG dd dZ G dd deZ!dS )    )annotations)	dataclass)TYPE_CHECKINGOptionalUnionN)FlashInferMLAAttnBackend)!create_flashmla_kv_indices_triton)get_attention_tp_size)ForwardBatchForwardMode)is_cuda)RadixAttention)ModelRunner)	SpecInput)cutlass_mla_decodecutlass_mla_get_workspace_size   c                   @  s6   e Zd ZU dZded< dZded< 		ddddZdS )	CutlassMLADecodeMetadataNOptional[torch.Tensor]	workspaceblock_kv_indicesc                 C  s   || _ || _d S N)r   r   )selfr   r    r   c/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/cutlass_mla_backend.py__init__'   s   
z!CutlassMLADecodeMetadata.__init__)NN)r   r   r   r   )__name__
__module____qualname__r   __annotations__r   r   r   r   r   r   r   "   s   
 r   c                      s|   e Zd ZdZ			d4d5 fddZd6 fddZ	d7d8ddZd9 fd!d"Zd: fd%d&Zd'd( Z		)		d;d<d2d3Z
  ZS )=CutlassMLABackendzCutlass attention kernels.FNmodel_runnerr   skip_prefillboolkv_indptr_bufr   kv_last_page_len_bufc                   s   t  |||| |jjt  | _|jt | _|jj	| _	|jjt  | _
d | _|jj| _|jj| _|jj| _|jj| _|jj| _|j| _|j| _| j| j | _d S r   )superr   model_confignum_attention_headsr	   num_q_headsget_num_kv_headsnum_kv_headsreq_to_token_poolreq_to_tokennum_local_headsforward_metadatakv_lora_rankqk_nope_head_dimqk_rope_head_dim
v_head_dimscalingkv_cache_dtype	data_typedtypeq_data_typekv_cache_dim)r   r!   r"   r$   r%   	__class__r   r   r   3   s(   





zCutlassMLABackend.__init__forward_batchr
   c              
     s   |j }|j}|j r`|d u rXt|j  t	}t
j||fdt
j|jjd}t|f | j|j|jd || jd|t	d t|t	 |dd}t
j|dt
jd}t||| _d S t | d S t | d S )	Nr7   devicer   
PAGED_SIZE   num_kv_splitscudar?   r7   )
batch_size	spec_infoforward_modeis_decode_or_idletritoncdivseq_lens_cpumaxitem	PAGE_SIZEtorchfullint32seq_lensr?   r   r-   req_pool_indicesstrider   emptyuint8r   r/   r&   init_forward_metadata)r   r<   bsrH   max_seqlen_padr   workspace_sizer   r:   r   r   rY   R   sF   




z'CutlassMLABackend.init_forward_metadatamax_bsintmax_num_tokensr   c                 C  sf   |d u rt j|| jt t fdt jdd}n|}t|jd t |dd}t j|dt jd| _	|| _
d S )NrB   rE   r>   rC   rF   )rQ   rR   max_context_lenrP   rS   r   shaperW   rX   cuda_graph_mla_workspacecuda_graph_kv_indices)r   r]   r_   r   rc   r\   r   r   r   init_cuda_graph_statez   s   
z'CutlassMLABackend.init_cuda_graph_staterZ   
num_tokensrU   torch.TensorrT   encoder_lensrI   r   rH   Optional[SpecInput]c           	   
     s   |  r;|d u r9| jjd }t|f | j||d | j| jd| jdtd t| j| jd |d |f | _	d S d S t
 ||||||| d S )NrB   r   r@   )rJ   rc   ra   r   r-   rV   rP   r   rb   r/   r&   (init_forward_metadata_capture_cuda_graph)	r   rZ   re   rU   rT   rg   rI   rH   r[   r:   r   r   ri      s6   




z:CutlassMLABackend.init_forward_metadata_capture_cuda_graphseq_lens_sumrM   c	           	   
     s|   |  r/|d us
J |d | }t|f | j|d | |d | j| jd| jdtd d S t |||||||| d S )Nr   r@   )rJ   r   r-   rc   rV   rP   r&   'init_forward_metadata_replay_cuda_graph)	r   rZ   rU   rT   rj   rg   rI   rH   rM   r:   r   r   rk      s.   



z9CutlassMLABackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS )NrB   r   )r   r   r   r   !get_cuda_graph_seq_len_fill_value   s   z3CutlassMLABackend.get_cuda_graph_seq_len_fill_valueTqkvlayerr   save_kv_cacheq_ropek_ropec	              
   C  s<  |j }	|d ur&|d usJ |r&|d ur|j||	|| n	|j||	|| |d ur@|d|j|j}
|d|j|j|j }n%|d|j|j}|d d d d d |jf }
|d d d d |jd f }|
| j	}
|| j	}|j
|j}t|
||dt| j|jtj| jj| jj|jdd}|d|j|j S )Nr=   rB   )q_nopeq_pekv_c_and_k_pe_cacherT   
page_tabler   sm_scalerD   )out_cache_loctoken_to_kv_poolset_mla_kv_bufferset_kv_bufferviewtp_q_head_numr3   head_dimtor8   get_key_bufferlayer_idr   rP   r9   rT   rQ   rS   r/   r   r   r4   )r   rm   rn   ro   rp   r<   rq   rr   rs   	cache_locrt   
reshaped_qk_cacheor   r   r   forward_decode   sN   z CutlassMLABackend.forward_decode)FNN)r!   r   r"   r#   r$   r   r%   r   )r<   r
   r   )r]   r^   r_   r^   r   r   )rZ   r^   re   r^   rU   rf   rT   rf   rg   r   rI   r   rH   rh   )rZ   r^   rU   rf   rT   rf   rj   r^   rg   r   rI   r   rH   rh   rM   r   )TNN)rm   rf   rn   rf   ro   rf   rp   r   r<   r
   rq   r#   rr   r   rs   r   )r   r   r   __doc__r   rY   rd   ri   rk   rl   r   __classcell__r   r   r:   r   r    0   s     ,'&
r    )"
__future__r   dataclassesr   typingr   r   r   rQ   rK   2sglang.srt.layers.attention.flashinfer_mla_backendr   !sglang.srt.layers.attention.utilsr   sglang.srt.layers.dp_attentionr	   ,sglang.srt.model_executor.forward_batch_infor
   r   sglang.srt.utilsr   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr    sglang.srt.speculative.spec_infor   _is_cuda
sgl_kernelr   r   rP   r   r    r   r   r   r   <module>   s,    