o
    پiN                     @  s   d dl mZ d dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlmZ d dlmZmZ erFd d	lmZ d d
lmZ d dlmZ d dlmZmZ G dd de	ZdS )    )annotations)TYPE_CHECKINGOptionalN)AttentionArch)AttentionBackend)FlashAttentionMetadata$make_local_attention_virtual_batchesmerge_state_v2_wrapper"prepare_swa_spec_page_table_triton)get_global_server_args)ForwardBatchForwardMode)RadixAttention)ModelRunner)merge_state_v2)flash_attn_varlen_funcflash_attn_with_kvcachec                      s~   e Zd ZdZ				d+d, fdd	Zd-ddZ				d.d/ddZ				d.d0ddZdd  Zd1d$d%Z		d2d3d)d*Z
  ZS )4XPUAttentionBackendaG  XPU FlashAttention backend, currently based on FlashAttentionBackend, will be refactored later.

    TODO:
    - Prefill and Decode disaggregation, currently only chunked prefill is supported
    - Speculative Decoding support
    - XPU Graph support, see https://github.com/pytorch/pytorch/issues/162143
    - MLA support
    Fr   model_runnerr   skip_prefillboolc                   s  t    |jd ur|jjrJ dd | _d | _|jj| _|j	| _	i | _
i | _|jj| _|j| _|jj| _|j| _|jjtjk| _| jdu sJJ d|| _|j| _| jrY|jj| _|jjp^d| _|| _|jj| _|| _t|drs|j nd | _ |j| _| jd uo| jdk| _!d S )Nz=Sliding window and cross attention are not supported togetherFz[XPUAttentionBackend doesn't support MLA yet, please use --attention-backend triton instead.r   attention_chunk_size)"super__init__sliding_window_sizemodel_configis_encoder_decoderforward_metadata#forward_metadata_spec_decode_expandcontext_lenmax_context_lendevicedecode_cuda_graph_metadatatarget_verify_metadatareq_to_token_poolreq_to_tokenkv_cache_dtypeserver_argskv_cache_dtype_str	page_sizeattention_archr   MLAuse_mlar   is_hybrid_swatoken_to_kv_poolfull_to_swa_index_mappingspeculative_eagle_topktopkspeculative_num_stepsspeculative_num_draft_tokensspeculative_step_idhasattrr   has_swa)selfr   r   r5   r2   r3   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/xpu_backend.pyr   $   sJ   




zXPUAttentionBackend.__init__forward_batchr   c                 C  sl  t  }|j}|j}|j}|j r]|jdur	J d|	t
j|_|j  |_t
jd|d t
j|d|_t
jjt
j|dt
jdd|_|jj|jd|jf |_| $||| n|j% r| jdkr|j| j& 	t
j|_| j&|_|j  | j& |_t
jd|| j& d | j&t
j|d|_t
jjt
j|jdt
jdd|_|jj|jd|jf |_| $||| n|j	t
j|_| j&|_|j  |_t
jd|| j& d | j&t
j|d	|_t
jjt
j|jdt
jdd|_|jj|jd|jf |_t  }d|_t
jd|j | j& d t
j|d|_t
j| j&|d'd}	|	(|j d|j'd }
t
jjt
j|j| j& )| j&ddddd }|
j)| j&dd|dddf   dd}|jj*|  d| j&}|	(|j+d | j&}t
,|||| j& }t
j-|dd\}}|jj|jddf .d|
j)| j&dd}|.d||_|j/dd	t
j|_t
jjt
j|jdt
jdd|_|| _#| j0r| 1|| no|j2 r<|	t
j|_|j  |_t
jjt
j|dt
jdd|_|jj|jd|jf |_t3|j4s|jt5j6kr&|j7}t|j8|_t
jjt
j|dt
jdd|_n|j|_|j|_|jt5j9kr<| $||| |j:dur|j: dksNJ d|j:	t
j|_;t
jjt
j|j;dt
jdd|_<|j;  |_=|jj|jd|j=f |_>|jj|j|j=|j=|j f |_| j?dkrt
jd|jj+d | j?| jd| _@|jdd| j@f | j? |_|| _AdS )zNInitialize forward metadata hence all layers in the forward pass can reuse it.NFzlXPUAttentionBackend doesn't support speculative decoding yet, please use --attention-backend triton instead.   r   )dtyper"   dimr?   r>   r   )stepr?   r"   )r"   r?   r   )r"   )rA   z(Only encoder size 1 is supported for now)Br   seq_lens
batch_sizer"   forward_modeis_decode_or_idle	spec_infor2   r5   totorchint32cache_seqlens_int32seq_lens_cpumaxitemmax_seq_len_karangecu_seqlens_qnn
functionalpadcumsumcu_seqlens_kr%   r&   req_pool_indices
page_tablemax_seq_len_qfullnumelout_cache_locviewr3   
contiguousr   _init_local_attn_metadatais_target_verifyr4   	unsqueezeexpandrepeat_interleavecustom_maskshapewheresortgathersumr7   '_init_sliding_window_attn_spec_metadata"is_extend_or_draft_extend_or_mixedanyextend_prefix_lens_cpur   DRAFT_EXTENDextend_seq_lensextend_seq_lens_cpuEXTENDencoder_lensencoder_lens_int32encoder_cu_seqlens_kencoder_max_seq_len_kencoder_page_tabler*   strided_indicesr   )r8   r=   metadataseqlens_in_batchrE   r"   metadata_expanddecode_length	cache_locoffsetscolscum_lenmask_extraction_indicesmaskcol_indiceskeys_
sort_ordernon_masked_page_tablerp   r;   r;   r<   init_forward_metadata]   sx  

K



	






z)XPUAttentionBackend.init_forward_metadataTNqtorch.Tensorkvlayerr   q_ropeOptional[torch.Tensor]k_ropesinksc
           .      C  sH  |d ur/|d us
J |r/|j s|jn|j}
| js&|j||
|||j|j n	|j||
|| | j	}|j
d uo;|j
dk}|rC|j
dfnd}d\}}|j  }| jd uo^|jd uo^t|do^|j}|j ok| jdkok| }i }|	d urv|	|d< |r|j}|j}|j}|j}|j}n)|r|jd ur|j}|j}|j}|j}|j}|j}n|j}|j}|j}|j}|j}| jsX|j|j\}}|d| j|j |j!}|d| j|j"|j!}|j r|j#}|j$}|j%}d}t&d|' d|j(|j!||||||s|nd ||j)|rdn|||j*|||d	|}|rT|^}}} t&d|' d|j(|j!||| j+j| j+j| j+j| j+j| j+j|j)d||j*||d
d	|^}!}"}#t,||j-' |!|"j-' \}}$n|}n|j.d ur|j s|j/ s|j.rt0 j1ruJ |j2d us}J |j3d usJ |j4d usJ |j2}%|%dksJ |j5sJ t6|d|j(|j!|d|j |j!7|j8|d|j |j97|j8|j|j3|% |j|j4|% |j)dd
d
}&n1t6|d|j(|j!|d|j |j!7|j8|d|j |j97|j8|j|j|j|j|j)d
|j5d
}&|j5r|&^}&}'} t:;|'dd' }'|&|'fS |&S |j<|j7|j8}(|(d d d d |j9d f }|(d d d d d |j9f })|d| j|j |j!|j9 }*|)d| j|j"|j9}+|d urv|d|j(|j9},|d|j(|j!|j9 }n'|' d|j(|j!}-|-d d d d d |j9f },|-d d d d |j9d f }t&||*|+|,||||s|nd ||j)|rdn||j*|||d}|r|^}}} t&di d|d|*d|+d|,d| j+jd| j+jd| j+jd| j+jd| j+jd|j)ddd|d|j*d|d|dd
^}!}"}#t,||j-' |!|"j-' \}}$n|}|d|j(|j9 S )Nr   r   r   r   NN	use_iroper>   r   Fr   k_cachev_cacherY   cache_seqlensrR   cu_seqlens_k_newmax_seqlen_qsoftmax_scalecausalwindow_sizesoftcap	k_descale	v_descalereturn_softmax_lseT)
r   r   r   rR   rW   r   max_seqlen_kr   r   r   r   r   r   qvrY   r   rR   r   r   r   r   r   r   r   r   r   r   r   r   rY   r   rR   r   r   r   r   r   r   r   r   r   r;   )=is_cross_attentionr]   encoder_out_cache_locr-   r/   set_kv_bufferk_scalev_scaleset_mla_kv_bufferr   r   r   local_attn_metadatar6   r   rF   ra   r2   local_block_tablelocal_query_start_loclocal_seqused_klocal_max_query_lenswa_spec_metadatarY   rR   rL   rZ   rW   get_kv_bufferlayer_idr^   r*   tp_k_head_numhead_dimtp_v_head_numrw   rt   ru   r   r_   tp_q_head_numscaling	logit_capr   r	   Tattn_attend_prefix_cacheis_draft_extendr   disable_chunked_prefix_cacheprefix_chunk_idxprefix_chunk_cu_seq_lensprefix_chunk_max_seq_lensmha_return_lser   rI   r?   
v_head_dimrJ   	transposeget_key_buffer).r8   r   r   r   r   r=   save_kv_cacher   r   r   r}   ry   r.   r   r   r   r   use_local_attnuse_cascade_attnkwargslocal_metadatarY   rR   r   r   r   rW   	key_cachevalue_cacheresultosoftmax_lseresto_expandsoftmax_lse_expandrest_expandr   	chunk_idxoutputlsekv_cachec_kvk_rope_cache
c_kv_cacheq_nopeq_allr;   r;   r<   forward_extend|  s  








	

z"XPUAttentionBackend.forward_extendreturnc
           *      C  sb  |d ur/|d us
J |r/|j s|jn|j}
| js&|j||
|||j|j n	|j||
|| | j	}t
|dd }| jd uoH|d uoHt|doH|j}|jd uoR| jdk}|jd urb|jdkrb|jdfnd}|j  }i }|	d urr|	|d< d\}}| jd	kr|jd
kr|jd ur|j|jf}|j|}|j|}|| j}|d ur|| jnd }|d ur|| jnd }| js|j|j\}}|d| j|j|j}|d| j|j|j}|j rtd | d|j|j|||j |j!|j"|j#d|j$dd|j%||d|}n|r-td | d|j|j|||j&|j'|j(d |j)|j$dd|j%||d|}nz|j*}|j+}|j,}|j-}| d|j|j}td ||||||j"|||j$|rVdn|||j%|||d|}|r|^}}}td |||| j.j*| j.j+| j.j"| j.j,| j.j-|j$d||j%||dd|^} }!}"t/||j0 | |!j0 \}}#n|}n|j1|j|j2}$|$d d d d |j3d f }|$d d d d d |j3f }%|d| j|j|j|j3 }&|%d| j|j|j3}'|d ur|d|j|j3}(|d|j|j|j3 }n'| d|j|j})|)d d d d d |j3f }(|)d d d d |j3d f }|j-}t||&|'|(|j*|j+|j"|j,||j$|r@dn||j%|||d}|r|^}}}td i d|d|&d|'d|(d| j.j*d| j.j+d| j.j"d| j.j,d| j.j-d|j$ddd|d|j%d|d|dd^} }!}"t/||j0 | |!j0 \}}#n|}|d|j|j3 S )!Nr   r   r>   r   r   r   r   r   auto   F)r   r   r   rY   r   rR   r   r   r   r   r   r   r   r   Tr   r   r   r   r   r   rY   r   rR   r   r   r   r   r   r   r   r   r   r;   )4r   r]   r   r-   r/   r   r   r   r   r   getattrr   r6   r   rH   r2   r   r)   r   rE   r   rc   rI   r'   r   r   r^   r*   r   r   r_   r   rw   rt   rR   ru   r   r   r   r   r   r   rY   rL   rW   rZ   r   r   r   r   r?   r   )*r8   r   r   r   r   r=   r   r   r   r   r}   ry   r   r   r   r   r   r   r   r   descale_shaper   r   r   rY   r   rW   r   
q_reshapedr   r   r   r   r   r   r   r   r   r   r   r   r   r;   r;   r<   forward_decode  s  
	








	


z"XPUAttentionBackend.forward_decodec                 C  s   dS )z5Get the fill value for sequence length in CUDA graph.r>   r;   )r8   r;   r;   r<   !get_cuda_graph_seq_len_fill_value  s   z5XPUAttentionBackend.get_cuda_graph_seq_len_fill_valueforwardbatchry   r   c                 C  s   | j du r
d|_dS |j}|j}| jr| j|j tj	}n|j}|du s-|du s-|du r2d|_dS |
  }|
  }t| j |||| j\}	}
}}tjt|
|t||||t|	 t| d}||_dS )zVCentralized utility to initialize local_attn_metadata if chunked attention is enabled.N)r   r   r   r   local_max_seq_len)r   r   rR   rL   r.   r0   rY   rI   rJ   rK   cpunumpyr   r*   r   LocalAttentionMetadata
from_numpyintrN   )r8   r   ry   r"   rR   rL   rY   cu_seqlens_q_npseq_lens_npseqlens_q_local_npcu_seqlens_q_local_npseqlens_k_local_npblock_table_localr   r;   r;   r<   r`     sF   
	


z-XPUAttentionBackend._init_local_attn_metadatar{   metadata_swa Optional[FlashAttentionMetadata]c                 C  s   | j dks	J d|j| j|j }tjjtj|dtj	dd}|j
d }|d u r:|j||j|jj
d  fn|j}t||j|j|j|j| j |d u rct }d|_|j|_||_||_||_n|j| |j| ||_d S )Nr>   zpFlashAttention backend doesn't support topk > 1 speculative decoding with page size > 1 sliding window attentionr   r@   rB   )r*   rL   rd   r4   rJ   rS   rT   rU   rV   rK   rf   rY   	new_zerosrP   r
   r   rZ   rR   rW   copy_r   )r8   ry   r{   r   rL   rW   bsrY   r;   r;   r<   rk     sJ   
	
z;XPUAttentionBackend._init_sliding_window_attn_spec_metadata)Fr   r   r   )r   r   r   r   )r=   r   )TNNN)r   r   r   r   r   r   r   r   r=   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r=   r   r   r   r   r   r   r   r   r   )r   r   ry   r   )N)ry   r   r{   r   r   r   )__name__
__module____qualname____doc__r   r   r   r   r   r`   rk   __classcell__r;   r;   r9   r<   r      s8    
9  (  -  
0r   )
__future__r   typingr   r   rJ   sglang.srt.configs.model_configr   -sglang.srt.layers.attention.base_attn_backendr   2sglang.srt.layers.attention.flashattention_backendr   r   r	   r
   "sglang.srt.managers.schedule_batchr   ,sglang.srt.model_executor.forward_batch_infor   r   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr   
sgl_kernelr   sgl_kernel.flash_attnr   r   r   r;   r;   r;   r<   <module>   s    