o
    پiF                  
   @  s  d dl mZ d dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ e Ze Ze Z e Z!er{zd dl"Z"W n e#yz Z$ ze$Z"W Y dZ$[$ndZ$[$ww e rd dl%Z%d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> erd dl?m@Z@ erdnd ZAG dd deZBd!ddZCG dd  d eZDdS )"    )annotationsN)ABCabstractmethod)TYPE_CHECKINGAnyDictListOptionalTuple)	rearrange)envs)	LayerNorm)is_fp8_fnuz)MultiPlatformOp)
add_prefix
ceil_alignis_cudais_hipis_npu)get_indexer_weight_stream)$get_attn_context_model_parallel_rank*get_attn_context_model_parallel_world_size)get_pp_group)deep_gemm_wrapper)cp_all_gather_rerange_outputis_nsa_enable_prefill_cpis_nsa_prefill_cp_in_seq_split)ReplicatedLinear)QuantizationConfig)get_rope_wrapper)get_is_capture_mode)ForwardBatch)get_global_server_args)NSATokenToKVPooli   c                   @  sz   e Zd ZedddZedddZedddZedd	d
ZdddZdddZ	dddZ
dddZedddZdS )BaseIndexerMetadatareturntorch.Tensorc                 C     dS )z4
        Return: (batch_size,) int32 tensor
        N selfr(   r(   _/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/nsa/nsa_indexer.pyget_seqlens_int329       z%BaseIndexerMetadata.get_seqlens_int32c                 C  r'   )zw
        Return: (batch_size, num_blocks) int32, page table.
                The page size of the table is 64.
        Nr(   r)   r(   r(   r+   get_page_table_64?   r-   z%BaseIndexerMetadata.get_page_table_64c                 C  r'   )zv
        Return: (batch_size, num_blocks) int32, page table.
                The page size of the table is 1.
        Nr(   r)   r(   r(   r+   get_page_table_1F   r-   z$BaseIndexerMetadata.get_page_table_1c                 C  r'   )z<
        Return: (sum_extend_seq_len,) int32 tensor
        Nr(   r)   r(   r(   r+   get_seqlens_expandedM   r-   z(BaseIndexerMetadata.get_seqlens_expanded!Tuple[torch.Tensor, torch.Tensor]c                 C  r'   )zp
        Return: (tokens, ), (tokens, ) int32, k_start and k_end in kv cache(token,xxx) for each token.
        Nr(   r)   r(   r(   r+   get_indexer_kvcache_rangeS   r-   z-BaseIndexerMetadata.get_indexer_kvcache_rangec                 C  r'   )z2
        Return: seq lens for each batch.
        Nr(   r)   r(   r(   r+   get_indexer_seq_len_cpuX   r-   z+BaseIndexerMetadata.get_indexer_seq_len_cpu	List[int]c                 C  r'   )z9
        Return: extend seq lens for each batch.
        Nr(   r)   r(   r(   r+   get_nsa_extend_len_cpu]   r-   z*BaseIndexerMetadata.get_nsa_extend_len_cpuc                 C  r'   )z3
        Return: batch idx for each token.
        Nr(   r)   r(   r(   r+   get_token_to_batch_idxb   r-   z*BaseIndexerMetadata.get_token_to_batch_idxlogitstopkintc                 C  r'   )a   
        Perform topk selection on the logits and possibly transform the result.

        NOTE that attention backend may override this function to do some
        transformation, which means the result of this topk_transform may not
        be the topk indices of the input logits.

        Return: Anything, since it will be passed to the attention backend
                for further processing on sparse attention computation.
                Don't assume it is the topk indices of the input logits.
        Nr(   )r*   r7   r8   r(   r(   r+   topk_transformg   r-   z"BaseIndexerMetadata.topk_transformN)r%   r&   )r%   r1   )r%   r4   )r7   r&   r8   r9   r%   r&   )__name__
__module____qualname__r   r,   r.   r/   r0   r2   r3   r5   r6   r:   r(   r(   r(   r+   r$   8   s    



r$   xr&   r%   c                 C  s`   | j tjksJ trddlm} nddlm} | d}||d @ dks(J d|| |d dS )Nr   )hadamard_transform   z8Hidden size must be a power of 2 for Hadamard transform.      )scale)dtypetorchbfloat16_is_hipfast_hadamard_transformr?   sglang.jit_kernel.hadamardsize)r>   r?   hidden_sizer(   r(   r+   rotate_activationz   s   

rL   c                      s   e Zd Z						dYdZ fddZejdd  Zes!ej	dd!nd"d# d[d&d'Z
es2ej	dd!nd(d# d\d*d+Zd]d1d2Zd^d3d4Zd_d:d;Zd`dAdBZd_dCdDZ	dadbdGdHZ	dcdddMdNZdedPdQZ	dadfdRdSZ	dcdgdUdVZdWdX Z  ZS )hIndexer   NT rK   r9   index_n_headsindex_head_dimrope_head_dim
index_topkq_lora_rankmax_position_embeddings
rope_thetafloatlayer_id	scale_fmtOptional[str]
block_sizerope_scalingOptional[Dict[str, Any]]is_neox_styleboolprefixstrquant_configOptional[QuantizationConfig]
alt_streamOptional[torch.cuda.Stream]c              	     s`  t    || _|| _|| _|| _|| _|| _|	| _|| _	t
 | _| jr-t | _t | _nd | _d | _trRt | _t| jd d| _t j}|dkoOt j | _nd| _t| j| j| j d|td|d| _t| j| jd|td|d| _t| j| jdtrtj ntj!td|d	| _"t#| jtj!d
| _$t%||||||t j&d| _'|| _(|
| _)| jd | _*d S )N      rA   Fwq_b)biasrb   r`   wkweights_proj)ri   params_dtyper`   rD   )
rotary_dimmax_positionbaser\   r^   devicerB   )+super__init__rK   n_headshead_dimrR   rS   rT   rX   rd   r   nsa_enable_prefill_cpr   cp_sizer   cp_rank_is_cuda	deep_gemmget_num_smssm_countr   half_device_sm_countr"   pp_sizer   is_last_ranklogits_with_pp_recvr   r   rh   rj   rE   rF   float32rk   r   k_normr   rq   
rotary_embr[   rY   softmax_scale)r*   rK   rP   rQ   rR   rS   rT   rU   rV   rX   rY   r[   r\   r^   r`   rb   rd   r~   	__class__r(   r+   rs      sn   



	zIndexer.__init__c                 c  sR    | j r$d}t| j|  d V  W d    d S 1 sw   Y  d S d V  d S )NrA   )r   r   configure_deep_gemm_num_smsr|   )r*   pp_recv_sm_countr(   r(   r+   _with_real_sm_count   s   "
zIndexer._with_real_sm_count)dynamicc                 C     | S Nr(   fr(   r(   r+   <lambda>   r-   zIndexer.<lambda>r>   r&   c                 C  s<   t r
|| jjj}| |\}}| }|| jd  }|S )NrB   )rG   tork   weightrD   rW   rt   )r*   r>   weights_r(   r(   r+   _project_and_scale_head_gates   s   z%Indexer._project_and_scale_head_gatesc                 C  r   r   r(   r   r(   r(   r+   r      r-   q_scalec                 C  sP   t r
|| jjj}| |\}}| }|| jd  }|d| | j }|S )NrB   r@   )	rG   r   rk   r   rD   rW   rt   	unsqueezer   )r*   r>   r   r   r   r(   r(   r+   _get_logits_head_gate   s   zIndexer._get_logits_head_gateq_lora	positionsenable_dual_streamforward_batchr!   c                 C  sZ  |r~t j }| j| t| j( | |\}}t	|d| j
d}t j|| j| j
| j gdd\}	}W d    n1 s>w   Y  t j| j% | |\}
}| |
}
t j|
| j| j
| j gdd\}}W d    n1 srw   Y  || j n=| |\}}t	|d| j
d}t j|| j| j
| j gdd\}	}| |\}
}| |
}
t j|
| j| j
| j gdd\}}| ||	|\}	}|	|dd | jf< ||
dd | jf< |r
t j }| j| t|}t j| j t|
}
W d    n1 sw   Y  || j nt|}t|
}
|jd ur)| jr)t|
 | j|t j }
||
fS )Nzl (h d) -> l h d)dr@   dim.)rE   cudacurrent_streamrd   wait_streamr   r   r}   rh   r   ru   splitrR   streamrj   r   r   rL   nsa_cp_metadatarv   r   
contiguousrw   )r*   r   r>   r   r   r   r   queryr   q_ropekeyk_roper(   r(   r+   _get_q_k_bf16   sn   







zIndexer._get_q_k_bf16c                 C  sj   |  |\}}| |}tj|| j| j| j gdd\}}| |||\}}||dd | jf< t|}|S )Nr@   r   .)rj   r   rE   r   rR   ru   r   rL   )r*   r>   r   r   r   r   r   r(   r(   r+   _get_k_bf16D  s   

zIndexer._get_k_bf16q_fp8r   metadatar$   r%   c                 C  sN  t r
t|jts
J |jj}tr|dksJ d| }n|dks%J d| }|jd | }|jj	|d}	|}
|j
 sE|j
jddrJ| }n| }t|dd }trb|d u rbt||
| j}t|jd	kskJ |d}t|	jd
ksyJ tr}dnd}d}d}tr|	d|||}	n|	|	jd |||}	t|jd	ksJ |d
}t| }trddlm} |j\}}}}tj|| |ftd|jtj d}|||	|||||d|dddd ntj!|d | |	|d | ||||dd}|"|| j#}ts%||jd k r%|jd | }tj||jd fd|j$|jd}tj%||gdd}|S )NrA   only support page size 1@   only support page size 64)rX   T
include_v2paged_mqa_schedule_metadata   rf      r@   r   )deepgemm_fp8_paged_mqa_logitsz-infrq   rD   FrN         )
PreshuffleKVBlockSizeChunkKTotalCuCount	WavePerEUclean_logitsrD   rq   r   )&r   
isinstancetoken_to_kv_poolr#   	page_sizerG   r/   r.   shapeget_index_k_with_scale_bufferforward_modeis_target_verifyis_draft_extendr0   r,   getattrry   rz   get_paged_mqa_logits_metadatar|   lenr   viewsqueezesumr5   aiter.ops.triton.pa_mqa_logitsr   rE   fullrW   rq   r   fp8_paged_mqa_logitsr:   rS   rD   cat)r*   r   rX   r   r   r   r   block_tablesmax_seq_lenkv_cache_fp8	blocksize
seqlens_32schedule_metadatablock_kvnum_heads_kvhead_dim_with_sfq_offsetr   
batch_sizenext_nheadsr   r7   topk_resultpad_lenpaddingr(   r(   r+   _get_topk_pagedW  s   






zIndexer._get_topk_pagednum_qnum_krq   torch.deviceTuple[bool, int]c           	      C  sP   || dk rdS t j|\}}d}|| | }|d |kp#||d k}||fS )z
        Detect whether we need to chunk the MQA logits computation to avoid OOM
        Return: (need_chunk, free_mem)
        i z )Fr      rf   g333333?)rE   r   mem_get_info)	r*   r   r   rq   free_mem	total_membytes_per_elemlogits_bytes
need_chunkr(   r(   r+   _should_chunk_mqa_logits  s   z Indexer._should_chunk_mqa_logitsc           /   
   C  s  t r
t|jts
J |j sJ |jj}tr |dksJ dn|dks(J dt|j	dks1J |
d}g }g }trA| }	n| }	|jd urO|jd usQJ t|	}
|j	\}}}|j}tj|| jfd|tjd}|
dkrq|S | }t||
ks}J t|
D ]%}||  }t|tsJ |j|||	| \}}|| || qtrtj|dd	tj}ntj|dd	tj}tj|dd	tj
d}||f}|  \}}|! }|" }|j	d }|j	d }| #|||\}}|sr|d | j	d dksJ | $ : tr(dd
l%m&} |\}}||d | |||d | ||} nt'j&|d | ||d | ||dd} W d    n	1 sFw   Y  | j	d t|ksWJ | j	d |ksaJ |j(| | j|d}!|!|d |< |S d}"||" }#t)dt|d t)|#d }$t*|$|}$|j+j,}%|j	d |ksJ d|j	d  d| |%d ur|%j	d |ksJ d|%j	d  d| d}&|&|k rmt*|&|$ |}'| $ J trdd
l%m&} |\}}|||&|' ||||&|' ||&|' ||&|' }(nt'j&||&|' |||&|' ||&|' ||&|' dd}(W d    n	1 sw   Y  ||&|' })|%d ur9|%|&|' }*d }+d },nd }*|(j	d }-tj-|-tj|d}+||&|' },|j(|(| j||&|' |+|)|,|*d}.|.||&|'< |'}&|&|k s|S )NrA   r   r   r   r   r@   r   r   r   )fp8_mqa_logitsFr   )ksr   g      ?z#seq_lens_expanded length mismatch: z != ztopk_indices_offset too short: z < r   )r   cu_seqlens_q	ke_offsetbatch_idx_listtopk_indices_offset_override).r   r   r   r#   r   is_extend_without_speculativer   rG   r   r   r   r/   r.   seq_lens_cpuextend_seq_lens_cpurq   rE   r   rS   int32r3   rangeitemr9   get_index_k_scale_bufferappend_is_fp8_fnuzr   r   float8_e4m3fnuzfloat8_e4m3fnr   r2   r0   r6   r   r   aiter.ops.triton.fp8_mqa_logitsr   rz   r:   maxminattn_metadatatopk_indices_offsetones)/r*   r   rX   r   r   r   r   
k_fp8_listk_scale_listr   r   
token_numsr   rq   r   indexer_seq_lens_cpuiseq_lenk_fp8k_scalekv_fp8r   keseq_lens_expandedtoken_to_batch_idxr   k_offsetr   r   r   kvrC   r7   raw_topk_resultr   bytes_per_rowmax_rowsglobal_topk_offsetstartendlogits_chunklengths_chunktopk_offset_chunkcu_seqlens_q_chunkbatch_idx_chunkB_chunkraw_topk_chunkr(   r(   r+   _get_topk_ragged  s  
















	






	
7zIndexer._get_topk_raggedreturn_indicesOptional[torch.Tensor]c	                 C  s   |j  sJ t|tr|d n|}	| |||}
||
| j| j\}}|j s.|j	 |_|j
j||j||d |s=d S | }tj|jd | jtj|	jd}||| jS )Nr   rX   locindex_kindex_k_scaler   )r   r   r   tupler   r[   rY   out_cache_locis_contiguousr   r   set_index_k_scale_bufferr0   rE   zerosr   rS   r   rq   r:   )r*   r>   r   r   rX   	act_quantr   r   r$  x_metar   r  r  r  dummy_logitsr(   r(   r+   _forward_cuda_k_only  s,   
zIndexer._forward_cuda_k_onlykv_lenactual_seq_qcp_indexList[Tuple[int, int, int]]c	              	   C  s  t r
t|jts
J |jj}	|	dksJ dt|jdksJ |d}g }
g }g }g }d}g }g }| }|j	d ur@|j
d usBJ |d ur1|D ]\}}}|j	|  |j
|  }||7 }||7 }|dkrs|dkrs||j
|d  7 }|j|||| }|j|||| }|| }tj|f|tjdd}|
| || || tj|d |d tjdd}|| tj|gtjdd}|| || qItj|
dd	tj}tj|dd	tjd}||f}tj|dd	}tj|dd	}|| }tj|dd	}|   tj|||||d
d}W d    n	1 sw   Y  |j|| j||||d}|S |j	d  |j
d  | }|j|||d }|j|||d }|tj}|tjd}||f}tj|f|tjdd}tj|| d |d tjdd}|| }|   tj|||||d
d}W d    n	1 sw   Y  tj|gtjdjddd}|j|| j|||d}|S )Nr   r   r   r@   r   rA   r   r   r   Fr   )r   r   r   r   rm   T)rq   non_blocking)r   r   r   )r   r   r   r#   r   r   r   r   r.   r   r   r   get_index_k_continuousget_index_k_scale_continuousrE   r   r   r   arangetensorr   r   r  r   r   rz   r   r:   rS   r   )r*   r   rX   r   r   r   r3  r4  r5  r   r  r	  ks_listke_offset_listoffsetactual_seq_q_listr   r   	batch_idxstart_seq_positionend_seq_positionpre_chunk_offsetr  r  extend_seq_lenr   r   r  r  r7   r   r(   r(   r+   _get_topk_ragged_with_cp  s  









	9

	z Indexer._get_topk_ragged_with_cpr8   c                 C  s  t sddlm} |jj}|dksJ dt|jdksJ |d}g }g }	g }
|jj	|j
d d f }tjd|jd |dd}|d d |f | }d}t|jD ]}|j|  }|j rc|j| nd	}|| }||| }|d }||| }|dd }|j|||| }|j|||| }|tjd }|tjdd }|||||}|}|jt||dd
d	 d}t|jd d|jd  }tjj |d|fdd}|
!| |}qPtj"|
dd
}|S )Nr   )	fp8_indexr   r   r   r@   r   )rq   rA   r   i   constant)#_is_npu/sglang.srt.layers.attention.nsa.tilelang_kernelrF  r   r   r   r   r   req_to_token_poolreq_to_tokenreq_pool_indicesrE   r:  r   r   seq_lensr   r   	is_extendr   r   r   r8  r9  r   r  r   r8   r  r   nn
functionalpadr   r   )r*   r   r   r   r8   rX   rF  r   r  r	  topk_indices_listr   strided_indicesq_len_startr  r  q_len	q_len_endq_fp8_partialweights_partialr  r  index_scoreend_postopk_indicesr   r(   r(   r+   forward_indexerF  sp   

zIndexer.forward_indexerc           '   
   C  s  t r	ddlm} ntsddlm} trt|jtsJ t|t	r$|d n|}|j
||}	| jd uoBt oB|jd dkoB|jd tk}
|	d u rId S d}|j ra|jd ura|j  }|| jk}|rr| jsr| ||||||
|	|S |
r|j rtj }| j| | |}| j||||
|d\}}||| j| j \}}tj!| j ||| j| j \}}W d    n1 sw   Y  || j |"d| | j# }n| j||||
|d\}}|
rtj }| j| ||| j| j \}}tj!| j ||| j| j \}}W d    n	1 sw   Y  || j n||| j| j \}}||| j| j \}}t|t	rt$|dv sCJ d|d |d }}|d ur|% d	kr|% d	kr|jd |jd kr|j\}}|jd }|dkr|| dkr|| }|&tj'(|||)|&tj'"d(||&tj*}n|&tj*}n	|&tj*}n|}| +||}|j,- s|j,. |_,|jj/||j,||d
 t0st r{|jd usJ t$|jdkrtj1|jd | jfdtj2|j3dS |j s
|j4 s
|jj5ddr| 6|||||	}|S |j7d urpt8 rp|j7j9}|j7j:}|j7j;}|j7j<} tj=||jd d d	 dd\}!}"tj=||jd d d	 dd\}#}$| >|||!|#|	||}%| >|||"|$|	|| }&tj?|%|&gddS | @|||||	}|S | jA|. ||| j|d}|S )Nr   )r/  F)r   r@   )rf   r   zBFor tuple input, only (x, x_s) or (x, x_s, y) formats are acceptedrA   rf   r&  r   Tr   r   )r8   rX   )BrG   rI  r/  rH  -sglang.srt.layers.attention.nsa.triton_kernelr   r   r   r#   r*  attn_backendget_indexer_metadatard   r    r   DUAL_STREAM_TOKEN_THRESHOLDr   r   r   r  r   rS   rv   r2  is_decode_or_idlerE   r   r   r   r   r   r[   rY   r   r   r   r   r   r   r   r   mul_rF   r   r+  r,  r   r-  ry   r   r9   rq   r   r   r   r   r   kv_len_prevkv_len_nextactual_seq_q_prevactual_seq_q_nextr   rE  r   r#  r\  )'r*   r>   r   r   r   rX   r$  r/  r0  r   r   skip_logits_computation
max_kv_lenr   r   r   r   r   r   r  r  x_qx_smnnggroup
x_for_gater   rc  rd  re  rf  
q_fp8_prev
q_fp8_nextweights_prevweights_nexttopk_result_prevtopk_result_nextr(   r(   r+   forward_cuda  s6  	

















7


	
zIndexer.forward_cudadynamic_scalec                 C  s  |j jjd u r|j jj}n|j jj}|j o(|j  o(|j  o(|j  }| j	j
| }	|	jddd\}
}|
ddddd| j}
|ddddd| j}|jd }| jd ur| jtj  tj| jd |d uru||fn|}| |d }| j }||| j| j}tj|| j| j| j gdd\}}||| jd| j}t||
||| j| j}tj||gdd}|| j | j }W d    n1 sw   Y  nL|d ur||fn|}| |d }||| j| j}tj|| j| j| j gdd\}}||| jd| j}t||
||| j| j}tj||gdd}tj ! rkt" }|tj  tj|% |d| j#}| $|% d &tj'}|| | }W d    n	1 sew   Y  n|d| j#}| $|% d &tj'}| (|d }| )|}tj|| j| j| j gdd\}}|ddd| j}tj*j||
||d| j}tj||+dgdd}|r| j,r|j-d urt.|/ d| j| j0|tj }|j12||j3| |r!| j,r|j-d ur|j-j4|j-j5f|j j_6|j-j7|j-j8f|j j_9|j jj6}|j jj9}nO|j}|j:j;dd}nD|j jj6d u r`|j s;|j s;|j rN|j j<}tj=||| |tj>|j?d}ntj@dd tA|D tj>|j?d}n|j jj6}|j1B|}| jd urytj C| tj ! rtj C| |j jjD}|r| j,r|j-d ur|d |d E  }| F|d| j| j|||||}|S |r|d |G d  n|}tjH|d| j| j|||&tj>|&|j?&tj>|d	d
| jIdd
}|d S )Nrf   r@   r   rA   r   r   c                 S  s   g | ]}d |d   qS )rA   r(   ).0r  r(   r(   r+   
<listcomp>  s    z'Indexer.forward_npu.<locals>.<listcomp>TNDPA_BSNDr   
r   r   r   actual_seq_lengths_queryactual_seq_lengths_keyblock_tablelayout_query
layout_keysparse_countsparse_mode)Jr^  forward_metadataseq_lens_cpu_intrM  r   rN  is_draft_extend_v2r   r   r   cos_sin_cachechunkrepeatr   rR   r   rd   r   rE   npur   r   rh   record_eventrt   ru   r   	torch_npunpu_rotary_mulr   record_streamr   SGLANG_NPU_USE_MULTI_STREAMgetr   rK   rk   rW   r   rF   rj   r   opsr   rv   r   r   r   rw   r   set_index_k_bufferr+  actual_seq_q_prev_tensoractual_seq_q_next_tensoractual_seq_lengths_qkv_len_prev_tensorkv_len_next_tensoractual_seq_lengths_kvextend_seq_lenscumsumspeculative_num_draft_tokensr:  r   rq   r;  r   get_index_k_buffer
wait_eventr   numeldo_npu_cp_balance_indexerrJ   npu_lightning_indexerrS   )r*   r>   r   r   r   rX   rw  r  
is_prefillcos_sincossinbsq
wq_b_eventq_peq_nopeq_rope_eventindexer_weight_streamr   weights_eventk_projkk_pek_noper  num_draft_tokenspast_key_statesr  r[  r(   r(   r+   forward_npuk  sP  	


















zIndexer.forward_npuc                 C  s  t j||dd d dd\}}d\}	}
|d ur@t j||dd d dd\}	}
|	 d|	jd }	|
 d|
jd }
|\}}|\}}tj|||	|j|j	t j
d|j|j	t j
d|dd	| jd
d
}tj|||
|j|j	t j
d|j|j	t j
d|dd	| jd
d
}|d |d fS )Nr   rA   rf   r   )NNr@   r   rz  r{  r   r|  )rE   r   rJ   r   r   r   r  r  r   rq   r   rS   )r*   r  r  indexer_weightsr  r  r  q_prevq_nextrr  rs  actual_seq_lengths_q_prevactual_seq_lengths_q_nextactual_seq_lengths_kv_prevactual_seq_lengths_kv_nexttopk_indices_prevtopk_indices_nextr(   r(   r+   r  3  sV   "	
z!Indexer.do_npu_cp_balance_indexer)rN   NTrO   NN) rK   r9   rP   r9   rQ   r9   rR   r9   rS   r9   rT   r9   rU   r9   rV   rW   rX   r9   rY   rZ   r[   r9   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   )r>   r&   )r>   r&   r   r&   )
r   r&   r>   r&   r   r&   r   r_   r   r!   )r>   r&   r   r&   r   r_   )r   r!   rX   r9   r   r&   r   r&   r   r$   r%   r&   )r   r9   r   r9   rq   r   r%   r   )T)r>   r&   r   r&   r   r!   rX   r9   r   r_   r   r$   r$  r_   r%   r%  r   )r   r!   rX   r9   r   r&   r   r&   r   r$   r3  r9   r4  r9   r5  r6  r%   r&   )r   r&   r   r&   r   r!   r8   r9   rX   r9   r%   r%  )r>   r&   r   r&   r   r&   r   r!   rX   r9   r$  r_   r%   r%  )r>   r&   r   r&   r   r&   r   r!   rX   r9   rw  r&   r%   r&   )r;   r<   r=   rs   
contextlibcontextmanagerr   rG   rE   compiler   r   r   r   r   r   r#  r2  rE  r\  rv  r  r  __classcell__r(   r(   r   r+   rM      sB    O

	
I

q
 63 
Z Z IrM   )r>   r&   r%   r&   )E
__future__r   r  abcr   r   typingr   r   r   r   r	   r
   rE   einopsr   sglang.srt.environr   sglang.srt.layers.layernormr   )sglang.srt.layers.quantization.fp8_kernelr   sglang.srt.layers.utilsr   sglang.srt.utilsr   r   r   r   r   ry   rG   rH  r   rz   ImportErrorer  %sglang.srt.hardware_backend.npu.utilsr   sglang.srt.distributedr   r   %sglang.srt.distributed.parallel_stater   sglang.srt.layersr   %sglang.srt.layers.attention.nsa.utilsr   r   r   sglang.srt.layers.linearr   *sglang.srt.layers.quantization.base_configr   "sglang.srt.layers.rotary_embeddingr   +sglang.srt.model_executor.cuda_graph_runnerr    ,sglang.srt.model_executor.forward_batch_infor!   sglang.srt.server_argsr"    sglang.srt.mem_cache.memory_poolr#   r`  r$   rL   rM   r(   r(   r(   r+   <module>   sR     
B