o
    پi$                    @  s  d dl mZ 	 d dlZd dlmZ d dlmZmZ d dlm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZ e	r_d dlmZ d dlmZ d dlmZ z d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+ W n e,y   e-d Y nw d dl.m/Z/ d dlm0Z0 d dl1m2Z2 d dlm3Z3 e4e5Z6e3dda7e3ddoe Z8da9t7rdnda:G dd deZ;eG dd dZ<dZ=dZ>G dd  d eZ?G d!d" d"Z@G d#d$ d$ZAG d%d& d&ZBdS )'    )annotationsN)	dataclass)Enumauto)TYPE_CHECKINGOptional)AttentionBackend)#create_flashinfer_kv_indices_triton)get_attention_tp_sizeis_dp_attention_enabled)ForwardBatchForwardMode)is_gfx95_supported)RadixAttention)ModelRunner)	SpecInput)	flash_attn_varlen_funcget_mla_metadata_info_v1get_mla_metadata_v1get_ps_metadata_info_v1get_ps_metadata_v1mha_batch_prefill_funcmla_prefill_ps_asm_fwdmla_reduce_v1paged_attention_ragged)mla_decode_fwdmla_prefill_fwdz]aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device.)AttentionArch)pad_sequence_with_mask)	fp8_dtype)get_bool_env_varSGLANG_AITER_MLA_PERSISTTrueSGLANG_AITER_FP8_PREFILL_ATTNFTc                   @  s   e Zd Ze Ze ZdS )WrapperDispatchN)__name__
__module____qualname__r   SLIDING_WINDOWCROSS_ATTENTION r*   r*   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/aiter_backend.pyr$   G   s    
r$   c                   @  s   e Zd ZU ded< ded< ded< ded< ded< ded	< d
Zded< d
Zded< d
Zded< d
Zded< d
Zded< d
Z	ded< d
Z
ded< dZded< d
S )ForwardMetadatatorch.Tensor	kv_indptr
kv_indices	qo_indptrkv_last_page_lenint	max_q_lenzOptional[int]
max_kv_lenNOptional[torch.Tensor]work_metadatawork_info_setwork_indptrreduce_indptrreduce_final_mapreduce_partial_mapnum_kv_splitsTzOptional[bool]	run_graph)r%   r&   r'   __annotations__r6   r7   r8   r9   r:   r;   r<   r=   r*   r*   r*   r+   r,   L   s   
 r,      c                      s   e Zd Z		dEdF fd	d
Zdd Zdd ZdGddZ	dHdId!d"ZdJd%d&Z	dKdLd*d+Z	dMd4d5Z
dNd8d9Zd:d; Z	dHdOdAdBZ	dHdOdCdDZ  ZS )PAiterAttnBackendFNmodel_runnerr   skip_prefillboolkv_indptr_bufr5   c                   s  t    ddlm} |jj| _|jj| _t	j
|| _|j| _|jj| _|jj| _|jj| _|jjt  | _|jj| _|jdjd | _|jt | _|j| _|jj| _|jjtj k| _!|jj"| _#|| _$|jj%}|d u r~t	j&|d ft	j'|jd| _(n|| _(t	j)|ft	j'|jd| _*t	j&|d ft	j'|jd| _+|st,|| | _-| j!rt.|| | _/| j#t0 d t0 | _1t	2t	j3j4d }| j!st	j5|| j | j1 | j | d|| j | j1  d  t	j6| jd| _7t8d	| jd
  | _9t	j:d	gt	j3d;| j | _<| _=d| _>d | _?| j!rQt@ | _At	j&|d ft	j'|jd| _B| jdkr%daCdaD| jdkr7| jtEur7daFdaCdaDtFr<dnd | _G| jd u rKtFrKd| _G| jG| _Hd S d S )Nr   )extend_attention_fwd   dtypedevice               ?g      ?rI   g            TF   @   )Isuper__init__7sglang.srt.layers.attention.triton_ops.extend_attentionrE   model_configrI   input_dtypeserver_args	page_sizetorchcompilerdisablerJ   is_multimodalspeculative_num_draft_tokensnum_draft_tokensspeculative_num_stepsnum_attention_headsr
   num_headhead_dimtoken_to_kv_poolget_value_buffershape
v_head_dimget_num_kv_headsnum_kv_headkv_cache_dtypereq_to_token_poolreq_to_tokenattention_archr   MLAuse_mlacontext_lenmax_context_lenrB   sizezerosint32r.   onesr1   r0   AiterIndicesUpdaterPrefillindices_updater_prefillAiterMlaIndicesUpdaterPrefillmla_indices_updater_prefill_AITER_PARTITION_SIZE_ROCMmax_num_partitionsfinfofloat32bitsemptyuint8workspace_bufferfloatscaletensortok_scalev_scalelogits_soft_capforward_metadatar   enable_dp_attention
qo_indptr_	fast_modeintra_batch_moder   _use_mla_ps_kernelmax_split_per_batchfix_max_split_per_batch)selfrA   rB   rD   rE   max_bsnbyes_per_qo_elem	__class__r*   r+   rT   d   s   









zAiterAttnBackend.__init__c                 C  s   | j }| j}| jr$tj }tj|}|j}t|| d | | j	| _
t|||||dt| j
td	\\}}	\}
}\}}\}}\}}\}}tj||	dd}tj|
|dd}tj||dd}tj||dd}tj||dd}tj||dd}||||||fS )NrG   F)	is_sparser   r<   r   cudarH   )rb   rj   r   rZ   r   current_deviceget_device_propertiesmulti_processor_countminr   r   r   r   r   r   )r   max_seqlen_qo
batch_sizenheadrI   gpudevice_propertiescu_numwork_meta_data_sizework_meta_data_typework_indptr_sizework_indptr_typework_info_set_sizework_info_set_typereduce_indptr_sizereduce_indptr_typereduce_final_map_sizereduce_final_map_typereduce_partial_map_sizereduce_partial_map_typer6   r8   r7   r9   r:   r;   r*   r*   r+    make_mla_decode_meta_data_buffer   sl   
z1AiterAttnBackend.make_mla_decode_meta_data_bufferc                 C  sP   d}| j }| j}t|||| j| |d||||||	t|d|
|
|||||d}d S )NrG   FrQ   )kv_granularityr   uni_seqlen_qor   r   r   dtype_qdtype_kv)rY   rj   r   rb   max)r   r0   r.   r1   r6   r7   r8   r9   r:   r;   r3   r   r   r   nhead_kvrY   rI   metar*   r*   r+   make_mla_meta_data  s2   
z#AiterAttnBackend.make_mla_meta_datar   r2   max_qlenqlen_granularityc                 C  s   t || j||d\\}}\}}\}}	\}
}\}}\}}| j}tj|||d}tj|||d}tj||	|d}tj|
||d}tj|||d}tj|||d}||||||fS )N)r   
num_head_kr   r   rH   )r   ri   rJ   rZ   r   )r   r   r   r   r   r   r   r   work_info_sizework_info_typer   r   r   r   r   r   rJ   work_metadata_ptrsr8   	work_infor9   r:   r;   r*   r*   r+   $make_mla_prefill_ps_meta_data_bufferE  sJ   
z5AiterAttnBackend.make_mla_prefill_ps_meta_data_bufferTr0   r-   r.   seq_lensr6   r8   r   r9   r:   r;   	is_causalc                 C  s   | j | j }| j}d}|}|| }td| j}| j}|jdtjd}|jdtjd}|jdtjd}t|||||||||||	|||||
d d S )Nr?      cpurO   )qhead_granularityr   kvlen_granularity
block_sizer   )rb   ri   r   rY   r   rZ   rt   r   )r   r0   r.   r   r6   r8   r   r9   r:   r;   r   	gqa_rationum_heads_ktile_qr   r   r   r   qo_indptr_cpukv_indptr_cpuseq_lens_cpur*   r*   r+   make_mla_prefill_ps_meta_dataq  s8   
z.AiterAttnBackend.make_mla_prefill_ps_meta_dataforward_batchr   c                 C  s  |j }| j}|j}d}d}d}d}d}	d}
d}d}d}d}|j r|du r\tj|jdd|d|d < |d|d  }tj|j	tj
| jd}t|f | j|j|j|d|| jd n|j|j}}|jd d }| jr| jd|d  }tj| jd| dd|d|d < | jd| }d}tr| ||\}}	}
}}}| j}| j|||||
|	||||t|td t|||||d||
|	||||dd| _dS |j rX| jr3||j|j|j	| j\}}}}trt|j }| ||\}}	}
}}}| j}| j||| jd| ||
|	||||t|td t|||| jd| t|j |j! " ||
|	||||dd| _dS | j#j$|j|j|j	d|j%|jd	 t| j#j| j#jdd| j#j&| j#j'| _dS |j( r"| jr|j)}|j| }|j	||  }|jj}tj*dd| | |tj
|d
}| j}tj|dd|d|d < |d|d  }tj|tj
|d}t|f | j|j||d|| jd tr|}| ||\}}	}
}}}| j}| j||| jd| ||
|	||||t|td t|||| jd| |d||
|	||||dd| _dS | j#j$|j|j|j	d|j%|jd	 t| j#j| j#jdd| j#j&| j#j'| _dS |j+}| j,r,d}nt-|j. }| jr| j/j$|j|j|j	|j0|j0 " |j " dd | j/j&}| j/j1}d}d}	d}
d}d}d}t2rd}|| j3| j4  }| 5|||\}}	}
}}}| j6|||j||	|
|||dd
 t| j/j| j/j|| jd| || j/j'||
|	|||d| _dS | j#j$|j|j|j	||j%dd t| j#j| j#jdd| j#j&| j#j'| _dS )z6Init auxiliary variables for triton attention backend.Nr   dimrG   rH   r   r   r   F)r6   r7   r8   r9   r:   r;   r<   r=   prefix_lensencoder_lens	spec_infosteprI   rJ   )r   r?   T)r   )r6   r7   r8   r9   r:   r;   )r   r   )7r   r.   r   forward_modeis_decode_or_idlerZ   cumsumr   r   seq_lens_sumrt   rJ   r	   rl   req_pool_indicesstrider/   rf   ro   r   r1   r   r   r   r   r   r   r,   r   is_draft_extendgenerate_attn_arg_prefillr   extend_seq_lens_cpur   itemrw   updater   r3   r4   is_target_verifydraft_token_numarangeextend_prefix_lensr]   anyextend_prefix_lens_cpury   extend_seq_lensr0   _use_fp8_prefill_attnrb   ri   r   r   )r   r   bsr.   r   r0   r1   r3   r6   r8   r7   r9   r:   r;   r<   r/   custom_maskr   	draft_numkv_lenskv_lens_sumrJ   r   extend_no_prefixr   r   r*   r*   r+   init_forward_metadata  sj  


$

	







	

z&AiterAttnBackend.init_forward_metadatar   max_num_tokenskv_indices_bufc                 C  s   t j|t jd| _|d u rt j|| j t j| jd| _n|| _| j	s0t j|| j t j
| jd| _| jrStrS| jd u r<dn| j}| ||\| _| _| _| _| _| _d S d | _d | _d | _d | _d | _d | _d S )NrO   rH   rG   )rZ   ru   r2   cuda_graph_kv_last_page_lenrs   rq   rt   rJ   cuda_graph_kv_indicesrB   r   cuda_graph_custom_maskro   r   r_   r   r6   r8   r7   r9   r:   r;   )r   r   r   r   r   r*   r*   r+   init_cuda_graph_state  s>   



z&AiterAttnBackend.init_cuda_graph_stater   
num_tokensr   r   r   r   r   Optional[SpecInput]c                 C  sp  d }d }	d }
d }d }d }d }|  rd }d }d }|d u rJ| j}tj|dd|d|d < |d |d  }| j}t|f | j|||d || jd n|j|j}}| j	r| j
d |d  }tj| jd | dd|d|d < | jd | }d}tr| j}| j|||| j| j| j| j| j| j|t|td | j}	| j}
| j}| j}| j}| j}t||||||d  |	|
|||||d| _d S | r| j	rf| jd |d  }tjdd| | j | jtj| jd|d |d < | jd |d  }tj|dd|d|d < | j}t|f | j|||d || jd | jd | }| j}trN| j}| j|||| j| j| j| j| j| j|t|td | j}	| j}
| j}| j}| j}| j}t||||||d  |	|
|||||d| _d S |   }| j!j"|||d ||d t| j!j| j!jd d | j!j#| j!j$| _d S |% r1| j&d }| jd |d  }tjd|| d |tj| jd|d |d < | jd |d  }tj|dd|d|d < | j}t|f | j|||d || jd | jd | }|}tr| j}| j|||| j| j| j| j| j| j|t|td | j}	| j}
| j}| j}| j}| j}t||||||d  |	|
|||||d| _d S t'd	|)
Nr   r   rG   r   rF   )r6   r7   r8   r9   r:   r;   r<   r   r   zInvalid mode: forward_mode=)(r   r.   rZ   r   r   r	   rl   r   r/   ro   r   r   r   r   r   r6   r7   r8   r9   r:   r;   r   r   r,   r   r   r   r0   r   r_   rt   rJ   sumrw   r   r3   r4   r   r`   
ValueError)r   r   r   r   r   r   r   r   r<   r6   r7   r8   r9   r:   r;   r0   r1   r3   r.   r/   r   num_tokens_per_bsr*   r*   r+   (init_forward_metadata_capture_cuda_graph)  s  




	





	
z9AiterAttnBackend.init_forward_metadata_capture_cuda_graphr   r   c	              
   C  s  |  r[| j}	| j}
|d u rCtj|d | dd|	d|d < |	d |d  }	t|f | j|d | |d | |	d |
| jd d S |j|	d |jjd < |j	|
d |j	jd < d S |
 rt|}| jd |d  }tjdd| | j | jtj| jd|d |d < || j }| jd |d  }	tj|dd|	d|d < | j}
t|f | j|||	d |
| jd d S | r|d | }|jd | }| jd |d  }tj|dd|d|d < | jd |d  }	tj|dd|	d|d < | j}
t|f | j|||	d |
| jd d S td)Nr   r   rG   r   zInvalid forward mode)r   r.   r   rZ   r   r	   rl   r   rf   r/   r   lenr0   r   r_   rt   rJ   r   accept_lengthr   )r   r   r   r   r   r   r   r   r   r.   r/   r0   r   accept_lensr*   r*   r+   'init_forward_metadata_replay_cuda_graph)  st   "








z8AiterAttnBackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS NrG   r*   r   r*   r*   r+   !get_cuda_graph_seq_len_fill_valueu  s   z2AiterAttnBackend.get_cuda_graph_seq_len_fill_valueqkvlayerr   c           2      C  s.  |j s|jn|j}|j| _|d ur3|d usJ |r3| jr&|j|||| n|j|||||j|j	 | jr| j
j}| j
j}	| j
j}
| j
j}| j
j}|j|j}|j|j}|jd }|jd | }|jd | }t|jdksuJ t|jdks~J t|jdksJ |j rB|j sB|j sBt|j }|jd dks|rotr_|jd }|j}|j}|jtkr|  !t}|jtkr|  !t}|jtkr|  !t}t"j#dt"j$|j%d}|}t"j&||j%t"j'd}d}| j
j(}| j
j)}| j
j*}t"j+|,d| ||ft"j$|j%d}t"j+|,d| |ft"j$|j%d}t"j+||ft"j$|j%d}|j-|||f| j.d} t/||||||| j
j0| j
j1||j2d	||| ||| t3||||||| | | S t4||||||||j2d	d
	} | S |j5|| krt"6|d|}t"j7|||gdd\}!}"| j8tkr|j}#|!!|#}!|"!|#}"|9|!: d }$|$;d|j<||j }$t"j7|$||jgdd\}%}&t"j=|%t">|"|"jd |j<|"jd fgdd}%|j?j|j@jksJ |%}|&}t4|||||
||	|j2d	d
	}'|'S |j5|jkr
|-|jd |j|j f}'nt"A|}'tB|;d|j|j5|;ddd|j5|';d|j|j||
|| j
jC| j
j|j2|j
 |;d|j<|j5}|'S |j r|j-|jd |j|jf| j.d}'| j
jD}(| j
j0})| j
j1}*| j
j(}| j
j)}| j
j*}| j
jE}+|jdkrtFr| jG| j
j| j
j| j
jC|(|*|)|||| j
jtH|+tId tJ||;ddd|j5|'| j
j| j
j| j
j| j
jC| j
jf|j2|j|(|)|*||||j|jtI|+d |'S |j r| j
jD}(| j
j0})| j
j1}*| j
j(}| j
j)}| j
j*}| j
jE}+|jdkrtFr| jG| j
j| j
j| j
jC|(|*|)|||| j
jtH|+tId | j
jKd	urwtL|;|jd d|d d |j@| j
j\},}-}.|j-|,| j
j |j|jf| j.d}'tJ|-;d|j|j5|;ddd|j5|'| j
j| j
j| j
j| j
jC| j
jf|j2|j|(|)|*||||j|jtI|+d |'|. S |j-|jd |j|jf| j.d}'tJ||;ddd|j5|'| j
j| j
j| j
j| j
jC| j
jf|j2|j|(|)|*||||j|jtI|+d |'S tMd|j|jN|j\}/}0|jOd }1| j8tkr|j}#|/!|#}/|0!|#}0tP|: ;d|j|jQ|/|0| jd |1 | j
jd |1 | j
j| j
j| j
jd	| jd ddd}'|';d|j|jQ S )NrF      r   rN   rH   )rJ   rI   r?   rO   T)softmax_scalecausalr   rL   rG   r   sm_scale	logit_capwork_meta_datar8   r7   r9   r:   r;   q_scalekv_scaler   r<   zAInvalid forward mode for MLA prefill: forward_batch.forward_mode=F)r
  r   alibi_slopes
return_lsereturn_attn_probs)Ris_cross_attentionout_cache_locencoder_out_cache_locr  r   ro   rd   set_kv_bufferr   r   r   r3   r4   r.   r/   r0   get_key_bufferlayer_idre   rf   r   r   	is_extendr   r   r   r   r   tp_q_head_numrg   rI   r   r   r   rZ   r   r}   rJ   r   rt   r9   r:   r;   r   rr   	new_emptyrW   r   r8   r7   scalingr   r   qk_head_dimindex_selectsplitrj   	kv_b_proj
contiguousviewtp_k_head_numcatbroadcast_tor   r   
empty_liker   r1   r6   r<   r   r   r   r   r   r=   r   r   get_kv_bufferr   r   rc   )2r   r  r  r  r  r   save_kv_cache	cache_locr3   r4   r.   r/   r0   K_BufferV_Bufferkv_lora_rankqk_rope_head_dimqk_nope_head_dimr   total_sr   rg   	one_scalekv_indptr_asmkv_indices_asmr   r9   r:   r;   logitsattn_lse	final_lseoutputkvck_perI   kvprefixk_prefixv_prefixor6   r8   r7   r<   r   q_padq_maskk_cachev_cachebs0r*   r*   r+   forward_extendx  s  













	


		



zAiterAttnBackend.forward_extendc                 C  s8  | d|j|j }|j|jkr"|j|jd |j|j f| jd}ntj|| jd}|r6|j	
||j|| | jr|j	|j}| jj}	| jj}
| jj}| jj}| jj}| jj}| jj}|jdkr}tr}| j| jj| jj| jj|	||
|||| jjt|td t| d|j|j| ddd|j| d|j|j| jj| jj| jj!| jj| jjf|j"|j#|	|
|||||j$|j$t|d |S |j#| _%|j	&|j\}}| j't(kr|j)}|*|}|*|}t+| d|j|j| j,| d|j|j| dd|j-|j| dd|j.|j| j/| jj| jj!| jd| j0d dd| j%| j$| j1d t2 |S )	NrF   r   rO   r   rG   r  r   NHD)3reshaper  r  rg   r  rf   rW   rZ   r'  rd   r  r  ro   r  r  r   r6   r8   r7   r9   r:   r;   r<   r   r   r0   r.   r1   r3   r   r   r   r#  r/   r  r  r   r   r(  rj   r   rI   r   r   r   r$  tp_v_head_numr   r{   r   rz   )r   r  r  r  r  r   r)  r=  k_bufferr6   r8   r7   r9   r:   r;   r<   r@  rA  rI   r*   r*   r+   forward_decode  s   

	:


zAiterAttnBackend.forward_decode)FN)rA   r   rB   rC   rD   r5   )r   r2   r   r2   r   r2   )T)r0   r-   r.   r-   r   r-   r6   r-   r8   r-   r   r-   r9   r-   r:   r-   r;   r-   r   rC   r   r   N)r   r2   r   r2   r   r5   )r   r2   r   r2   r   r-   r   r-   r   r5   r   r   r   r   )r   r2   r   r-   r   r-   r   r2   r   r5   r   r   r   r   r   r5   )
r  r-   r  r-   r  r-   r  r   r   r   )r%   r&   r'   rT   r   r   r   r   r   r   r   r   r  rC  rH  __classcell__r*   r*   r   r+   r@   c   s4    v?
,7
,  b
0  
L
   r@   c                   @  *   e Zd ZdddZdddZdddZdS )rv   rA   r   attn_backendr   c                 C  s   |j jt  | _|j t | _|j j| _|j| _|j	| _
|j| _|| _|j| _|j| _|j| _|jj| _| j| _d | _d| _d| _d S Nr   )rV   ra   r
   num_qo_headsrh   num_kv_headsrc   rj   	data_typerI   q_data_typesliding_window_sizerM  r.   r1   r0   rk   rl   update_single_wrapperr   r/   r3   r4   r   rA   rM  r*   r*   r+   rT   {  s$   


z#AiterIndicesUpdaterPrefill.__init__r   r-   r   r   r2   r   r   r5   r   r   c                 C     t  rJ  NotImplementedError)r   r   r   r   r   r   r   r*   r*   r+   r     s   
z!AiterIndicesUpdaterPrefill.updatec              	   C  s"  d }| j }| j}	|}
|}t|}|d u rtj|
dd|d|d < |d |d  }tj|d tj|jd}t|f | j	||
|||| j	j
d  |d }|d ||d < t|
 | _|| }t| | _tj|dd|	d|d < |	d |d  }	d }n|||
|| j	\}}}	}|| _d S )Nr   r   rG   r?   rH   rF   )r.   r0   r   rZ   r   r   rt   rJ   r	   rl   rf   r   r   r4   r3   r   r/   )r   r   r   r   r   r   r   kv_start_idxr.   r0   paged_kernel_lenspaged_kernel_lens_sumr   r/   	token_numextend_lensr   r*   r*   r+   rT    sN   




	z0AiterIndicesUpdaterPrefill.update_single_wrapperNrA   r   rM  r   )r   r-   r   r-   r   r2   r   r-   r   r5   r   r   r%   r&   r'   rT   r   rT  r*   r*   r*   r+   rv   z  s    

rv   c                   @  rL  )rx   rA   r   rM  r   c                 C  s@   || _ |jj| _| j| _d | _d | _d | _d | _d| _	d| _
d S rN  )rM  rk   rl   rT  r   r.   r/   r0   r1   r3   r4   rU  r*   r*   r+   rT     s   

z&AiterMlaIndicesUpdaterPrefill.__init__r   r-   r   r   r2   r]  r3   r4   r   r   c                 C  rV  rJ  rW  )r   r   r   r   r]  r3   r4   r   r*   r*   r+   r     s   z$AiterMlaIndicesUpdaterPrefill.updatec              
   C  s   t |}| jj}	|d u rWtj|dd|	d|d < |	d |d  }	tj|tj|jd}
t|f | j	|||	d |
| j	
d | jj}tj|dd|d|d < |d |d  }n||||| j	\}
}	}}|	| _|
| _|| _|| _|| _d S )Nr   r   rG   rH   )r   rM  r.   rZ   r   r   rt   rJ   r	   rl   r   r0   r   r/   r3   r4   )r   r   r   r   r]  r3   r4   r   r   r.   r/   r0   r   r*   r*   r+   rT     sD   



	
z3AiterMlaIndicesUpdaterPrefill.update_single_wrapperNr^  )r   r-   r   r-   r   r2   r]  r-   r3   r2   r4   r2   r   r   r_  r*   r*   r*   r+   rx     s    

rx   c                   @  sL   e Zd ZdZdddZdddZdddZdddZdddZd ddZ	dS )!AiterMultiStepDraftBackendzk
    Wrap multiple triton attention backends as one for multiple consecutive
    draft decoding steps.
    rA   r   topkr2   r`   c                 C  s   ddl m} || _|| _|| _|jj| j }tj| j|d ftj|j	d| _
g | _t| jd D ]}| jt|d| j
| d q0| jd j| _|jjt  | _|j	| _	|jjjd | _|jj| _d S )Nr   ) generate_draft_decode_kv_indicesrG   rH   T)rB   rD   )!sglang.srt.speculative.spec_utilsrb  ra  r`   rk   rr   rZ   rs   rt   rJ   r.   attn_backendsrangeappendr@   rq   rV   ra   r
   rb   rl   rf   pool_lenrX   rY   )r   rA   ra  r`   rb  r   ir*   r*   r+   rT   ;  s6   z#AiterMultiStepDraftBackend.__init__r   r   kv_indices_bufferr-   call_fnc                 C  s   |j }| j| }|j}| j| j|| jf |j|jj|j|| j	|j
| j|jd | j	jd t|t| jt|| j t| jd D ]'}| j	|d |d f |j_	|| d || j ||d    |j_||| qCd S r  )r   ra  r   rb  r`   r   rk   rl   r   r.   	positionsrg  rf   tritonnext_power_of_2rY   re  r   r/   )r   r   ri  rj  num_seqsr   r   rh  r*   r*   r+   common_templatea  s6   


z*AiterMultiStepDraftBackend.common_templatec                   sF   t j j|j j  j ft j jd} fdd} ||| d S )NrH   c                   s4   |j j |j _|j j |j _ j|  | d S rJ  )r   r.   cloner/   rd  r   rh  r   r  r*   r+   rj    s
   

zAAiterMultiStepDraftBackend.init_forward_metadata.<locals>.call_fn)	rZ   r   r`   r   ra  rq   rt   rJ   ro  )r   r   r/   rj  r*   r  r+   r     s   		z0AiterMultiStepDraftBackend.init_forward_metadatar   r   c                 C  sV   t j| j|| j ft j| jd| _t| jd D ]}| j| j	||| j| d qd S )NrH   rG   )r   )
rZ   rs   r`   rq   rt   rJ   r   re  rd  r   )r   r   r   rh  r*   r*   r+   r     s   
z0AiterMultiStepDraftBackend.init_cuda_graph_statec                   s     fdd}  | j| d S )Nc              	     s4    j |  j|j|j j |j|jd tj|jd d S )N)r   r   r   )	rd  r   r   ra  r   r   r   DECODEr   rq  r  r*   r+   rj    s   


zTAiterMultiStepDraftBackend.init_forward_metadata_capture_cuda_graph.<locals>.call_fnro  r   )r   r   rj  r*   r  r+   r     s   zCAiterMultiStepDraftBackend.init_forward_metadata_capture_cuda_graphr   c                   s"    fdd} |j| d S )Nc              
     s,   j |  j |j|jdd tj|jd d d S )NrF   )r   r   r   r   r   )rd  r   r   r   r   rr  r   rq  r   r   r*   r+   rj    s   

zSAiterMultiStepDraftBackend.init_forward_metadata_replay_cuda_graph.<locals>.call_fnrs  )r   r   r   rj  r*   rt  r+   r     s   zBAiterMultiStepDraftBackend.init_forward_metadata_replay_cuda_graphN)rA   r   ra  r2   r`   r2   )r   r   ri  r-   rj  r2   rI  )r   r2   r   r2   )r   r   r   r2   )
r%   r&   r'   __doc__rT   ro  r   r   r   r   r*   r*   r*   r+   r`  5  s    

&
 

r`  )C
__future__r   loggingdataclassesr   enumr   r   typingr   r   rZ   rl  -sglang.srt.layers.attention.base_attn_backendr   !sglang.srt.layers.attention.utilsr	   sglang.srt.layers.dp_attentionr
   r   ,sglang.srt.model_executor.forward_batch_infor   r   sglang.srt.utilsr   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr    sglang.srt.speculative.spec_infor   aiterr   r   r   r   r   r   r   r   r   	aiter.mlar   r   ImportErrorprintsglang.srt.configs.model_configr   r   )sglang.srt.layers.quantization.fp8_kernelr   r    	getLoggerr%   loggerr   r   r   r   r$   r,   global_workspace_bufferrz   r@   rv   rx   r`  r*   r*   r*   r+   <module>   sp    ,

	            #iR