o
    پiY                     @  s   d dl mZ d dlZd dlmZ d dlmZmZ d dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ er\d d
lmZ d dlmZ d dlmZ ee Z!e	j"dddZ#eG dd dZ$G dd deZ%dS )    )annotationsN)	dataclass)TYPE_CHECKINGOptional)AttentionBackend)#create_flashinfer_kv_indices_triton)get_attention_tp_size)ForwardBatchForwardMode)get_bool_env_varget_device_core_count)RadixAttention)ModelRunner)	SpecInputMAX_NUM_SEQtl.constexprc	                 C  sv  t d|}	|	|k }
t j||	 |
dd}t |}t j||	 |
|d}t |}|d |d k r2|}t t |||}t ||}t |t jd }t |t 	t 
|d t j}d|| }}|dkrm|| | }nt ||}|| t || }t t |||}t ||}t 	t ||t ||}|	| }||| k }td|D ]}t j| | | ||d	 qd S )
Nr   )maskother   
   g      P@g      ?      )r   )tlarangeloadmaxminminimumcdivcastfloat32maximumlog2int32rangestore)num_kv_splits_ptrseq_lens_ptrnum_seq	num_groupnum_headnum_kv_headmax_kv_splitsdevice_core_countr   offs_seqmask_seqseq_lensmax_seq_lenmin_seq_lenmax_kv_splits_1kv_chunk_size_1ext_seq_lenext_device_core_countblock_hnum_kv_group
token_gridmax_kv_splits_2kv_chunk_size_2num_kv_splits
offs_token
mask_tokeni r@   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/wave_backend.pyget_num_kv_splits_triton   s>   

rB   c                   @  sV   e Zd ZU ded< ded< ded< ded< ded< ded< ded	< ded
< ded< dS )ForwardMetadatatorch.Tensorattn_logitsattn_lseintmax_extend_lenr<   	kv_indptr
kv_indices	qo_indptrcustom_maskmask_indptrN)__name__
__module____qualname____annotations__r@   r@   r@   rA   rC   M   s   
 rC   c                      s~   e Zd Z		d4d5 fd	d
Zd6ddZd7ddZ	d8d9ddZd:d"d#Zd;d&d'Zd(d) Z		*d<d=d0d1Z
	*d<d=d2d3Z  ZS )>WaveAttnBackendFNmodel_runnerr   skip_prefillboolkv_indptr_bufOptional[torch.Tensor]c           
        sp  ddl m} ddlm} t   dd lm  m  m	} |j
}|d|j  }td|  ||_
|| _|| _|| _|jj}	|d u rStj|	d ftj|jd| _n|| _|jj| _| jsztj|	d ftj|jd| _tj|	d ftj|jd| _|jj| _|jjt   | _!|j"t  | _#t$dd	| _%|jj&| _'|j()dj*d
 | _+d | _,|jj-| _.|j| _t/|j0| _1d S )Nr   )decode_attention_fwd)extend_attention_waveworker_zSetting Wave cache dir: r   dtypedevice*SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITSfalse)25sglang.srt.layers.attention.wave_ops.decode_attentionrX   5sglang.srt.layers.attention.wave_ops.extend_attentionrY   super__init__wave_lang.kernel.wave.cachekernelwavecacheCACHE_BASE_DIRtp_rankloggerinfoextend_attention_fwdrT   req_to_token_poolsizetorchzerosr#   r]   rI   req_to_tokenrK   int64rM   server_argsspeculative_num_draft_tokensnum_draft_tokensmodel_confignum_attention_headsr   r*   get_num_kv_headsr+   r   static_kv_splitstriton_attention_num_kv_splitsr,   token_to_kv_poolget_value_buffershape
v_head_dimforward_metadatacontext_lenmax_context_lenr   gpu_idr-   )
selfrS   rT   rV   rX   rY   rh   base_cache_dirnew_dirmax_bs	__class__r@   rA   rd   [   sP   





zWaveAttnBackend.__init__r<   rD   r0   c                 C  s   |j d |j d }}|| }|| |ks J d| d| d| js(| jdkr0|| j d S |dk r7d}nt|}td ||||| j| j	| j| j|d	 d S )Nr   znum_seq(z), num_token(z), something goes wrong!   )r   )r   )
r~   rz   r-   fill_r,   tritonnext_power_of_2rB   r*   r+   )r   r<   r0   	num_tokenr(   r)   SCHEDULE_SEQr@   r@   rA   get_num_kv_splits   s,   

z!WaveAttnBackend.get_num_kv_splitsforward_batchr	   c              
   C  sl  |j }| j}|j}|j r|du rHtj|jdd|d|d < |d|d  }tj|j	tj
| jd}t|f | j|j|j|d|| jd n|j|j}}|jd d }ddlm} ||| j| j| j\}}tj|tj| jd}	tj|tj| jd}
tj|ftj
| jd}| ||j d}d}d}d}n|j rt|j}tjdd| | j | jtj
| jd}tj|jdd|d|d < |d|d  }tj|d tj
| jd}t|f | j|j|j|d|| jd |j}| j|j| j  }| j}tj|d| dd|d|d < |d|d  }| j}d}d}	d}
n|j rC| |j|jd| j\}}}}d}t!|j"# }d}d}	d}
ndtj|j$dd|d|d < |d|d  }tj|j$% # tj
| jd}t|f | j|j|j$|d|| jd | j&}tj|j'dd|d|d < |d|d  }d}d}d}	d}
t!|j'# }d}t(|	|
|||||||	| _)dS )	z4Init auxiliary variables for wave attention backend.Nr   dimr   r[   +decode_attention_intermediate_arrays_shapesstepr\   r]   r`   )*
batch_sizerI   	spec_infoforward_modeis_decode_or_idlerp   cumsumr0   emptyseq_lens_sumr#   r]   r   rr   req_pool_indicesstriderJ   r~   ra   r   r   r*   r,   r    r   is_target_verifylenr   rv   rL   rM   is_draft_extendgenerate_attn_arg_prefillr   accept_lengthitemextend_prefix_lenssumrK   extend_seq_lensrC   r   )r   r   bsrI   r   rJ   r   attn_logits_shapeattn_logits_max_shaperE   rF   r<   rK   rL   rM   rH   seq_mask_lenr@   r@   rA   init_forward_metadata   s   






"



z%WaveAttnBackend.init_forward_metadatar   rG   max_num_tokenskv_indices_bufc                 C  s   ddl m} ||| j| j| j\}}tj|tj| jd| _	tj|tj| jd| _
tj|f| jtj| jd| _|d u rItj|| j tj| jd| _n|| _| js_tj|| j tj| jd| _d S d S )Nr   r   r[   )ra   r   r   r*   r,   rp   rq   r    r]   cuda_graph_attn_logitscuda_graph_attn_lsefullr#   cuda_graph_num_kv_splitsr   cuda_graph_kv_indicesrT   uint8cuda_graph_custom_mask)r   r   r   r   r   r   r   r@   r@   rA   init_cuda_graph_stateX  s@   
z%WaveAttnBackend.init_cuda_graph_stater   
num_tokensr   encoder_lensr   r
   r   Optional[SpecInput]c              
   C  s  |d u sJ d|  rW|d u r>| j}tj|dd|d|d < |d |d  }| j}	t|f | j|||d |	| jd n|j|j}}	| j	}
| j
}d }| j}d }d }d }n| r| jd |d  }tjdd| | j | jtj| jd|d |d < | jd |d  }tj|dd|d|d < | j}	t|f | j|||d |	| jd | j}| j|| j  }| jd |d  }tj|dd|d|d < | j}d }d }
d }ntd|dt|
|||||	|||	| _d S )NzNot supportedr   r   r   r   #Invalid forward mode: forward_mode=z for CUDA Graph capture.)r   rI   rp   r   r   r   rr   r   rJ   r   r   r   r   rK   r   rv   r#   r]   r   rM   
ValueErrorrC   r   )r   r   r   r   r0   r   r   r   rI   rJ   rE   rF   rH   r<   rK   rL   rM   r   r@   r@   rA   (init_forward_metadata_capture_cuda_graph  s   






z8WaveAttnBackend.init_forward_metadata_capture_cuda_graphr   seq_lens_cpuc	              
   C  s  |  ru| j}	| j}
| j}|d u rGtj|d | dd|	d|d < |	d |d  }	t|f | j|d | |d | |	d |
| jd |}n|j|	d |jj	d < |j
|
d |j
j	d < |jj	d d }| |d | |d |  d S | rt|}| jd |d  }tjdd| | j | jtj| jd|d |d < | jd |d  }	tj|dd|	d|d < | j}
t|f | j|||	d |
| jd | j}|j|d |jj	d < | j|| j  }| jd |d  }tj|dd|d|d < d S td|d)Nr   r   r   r   r   z for CUDA Graph replay.)r   rI   r   r   rp   r   r   rr   r   r~   rJ   r   r   r   rK   r   rv   r#   r]   r   rL   rM   r   )r   r   r   r0   r   r   r   r   r   rI   rJ   r<   r   rK   rL   r   rM   r@   r@   rA   'init_forward_metadata_replay_cuda_graph  sf   "


	 
	
z7WaveAttnBackend.init_forward_metadata_replay_cuda_graphc                 C  s   dS )Nr   r@   )r   r@   r@   rA   !get_cuda_graph_seq_len_fill_value  s   z1WaveAttnBackend.get_cuda_graph_seq_len_fill_valueTqkvlayerr   c           
      C  s
  |j |jkr||jd |j|j f}nt|}|r&|j||j	|| | j
j}t|j}	|	|krEt|jdks=J ||jd< ||_| j|d|j|j | | |j|j|j|j| j
j| j
j| j
j| j
j| j
j| j
j|d|j|jd|j|jd |S )Nr   r   r`   T)	is_causallayer_scaling	logit_cap)qk_head_dimr   	new_emptyr~   tp_q_head_numrp   
empty_liker|   set_kv_bufferout_cache_locr   rH   r   r   r   r0   rm   view
contiguousget_key_bufferlayer_idr}   rK   rI   rJ   rL   rM   scalingr   )
r   r   r   r   r   r   save_kv_cacheorH   computed_max_ext_seq_lenr@   r@   rA   forward_extend  s>   



zWaveAttnBackend.forward_extendc                 C  s   | d|j|j }|j|jkr||jd |j|j f}nt|}|r0|j	||j
|| | |d|j|j|j|j|j|j|d|j|j| jj| jj| jj| jj| jj| j|j|j |S )Nr`   r   )reshaper   r   r   r   r~   rp   r   r|   r   r   rX   r   r   r   r}   r   rI   rJ   rE   rF   r<   r,   r   r   )r   r   r   r   r   r   r   r   r@   r@   rA   forward_decodeM  s.   

zWaveAttnBackend.forward_decode)FN)rS   r   rT   rU   rV   rW   )r<   rD   r0   rD   )r   r	   )N)r   rG   r   rG   r   rW   )r   rG   r   rG   r   rD   r0   rD   r   rW   r   r
   r   r   )r   rG   r   rD   r0   rD   r   rG   r   rW   r   r
   r   r   r   rW   )T)
r   rD   r   rD   r   rD   r   r   r   r	   )rN   rO   rP   rd   r   r   r   r   r   r   r   r   __classcell__r@   r@   r   rA   rR   Z   s     
G
! 
,
TD
5rR   )r   r   )&
__future__r   loggingdataclassesr   typingr   r   rp   r   triton.languagelanguager   -sglang.srt.layers.attention.base_attn_backendr   !sglang.srt.layers.attention.utilsr   sglang.srt.layers.dp_attentionr   ,sglang.srt.model_executor.forward_batch_infor	   r
   sglang.srt.utilsr   r   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr    sglang.srt.speculative.spec_infor   	getLoggerrN   rk   jitrB   rC   rR   r@   r@   r@   rA   <module>   s,    
3