o
    پi#                     @  st   d dl mZ d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 er0d dlmZ d dlmZ G d	d
 d
eZdS )    )annotations)TYPE_CHECKINGN)AttentionBackend)ForwardBatch)get_global_server_args)RadixAttention)ModelRunnerc                      sD   e Zd Zd fddZdddZ		ddddZ		ddddZ  ZS )DoubleSparseAttnBackendmodel_runnerr   c                   s   ddl m}m}m} t   || _|| _|| _|jj	| _
|jj| j
 | _|jj| _|j| _|jj| _d | _d | _d | _d| _t jrHtj| _ntj| _d | _d S )Nr   )extend_attention_fwdflash_decode_attention_fwd!flash_decode_sparse_attention_fwd   )@sglang.srt.layers.attention.triton_ops.double_sparsity_attentionr   r   r   super__init__decode_attention_fwddecode_sparse_attention_fwdmodel_confignum_attention_headsnum_headhidden_sizehead_dimserver_argsds_heavy_token_numheavy_token_numsorted_channelsds_sparse_decode_thresholdsparse_decode_thresholdatt_out_approxmid_outmid_o_logexpsum	BLOCK_SEQr   triton_attention_reduce_in_fp32torchfloat32reduce_dtypefloat16forward_metadata)selfr
   r   r   r   	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/double_sparsity_backend.pyr      s&   




z DoubleSparseAttnBackend.__init__forward_batchr   c                 C  sb  |j  rtj|jtjd}tj|jdd dd|dd< t|j }tj	| j
|f| jdd}t|j }t|j }d}|jj|j }|jjd }	tj	| j
|	|g| jdd}
| j| j d | j }tj	|	| j
|| jgtjdd}tj	|	| j
|gtjdd}|
| _|| _|| _nd } } }}|j}t|j|  }d}||||||f| _dS )	z6Init auxiliary variables for triton attention backend.)dtypeNr   )dim   cuda)r/   device)forward_mode	is_decoder$   
zeros_likeseq_lensint32cumsumsumitememptyr   r&   maxminreq_to_token_poolreq_to_tokenreq_pool_indicesshaper   r"   r   r%   r   r    r!   extend_prefix_lensr(   )r)   r.   	start_loctotal_num_tokensattn_logitsmax_seq_lenmin_seq_lenmax_extend_lends_req_to_tokenbszr   block_seq_numr    r!   prefix_lensr,   r,   r-   init_forward_metadata4   s\   
 

z-DoubleSparseAttnBackend.init_forward_metadataTlayerr   c                 C  s   |j |jkr||jd |j|j f}nt|}t|d| j|j	 
d|jd dd}|r=|j||j||| | j\}	}
}}}}| |d|j|j | | |d|j|j|j|j	|j|j	|jj|j|j|j|j||j|j |S )Nr      r0   )qk_head_dim
v_head_dim	new_emptyrC   tp_q_head_numr$   
empty_likegatherr   layer_id	unsqueezeexpandtoken_to_kv_poolset_kv_bufferout_cache_locr(   r   view
contiguousget_key_bufferget_value_bufferr@   rA   rB   r8   extend_seq_lensextend_start_locscaling	logit_cap)r)   qkvrP   r.   save_kv_cacheok_labelrE   rG   rH   rI   rJ   rK   r,   r,   r-   forward_extendq   sN   


z&DoubleSparseAttnBackend.forward_extendc                 C  s  | d|j|j }|j|jkr||jd |j|j f}nt|}| j\}}	}
}}}t	|d| j
|j d|jd dd}|rP|j||j||| || jk sZ|
| jk r| |d|j|j|j|j|j|j|d|j|j|jj|j||j|	|
|j|j |S t	|d|j|jd| j
|j d|jd dd}| |d|j|j|j|j|j|j|d|j|j||j|j||j|
|j|j| j| j| j | j!| j" |S )Nr0   r   rQ   )#reshaperU   rR   rS   rT   rC   r$   rV   r(   rW   r   rX   rY   rZ   r[   r\   r]   r   r   r   r^   r`   ra   r@   rA   rB   r8   rd   re   r   get_label_bufferr   r    r!   r"   )r)   rf   rg   rh   rP   r.   ri   rj   rE   rG   rH   rI   rJ   rK   rk   q_labelr,   r,   r-   forward_decode   s   




*
z&DoubleSparseAttnBackend.forward_decode)r
   r   )r.   r   )T)rP   r   r.   r   )__name__
__module____qualname__r   rO   rl   rp   __classcell__r,   r,   r*   r-   r	      s    
#D=r	   )
__future__r   typingr   r$   -sglang.srt.layers.attention.base_attn_backendr   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.server_argsr   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr   r	   r,   r,   r,   r-   <module>   s    