o
    پi                     @  sh   d dl mZ d dlmZ d dlZd dlmZ d dlmZ er*d dl	m
Z
 d dlmZ G dd	 d	eZdS )
    )annotations)TYPE_CHECKINGN)AttentionBackend)ForwardBatch)RadixAttention)ModelRunnerc                      sT   e Zd Zd fddZdddZd	d
 Z	ddddZ	ddddZdd Z  Z	S )IntelAMXAttnBackendmodel_runnerr   c                   s   dd l }t   d | _|j| _|jj|j | _d}t	|j
dr(g |j
jd }|j
|jd | _tjj j| _tjj j| _d S )Nr   full_attention_layer_id_mapping)
sgl_kernelsuper__init__forward_metadatadevicemodel_confignum_attention_headstp_sizenum_headhasattrtoken_to_kv_poolr
   get_value_buffershape
v_head_dimtorchopsdecode_attention_cpudecode_attention_fwdextend_attention_cpuextend_attention_fwd)selfr	   r   layer_id	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/intel_amx_backend.pyr      s$   

zIntelAMXAttnBackend.__init__forward_batchr   c                 C  sX   |j }tj|| jd| jd ftj| jd}|j rd}nt	|j
 }||f| _dS )z%Init the metadata for a forward pass.      )dtyper   N)
batch_sizer   zerosr   r   float32r   forward_modeis_decode_or_idlemaxextend_seq_lensitemr   )r    r&   bsattn_logitsmax_extend_lenr$   r$   r%   init_forward_metadata(   s   

z)IntelAMXAttnBackend.init_forward_metadatac                 C     dS )Nr(   r$   r    r$   r$   r%   get_graph_seq_len_fill_value<      z0IntelAMXAttnBackend.get_graph_seq_len_fill_valueTlayerr   c           
      C  s   |j |jkr||jd |j|j f}nt|}|r&|j||j	|| | j
\}}	| |d|j|j |||d|j|j|j|j|j|j|jj|j|j|j|j|	|j|j |S )Nr   r   )qk_head_dimr   	new_emptyr   tp_q_head_numr   
empty_liker   set_kv_bufferout_cache_locr   r   viewget_key_bufferr!   r   req_to_token_poolreq_to_tokenreq_pool_indicesseq_lensr0   extend_start_locscaling	logit_cap)
r    qkvr:   r&   save_kv_cacheo_r4   r$   r$   r%   forward_extend?   s2   	


z"IntelAMXAttnBackend.forward_extendrJ   torch.TensorrK   rL   c           
      C  s   | j \}}|d|j|j }|j|jkr$||jd |j|j f}	nt|}	| 	|
d|j|j|j|j|j|j|	
d|j|j|||j||jj|j|j|j|j |	S )Nr   r   )r   reshaper=   r;   r   r<   r   r   r>   r   rA   r   rB   r!   r   r@   rC   rD   rE   rF   rH   rI   )
r    rJ   rK   rL   r:   r&   rM   r3   rO   rN   r$   r$   r%   forward_decodef   s*   
	
z"IntelAMXAttnBackend.forward_decodec                 C  r6   )NFr$   r7   r$   r$   r%   support_triton   r9   z"IntelAMXAttnBackend.support_triton)r	   r   )r&   r   )T)r:   r   r&   r   )
rJ   rQ   rK   rQ   rL   rQ   r:   r   r&   r   )
__name__
__module____qualname__r   r5   r8   rP   rS   rT   __classcell__r$   r$   r"   r%   r      s    

.$r   )
__future__r   typingr   r   -sglang.srt.layers.attention.base_attn_backendr   ,sglang.srt.model_executor.forward_batch_infor   !sglang.srt.layers.radix_attentionr   &sglang.srt.model_executor.model_runnerr   r   r$   r$   r$   r%   <module>   s    