o
    پi0                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZ G d	d
 d
ZG dd dZe ej  dG dd deZedkr^e   dS dS )    N)AttentionArch)FlashAttentionBackend)TorchNativeAttnBackend)RadixAttention)MLATokenToKVPool)ForwardBatchForwardMode)CustomTestCasec                   @      e Zd Zdd ZdS )MockModelRunnerc              
   C   s   t j}d| _tj| _d| _d}tdd||ddd| _d | _	tddtjd ddd	| _
| j
j| _d
}tdd|tj||tj| jdd| _d| _|| }t|| j| j||d| jdd| _d S )NcudaFi   ModelConfig )context_lenattention_archis_encoder_decoderis_local_attention_model
ServerArgsr   )kv_cache_dtypespeculative_eagle_topkspeculative_num_draft_tokensenable_deterministic_inference   	TokenPooldtypedevice)sizereq_to_token   )r   	page_sizer   kv_lora_rankqk_rope_head_dim	layer_numr   enable_memory_saver)r   MLAr   torchfloat16r   is_hybrid_swatypemodel_configsliding_window_sizeserver_argsr   zerosint32req_to_token_poolr    r   token_to_kv_pool)selfr!   r"   r   r   
batch_sizemax_total_num_tokensr   r   d/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/attention/test_flashattn_mla_backend.py__init__   s^   


zMockModelRunner.__init__N__name__
__module____qualname__r5   r   r   r   r4   r          r   c                   @   r
   )MockReqToTokenPoolc                 C   s(   t j|| |d||t j| _d S )Nr   )r&   arangereshapetor.   r   )r1   r2   seq_lenr   r   r   r4   r5   N   s   zMockReqToTokenPool.__init__Nr6   r   r   r   r4   r;   M   r:   r;   zTest requires CUDAc                   @   sh   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdddZdd Z	dddZ
dd Zdd Zdd ZdS )TestFlashAttentionMLABackendc                 C   s   t j rt j }|d dk r| d|d  d|d   d| _d| _d| _d| _t j	| _
d	| _d
| _d| _| j| j | _| jd | _|   t| j| _t| j| _d| _d S )Nr   	   zRMLA requires Hopper GPU (compute capability >= 9.0), but found compute capability .r      ih  r   i      @   g      )r&   r   is_availableget_device_capabilityskipTestr2   r@   	num_headsr   r'   r   r!   q_lora_rankr"   qk_head_dimscaling_init_model_runnerr   model_runnerbackendr   ref_backendnum_local_heads)r1   compute_capabilityr   r   r4   setUpX   s0   


z"TestFlashAttentionMLABackend.setUpc                 C   s   t | j| jd| _d S )N)r!   r"   )r   r!   r"   rO   r1   r   r   r4   rN   t   s   z/TestFlashAttentionMLABackend._init_model_runnerc              	   C   s,   t | j| j| j | jdd| jdd| _| jS )z#Create attention layer for testing.r   r   attn_mqa)rJ   head_dimrM   num_kv_headslayer_id
v_head_dimprefix)r   rR   r!   r"   rM   rV   rU   r   r   r4   _create_attention_layerz   s   
	z4TestFlashAttentionMLABackend._create_attention_layerc           	      C   s>   |t jkr| j|||||}n
| j|||||}||S )z0Run reference forward pass using native backend.)r   EXTENDrQ   forward_extendforward_decodeview)	r1   modeqkvlayerforward_batchexpected_shapeoutputr   r   r4   _run_reference_forward   s   

z3TestFlashAttentionMLABackend._run_reference_forwardc                 C   s^   |  |j|d| d|j  |  |j| j |  |jjd |  t|  dd dS )z.Verify output tensor shape, dtype, and values.zExpected shape z, got r   r   zOutput contains NaN valuesN)	assertEqualshaper   r   r)   r&   isnansumitem)r1   rh   rg   r   r   r4   _verify_output   s   z+TestFlashAttentionMLABackend._verify_outputNr   c           	      C   s  |p| j }|tjkr|| }|| j }|| j }t| jtjdd| j|f| jdtj||| jd| j| |tj| j| jdtj	|g| j | jdtj	|g| j ddtj	|g| j | jdtj	|g| j ddtj	|g| j | jdtj	|g| j dd| j
d}nP|}| j | }| j| j  }| j| }t| jtjdd| j|f| jdtj||| jd| j| |tj| j| jdtj	|g| j | jdtj	|g| j dd| j
d	}| jj|_| jj|_|S )z=Create a forward batch for testing based on mode and lengths.r   d   r<   cpu)r2   	input_idsout_cache_locseq_lens_sumforward_modereq_pool_indicesseq_lensseq_lens_cpuextend_prefix_lensextend_prefix_lens_cpuextend_seq_lensextend_seq_lens_cpuattn_backend)	r2   rr   rs   rt   ru   rv   rw   rx   r}   )r@   r   r]   r2   r   r&   randintr   r=   tensorrP   rO   r/   r0   )	r1   ra   q_len
prefix_len	total_lenout_cache_startout_cache_endrf   
decode_lenr   r   r4   _create_forward_batch   st   







z2TestFlashAttentionMLABackend._create_forward_batchc                 C   st   |dkrdS t j| j| d| j| j| jd}t j| j| d| j| j| jd}|j|t j	| j| | jd|| dS )z#Set up KV cache with prefix tokens.r   Nr   r   r<   )
r&   onesr2   r!   r   r   r"   r0   set_mla_kv_bufferr=   )r1   rf   re   	cache_lencache_k_nopecache_k_roper   r   r4   _setup_kv_cache   s,   	z,TestFlashAttentionMLABackend._setup_kv_cachec                 C   s2  |   }| |||}| j| | j| jf}| j| | jf}tj|| j| jd}tj|| j| jd}	|	ddd| j	f }
|	dd| j	df }|

d}|
d}tjd| j| jd}| ||| | j| | j| | j| j	 f}|tjkr| jj||||||d}n| jj||||||d}| || |S )a'  
            Run an attention test with the specified parameters.
        Args:
            mode: ForwardMode.EXTEND or ForwardMode.DECODE
            q_len: Length of the query sequence. For decode mode, q_len is 1.
            prefix_len: Length of the prefix sequence for extend mode
        r   Nr   )k_rope)r\   r   r2   rJ   rL   r&   randnr   r   r!   	unsqueezer   rP   init_forward_metadatar   r]   r^   r_   ro   )r1   ra   r   r   re   rf   q_shapekv_shaperb   kv_compressedk_noper   rc   rd   rg   rh   r   r   r4   _run_attention_test  s2   



z0TestFlashAttentionMLABackend._run_attention_testc                 C   s   | j tj| jd dS )z#Test the standard extend operation.r   N)r   r   r]   r@   rU   r   r   r4   test_forward_extend9  s   z0TestFlashAttentionMLABackend.test_forward_extendc                 C   s   | j tjdd dS )z-Test the decode operation with cached tokens.r   r   N)r   r   DECODErU   r   r   r4   test_forward_decode=  s   z0TestFlashAttentionMLABackend.test_forward_decodec                 C   s*   | j d }| j | }| jtj||d dS )z)Test extending from cached prefix tokens.rD   )r   r   N)r@   r   r   r]   )r1   r   
extend_lenr   r   r4   test_forward_extend_with_prefixA  s
   


z<TestFlashAttentionMLABackend.test_forward_extend_with_prefix)Nr   )r   )r7   r8   r9   rT   rN   r\   ri   ro   r   r   r   r   r   r   r   r   r   r4   rA   V   s    

H
5rA   __main__)unittestr&   sglang.srt.configs.model_configr   2sglang.srt.layers.attention.flashattention_backendr   0sglang.srt.layers.attention.torch_native_backendr   !sglang.srt.layers.radix_attentionr    sglang.srt.mem_cache.memory_poolr   ,sglang.srt.model_executor.forward_batch_infor   r   sglang.test.test_utilsr	   r   r;   skipIfr   rG   rA   r7   mainr   r   r   r4   <module>   s"    ?	 t