o
    پiG                     @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d d	lmZ G d
d dZe ej  dG dd deZG dd deZedkrge   dS dS )    N)AttentionArch)FlashAttentionBackend draft_decode_set_expand_metadata)TorchNativeAttnBackend)RadixAttention)MHATokenToKVPool)ForwardBatchForwardMode)
ServerArgs)CustomTestCasec                   @   s   e Zd Z			dddZdS )MockModelRunner         c              
   C   s   d| _ tj| _tj| _d| _d | _tj}d}d}t	dd|d|ddd | _
d | _t	dd|tj||tj| j d	d
 | _|| _|| }t||| j||d| j dd| _tdd| _d S )NcudaF   i   ModelConfig )context_lenis_multimodalattention_archis_encoder_decoderis_local_attention_model	TokenPooldtypedevice)sizereq_to_tokenr   )r   	page_sizer   head_numhead_dim	layer_numr   enable_memory_saverdummy)
model_path)r   torchfloat16r   kv_cache_dtypeis_hybrid_swaattention_chunk_sizer   MHAtypemodel_configsliding_window_sizezerosint32req_to_token_poolr   r   token_to_kv_poolr
   server_args)selfr   	num_headsr!   r   max_batch_sizemax_context_lenmax_total_num_tokensr   r   `/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/attention/test_flashattn_backend.py__init__   sX   zMockModelRunner.__init__N)r   r   r   )__name__
__module____qualname__r:   r   r   r   r9   r      s
    r   zTest requires CUDAc                   @   s   e Zd Zdd Zd"ddZdd Zdd	 Zd
d Zdd Zd#ddZ	d$ddZ
dd Zd%ddZdd Zdd Zdd Zdd Zd d! ZdS )&TestFlashAttentionBackendc                 C   s*   d| _ d| _d| _d| _d| _tj| _d S )Nr      r   r   )
batch_sizeseq_lenr5   r!   r   r&   r'   r   r4   r   r   r9   setUpQ   s   zTestFlashAttentionBackend.setUpr   c                 C   s<   t || j| jd| _t| j| _t| j| _| j| jj_	d S )N)r   r5   r!   )
r   r5   r!   model_runnerr   backendr   ref_backendr-   num_attention_heads)r4   r   r   r   r9   _init_model_runnerZ   s   z,TestFlashAttentionBackend._init_model_runnerc                 C   sn   t jd|t j| jdd d d f | t jd|t j| jdd d d f  | | _| j| jjjd |d |f< d S )Nr   r   )r&   aranger0   r   r   rD   r1   )r4   r@   rA   r   r   r   r9    _mock_write_to_req_to_token_poold   s     z:TestFlashAttentionBackend._mock_write_to_req_to_token_poolc                 C   s   t | j| jd| jddS )z#Create attention layer for testing.g      ?r   )r5   r!   scalingnum_kv_headslayer_id)r   r5   r!   rB   r   r   r9   _create_attention_layerq   s   z1TestFlashAttentionBackend._create_attention_layerc                 C   sH   || j | jf}tj|| j| jdtj|| j| jdtj|| j| jdfS )z#Create q, k, v tensors for testing.r   )r5   r!   r&   randnr   r   )r4   
tokens_lenshaper   r   r9   _create_qkv_tensors{   s
   z-TestFlashAttentionBackend._create_qkv_tensorsc           	      C   s>   |t jkr| j|||||}n
| j|||||}||S )z0Run reference forward pass using native backend.)r	   EXTENDrF   forward_extendforward_decodeview)	r4   modeqkvlayerforward_batchexpected_shapeoutputr   r   r9   _run_reference_forward   s   

z0TestFlashAttentionBackend._run_reference_forwardNc                 C   s   |  |j|d| d|j  |  |j| j |  |jjd |  t|  dd |durqtj	||ddd	sstj
||ddd	 }| rm| d }td
t|  td|t|   td|t|   tddS dS )z.Verify output tensor shape, dtype, and values.zExpected shape z, got r   r   zOutput contains NaN valuesNg?g        )atolrtolzFirst mismatch at index:zoutput:zoutput_ref:z@Attention output is not close to the torch native backend output)assertEqualrQ   r   r   r,   r&   isnansumitemallcloseiscloseanynonzeroprinttupletolistAssertionError)r4   r^   r]   
output_ref	diff_maskfirst_mismatch_idxr   r   r9   _verify_output   s2   z(TestFlashAttentionBackend._verify_outputr   c           
      C   s  | j |d |p
| j}|tjkr|| }|| j }|| j }t| jtjdd| j|f| jdtj	||| jd| j| |tj	| j| jdtj
|g| j | jdtj
|g| j ddtj
|g| j | jdtj
|g| j ddtj
|g| j | jdtj
|g| j dd| jd}nm|}	| j|	 }|tjkr|dkr| j| j | d | }||	|  }n| j| j }| j| }t| jtjdd| j|	f| jdtj
||g| jd| j| |tj	| j| jdtj
|g| j | jdtj
|g| j dd| jd	}| jj|_| | j|| | jj|_|S )	z=Create a forward batch for testing based on mode and lengths.)r   r   d   r   cpu)r@   	input_idsout_cache_locseq_lens_sumforward_modereq_pool_indicesseq_lensseq_lens_cpuextend_prefix_lensextend_prefix_lens_cpuextend_seq_lensextend_seq_lens_cpuattn_backendr   )	r@   ru   rv   rw   rx   ry   rz   r{   r   )rH   rA   r	   rS   r@   r   r&   randintr   rI   tensorrE   DECODErD   r1   rJ   r2   )
r4   rW   q_len
prefix_lenr   	total_lenout_cache_startout_cache_endr\   
decode_lenr   r   r9   _create_forward_batch   s   








z/TestFlashAttentionBackend._create_forward_batchc                 C   sx   t j| j| | j| j| j| jd}t j| j| | j| j| j| jdd }|j|t j	| j| | jd|||j
|j d S )Nr   r   rs   )r&   onesr@   r5   r!   r   r   r2   set_kv_bufferrI   k_scalev_scale)r4   r\   r[   	cache_lencache_kcache_vr   r   r9   _setup_kv_cache   s2   z)TestFlashAttentionBackend._setup_kv_cachec              	   C   s   |   }| ||||}| | j| \}}}	|tjkr(|dkr'| ||| n| ||| j | j	| |tjkrQ| j| | j
| j f}
| j|||	||}n| j| j
| j f}
| j|||	||}| ||||	|||
}| ||
| |S )aY  
            Run an attention test with the specified parameters.
        Args:
            mode: ForwardMode.EXTEND or ForwardMode.DECODE
            q_len: Length of the query sequence. For decode mode, q_len is 1.
            prefix_len: Length of the prefix sequence for extend mode
            page_size: Page size for the KV cache
        r   )rN   r   rR   r@   r	   rS   r   rA   rE   init_forward_metadatar5   r!   rT   rU   r_   rq   )r4   rW   r   r   r   r[   r\   rX   rY   rZ   r]   r^   rn   r   r   r9   _run_attention_test  s*   	


z-TestFlashAttentionBackend._run_attention_testc                 C   s   | j tj| jd dS )z#Test the standard extend operation.r   Nr   r	   rS   rA   rB   r   r   r9   test_forward_extendK     z-TestFlashAttentionBackend.test_forward_extendc                 C   s   | j tjdd dS )z-Test the decode operation with cached tokens.r   r   Nr   r	   r   rB   r   r   r9   test_forward_decodeO  s   z-TestFlashAttentionBackend.test_forward_decodec                 C   s*   | j d }| j | }| jtj||d dS )z)Test extending from cached prefix tokens.r   )r   r   N)rA   r   r	   rS   )r4   r   
extend_lenr   r   r9   test_forward_extend_with_prefixS  s
   


z9TestFlashAttentionBackend.test_forward_extend_with_prefixc                 C   s   | j tj| jdd dS )zGTest extending from cached prefix tokens with page size greater than 1.@   r   r   Nr   rB   r   r   r9   1test_forward_extend_with_page_size_greater_than_1[  s   zKTestFlashAttentionBackend.test_forward_extend_with_page_size_greater_than_1c                 C   s   | j tjddd dS )z4Test decode operation with page size greater than 1.r   r   r   Nr   rB   r   r   r9   1test_forward_decode_with_page_size_greater_than_1_  r   zKTestFlashAttentionBackend.test_forward_decode_with_page_size_greater_than_1)r   )N)Nr   r   )r   r   )r;   r<   r=   rC   rH   rJ   rN   rR   r_   rq   r   r   r   r   r   r   r   r   r   r   r   r9   r>   O   s     
	

	


S
.r>   c                   @   s    e Zd ZdZdd Zdd ZdS )&TestUpdateDraftDecodeSetExpandMetadataz
    All the test cases examples have 1 additional cache location than the decode length.
    This is to align with the current allocation logic. It does not affect the correctness.
    c              
   C   s  d\}}}t jddgddggt jdt jddgd	d
ggt jddft jg dg dgt jdt jg dg dgt jddfg}t jdgt jd}|D ]B\}}}t j|| t jd}	t j|t jd}
t|	|
|||||d t j|d |d gt jd}| t |	| | t |
| qJd S )N)r   r                   r            r   r   )            r   r   )#   $   %   &   '   (   )r   r   r   r   r   r   )r   	   
   r   r   r      cache_seqlens_int32
page_tablelast_page_lensdecode_length	cache_loctopkr   r&   r   r0   r/   
zeros_liker   
assertTrueequal)r4   bsr   r   casesr   r   expected_page_tabler   r   r   expected_cache_seqlensr   r   r9   %test_draft_decode_set_expand_metadataj  sb   
)
zLTestUpdateDraftDecodeSetExpandMetadata.test_draft_decode_set_expand_metadatac              	   C   s   d\}}}}t jg dg dg dg dg dg dgt jd}t j|| t jd}t jg d	t jd}t j|t jd}t|||||||d
 t jg dt jd}	t jg dg dg dg dg dg dgt jd}
| t ||	 | t ||
 dS )zY
        Ensure expand metadata works when batch size > 1 and last pages differ.
        )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )            )r   r   r   r   )r   r   r   r   r   )r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   Nr   )r4   r   r   r   r   r   r   r   r   r   r   r   r   r9   8test_update_draft_decode_set_expand_metadata_multi_batch  sH   
z_TestUpdateDraftDecodeSetExpandMetadata.test_update_draft_decode_set_expand_metadata_multi_batchN)r;   r<   r=   __doc__r   r   r   r   r   r9   r   d  s    @r   __main__)unittestr&   sglang.srt.configs.model_configr   2sglang.srt.layers.attention.flashattention_backendr   r   0sglang.srt.layers.attention.torch_native_backendr   !sglang.srt.layers.radix_attentionr    sglang.srt.mem_cache.memory_poolr   ,sglang.srt.model_executor.forward_batch_infor   r	   &sglang.srt.model_executor.model_runnerr
   sglang.test.test_utilsr   r   skipIfr   is_availabler>   r   r;   mainr   r   r   r9   <module>   s&    =  v