o
    پi=(                     @   s\  d Z ddlZddlZddlZzddlm  mZ ddl	Z	ddl
mZ ddlmZ dZW n ey8   dZdZY nw z
ddlmZ dZW n eyN   dZY nw dd	 Zejje d
dejje ddejdddgdefddZejjdddejje d
dejje ddejdddgdefddZedkreedg dS dS )z@Tests for CuTe DSL fused sigmoid gating delta rule kernel (GDN).    N)from_dlpack)cutedsl_gdnTF&fused_sigmoid_gating_delta_rule_updatec
           
      C   s$   t | ||dd|||||||	dd dS )N      ?g      4@T)A_logadt_biassoftplus_betasoftplus_thresholdqkvbinitial_state_sourceinitial_state_indicesscaleuse_qk_l2norm_in_kernel
cu_seqlensr   )
r   r	   r   r   r   r   r   initial_stateindicesr    r   \/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/tests/test_cutedsl_gdn.pyrun_triton_kernel   s    r   zCuTe DSL not available)reasonzTriton kernel not availableB      c                 C   sN  t d d\}}}}}|d }t j|t jdd}t j|t jdd}t j| ||t jdd}	t j| ||t jdd}
t j| |||t jdd}t j| |||t jdd}t j| |||t jdd}t j| t jdd}t j| |||t jdd}| d	 }t
j||||||	|
| ||d
}t j  t j| |||t jdd}| d	 }t
j||||||	|
|||d
}t||||||	|
|||
}| |   }|  }|  }|dk   d	 }t | pt | }| d
k rdnd}td|  d| d|dd|dd|dd |rJ d|dk s%J d|dddS )z?Test precision of CuTe DSL GDN kernel against Triton reference.     r   r   r             cudadtypedevice)r   g?d   r!   
SmallBatch
LargeBatch
  B= (z): max_diff=z.2ez, mean_diff=z, fail_rate=.2f%zOutput contains NaN/Infr   z
Fail rate z% >= 1%N)torchmanual_seedrandnfloat32bfloat16arangeint32clonereshape
contiguousr   .cutedsl_fused_sigmoid_gating_delta_rule_updater#   synchronizer   floatabsmaxitemmeanisnananyisinfprint)r   THKVHVr   r   r	   r   r   r   r   r   r   state_cutedslstate_triton_out_cutedsl
out_tritonabs_diffmax_diff	mean_diff	fail_ratehas_nankernel_typer   r   r   test_cutedsl_gdn_precision0   sH   

( rT   z^Skip the performance test because the speedup ratio is highly unstable in the CI environment. r    c           )         s  t d d\}}}}}| }|d d}d\}}	t j|t jdd t j|t jddt j|t jddt j||||t jdd}
|
d t j	|d	 t jdd}t j	d	|||t jdd}g g g g g f\}}}}}g g g g g f\g g g g g f\t
D ]}t d|  t jd	|||t jdd}t jd	|||t jdd}t jd	|||t jdd}t j||t jdd}t j||t jdd}|| || || || || t|d
d t|d
d t|d
d t|d
d t|d
d |dd	  |dd	  |dd	  |d	  |d	  qt d
dtd
d	t|
d
d
td
dt|d
dt|d
dt j }t|jt|||||||dk |t j  t
D ]}t | | | | | 
}qt j  	
fdd} fdd}t j| |  W d   n	1 sw   Y  t j  |  t j  t j }t j }z=t j| |  W d   n	1 sw   Y  t jj||d |  W d   n	1 s/w   Y  t j  W n tyH   d }}Y nw t
|D ]:}|rW|  nt j| |  W d   n	1 slw   Y  t j  |r~|  n|  t j  qMg g }}t
|	D ]x}t jjddt jjdd} }!|   |r|  n|  |!  t j  || |! t jjddt jjdd} }!t j| |   |r|  n|  |!  W d   n	1 sw   Y  t j  || |! qt !| d }"t "| d }#t !| d }$t "| d }%|"|$ }&| dk r9dnd}'t#d|  d|' d|"dd|#dd|$dd|%dd|&dd | dk rad nd!}(|&|(ksvJ d"|&dd#|( d$|  dS )%z7Benchmark CuTe DSL GDN kernel against Triton reference.r   r   r"   T)
   r(   rU   r#   r$   r'   r    r   )assumed_alignr   r!   c                     sD   t D ]} 
|  |  |  |  |   	 qd S N)range)ri)A_log_ta_tensor_listb_tensor_listcompiledcu_t	dt_bias_th0_tidx_tk_tensor_listo_tq_tensor_list	run_itersstreamv_tensor_listr   r   run_cutedsl   s    z1test_cutedsl_gdn_performance.<locals>.run_cutedslc                     s@   t D ]} t |  |  
|  |  |  	
}qd S rW   )rX   r   )rY   rK   )r   a_tritonb_tritonr	   r   k_tritonq_tritonre   r   rJ   v_tritonr   r   
run_triton   s   z0test_cutedsl_gdn_performance.<locals>.run_tritonN)rf   )enable_timingi  r)   r*   r+   r,   z
): Triton=r-      ±u   μs, CuTeDSL=u   μs, speedup=xr   gffffff?zSpeedup zx < zx for B=)$r/   r0   r1   r2   r3   r4   r5   r7   r8   zerosrX   appendr   	transpose	unsqueezer#   Streamcuda_driverCUstreamcuda_streamr   _get_compiled_kernelr:   r   rf   	CUDAGraphgraph	ExceptionreplayEventrecordelapsed_timenpr?   stdrC   ))r   rD   rE   rF   rG   rH   N	is_varlenwarmupbench_itersrI   r   	o_cutedslq_listk_listv_lista_listb_listrY   q_ik_iv_ia_ib_itorch_streamrK   rh   rn   graph_tritongraph_cutedsltriton_timescutedsl_timesstartendtriton_mean
triton_stdcutedsl_meancutedsl_stdspeeduprS   min_speedupr   )r   rZ   r[   ri   r\   rj   r]   r^   r	   r_   r`   ra   r   rb   rk   rc   rd   rl   re   r   rJ   rf   rg   rm   r   test_cutedsl_gdn_performancee   s  
	








& 














8*r   __main__z-v)__doc__numpyr   pytestr/   cuda.bindings.driverbindingsdriverrw   cutlasscutlass.cute.runtimer   sglang.jit_kernelr   CUTEDSL_AVAILABLEImportError>sglang.srt.layers.attention.fla.fused_sigmoid_gating_recurrentr   TRITON_AVAILABLEr   markskipifparametrizeintrT   r   __name__main__file__r   r   r   r   <module>   sJ    2 ?