o
    پiM                     @   sf  d dl Z d dlZd dlZd dlZzd dlZW n ey!   dZY nw d dlmZ	 ddddddej
ded	ed
ededefddZejdg dejdg dejdejejejgdd Zejdg dejdg dejdejejejgejd	ddgejd
d dgejdddgdd Zdd Zedkreeg dS dS )    N)timestep_embeddingF   '  flip_sin_to_cosdownscale_freq_shiftscale
max_period	timestepsdimr   r   r   r	   c          	      C   s  t | jdksJ d| tj} |d }ttj|tj| jd tjd|tj| jd }|||  }t	|}| d d d f 
 |d d d f  }|| }tjt|t|gdd}|rytj|d d |d f |d d d |f gdd}|d dkrtjj|d	}|S )
Nr   zTimesteps should be a 1d-array   )dtypedevicer   )startendr   r   )r   )r   r   r   r   )lenshapetotorchfloat32logtensorr   arangeexpfloatcatsincosnn
functionalpad)	r
   r   r   r   r   r	   half_dimexponentemb r%   c/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/tests/test_timestep_embedding.py get_timestep_embedding_reference   s(   	
$2r'   
batch_size)r   r                        i +   @  )    r*   r+   r,   r-   r.   r/       r   c                 C   sV   d}t jdd| f|d|}t||ddd}t||ddd}t jj||ddd d S )	Ncudar     lowhighsizer   T)r   r   MbP?atolrtolr   randintr   r'   timestep_embedding_cudatestingassert_close)r(   r   r   r   ttorch_outputcuda_outputr%   r%   r&   -test_timestep_embedding_correctness_with_sgld2   s   rE   )	r   r   r)   r*   r+   r,   r-   r.   r0   )r1   r+   r,   r-   r2   Tg{Gz?c           
      C   s^   d}t jdd| f|d|}t|||||dd}t|||||dd}	t jj||	ddd d S )	Nr3   r   r4   r5   r   r   r9   r:   r=   )
r(   r   r   r   r   r   r   rB   rC   rD   r%   r%   r&   2test_timestep_embedding_correctness_with_diffusersC   s&   	rF   c               
   C   s  t jddkrtd td u rtd g d} g d}dtfdd	}d
}g }g }| D ]7}|D ]2}tjdt	d|||d
tj}|t||}	|t||}
|	|
 }||||	|
|d || q1q-td ttj|dddd tdt|d d S )N SGLANG_RUN_JIT_KERNEL_PERF_TESTS1zPerf test disabled by defaultz/Optional dependency 'tabulate' is not installed)	r   r   r)   ?   r+   r,   ie     r-   )r1   @   r*   r+   r,   rJ   r.   r/   	kernel_fnc           	      _   s   d}d}t jjdd}t jjdd}t|D ]	}| |i |}qt j  |  t|D ]	}| |i |}q-|  |  ||| S )N      T)enable_timing)r   r3   Eventrangesynchronizerecordelapsed_time)	rL   argskwargswarmup_timesrepeat_timesr   r   _	output_fnr%   r%   r&   perf_kernel_fnj   s   
z4test_timestep_embedding_perf.<locals>.perf_kernel_fnr3   r   i )stepsr   )z
Batch Size	DimensionzTorch Time (ms)zCUDA Time (ms)zSpeedup (CUDA)z,=== Timestep Embedding Benchmark Results ===keys
fancy_grid).0fr`   .6fra   z.5f)headerstablefmtfloatfmtzAverage Speedup(cuda): z.4f)osenvirongetpytestskiptabulatecallabler   linspacemaxr   r   r'   r?   appendprintnpmean)	NUM_BATCHNUM_DIMr[   r   resultscuda_speedupsBr   rB   
time_torch	time_cudaspeedup_cudar%   r%   r&   test_timestep_embedding_perfa   sL   

	rz   __main__)re   numpyrp   rh   r   rj   	Exception$sglang.jit_kernel.timestep_embeddingr   r?   Tensorintboolr   r'   markparametrizefloat16bfloat16r   rE   rF   rz   __name__main__file__r%   r%   r%   r&   <module>   sX    	
!>