o
    پi                  	   @   s   d dl Z d dlZd dlmZ d dlZd dlZd dlmZ	 de
de
fddZdejdejfd	d
ZdejfddZdejdejfddZdejde
fddZdejdejde
fddZd,de
de
fddZG dd  d ZG d!d" d"Z		#		#d-de
d$ed%ee d&efd'd(Zd)ejfd*d+ZdS ).    N)Optional
local_ranknum_local_ranksc                 C   s   t dd}tt dd}tt dd}tt dd}|d	k r&|dks,|d	ks,J tjd
d| d| || || |  d ttj td tj	
|  t t ttt|| fS )NMASTER_ADDRz	127.0.0.1MASTER_PORT8361
WORLD_SIZE   RANKr      ncclztcp://:)backendinit_method
world_sizerankcuda)osgetenvintdistinit_process_grouptorchset_default_dtypebfloat16set_default_devicer   
set_deviceget_rankget_world_size	new_grouplistrange)r   r   ipport	num_nodes	node_rank r&   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/test_deepep_utils.py	init_dist   s$   

r(   xyc                 C   sN   |   d |  d } }| |  ||   }d| |   | }d|  S )Nr	      )doublesumitem)r)   r*   denominatorsimr&   r&   r'   	calc_diff%   s   r1   c                 C   s   |   dkr| dd dksJ | j\}}| |dd}|  jdd|dd}|d|d  	t
j|||d |dfS )	Nr+   r	      r   )dimg-C6?g      |@)r4   sizeshapeviewabsfloatamaxclamp	unsqueezetor   float8_e4m3fn)r)   mnx_viewx_amaxr&   r&   r'   per_token_cast_to_fp8,   s   "
"rC   x_fp8x_scalesc                 C   sH   |  tj| ddd}|| ddd}|| | j tjS )Nr   r3   r2   r	   )r=   r   float32r7   r5   r6   r   )rD   rE   x_fp32r&   r&   r'   per_token_cast_back6   s   rH   	num_slotsc                 C   s   |   dksJ | dk }| ||}tj| d|d f| j| jd}|d|t| |d d d |f }tj	|ddd\}}|
|dkd tj	|dddj}| d d d d f d t|| d}|d d d |f | d d d |f< d S )	Nr+   r   r	   dtypedevicer3   T)r4   
descending)rM   r4   )r4   masked_fillr   zerosr5   rK   rL   scatter_add_	ones_likesortmasked_fill_valuesfill_min)r)   rI   maskx_padded	bin_countsorted_bin_countsorted_bin_idx	valid_lenr&   r&   r'   inplace_unique<   s   "(r]   scores	group_idx
num_groupsc                 C   sZ   | j \}}| ||d} tj||ftj| jd}|d|dd| }| | ||S )Nr3   rJ   r	   T)	r6   r7   r   rO   boolrL   scatter_r<   	expand_as)r^   r_   r`   
num_tokensnum_expertsrW   r&   r&   r'   create_grouped_scoresK   s
   
rf         num_warmups	num_testsc           
      C   s   t j  t jtdt jdd}t|D ]}|   q|  dd t|D }dd t|D }t|D ]}||   |   ||   |d urL|  q4t j  t	dd t
||D dd  }	t|	t|	t|	fS )	Ng    Ar   rJ   c                 S      g | ]	}t jjd dqS T)enable_timingr   r   Event.0_r&   r&   r'   
<listcomp>b       zbench.<locals>.<listcomp>c                 S   rk   rl   rn   rp   r&   r&   r'   rs   c   rt   c                 S   s   g | ]\}}| |d  qS )     @@)elapsed_time)rq   ser&   r&   r'   rs   n   s    r	   )r   r   synchronizeemptyr   r!   zero_recordnparrayzipaveragerV   max)
fnri   rj   post_fncacherr   start_events
end_eventsitimesr&   r&   r'   benchU   s*   

r   c                   @      e Zd Zdd Zdd ZdS )empty_suppressc                 C   s   | S Nr&   selfr&   r&   r'   	__enter__t      zempty_suppress.__enter__c                 G   s   d S r   r&   r   rr   r&   r&   r'   __exit__w   r   zempty_suppress.__exit__N__name__
__module____qualname__r   r   r&   r&   r&   r'   r   s   s    r   c                   @   r   )suppress_stdout_stderrc                 C   s   t tjd| _t tjd| _tj | _tj	 | _
ttj | _ttj	 | _tj| _tj	| _t| j | j t| j | j
 | jt_| jt_	| S )Nw)openr   devnulloutnull_fileerrnull_filesysstdoutfilenoold_stdout_fileno_undupstderrold_stderr_fileno_undupdupold_stdout_filenoold_stderr_fileno
old_stdout
old_stderrdup2r   r&   r&   r'   r   |   s   z suppress_stdout_stderr.__enter__c                 G   s`   | j t_| jt_t| j| j t| j	| j
 t| j t| j	 | j  | j  d S r   )r   r   r   r   r   r   r   r   r   r   r   closer   r   r   r&   r&   r'   r      s   
zsuppress_stdout_stderr.__exit__Nr   r&   r&   r&   r'   r   {   s    r   Fsuppress_kineto_output
trace_pathbarrier_comm_profilingc              
      s&  |rt nt}| g tjjddddd}tjjtjjjg|dA}tdD ]4}	|rKtj	dtj
dd}
tj	dtj
dd}|
|  ttjdtj
dd t|D ]}|   qO|  q%W d    n1 sdw   Y  W d    n1 ssw   Y  t|tst|tsJ t|t}| jd	d
dd}t|tr|fn|}tdd |D sJ |D ] t fdd|D dksJ d  dq|d ur|| ddd}g }|D ]1 |D ],} |v r| d }| D ]\}}||v r|t
||d|   nq nqq|rt|S |d S )Nr   r	   )waitwarmupactiverepeat)
activitiesscheduler+   )    r   r   rJ   cuda_time_totald   )sort_bymax_name_column_width
c                 S   s   g | ]}t |tqS r&   )
isinstancestr)rq   namer&   r&   r'   rs      s    z bench_kineto.<locals>.<listcomp>c                    s   g | ]} |v qS r&   r&   )rq   liner   r&   r'   rs      s    zErrors of the kernel z in the profiling tableru   g    .A)msus )r   r   r   profilerr   profileProfilerActivityCUDAr!   randnr9   r   
all_reduceonesstepr   r   tuplekey_averagestablesplitallr-   export_chrome_traceitemsappendreplace)r   kernel_namesrj   r   r   r   suppressr   profr   lhsrhsrr   	is_tupled
prof_linesunitskernel_timesr   time_strunitscaler&   r   r'   bench_kineto   sd   	






r   tc                 C   s   |  tj  S r   )r7   r   int64r-   r.   )r   r&   r&   r'   hash_tensor   s   r   )rg   rh   N)rh   FNF)r   r   typingr   numpyr}   r   torch.distributeddistributedr   r   r(   Tensorr1   rC   rH   r]   rf   r   r   r   ra   r   r   r   r&   r&   r&   r'   <module>   sF   


&
<