o
    پi                     @   s   d Z ddlZddlm  mZ ddddejddd	Zdddddejd
ddZdddddejd
ddZdddddejd
ddZ	dddddejd
ddZ
dddejdddddZdddddZdS )z'Useful functions for writing test code.    N
    TFrepeatsdescverboseamp	amp_dtypec                   sX   |rt |d  fdd}tjd|||dt d}	|	|}
|r(t |
 |	|
fS )zCUse Pytorch Benchmark on the forward pass of an arbitrary function.z- Forward passc                     sD   t jd d | i | W d    d S 1 sw   Y  d S )Ncudadevice_typedtypeenabled)torchautocast)inputskwinputsr   r	   fn T/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/benchmark.pyamp_wrapper   s   "z&benchmark_forward.<locals>.amp_wrapperzfn_amp(*inputs, **kwinputs))fn_ampr   r   stmtglobalsnum_threads)print	benchmarkTimerr   get_num_threadstimeit)r   r   r   r   r   r	   r   r   r   tmr   r   r   benchmark_forward   s   


r$   gradr   r   r   r   r	   c                O   s   |rt |d tjd||d | |i |}	t|	tu r!|	d }	W d   n1 s+w   Y  |du r:t|	}n
|j|	jkrDtddd }
tj	d	|
||	|d
t
 d}||}|rbt | ||fS )zDUse Pytorch Benchmark on the backward pass of an arbitrary function.z- Backward passr
   r   r   N&Grad shape does not match output shapec                 W   s.   |D ]}t |tjrd |_q| j|dd d S )NTretain_graph)
isinstancer   Tensorr&   backward)yr&   r   xr   r   r   f6   s
   zbenchmark_backward.<locals>.fzf(*inputs, y=y, grad=grad))r/   r   r-   r&   r   r   r   r   typetuple
randn_likeshapeRuntimeErrorr   r   r    r!   r   r&   r   r   r   r   r	   r   r   r-   r/   r"   r#   r   r   r   benchmark_backward   s,   

r7   c                   s   |rt |d tjd d |i |}	t|	tu r!|	d }	W d   n1 s+w   Y  |du r:t|	}n
|j|	jkrDtd fdd}
tj	d	|
|||d
t
 d}||}|rgt | ||fS )LUse Pytorch Benchmark on the forward+backward pass of an arbitrary function.z- Forward + Backward passr
   r   r   Nr'   c                    s   |D ]}t |tjrd |_qtjd d |i |}t|tu r(|d }W d    n1 s2w   Y  |j| dd d S )Nr
   r   r   Tr(   )r*   r   r+   r&   r   r1   r2   r,   )r&   r   r   r.   r-   r   r   r   r/   `   s   zbenchmark_combined.<locals>.fzf(grad, *inputs, **kwinputs))r/   r   r   r&   r   r   r0   r6   r   r   r   benchmark_combinedH   s,   


r9   c          	   
   O   sF   t | g|R |||||d|t| g|R ||||||d|fS r8   r   r%   )r$   r7   	r   r&   r   r   r   r   r	   r   r   r   r   r   benchmark_fwd_bwdu   s8   
	r<   c          	      O   sh   t | g|R |||||d|t| g|R ||||||d|t| g|R ||||||d|fS r:   )r$   r7   r9   r;   r   r   r   benchmark_all   sT   
		r=   )trace_filenamer,   r   r	   cpur   c             	   O   s  |r0t jd||d | |i |}	t|	tu r|	d }	t |	}
W d   n1 s+w   Y  tdD ]D}|rF|D ]}t|t jrEd|_q:t jd||d | |i |}	t|	tu r`|	d }	W d   n1 sjw   Y  |rx|	j	|
dd q4|rt j
jjgng t j
jjg }t j
j|dddJ}|r|D ]}t|t jrd|_qt jd||d | |i |}	t|	tu r|	d }	W d   n1 sw   Y  |r|	j	|
dd W d   n1 sw   Y  |rt| jd	d
 |dur|| dS dS )zEWrap benchmark functions in Pytorch profiler to see CUDA information.r
   r   r   N   Tr(   )
activitiesrecord_shapes
with_stack2   )	row_limit)r   r   r1   r2   r3   ranger*   r+   r&   r,   profilerProfilerActivityCPUCUDAprofiler   key_averagestableexport_chrome_trace)r   r>   r,   r   r	   r?   r   r   r   outg_r.   rA   profr   r   r   pytorch_profiler   sf   rS   )r   r   c                O   sj   t j  t j  t j  | |i | t j  t j d }|r.t| d| d t j  |S )Ni  >z max memory: GB)r   r
   empty_cachereset_peak_memory_statssynchronizemax_memory_allocatedr   )r   r   r   r   r   memr   r   r   benchmark_memory  s   




rZ   )__doc__r   torch.utils.benchmarkutilsr   float16r$   r7   r9   r<   r=   rS   rZ   r   r   r   r   <module>   sR   -0(38