o
     i3                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlZddlmZ ddlmZ e eZ G dd	 d	Z!G d
d dZ"G dd de"Z#G dd dZ$eG dd dZ%G dd dZ&dS )    N)	dataclass)Path)AnyDictListOptionalSequenceTuple   )get_device_limits)AnalyzedTracec                   @   s4   e Zd ZdZdddZdd	 Zd
d ZdddZdS )NsightProfilera$  Profiler that triggers start of NSight profiler.

    NOTE: you need to ensure that the script running this code actually is running with
    ``nsys profile`` and also has a flag ``--capture-range=cudaProfilerApi`` so the
    capturing is performed by this profiler during certain steps.
    main_profiler	_ProfilerreturnNc                 C   s
   || _ d S N)r   selfr    r   N/home/ubuntu/.local/lib/python3.10/site-packages/xformers/profiler/profiler.py__init__%   s   
zNsightProfiler.__init__c                 C      t jj  d S r   )torchcudaprofilerstartr   r   r   r   	__enter__)      zNsightProfiler.__enter__c                 C   r   r   )r   r   r   stopr   exc_typeexc_valexc_tbr   r   r   __exit__,   r   zNsightProfiler.__exit__c                 C      d S r   r   r   r   r   r   step/      zNsightProfiler.stepr   r   r   Nr   N)__name__
__module____qualname____doc__r   r   r$   r&   r   r   r   r   r      s    
r   c                   @   sx   e Zd ZdZejjjejjjgZ	dddZ
dejjjddfd	d
ZdejjjddfddZdd Zdd ZdddZdS )PyTorchProfilerzProfiler which relies on native Pytorch profiling. Current setting of the profiler
    captures traces, memory footprint and other info that could be read via TensorBoard.
    r   r   r   Nc                 C   s.   || _ d| _tjj| jdddd| jd| _d S )Nr   T)on_trace_readyprofile_memoryrecord_shapes
with_stack
with_flops
activities)r   	num_stepsr   r   profile	_on_trace
ACTIVITIESpytorch_profilerr   r   r   r   r   =   s   zPyTorchProfiler.__init__profc              
   C   s&  d dd | jD }t| jjd| d| jjd }| jj}|dkrNt  dt	
  }t rNt rNt  tt d  }d| d| }t	j|d	d
 | dt  d}|t	j || z| | W d S  ty } z| jjd tjd|d W Y d }~d S d }~ww )N_c                 s   s    | ]}|j V  qd S r   )name).0ar   r   r   	<genexpr>J   s    z,PyTorchProfiler._on_trace.<locals>.<genexpr>profile_06 r
   rankT)exist_ok.z.pt.trace.json.gz)TraceAnalysisErrorz Exception analyzing kineto trace)exc_info)joinr8   strr   
output_dir
done_stepsworker_namesocketgethostnameosgetpiddistis_availableis_initializedget_rankzfilllenget_world_sizemakedirstimetime_nsexport_chrome_tracepath_analyze_trace	Exceptionsummaryappendloggerwarn)r   r:   activities_strdir_namerM   rC   	file_nameexcr   r   r   r7   I   s,   zPyTorchProfiler._on_tracec           
         s  |j d u s|j jd u rd S t|j j  ttd}i }|d ur3|j	 D ]
\}}|d ||< q( 
|} |}t fdd j D }| jj}	|	dt jd | j  f |	d|| jd  df |	d	| jd  df |	d
|df |	d|df d S )Nr   l    J)c                 3   s    | ]}  |V  qd S r   )compute_num_ops)r=   dtyperesultsr   r   r?   j   s
    
z1PyTorchProfiler._analyze_trace.<locals>.<genexpr>zStep time (ms)i  z
TFlop/stepz0.1fTFlopsHFUz0.3fMFU)r   kineto_resultsr   from_profileeventsr   r   devicegemm_tflopsitemscompute_hfucompute_mfusumoperations_per_dtype_fwkeysr   r`   ra   inttotal_time_sr5   )
r   r:   limitshw_flopsri   tflops	total_hfu	total_mfu
total_flopsr   rj   r   r^   _   s*   

zPyTorchProfiler._analyze_tracec                 C   s   t j  | j  d S r   )r   r   synchronizer9   r   r   r   r   r   r   w   s   
zPyTorchProfiler.__enter__c                 C   s   t j  | j||| d S r   )r   r   r   r9   r$   r    r   r   r   r$   {   s   
zPyTorchProfiler.__exit__c                 C   s   | j   |  jd7  _d S )Nr
   )r9   r&   r5   r   r   r   r   r&      s   
zPyTorchProfiler.stepr(   r)   )r*   r+   r,   r-   r   r   ProfilerActivityCPUCUDAr8   r   r6   r7   r^   r   r$   r&   r   r   r   r   r.   3   s    
r.   c                   @   s0   e Zd ZejjjgZdejjjddfddZ	dS )PyTorchProfiler_CUDAOnlyr:   r   Nc                 C   r%   r   r   )r   r:   r   r   r   r^      s   z'PyTorchProfiler_CUDAOnly._analyze_trace)
r*   r+   r,   r   r   r   r   r8   r6   r^   r   r   r   r   r      s    r   c                   @   sF   e Zd ZdZdddZedefdd	Zd
d Zdd Z	dddZ
dS )MemSnapshotsProfilerzdProfiler that captures memory traces for allocation and deallocation of memory for
    tensors.
    r   r   r   Nc                 C   s   || _ d| _d S )NF)r   enabledr   r   r   r   r      s   
zMemSnapshotsProfiler.__init__c                 C   s   t tjjdS )N
trace_plot)hasattrr   r   _memory_vizr   r   r   r   _has_trace_plot   s   z$MemSnapshotsProfiler._has_trace_plotc                 C   s(   | j sd S d| _tjjjdddd d S )NTi )trace_alloc_max_entriestrace_alloc_record_context)r   r   r   r   memory_record_memory_historyr   r   r   r   r      s   
zMemSnapshotsProfiler.__enter__c                 C   s   | j s| jjd d S | jsJ tjj }tjj	d t
dd |d D r2| jjd d S | jd}| jjd|f t|d	}|tjjj|d dd
 W d    d S 1 s_w   Y  d S )N)MemTracez)(not available with your Pytorch version)Fc                 s   s    | ]	}t |d kV  qdS )r   NrW   )r=   tr   r   r   r?          z0MemSnapshotsProfiler.__exit__.<locals>.<genexpr>device_traces)r   z(no allocation recorded)zmemory_trace_plot.htmlr   zw+)rr   plot_segments)r   r   r`   ra   r   r   r   r   	_snapshotr   all_create_output_filenameopenwriter   r   )r   r!   r"   r#   snapshotfilenamefdr   r   r   r$      s(   
"zMemSnapshotsProfiler.__exit__c                 C   r%   r   r   r   r   r   r   r&      r'   zMemSnapshotsProfiler.stepr(   r)   )r*   r+   r,   r-   r   propertyboolr   r   r$   r&   r   r   r   r   r      s    
r   c                   @   s2   e Zd ZU eed< eed< eed< dZeed< dS )_ProfilerStatecls
iter_beginiter_endNobject)r*   r+   r,   r   __annotations__rz   r   r   r   r   r   r      s
   
 r   c                   @   s   e Zd ZdZdedeeeeef  de	e
j ddfddZdd	eddfd
dZdeeeeef  ddfddZd ddZdedefddZdd Zd!ddZdd Zdd Zd ddZdefddZdS )"r   NrK   schedulemoduler   c                 C   s   |  | || _d| _t| | _| jjddd d| _t	 rDt
 rDt  tt d  }d| dt  dt  | _t|d urL|nt | _|   d S )Nr   T)rD   parentsrB   r
   rC   r;   )check_scheduler   rL   r   absoluterK   mkdirrM   rR   rS   rT   rU   rV   rW   rX   rN   rO   rP   rQ   weakrefrefnnModuler   init_schedule)r   rK   r   r   rC   r   r   r   r      s   
 z_Profiler.__init__r   offsetc                    sD   t  fdd| jD dd d| _| jr| jd jnd| _g | _d S )Nc                    s&   g | ]\}}}t ||  |  qS r   )r   )r=   r   beginendr   r   r   
<listcomp>   s    z+_Profiler.init_schedule.<locals>.<listcomp>c                 S   s   | j S r   )r   )xr   r   r   <lambda>   s    z)_Profiler.init_schedule.<locals>.<lambda>)keyr   )sortedr   	profilersr   	last_stepr`   )r   r   r   r   r   r      s   

z_Profiler.init_schedulec                 C   s   t |dkrtd t }|D ]0\}}}|dks!J d| |dks,J d| ||k s:J d| d| |||f qd}|jD ]\}}||ksXJ dd	|  |}qGd S )
Nr   zEYou specified empty schedule for profiling. No data will be captured.z4Begin step of profiler must be non-negative, found: z.End step of profiler must be positive, found: z+Start must be before the end, found: begin=z	 and end=r   zThere is some overlapping in profiler scheduling. Please do not overlap profilers by step as they may affect each other. Schedule: )rW   rb   warningqueuePriorityQueueput)r   r   pqr   r   r   prev_endr   r   r   r      s.   


z_Profiler.check_schedulec                 C   s   | j D ]R}|j| jkr4| j|jk r4|jd u r.|| }td|jj d |	  ||_q|j
  q|jd urU|j}d |_td|jj d |
  |d d d  qd S )Nz	Starting z profiler...zShutting down )r   r   rL   r   r   r   logginginfor*   r   r&   r$   )r   por   r   r   update_profilers_on_step
  s    



z"_Profiler.update_profilers_on_stepr   c                 C   sb   | j dkr%t|}| j|j }|jddd || jdd| j  |j  S | j| jdd|  S )z
        Returns where to write a file with desired filename.
        Handles the case where we are in distributed settings, or when
        we need to output the same file multiple times (eg if a profiler
        runs for several steps)
        rB   T)r   rD   rA   r;   )rM   r   rK   stemr   rL   suffix)r   r   filefolderr   r   r   r     s   
z!_Profiler._create_output_filenamec                 C   s   |    d S r   )r   r   r   r   r   r   ,  s   z_Profiler.startc                 C   s   |  ||| d S r   )r$   r    r   r   r   r   /  s   z_Profiler.stopc                 C   s$   t jd ur	td| t _|   | S )Nz2Only one xformers profiler can be active at a time)r   _CURRENT_PROFILER
ValueErrorr   r   r   r   r   r   2  s
   
z_Profiler.__enter__c                 C   s0   d t _| jD ]}|jd ur|j||| qd S r   )r   r   r   r   r$   )r   r!   r"   r#   r   r   r   r   r$   :  s   

z_Profiler.__exit__c                 C   s   |  j d7  _ | j | jkr|   | j | jkrtd|   d}| j | jkrq| j | dkrsz| jd   | jd| j | d | j	 W n	 t
yP   Y nw | jd| j d }| rutd| j  | j| j d d	 d
S d
S d
S d
S )z>Signals the profiler that the next profiling step has started.r
   zxFormers profiler done. %s
   r   triggerztrigger.09z/xFormers profiler manually triggered at step %dr   N)rL   r   r   rb   r   format_summaryrK   unlink
write_textrM   FileNotFoundErrorexistsr   )r   CHECK_TRIGGER_EVERYstep_triggerr   r   r   r&   A  s4   z_Profiler.stepc                    sD   t | jdkr	dS tdd | jD  dd fdd| jD  S )	Nr   rB   c                 s   s    | ]	\}}t |V  qd S r   r   r=   titlevaluer   r   r   r?   b  r   z+_Profiler.format_summary.<locals>.<genexpr>z	summary:

c                    s&   g | ]\}}d |   d| qS )z  z: )ljustr   
pad_titlesr   r   r   d  s   & z,_Profiler.format_summary.<locals>.<listcomp>)rW   r`   maxrI   r   r   r   r   r   _  s   z_Profiler.format_summary)r   r)   )NNN)r*   r+   r,   r   rJ   r   r	   r   rz   r   r   r   r   r   r   r   r   r   r   r   r   r$   r&   r   r   r   r   r   r      s*    
 


r   )'r   rP   r   rN   rZ   r   dataclassesr   pathlibr   typingr   r   r   r   r   r	   torch.cuda.memoryr   torch.cuda.nvtxtorch.distributeddistributedrR   torch.nnr   torch.profilerdevice_limitsr   profile_analyzerr   	getLoggerr*   rb   r   r.   r   r   r   r   r   r   r   r   <module>   s0    
Q
7