o
    ̳i\7                     @   s  U d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlZ
d dlmZ d dlmZ d dlmZ d dlmZmZ ed	Zd
Ze
jjje
jjje
jjjhZdddddZeed< dddddZ eed< dZ!e"ed< de"fddZ#		d2de
jj$de"de"de%fdd Z&G d!d" d"Z'dddde d# e d$ e d% e d& dddddfd'e(d(e(d)e(d*e(d#e(d$e(d%e(d&e(d+ee% d,ee% d-ee% d.ee% dee" d/e	e
jj$ef fd0d1Z)dS )3    N)partial)Path)OptionalTuple)
DictConfig)_ExperimentalConfig)tensorboard_trace_handler)
get_loggerget_world_size_and_rankINFOprofiler         
wait_stepswarmup_stepsactive_steps
num_cyclesDEFAULT_SCHEDULEFT)profile_memory
with_stackrecord_shapes
with_flopsDEFAULT_TRACE_OPTSprofiler_outputDEFAULT_PROFILE_DIRmsgc                 C   s$   t  \}}|dkrt|  d S d S )Nr   )r
   logwarning)r   _rank r"   P/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/training/_profiler.py_warn2   s   
r$   self_cuda_time_total   prof
output_dirmetric	row_limitc                 C   s  t  \}}dt| j }tj||}tj|s tj|dd |dkr-t	d| j  t
 }tj }	t|d|	j d|	j d|	j d|	j d|	j 
dd}
|
|  |dkrgt	d	t
 | d
d | jrtj r|dkrz| | d| d W n ty } ztd|  W Y d}~nd}~ww tjj| d| d | jr| j| d| d|d | j| j ddj!||d}t"| d| dd}t#||d W d   n1 sw   Y  |dkrt	d|  |dkrtj$%  dS dS )a~  
    Handles export of artifacts from ``torch.profiler.profile``.

    The following artifacts are exported:
    - chrome / tensorboard trace - viewable through tensorboard or perfetto.dev / chrome::/tracing
    - trace event table
    - memory timeline and snapshot.pickle if ``profile_memory``
    - stacks if ``with_stack`` (note that ``profile_memory`` requires ``with_stack`` to be ``True``),
    viewable as a flamegraph see (https://pytorch.org/docs/stable/profiler.html#torch.profiler._KinetoProfile.export_stacks).

    Notes:
    - Each profiling cycle is exported as a sub-directory in output_dir
        - E.g., profiling in 5-step cycle (wait=2, warmup=2, active=1, repeat=0) will result in
        sub-directories iteration_5, iteration_10, etc.
    - If profiling in a distributed setting, each artifact will be prefixed with rank.
    - Memory timeline is only exported for rank 0 (error if exporting from multiple ranks on single node)

    See profiler documentation (https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile) for more details

    Args:
        prof (torch.profiler.profile): instance of torch profiler to use
        output_dir (str):  directory to store artifacts
        metric (str): metric to order trace event table by, see ``torch.profiler.profile.key_averages().table`` for
        row_limit (int): number of rows to display in trace event table

    
iteration_T)exist_okr   zDumping traces at step zr0--)worker_nameuse_gzipzFinished dumping traces in z.2fz secondsz/rankz_memory-timeline.htmlz# Failed to export memory timeline: Nz_memory_snapshot.picklez_stacks.txt)r)   r   )group_by_input_shapegroup_by_stack_n)sort_byr*   z_key_averages.txtw)filezSaving profiling results to r   )&r
   strstep_numospathjoinexistsmakedirsr   infotime	monotonicdatetimenowr   yearmonthdayhourminuter   torchcudais_availableexport_memory_timeline	Exceptionwarnmemory_dump_snapshotr   export_stackskey_averagesr   tableopenprintdistributedbarrier)r'   r(   r)   r*   
world_sizer!   curr_trace_dir_namecurr_trace_dirbeginr@   exporterekey_avgsfr"   r"   r#   trace_handler8   sZ   
 
*r]   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )DummyProfilera{  
    Drop-in replacement for torch.profiler.profile that functions as a nullcontext / object
    with no-op methods for ``start``, ``stop``, and ``step``.

    This is helpful for instrumenting profiling in a recipe without requiring changes to the
    code independent of whether profiling is on / off.

    E.g.,
    ```
        profiler = DummyProfiler()
        #profiler = torch.profiler.profile()

        # Below is same regardless of profiler object type
        with profiler as prof:
            for epoch in epochs:
                for batch in batches:
                    train.step()
                    prof.step()

    c                 C   s   | S Nr"   selfr"   r"   r#   	__enter__      zDummyProfiler.__enter__c                 G      d S r_   r"   )ra   argsr"   r"   r#   __exit__   rc   zDummyProfiler.__exit__c                 C   rd   r_   r"   r`   r"   r"   r#   start   rc   zDummyProfiler.startc                 C   rd   r_   r"   r`   r"   r"   r#   stop   rc   zDummyProfiler.stopc                 C   rd   r_   r"   r`   r"   r"   r#   step   rc   zDummyProfiler.stepN)	__name__
__module____qualname____doc__rb   rf   rg   rh   ri   r"   r"   r"   r#   r^      s    r^   r   r   r   r   enabledcpurG   xpur   r   r   r   returnc                    s  | st d t tddifS g }|r|tjjj |r%|tjjj |r/|tjjj	 t
|dkrAt d t}d } }}t|du|	du|
du|dug }|rjt t dd	 fd
d  D  n8||	|
|d  fdd  D }t
|dkr|D ]}t|  |< qt dd	|d	 fdd|D  tjj d  d  d  d d}|rt d |p|}|p|}|rtddnd}|du rt dt  t}t|}|jddd t|}tt|d}tjj||||||||d}t| ||||||||d	 }||fS )aC  
    Sets up :class:`~torch.profiler.profile` and returns the profiler config with post-setup updates.

    The profiler config can be provided in configs under the ``profiler`` key with the following layout:

    .. code-block:: yaml

        profiler:
          _component_: torchtune.training.setup_torch_profiler
          enabled: bool
          # Output directory of trace artifacts
          output_dir: str

          # torch.profiler.ProfilerActivity types to trace
          cpu: bool
          cuda: bool

          # Trace options
          profile_memory: bool
          with_stack: bool
          record_shapes: bool
          with_flops: bool

          # torch.profiler.schedule args
          wait_steps: int
          warmup_steps: int
          active_steps: int
          num_cycles: int

    The profiler schedule updates with respect to an optimizer step (e.g., if
    ``gradient_accumulation = 2``, then the profiler will step every 2 batches).

    Sensible defaults will be chosen if the config is missing options:

    - If no activities are specified, profiler will default to CPU + CUDA
    - If no schedule is specified, profiler will default to ``DEFAULT_SCHEDULE``
    - Certain options will be overridden (``with_stack`` and ``record_shapes``)     depending on requirements of other options (e.g., ``profile_memory`` requires     ``with_stack`` and ``record_shapes``).


    Note:
        - Enabling the profiler will result in training speed reduction.
        - Setting ``profile_memory: True`` will generate large trace files.
        - The profiler schedule is context dependent. Calling ``profiler.step()``         at each batch iteration but **outside** the gradient accumulation scope will         ``step`` the profiler each forward / backward step. Calling ``profiler.step()``         each batch iteration but **within** the gradient accumulation scope  will ``step``         the profiler each optimizer update step such that each ``step`` contains multiple         forward / backward passes.

    Args:
        enabled (bool): Enable pytorch profiler. Default is False.
        cpu (bool): Enable cpu profiling. Default is True.
        cuda (bool): Enable cuda profiling. Default is True.
        xpu (bool): Enable xpu profiling. Default is True.
        profile_memory (bool): Profile memory usage. Default is False.
        with_stack (bool): Profile stack. Default is False.
        record_shapes (bool): Record shapes. Default is True.
        with_flops (bool): Profile flops. Default is False.
        wait_steps (Optional[int]): Wait time in steps. Maps to ``wait`` kwarg of ``torch.profiler.schedule``.
        warmup_steps (Optional[int]): Warmup time in steps. Maps to ``warmup`` kwarg of ``torch.profiler.schedule``.
        active_steps (Optional[int]): Active time in steps. Maps to ``active`` kwarg of ``torch.profiler.schedule``.
        num_cycles (Optional[int]): Number of profiling cycles. Maps to ``repeat`` kwarg of ``torch.profiler.schedule``.
        output_dir (Optional[str]): Tracing file output path.

    Returns:
        Tuple[torch.profiler.profile, DictConfig]
    z Profiling disabled.rn   Fr   z1No activities specified, defaulting to CPU + CUDATNz. No schedule found in config, defaulting to {}z, c                 3   "    | ]}| d  |  V  qdS z = Nr"   .0kschedule_argsr"   r#   	<genexpr>6       z'setup_torch_profiler.<locals>.<genexpr>r   c                    s   g | ]
} | d u r|qS r_   r"   rt   rw   r"   r#   
<listcomp>@  s    z(setup_torch_profiler.<locals>.<listcomp>z= Missing keys in torch profiler schedule {}: defaulting to {}c                 3   rr   rs   r"   rt   rw   r"   r#   ry   G  rz   r   r   r   r   )waitwarmupactiverepeatzp`profile_memory` requires `with_stack` and `record_shapes`, these will be enabled since `profile_memory` is True)verbosez= No output directory found in profiler config, defaulting to )parentsr,   )r(   )
activitiesr   r   r   r   scheduleexperimental_configon_trace_ready)	rn   r(   ro   rG   rp   r   r   r   r   )r$   r^   r   appendrF   r   ProfilerActivityCPUCUDAXPUlenDEFAULT_PROFILER_ACTIVITIESanyr   formatr9   keysr   r   r   r   mkdirr5   r   r]   profile)rn   ro   rG   rp   r   r   r   r   r   r   r   r   r(   r   use_default_schedulemissing_keysrv   r   r   callbackr   profiler_cfgr"   rw   r#   setup_torch_profiler   s   W	
	
r   )r%   r&   )*r?   r7   r=   	functoolsr   pathlibr   typingr   r   rF   torch.distributed	omegaconfr   torch._C._profilerr   torch.profilerr   torchtune.utilsr	   r
   r   PROFILER_KEYr   r   r   r   r   r   r   dict__annotations__r   r   r5   r$   r   intr]   r^   boolr   r"   r"   r"   r#   <module>   s   
	
['