o
    ©Ì³i\7  ã                   @   s¸  U d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
Z
d dlZ
d dlmZ d dlmZ d dlmZ d dlmZmZ ed	ƒZd
Ze
jjje
jjje
jjjhZdddddœZeed< dddddœZ eed< dZ!e"ed< de"fdd„Z#		d2de
jj$de"de"de%fdd „Z&G d!d"„ d"ƒZ'dddde d# e d$ e d% e d& dddddfd'e(d(e(d)e(d*e(d#e(d$e(d%e(d&e(d+ee% d,ee% d-ee% d.ee% dee" d/e	e
jj$ef fd0d1„Z)dS )3é    N)Úpartial)ÚPath)ÚOptionalÚTuple)Ú
DictConfig)Ú_ExperimentalConfig)Útensorboard_trace_handler)Ú
get_loggerÚget_world_size_and_rankÚINFOÚprofileré   é   é   ©Ú
wait_stepsÚwarmup_stepsÚactive_stepsÚ
num_cyclesÚDEFAULT_SCHEDULEFT)Úprofile_memoryÚ
with_stackÚrecord_shapesÚ
with_flopsÚDEFAULT_TRACE_OPTSÚprofiler_outputÚDEFAULT_PROFILE_DIRÚmsgc                 C   s$   t ƒ \}}|dkrt | ¡ d S d S )Nr   )r
   ÚlogÚwarning)r   Ú_Úrank© r"   úP/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/training/_profiler.pyÚ_warn2   s   
ÿr$   Úself_cuda_time_totalé   ÚprofÚ
output_dirÚmetricÚ	row_limitc                 C   sþ  t ƒ \}}dt| jƒ }tj ||¡}tj |¡s tj|dd |dkr-t 	d| j› ¡ t
 ¡ }tj ¡ }	t|d|	j› d|	j› d|	j› d|	j› d|	j› 
dd}
|
| ƒ |dkrgt 	d	t
 ¡ | d
›d¡ | jr¨tj ¡ r¨|dkr¨z|  |› d|› d¡ W n tyš } zt d|› ¡ W Y d}~nd}~ww tjj |› d|› d¡ | jr¸| j|› d|› d|d | j| j ddj!||d}t"|› d|› ddƒ}t#||d W d  ƒ n1 sáw   Y  |dkròt 	d|› ¡ |dkrýtj$ %¡  dS dS )a~  
    Handles export of artifacts from ``torch.profiler.profile``.

    The following artifacts are exported:
    - chrome / tensorboard trace - viewable through tensorboard or perfetto.dev / chrome::/tracing
    - trace event table
    - memory timeline and snapshot.pickle if ``profile_memory``
    - stacks if ``with_stack`` (note that ``profile_memory`` requires ``with_stack`` to be ``True``),
    viewable as a flamegraph see (https://pytorch.org/docs/stable/profiler.html#torch.profiler._KinetoProfile.export_stacks).

    Notes:
    - Each profiling cycle is exported as a sub-directory in output_dir
        - E.g., profiling in 5-step cycle (wait=2, warmup=2, active=1, repeat=0) will result in
        sub-directories iteration_5, iteration_10, etc.
    - If profiling in a distributed setting, each artifact will be prefixed with rank.
    - Memory timeline is only exported for rank 0 (error if exporting from multiple ranks on single node)

    See profiler documentation (https://pytorch.org/docs/stable/profiler.html#torch.profiler.profile) for more details

    Args:
        prof (torch.profiler.profile): instance of torch profiler to use
        output_dir (str):  directory to store artifacts
        metric (str): metric to order trace event table by, see ``torch.profiler.profile.key_averages().table`` for
        row_limit (int): number of rows to display in trace event table

    Ú
iteration_T)Úexist_okr   zDumping traces at step zr0-ú-)Úworker_nameÚuse_gzipzFinished dumping traces in z.2fz secondsz/rankz_memory-timeline.htmlz# Failed to export memory timeline: Nz_memory_snapshot.picklez_stacks.txt)r)   r   )Úgroup_by_input_shapeÚgroup_by_stack_n)Úsort_byr*   z_key_averages.txtÚw)ÚfilezSaving profiling results to r   )&r
   ÚstrÚstep_numÚosÚpathÚjoinÚexistsÚmakedirsr   ÚinfoÚtimeÚ	monotonicÚdatetimeÚnowr   ÚyearÚmonthÚdayÚhourÚminuter   ÚtorchÚcudaÚis_availableÚexport_memory_timelineÚ	ExceptionÚwarnÚmemoryÚ_dump_snapshotr   Úexport_stacksÚkey_averagesr   ÚtableÚopenÚprintÚdistributedÚbarrier)r'   r(   r)   r*   Ú
world_sizer!   Úcurr_trace_dir_nameÚcurr_trace_dirÚbeginr@   ÚexporterÚeÚkey_avgsÚfr"   r"   r#   Útrace_handler8   sZ   
 
*ýÿ€ÿÿÿþÿÿr]   c                   @   s8   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zd
d„ ZdS )ÚDummyProfilera{  
    Drop-in replacement for torch.profiler.profile that functions as a nullcontext / object
    with no-op methods for ``start``, ``stop``, and ``step``.

    This is helpful for instrumenting profiling in a recipe without requiring changes to the
    code independent of whether profiling is on / off.

    E.g.,
    ```
        profiler = DummyProfiler()
        #profiler = torch.profiler.profile()

        # Below is same regardless of profiler object type
        with profiler as prof:
            for epoch in epochs:
                for batch in batches:
                    train.step()
                    prof.step()

    c                 C   s   | S ©Nr"   ©Úselfr"   r"   r#   Ú	__enter__©   ó   zDummyProfiler.__enter__c                 G   ó   d S r_   r"   )ra   Úargsr"   r"   r#   Ú__exit__¬   rc   zDummyProfiler.__exit__c                 C   rd   r_   r"   r`   r"   r"   r#   Ústart¯   rc   zDummyProfiler.startc                 C   rd   r_   r"   r`   r"   r"   r#   Ústop²   rc   zDummyProfiler.stopc                 C   rd   r_   r"   r`   r"   r"   r#   Ústepµ   rc   zDummyProfiler.stepN)	Ú__name__Ú
__module__Ú__qualname__Ú__doc__rb   rf   rg   rh   ri   r"   r"   r"   r#   r^   “   s    r^   r   r   r   r   ÚenabledÚcpurG   Úxpur   r   r   r   Úreturnc                    s  | st dƒ tƒ tddiƒfS g }|r| tjjj¡ |r%| tjjj¡ |r/| tjjj	¡ t
|ƒdkrAt dƒ t}d } }}t|du|	du|
du|dugƒ }|rjt‰ t d d	 ‡ fd
d„ˆ  ¡ D ƒ¡¡ƒ n8||	|
|dœ‰ ‡ fdd„ˆ  ¡ D ƒ}t
|ƒdkr¢|D ]}t| ˆ |< q„t d d	 |¡d	 ‡ fdd„|D ƒ¡¡ƒ tjjˆ d ˆ d ˆ d ˆ d d}|rºt dƒ |p½|}|pÁ|}|rÉtddnd}|du rØt dt› ƒ t}t|ƒ}|jddd t|ƒ}tt|d}tjj||||||||d}t| ||||||||dœ	ˆ ¥ƒ}||fS )aC  
    Sets up :class:`~torch.profiler.profile` and returns the profiler config with post-setup updates.

    The profiler config can be provided in configs under the ``profiler`` key with the following layout:

    .. code-block:: yaml

        profiler:
          _component_: torchtune.training.setup_torch_profiler
          enabled: bool
          # Output directory of trace artifacts
          output_dir: str

          # torch.profiler.ProfilerActivity types to trace
          cpu: bool
          cuda: bool

          # Trace options
          profile_memory: bool
          with_stack: bool
          record_shapes: bool
          with_flops: bool

          # torch.profiler.schedule args
          wait_steps: int
          warmup_steps: int
          active_steps: int
          num_cycles: int

    The profiler schedule updates with respect to an optimizer step (e.g., if
    ``gradient_accumulation = 2``, then the profiler will step every 2 batches).

    Sensible defaults will be chosen if the config is missing options:

    - If no activities are specified, profiler will default to CPU + CUDA
    - If no schedule is specified, profiler will default to ``DEFAULT_SCHEDULE``
    - Certain options will be overridden (``with_stack`` and ``record_shapes``)     depending on requirements of other options (e.g., ``profile_memory`` requires     ``with_stack`` and ``record_shapes``).


    Note:
        - Enabling the profiler will result in training speed reduction.
        - Setting ``profile_memory: True`` will generate large trace files.
        - The profiler schedule is context dependent. Calling ``profiler.step()``         at each batch iteration but **outside** the gradient accumulation scope will         ``step`` the profiler each forward / backward step. Calling ``profiler.step()``         each batch iteration but **within** the gradient accumulation scope  will ``step``         the profiler each optimizer update step such that each ``step`` contains multiple         forward / backward passes.

    Args:
        enabled (bool): Enable pytorch profiler. Default is False.
        cpu (bool): Enable cpu profiling. Default is True.
        cuda (bool): Enable cuda profiling. Default is True.
        xpu (bool): Enable xpu profiling. Default is True.
        profile_memory (bool): Profile memory usage. Default is False.
        with_stack (bool): Profile stack. Default is False.
        record_shapes (bool): Record shapes. Default is True.
        with_flops (bool): Profile flops. Default is False.
        wait_steps (Optional[int]): Wait time in steps. Maps to ``wait`` kwarg of ``torch.profiler.schedule``.
        warmup_steps (Optional[int]): Warmup time in steps. Maps to ``warmup`` kwarg of ``torch.profiler.schedule``.
        active_steps (Optional[int]): Active time in steps. Maps to ``active`` kwarg of ``torch.profiler.schedule``.
        num_cycles (Optional[int]): Number of profiling cycles. Maps to ``repeat`` kwarg of ``torch.profiler.schedule``.
        output_dir (Optional[str]): Tracing file output path.

    Returns:
        Tuple[torch.profiler.profile, DictConfig]
    z Profiling disabled.rn   Fr   z1No activities specified, defaulting to CPU + CUDATNz. No schedule found in config, defaulting to {}z, c                 3   ó"    | ]}|› d ˆ | › V  qdS ©z = Nr"   ©Ú.0Úk©Úschedule_argsr"   r#   Ú	<genexpr>6  ó   €  z'setup_torch_profiler.<locals>.<genexpr>r   c                    s   g | ]
}ˆ | d u r|‘qS r_   r"   rt   rw   r"   r#   Ú
<listcomp>@  s    z(setup_torch_profiler.<locals>.<listcomp>z= Missing keys in torch profiler schedule {}: defaulting to {}c                 3   rr   rs   r"   rt   rw   r"   r#   ry   G  rz   r   r   r   r   )ÚwaitÚwarmupÚactiveÚrepeatzp`profile_memory` requires `with_stack` and `record_shapes`, these will be enabled since `profile_memory` is True)Úverbosez= No output directory found in profiler config, defaulting to )Úparentsr,   )r(   )Ú
activitiesr   r   r   r   ÚscheduleÚexperimental_configÚon_trace_ready)	rn   r(   ro   rG   rp   r   r   r   r   )r$   r^   r   ÚappendrF   r   ÚProfilerActivityÚCPUÚCUDAÚXPUÚlenÚDEFAULT_PROFILER_ACTIVITIESÚanyr   Úformatr9   Úkeysrƒ   r   r   r   Úmkdirr5   r   r]   Úprofile)rn   ro   rG   rp   r   r   r   r   r   r   r   r   r(   r‚   Úuse_default_scheduleÚmissing_keysrv   rƒ   r„   Úcallbackr   Úprofiler_cfgr"   rw   r#   Úsetup_torch_profiler¹   s´   W	üÿ
ÿÿüþÿü	ÿÿø÷
öÿr–   )r%   r&   )*r?   r7   r=   Ú	functoolsr   Úpathlibr   Útypingr   r   rF   Útorch.distributedÚ	omegaconfr   Útorch._C._profilerr   Útorch.profilerr   Útorchtune.utilsr	   r
   r   ÚPROFILER_KEYr   r‡   rˆ   r‰   rŠ   rŒ   r   ÚdictÚ__annotations__r   r   r5   r$   r‘   Úintr]   r^   Úboolr–   r"   r"   r"   r#   Ú<module>   s¨   
ýüü	üÿþý
ü['ñÿþýüûúùøõôóòñð