o
    Ti                     @   s   d a i ai add Zdd Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zd add Zdd Zd S )Nc                   C   s   i a i ad S )N)module_namesparam_names r   r   I/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/utils/debug.py"debug_clear_module_and_param_names   s   r   c                 C   s(   dd |   D add |  D ad S )Nc                 S      i | ]\}}||qS r   r   ).0namemoduler   r   r   
<dictcomp>       z8debug_extract_module_and_param_names.<locals>.<dictcomp>c                 S   r   r   r   )r   r   paramr   r   r   r
      r   )named_modulesr   named_parametersr   )modelr   r   r   $debug_extract_module_and_param_names   s   r   c                 C      | t v rt |  S dS Nunknown)r   r	   r   r   r   debug_module2name      r   c                 C      dt |  d| j S Nname= id=)r   idr   r   r   r   debug_module2name_id%      r   c                 C   s   dt |  d| jj S )Nr    )r   	__class____name__r   r   r   r   debug_module2name_class)   s   r!   c                 C   r   r   )r   r   r   r   r   debug_param2name-   r   r#   c                 C   r   r   )r#   ds_idr"   r   r   r   debug_param2name_id4   r   r%   c                 C   s    dt |  d| j d| jj S )Nr   r    shape=)r#   r$   datashaper"   r   r   r   debug_param2name_id_shape8       r)   c                 C   (   dt |  d| j d| jj d| j S )Nr   r   r&   z device=)r#   r$   r'   r(   devicer"   r   r   r    debug_param2name_id_shape_device<      (r-   c                 C   s    dt |  d| j d|   S )Nr   r   z numel=)r#   r$   numelr"   r   r   r   debug_param2name_id_numel@   r*   r0   c                 C   r+   )Nr   r   r&   z status=)r#   r$   r'   r(   	ds_statusr"   r   r   r    debug_param2name_id_shape_statusD   r.   r2   c                  G   sz   t du rddl a ttd&}t |t j zt|   W t |t j nt |t j w W d   dS 1 s6w   Y  dS )a  

    For printing messages for all concurrent gpus w/o getting interleaved text.

    This is useful when debugging issues where multi-gpus don't sync.

    1. Enable the force debug in say partitioning and zero3 files
    2. Override the usual versions with ::

        def print_rank_0(message, debug=False, force=False):
            rank = deepspeed.comm.get_rank()
            printflock(f"[{rank}] {message}")
    3. run the program and you get both logs non-interleaved

    But this makes it very difficult to make sense of the output, so the ``log_rank_file`` helper
    function might be more useful, as it's easier to send each log stream into a separate file and
    then compare those.

    N    r)fcntlopen__file__flockLOCK_EXprintLOCK_UN)msgsfhr   r   r   
printflockH   s   
""r>   c                 G   s@   t du rtd|  dda |D ]
}t | d qt   dS )a+  
    Print to a log file of the given rank

    This is useful for debugging hanging in sync processes. Here is a possible workflow:

    1. Enable the force debug in say partitioning and zero3 files
    2. Override the usual versions of print_rank_0 in those files with ::

        def print_rank_0(message, debug=False, force=False):
            rank = deepspeed.comm.get_rank()
            log_rank_file(rank, message)

    3. run the program
    4. fix up the expected differences, e.g. different cuda numbers ::

        perl -pi -e 's|cuda:1|cuda:0|' log_rank_*

    5. now diff and see where names and ids diverge - you will find where the gpus don't do the same
    work (e.g. when some layers get conditionally skipped on one gpu but not all)

        diff -u log_rank_0.txt log_rank_1.txt | less

    N	log_rank_z.txtw
)r=   r6   writeflush)rankr<   mr   r   r   log_rank_filek   s
   rF   c                    s(    fdd t | dr | j d S d S )Nc                    s   t d|   | jD ]B}|d rLz#t|d d}t |d  t dt| d|j d| d|j  W q
 tyK } z |d  W Y d }~q
d }~ww q
d S )NzBackward tensors in r3   variablezTensor - id: z	, shape: z, data: z, grad: )r:   next_functionsgetattrr   r(   gradAttributeError)grad_fnfuncstensore_print_bwd_tensorsr   r   rQ      s   
,z2print_backward_tensors.<locals>._print_bwd_tensorsrL   )hasattrrL   )rN   r   rP   r   print_backward_tensors   s   
rS   )r5   r   r   r   r   r   r   r!   r#   r%   r)   r-   r0   r2   r>   r=   rF   rS   r   r   r   r   <module>   s$   	  