o
    Ti!+                     @   s   d dl Z d dlmZmZmZ d dlZd dlZd dlmZm	Z	 d dl
mZ zd dlmZmZ d dlmZmZ W n	 ey?   Y nw d dlmZ d dlmZ dd	lmZmZmZ d
d Zdd Zdd Zdd Zdd Z G dd de	Z!G dd de	Z"dS )    N)AnyTupleDict)GraphModuleInterpreter)map_aggregate)tree_alltree_leaves)unset_fake_temporarilyis_fake)get_accelerator   )
is_comm_opis_release_nodeget_deepcompile_handlec                 C   s   t dd | S )Nc                 S   s   t |  p
t|  S N)torch	is_tensorr   x r   ]/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/compile/profilers/graph_profile.py<lambda>   s    z%_all_real_if_tensor.<locals>.<lambda>)r   )argsr   r   r   _all_real_if_tensor   s   r   c                 C   s>   t | rt  | |W  d    S 1 sw   Y  | S r   )r   r   r
   to)vdevicer   r   r   _to   s
   
 r   c                 C   s   dt fdd}t| |S )Nreturnc                 S   sL   t | r"|  dkr| j | j |   S | j | j | j S t| S )N   )r   r   numeldtyper   itemshapestrr   r   r   r   _tensor_to_key'   s
   
z$_args_to_key.<locals>._tensor_to_key)r%   r   )r   r'   r   r   r   _args_to_key%   s   
r(   c                 C   s   t dd t| D S )Nc                 S   s&   g | ]}t |r| |  qS r   )r   r   element_sizer!   ).0r   r   r   r   
<listcomp>3   s   & z_node_size.<locals>.<listcomp>)sumr	   )outr   r   r   
_node_size2   s   r.   c                  C   sZ   d} z$dd l }|  t  }||}||}t  }|j| } W | S    Y | S Nr   )pynvmlnvmlInitr   current_devicenvmlDeviceGetHandleByIndexnvmlDeviceGetMemoryInfomemory_allocatedused)adjustr0   current_dev_idhandleinfotorch_allocr   r   r   _get_mem_usage_out_of_torch6   s   



r<   c                       sX   e Zd Zddededef fddZd	ef fd
dZdej	j
d	ef fddZ  ZS )ProfilingInterpreter
      Fgm	iterationwarmupc                    sp   t  | t | _|dksJ |dksJ || _|| _tt 	 | _i | _
t | _i | _|| _d| _d S r/   )super__init__r   nz3rA   rB   r   r   r   r2   cachedistis_initializeddistributedallgather_mem	debug_logmem_usage_out_of_torch)selfr@   rA   rB   rK   	__class__r   r   rD   M   s   

zProfilingInterpreter.__init__r   c              
      s0  zzHt |s
J d| jd t - t  j| jgd t | _	t
 j| }W d   n1 s4w   Y  W d   n1 sCw   Y  W n& tyo } zdt|v rZ|jnt|}td|  W Y d}~nd}~ww W | j  | jd |S W | j  | jd |S | j  | jd w )a  Run the graph with profiling enabled.

        args: inputs to the graph. Tensors in the inpusts must be real tensors, not fake tensors. args can contain ds parameters.
        returns: The output of the graph. Tensor in the output is real tensors.
        Inputs must be real tensorsTdevicesNmsgzProfiling error F)r   rE   enable_profilingr
   r   randomfork_rngr   r<   rL   rC   run	ExceptiondirrS   r%   printclear_all_gathered_params)rM   r   
return_valerS   rN   r   r   rW   ]   s4   


zProfilingInterpreter.runnc                    s  |j dv r&d|jd< d|jd< d|jd< d|jd< t||jd< t |S |\}}t|ts4J t|ts;J d	d
 t	|fdd}t	|fdd}t	|fdd}|j
t|t|f}|jv }tj|rodndgjtjd}jrt|tjj | dk}|rj| \}}}	}
}||jd< ||jd< |	|jd< |
|jd< ||jd< t|}|p|}|rdnj}t   fddt|D } fddt|D }t   t  }t  }|stjD ]}t|j |j
||}qt |rjsJ d|j! d|j
 dt"  t## }t|D ]}|| $  t|j |j
||}|| $  q %  t## | }t |rJt"  t  | j& }	t  | j& }t|}dd t	|fdd}|sJt'(dd t)||D }|| d }t* m tj|||	||gjd}jrt|tjj+ |d  |jd< |d  |jd< t|d  |jd< t|d  |jd< t|d   |jd< |jd |jd |jd |jd |jd fj|< W d    n	1 sw   Y  |rj,-|d d |jd< t. dkrJj/rJt0|j
 d|jd d!d"|jd d!d#|jd d$ d$ d!d%|jd d$ d$ d!d&|jd   |j
tj1j2j3j4kru|d }t5|d's`J |j6slj78|d  |jd j,|j9< |S )(N>   outputplaceholderg        device_time	wall_timer   	alloc_mem
max_memorytensor_sizec                 S   s   t | dr| j| gd | S )Nds_id)
param_list)hasattr
all_gatherr&   r   r   r   rebuild_param_if_necessary   s   
zAProfilingInterpreter.run_node.<locals>.rebuild_param_if_necessaryc                        | S r   r   r   )rj   r   r   r          z/ProfilingInterpreter.run_node.<locals>.<lambda>c                       t |  jS r   r   r   r   rM   r   r   r          c                    rm   r   rn   r   ro   r   r   r      rp   r    )r   r"   max_memc                       g | ]} j d dqS T)enable_timingEventr*   _acceleratorr   r   r+          z1ProfilingInterpreter.run_node.<locals>.<listcomp>c                    rr   rs   ru   rw   ry   r   r   r+      r{   z=Distributed environment is not initialized but comm operator  z	 is used.c                 S   s$   t | dr| js| j| gdd | S )Nrf   F)rg   has_been_updated)rh   
ds_persist	partitionr&   r   r   r   partition_param_if_necessary   s   zCProfilingInterpreter.run_node.<locals>.partition_param_if_necessaryc                    rk   r   r   r   )r   r   r   r      rl   c                 S   s   g | ]	\}}| |qS r   )elapsed_time)r*   sr]   r   r   r   r+      s    i  r   r         .2fzms zms alloc_mem=   zMB max_mem=zMB tensor_size=rf   ):opmetar.   rC   run_nodefetch_args_kwargs_from_env
isinstancetupledictr   targetr(   rF   r   tensorr   intrI   rG   
all_reduceReduceOpSUMr#   r   rA   r   rangereset_peak_memory_statsr5   max_memory_allocatedrB   getattrr   namebarriertimerecordsynchronizerL   
statisticsmeanzipr
   AVGrJ   getget_rankrK   rZ   opsdcallgather_paramdefaultrh   r~   rE   invalidate_gathered_paramrf   )rM   r^   r   kwargs	cache_key	cache_hitcache_hit_flagra   rb   rc   rq   re   is_release_oprun_only_oncerA   start_events
end_eventsalloc_mem_startmax_mem_startir-   startwalltime_sumrd   vals_to_bcastrN   )rz   r   rj   rM   r   r   s   s   














"
\zProfilingInterpreter.run_node)r>   r?   F)__name__
__module____qualname__r   r   rD   r   rW   r   fxNoder   __classcell__r   r   rN   r   r=   K   s    "r=   c                       sX   e Zd Zddef fddZdef fddZdejj	def fd	d
Z
dd Z  ZS )MemoryProfilingInterpreterFr@   c                    sV   t  | t | _tt  | _g | _t 	 | _
d| _t|jj| _|| _d S r/   )rC   rD   r   rE   r   r   r   r2   
mem_recordr5   
last_allocnode_counterlengraphnodesnode_numrK   )rM   r@   rK   rN   r   r   rD      s   
z#MemoryProfilingInterpreter.__init__r   c              
      s   ztzHt |s
J d| jd t | _t ) t  j| j	gd t
 j| }W d    n1 s4w   Y  W d    n1 sCw   Y  W n tyb } ztd|  W Y d }~nd }~ww W | jd |S W | jd |S | jd w )NrP   TrQ   zMemoryProfiling error F)r   rE   rT   r<   rL   r
   r   rU   rV   r   rC   rW   rX   rZ   )rM   r   r\   r]   rN   r   r   rW      s,   zMemoryProfilingInterpreter.runr^   c                    sX  t    |jdv rt |}n& |\}}t| fdd}t| fdd}t |j|j||}~~t  	  j
 }t    j
 }tj||g jd}t|tjj |d  }|d  } j|j|| j |f   jd7  _ jrt dkrtd j d	 j d
|j d|d d dd| j d d dd | _|S )N>   r_   r`   c                    rm   r   rn   r   ro   r   r   r     rp   z5MemoryProfilingInterpreter.run_node.<locals>.<lambda>c                    rm   r   rn   r   ro   r   r   r     rp   r   r   r    zMem prof Node /r|   z memory r   r   z	MB delta MB)r   r   r   rC   r   r   r   r   r   r5   rL   r   r   r   r   rG   r   r   MAXr#   r   appendr   r   r   rK   r   rZ   r   )rM   r^   retr   r   current_alloc	max_allocr   rN   ro   r   r     s,   

Bz#MemoryProfilingInterpreter.run_nodec                 C   s.   dd l }|j| jg dd}|j|dd d S )Nr   )nodememorydeltarq   )columnsF)index)pandas	DataFramer   to_csv)rM   pathpddfr   r   r   dump$  s   zMemoryProfilingInterpreter.dump)F)r   r   r   r   rD   r   rW   r   r   r   r   r   r   r   r   rN   r   r      s
     r   )#r   typingr   r   r   r   r   torch.fxr   r   torch.fx.noder   torch.utils._pytreer   r	   torch._subclasses.fake_tensorr
   r   ImportErrordeepspeed.commcommrG   deepspeed.acceleratorr   utilr   r   r   r   r   r(   r.   r<   r=   r   r   r   r   r   <module>   s.    