o
    Ti                     @   s   d dl Z d dlZzd dlmZ W n	 ey   Y nw d dlZd dlmZ d dl	m
Z
 dd Zdd Zdd	 ZdddZdadd Zedkrzee jd Ze
 e ede  ejdd e ZdZeeZede dedd e  dS dS )    N)unset_fake_temporarily)get_acceleratorc                   C   s   t    t  d S )N)r   synchronizedistbarrier r   r   \/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/compile/profilers/comm_profile.pysync_all   s   
r	   c                 C   s   t  }d}d}|dkrtd| dkr&|| }|| |d |  }||fS | dkr@||9 }|| }|| |d |  }||fS | dkrZ|d | }|| d|d  |  }||fS | dksb| d	krl|| }|}||fS td
)Nr   zError. Duration is 0.
all_to_all   
all_gather
all_reduce   pt2pt	broadcastzwrong comm_op specified)r   get_world_size
ValueError)comm_opsizedurationntputbusbwr   r   r   get_bw   s.   
r   c                 C   s   t   t|D ]
}tj|||d qt   |  t|D ]
}tj|||d q|  t   ||d }	|	| }
| |  t  }t	j
|
g| d}t dkr\t|tjj || fS )N)async_opi  )devicer   )r	   ranger   all_gather_into_tensorrecordelapsed_timeelement_sizenelementr   torchtensorr   ReduceOpAVGitem)r   inputoutputstart_event	end_eventwarmuptrialsr   ir   avg_durationr   avg_duration_tenr   r   r   timed_all_gather5   s    r0      
   Fc                 C   sN  t  }t  }t jdd}t jdd}	g }
dd td|D D ]}|| }|dkr1|
| q"dg}t  |
D ]j}t  }z)tj	||| d}t  |
t|d	}~t   tj| | || d}W n) ty } zd
t|v rt  dkrtd t  W Y d }~ |S |d }~ww t  |t| ||||	||| q:|S )NT)enable_timingc                 s   s    | ]}d | V  qdS )r   Nr   ).0pr   r   r   	<genexpr>[   s    z!run_all_gather.<locals>.<genexpr>r   r   )r   r   )dtyper   zout of memoryz0WARNING: Ran out of GPU memory. Exiting comm op.)r   get_rankr   r   Eventr   appendr	   r"   onesmul_floatviewempty_cachezerosr!   RuntimeErrorstrprintr0   )r   r7   maxsizer+   r,   r   global_rank
world_sizer)   r*   M_LISTxmresultsMmatr'   r(   er   r   r   run_all_gatherP   sD   

rO   c                     s   t d u r:t  t  } t| tjda W d    n1 sw   Y  t dkr:t D ]\}}t	d| d|  q+dd t D }dd t D }zddl
m} W n ty[   td	w |||d
dd  fdd}|S )N   r   zsize: z, avg_duration: c                 S      g | ]}|d  qS )r   r   r4   resultr   r   r   
<listcomp>       z$create_predictor.<locals>.<listcomp>c                 S   rQ   )r   r   rR   r   r   r   rT      rU   )interp1dzAPlease install scipy to use communication profiler in DeepCompilelinearextrapolate)kind
fill_valuec                    s   | dkrdS  | S )Nr   r   )r   	predictorr   r   f   s   zcreate_predictor.<locals>.f)profile_resultsr   r   current_devicerO   r"   bfloat16r   r9   rD   scipy.interpolaterV   ImportErrorrB   )r   r   r.   sizes	durationsrV   r]   r   r[   r   create_predictor~   s$   
re   __main__
LOCAL_RANKzlocal_rank=nccl)dist_backendg    eAzPredicted time for size z: z.6fz seconds)r1   r2   F)osr"   torch._subclasses.fake_tensorr   rb   	deepspeeddeepspeed.commcommr   deepspeed.acceleratorr   r	   r   r0   rO   r^   re   __name__intenviron
local_rank
set_devicerD   init_distributedr\   example_sizepredicted_timedestroy_process_groupr   r   r   r   <module>   s6   
+