o
     iv8                     @   sN  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZ d dlmZ d dlZejG dd dZG dd dejZejG d	d
 d
ZdZdZdZdZdededefddZdedefddZdedefddZdedefddZdedefddZeeeedZdejiZ dd Z!d d! Z"e#d"kre"  dS dS )#    N)deque)meanstdev)Callablec                   @   s.   e Zd ZU eed< eed< eed< eed< dS )Scenarionum_samples	outer_dim	inner_dimnum_ag_matricesN)__name__
__module____qualname__int__annotations__ r   r   i/home/ubuntu/.local/lib/python3.10/site-packages/xformers/benchmarks/benchmark_sequence_parallel_fused.pyr      s
   
 r   c                   @   s   e Zd ZdZdZdd ZdS )Stepagrsc                 C   s   | j S N)value)selfr   r   r   __str__%   s   zStep.__str__N)r   r   r   	AllGatherReduceScatterr   r   r   r   r   r   !   s    r   c                   @   s<   e Zd ZU eg df ed< eg df ed< defddZdS )BenchNr   r   stepc                 C   s*   |t ju r| jS |t ju r| jS t| r   )r   r   r   r   r   KeyError)r   r   r   r   r   __getitem__.   s
   


zBench.__getitem__)r   r   r   r   r   r   r   r   r   r   r   r   )   s   
 r   i   i   i    nmreturnc                 C   s   || | d |  S )N   r   )r   r    r   r   r   round_up_to_nearest_multiple>   s   r#   
world_sizec                 C   s   d}t |t tt|  ddS )N      r   r   r	   r
   )r   LLAMA_07B_SLENLLAMA_07B_Dr$   
batch_sizer   r   r   llama_07B_MHAB      r,   c                 C   s.   d}t |t ttddt  d d|  ddS )Nr%         r&      r'   )r   r(   r)   r#   r*   r   r   r   llama_07B_FFNL      r1   c                 C   s   | }t |t tt|  ddS )Nr&   r'   )r   LLAMA_70B_SLENLLAMA_70B_Dr*   r   r   r   llama_70B_MHAW   r-   r5   c                 C   s.   | }t |t ttddt  d d|  ddS )Nr.   r/   r&   r0   r'   )r   r3   r4   r#   r*   r   r   r   llama_70B_FFNa   r2   r6   )r,   r1   r5   r6   bfloat16c           &         s  t d	 d tj	 td	 	 tjd<  tjd< dtjd< dtjd	< tjjd
dd tj	 tj	 tj	 t
| }|tju r[|j|j|j |j}n|tju rk|j|j|j d}t| tj  fd
tj fd fddt|D fddt|D fddt|D fddt|D fddt|D fddt|D fddt|D fdd}fdd}
fdd}fdd}
fdd }fd!d"}
fd#d$}fd%d&}
fd'd(}fd)d*}
fd+d,}fd-d.}t d/ d0  d1| d0 d2  
 |tju r|  |  	d3krt d4 t d5d6d7d8 tD   t d9d6d:d8 tD   n7|tju r|  |  	d3krt d4 t d5d6d;d8 tD   t d9d6d<d8 tD   t||d=t||d=t||d=t||d=t||d=t||d=d>}t	fd?d8tt|D }t }i }td@}|rtj }nt }|x}|j t!||t| |gt| dAD ]W} ||  | }!|rG|" \}"}#n|" \}$}"}#|##  	d3kre|$|$g %|"&|#|  t|D ]}%|!  qi|"'  t|D ]}%|!  qx|#'  |%| |"|#f q5tj#  W d    n	1 sw   Y  |r|(dB	 dC 	d3kr|D ]\} }"}#|$| g %|"&|#|  q|D ]} t |  dDt)||  dEdFt*||  dEdG qd S d S )HNzRANK z startedzcuda:RANK
WORLD_SIZE	localhostMASTER_ADDR29500MASTER_PORTncclzenv://)backendinit_methodr"   dtypedevicec                        g | ]}t j fd qS rA   torchrandn.0_)KNrB   	my_devicer   r   
<listcomp>       z run_one_rank.<locals>.<listcomp>c                    rD   rE   rF   rI   MrM   rB   rN   r   r   rO      rP   c                    $   g | ]}t j  fd qS rE   rF   rI   rR   rM   rB   rN   r$   r   r   rO          c                    rD   rE   rF   rI   rQ   r   r   rO      rP   c                    rD   rE   rF   rI   rQ   r   r   rO      rP   c                    rS   rE   rF   rI   rT   r   r   rO      rU   c                    rS   rE   rF   rI   rT   r   r   rO      rU   c                     s(   t D ]\} }tj | |d qd S Nout)ziprG   matmulwgo)gathered_inputgathered_outputsweightsr   r   run_compute_lower_bound_ag   s   z0run_one_rank.<locals>.run_compute_lower_bound_agc                     sL   t D ]\} }}tj| |d tj|  fd|d qd S )NrW   r   )dimrX   )rY   rG   rZ   sumviewr\   r]   so)rR   rM   r^   r_   scattered_outputsr`   r$   r   r   run_compute_lower_bound_rs   s   "z0run_one_rank.<locals>.run_compute_lower_bound_rsc                      s   t j  d S r   )rG   distributedall_gather_into_tensorr   )r^   scattered_inputr   r   run_comms_lower_bound_ag   s   z.run_one_rank.<locals>.run_comms_lower_bound_agc                     s&   t  D ]\} }tj| | qd S r   )rY   rG   ri   reduce_scatter_tensor)rf   r]   )r_   rg   r   r   run_comms_lower_bound_rs   s   z.run_one_rank.<locals>.run_comms_lower_bound_rsc                     s6   t j  tD ]\} }t j | |d qd S rV   )rG   ri   rj   rY   rZ   r[   )r^   gathered_outputs_nccl_referencerk   r`   r   r   run_nccl_reference_ag   s   z+run_one_rank.<locals>.run_nccl_reference_agc                     s:   t D ]\} }}tj | |d tj|| qd S rV   )rY   rG   rZ   ri   rm   re   )r^   r_    scattered_outputs_nccl_referencer`   r   r   run_nccl_reference_rs   s   z+run_one_rank.<locals>.run_nccl_reference_rsc                     s*   ddl m}  | dd D dd d S )Nr   fused_allgather_and_linearc                 S      g | ]}|  qS r   trJ   r\   r   r   r   rO          z6run_one_rank.<locals>.run_fused_ag.<locals>.<listcomp>
   group	timeout_sxformers.opsrt   rs   )gathered_outputs_fusedrk   subgroupr`   r   r   run_fused_ag      
z"run_one_rank.<locals>.run_fused_agc                     s*   ddl m}  |  dd D ddd S )Nr   fused_linear_and_reducescatterc                 S   ru   r   rv   rx   r   r   r   rO      ry   z6run_one_rank.<locals>.run_fused_rs.<locals>.<listcomp>rz   r{   r   r   r   )r^   scattered_outputs_fusedr   r`   r   r   run_fused_rs   r   z"run_one_rank.<locals>.run_fused_rsc                     s,   ddl m}  | dd D ddd d S )Nr   rs   c                 S   ru   r   rv   rx   r   r   r   rO      ry   z=run_one_rank.<locals>.run_fused_nowait_ag.<locals>.<listcomp>Frz   r|   _waitr}   r~   rs   )r   rk   subgroup_nowaitr`   r   r   run_fused_nowait_ag      
z)run_one_rank.<locals>.run_fused_nowait_agc                     s,   ddl m}  |  dd D dddd S )Nr   r   c                 S   ru   r   rv   rx   r   r   r   rO     ry   z=run_one_rank.<locals>.run_fused_nowait_rs.<locals>.<listcomp>Frz   r   r   r   )r^   r   r   r`   r   r   run_fused_nowait_rs   r   z)run_one_rank.<locals>.run_fused_nowait_rsc                     s.   ddl m}  | dd D dddd d S )Nr   rs   c                 S   ru   r   rv   rx   r   r   r   rO     ry   zFrun_one_rank.<locals>.run_fused_nowait_nomemcpy_ag.<locals>.<listcomp>Frz   r|   r   _memcpyr}   r~   rs   )r   rk   subgroup_nowait_nomemcpyr`   r   r   run_fused_nowait_nomemcpy_ag     
z2run_one_rank.<locals>.run_fused_nowait_nomemcpy_agc                     s.   ddl m}  |  dd D ddddd S )Nr   r   c                 S   ru   r   rv   rx   r   r   r   rO     ry   zFrun_one_rank.<locals>.run_fused_nowait_nomemcpy_rs.<locals>.<listcomp>Frz   r   r   r   )r^   r   r   r`   r   r   run_fused_nowait_nomemcpy_rs  r   z2run_one_rank.<locals>.run_fused_nowait_nomemcpy_rszSizes: (xz)x(z)xr   zfused:zAre equal?  c                 s   $    | ]\}}t t||V  qd S r   strrG   equalrJ   reffusr   r   r   	<genexpr>+  
    
zrun_one_rank.<locals>.<genexpr>zAre allclose? c                 s   r   r   r   rG   allcloser   r   r   r   r   4  r   c                 s   r   r   r   r   r   r   r   r   C  r   c                 s   r   r   r   r   r   r   r   r   L  r   )r   r   )compute_lower_boundcomms_lower_boundnccl_referencefusedfused_nowaitfused_nowait_nomemcpyc                 3   s*    | ]}t  fd dtdD V  qdS )c                 3   s"    | ]}t jj d kdV  qdS )r   )enable_timingN)rG   cudaEventrI   my_rankr   r   r   l  s     z)run_one_rank.<locals>.<genexpr>.<genexpr>r.   N)tuplerange)rJ   fr   r   r   r   k  s
    
*   )kcountsfusion_trace_z.jsonz = gzms (+/- ))+printrG   r   
set_devicerC   osenvironri   init_process_group	new_group	SCENARIOSr   r   r   r	   r   r
   r   DTYPESrH   r   joinrY   r   r   lenrandomRandomprofilerprofile
contextlibnullcontextsamplelistpopleftsynchronize
setdefaultappendelapsed_timerecordexport_chrome_tracer   r   )&r   r$   scenario_namer   	dtype_str
num_roundsnum_warmup_itersnum_bench_itersr   conn_from_prevconn_to_nextscenarionum_matricesra   rh   rl   rn   rp   rr   r   r   r   r   r   r   
all_benchsunused_eventsused_eventstimingsgenr   pmethodfunstart_evend_ev
old_methodrK   r   )rL   rR   rM   rB   r^   r_   r   ro   rN   r   rk   rg   r   rq   r   r   r   r`   r$   r   run_one_rankx   sJ  






*

	



	











(r   c            	      C   s  t  } | jdt d | jdtttd | jdtdd | jdt d	d
 | jdtdd | jdtdd | jdtdd | jddd | 	 }d g|j
 }d g|j
 }t|j
D ]}tdjdd\}}|||< |||d |j
 < q]g }t|j
D ]-}tdjt||j
|j|j|j|j|j|j|j|| || fdd}|  || qtd t|D ]\}}|  td| d|j  qtd d S )Nr   )choicesr   )r   typez--world-sizer%   )r   defaultz--dtyper7   )r   r   z--num-rounds   z--num-warmup-iters   z--num-bench-iters2   z	--profile
store_true)actionspawnT)duplexr"   )targetargsdaemonLAUNCHEDzRank z exited with JOINED)argparseArgumentParseradd_argumentr   keysr   r   r   r   
parse_argsr$   r   multiprocessingget_contextPipeProcessr   r   r   rB   r   r   r   r   startr   r   	enumerater   exitcode)	parserr   conns_from_prevconns_to_nextrankend1end2	processesr   r   r   r   main  sR   
r  __main__)$r   r   dataclassesenumr   r   r   collectionsr   
statisticsr   r   typingr   rG   	dataclassr   Enumr   r   r(   r)   r3   r4   r   r#   r,   r1   r5   r6   r   r7   r   r   r  r   r   r   r   r   <module>   sN   


  02
