o
    Ti=                  
   @   sx  d dl mZ d dlmZmZ d dlmZ d dlmZ d dlZd dl	m
Z
mZ d dlmZ zd dlmZ W n	 ey=   Y nw d	d
lmZmZ d	dlmZ dee fddZdefddZdee fddZdefddZde
fddZdee dee fddZdee dee deeef fddZde
d eeef fd!d"Zde
d e
fd#d$Z dee d%eeee f d&efd'd(Z!dee dee d%eeee f d)ee fd*d+Z"dee dee d%eeee f fd,d-Z#dee dee d%eeee f d&efd.d/Z$de
d0ed1ed2e%d e
f
d3d4Z&de
fd5d6Z'd7edee fd8d9Z(eG d:d; d;Z)de
d0ed1ed2e%d e
f
d<d=Z*dS )>    )defaultdict)ListDict)copy)	dataclassN)GraphNode)map_arg	tree_iter   )get_last_usesis_release_node)get_output_node	scheduledc                    s6   t  }i  | D ]}|| fdd}| |j< q|S )Nc                    
    | j  S Nnamenenv S/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/compile/list_schedule.py<lambda>      
 z*make_graph_from_schedule.<locals>.<lambda>)r   	node_copyr   )r   	new_graphnodenew_noder   r   r   make_graph_from_schedule   s   r!   r   c                 C   s>   | j ds| j ds| j ds| j drdS t| jS )Nallgather_ds_paramrelease_ds_paramwait_allgather_ds_paramreduce_ds_paramr   )r   
startswithlenargs)r   r   r   r   get_original_args_num"   s   



r)   r(   c                 C   s   dd t | D S )Nc                 S   s   g | ]	}t |tr|qS r   )
isinstancer   ).0ar   r   r   
<listcomp>-       z&flat_nodes_in_args.<locals>.<listcomp>r
   r(   r   r   r   flat_nodes_in_args,   s   r0   c                 C   s   | j d t|  }t|S r   )r(   r)   r0   )r   r(   r   r   r   filter_args0   s   r1   graphc                 C   s   t | }tt}i }g }g }tt}| jD ]J}t|}t|dkr<|| t|j	 ||< |j	 D ]}	|||	< q4n|| |D ]}
t
|
D ]}t|tr]||| vr]|| | qIqCq||||||fS Nr   )create_mem_tabler   setlistnodesr1   r'   appenduserskeysr   r*   r   )r2   	mem_tableremaining_usersuser_to_producerr   unschedulededgesr   filtered_argsuserr,   elem_ar   r   r   init_schedule5   s0   




rC   r>   c                    s   t    fdd|D S )Nc                    s*   g | ]}t  fd dt|D r|qS )c                 3   s    | ]}| v V  qd S r   r   r+   argr   r   r   	<genexpr>S   s    z0get_runnable_nodes.<locals>.<listcomp>.<genexpr>)allr1   )r+   r   rF   r   r   r-   S   s   * z&get_runnable_nodes.<locals>.<listcomp>)r5   )r   r>   r   rF   r   get_runnable_nodesQ   s   rI   r;   c                    s&   t | |}t| fddd}|d S )Nc                    r   r   r   r   r;   r   r   r   Z   r   z"choose_next_node.<locals>.<lambda>keyr   )rI   sorted)r   r>   r;   runnable_nodesr   rJ   r   choose_next_nodeV   s   
rO   returnc                 C   sh   i }| j D ],}|jdr|jd ||j< q|jds"|jdr,|jd  ||j< qd||j< q|S )Nr"   tensor_sizer#   r%   r   )r7   r   r&   meta)r2   r;   r   r   r   r   r4   ^   s   
r4   c                 C   sN   t | \}}}t|dkr#t|||}|| || t|dkst|S r3   )rC   r'   rO   r8   remover!   )r2   r   r>   r;   	next_noder   r   r   list_schedulek   s   

rU   r?   new_scheduledc                    sD   t g }|  D ]}t fddt|D r|| q
|S )Nc                 3   s     | ]}| kr|v V  qd S r   r   rD   rV   r   r   r   rG   ~   s    z.get_new_runnable_nodes_with.<locals>.<genexpr>)r5   rH   r1   r8   )r   r?   rV   new_runnablesr   r   rW   r   get_new_runnable_nodes_withz   s   
rY   non_ag_runnablec                 C   sZ   t |dkr)| }t| ||}|dd |D 7 }| | || t |dks| |fS )Nr   c                 S      g | ]
}|j d s|qS r"   r   r&   r+   r   r   r   r   r-          z2_do_schedule_without_allgather.<locals>.<listcomp>)r'   poprY   r8   rS   )r   r>   r?   rZ   rT   rX   r   r   r   _do_schedule_without_allgather   s   

	ra   c                 C   s6   t | |}dd |D }t| }t|}t||||S )Nc                 S   r[   r\   r]   r^   r   r   r   r-      r_   z.schedule_without_allgather.<locals>.<listcomp>)rI   r   ra   )r   r>   r?   runnablerZ   tmp_scheduledtmp_unscheduledr   r   r   schedule_without_allgather   s
   
re   c                 C   sL   t | ||}dd |D }t| }t|}|| || t||||S )Nc                 S   r[   r\   r]   r^   r   r   r   r-      r_   z3try_schedule_with_new_allgather.<locals>.<listcomp>)rY   r   r8   rS   ra   )r   r>   r?   rV   rX   rZ   rc   rd   r   r   r   try_schedule_with_new_allgather   s   

rf   available_memoutput_size	debug_logc                 C   sP  t | \}}}}}}	t|||\}
}t|dkrt|
|}g }|D ]%}t|
|||\}}tdd |t|
d d  D }|||||f q!t|dd dd}|d d }|d d	 }|
t|d  }|t|
d d  }|| || |D ]}|| || qzt	|}
t	|}|D ]}|
| || qt|dkst
|
S )
Nr   c                 s       | ]}|j d  V  qdS device_timeNrR   r^   r   r   r   rG          z"simple_prefetch.<locals>.<genexpr>r   c                 S   s   | d S )Nr   r   xr   r   r   r      s    z!simple_prefetch.<locals>.<lambda>T)rL   reverse   )rC   re   r'   rI   rf   sumr8   rM   rS   r   r!   )r2   rg   rh   ri   r   r>   r?   r;   r<   r=   rc   rd   rb   ag_with_unblock_timeag_nodeag_scheduledag_unscheduledunblock_timebest_ag_nodebest_ag_scheduledno_ag_runnablesafter_ag_runnablesr   r   r   r   simple_prefetch   s8   
"



r}   c           	      C   s   t | }tt}i }g }g }tt}| jD ]'}|jdkr7|| t|j ||< |j D ]}|||< q/q|| q||||||fS )Nplaceholder)	r4   r   r5   r6   r7   opr8   r9   r:   )	r2   r;   r<   r=   r   r>   r?   r   rA   r   r   r   init_schedule_with_placeholders   s   



r   target_nodec                    s6   t t  g dtf fdd  |  S )Nr   c                    sd   | v rd S | v rd S  |  g  dtf fdd}t| j|  D ]}| q$|  d S )Nr   c                    s     |  d S r   )r8   r   r/   r   r   register_arg   s   z8get_node_requirements.<locals>.dfs.<locals>.register_arg)addr   r	   r(   r8   )r   r   rE   dfsordered_nodesr   visitedr/   r   r      s   

z"get_node_requirements.<locals>.dfs)r5   r   )r   r   r   r   r   get_node_requirements   s   r   c                   @   sf   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< ee ed	< ee ed
< dS )AllgatherTaskr   allgather_cost	free_costallgathered_memallgather_acc_memfree_acc_memlast_usen_scheduled_agsschedule_until_agschedule_until_freeN)__name__
__module____qualname__r   __annotations__floatintr   r   r   r   r   r     s   
 r   c           4         s  t | \}}| jD ]}d|jvrd|jd< q	t| \}}}	}
}}dd |D }tt}|D ]}t|r=||jd  | q-i }|D ]}|| }t	||}t
dd |D ||< qBdd |D }i }|D ]}t
d	d t	||D ||< qdd
d t| jd D }i }|D ]}t
dd t	||D ||< qt|dkr3dd | D tt
 }g }|D ]  fdd|D }|D ]v}|jd }t	||}|d u rq|| }t	||| }tdd |D } tdd |D }!|jd }"tdd |D }#tdd |D }$|| }%|| D ]}&|&|%vr|%|& qtdd |%D }'t|| |!|"|#|$||'||%
}(||( qt|dkr@ nqt|dksLJ ddd |D })t|)dkrjt|dd d}*|*d }+|+j},nt|dd d}*|*d }+|+j},|,D ]}|| || q{||+j ||+j | D ]\}}-|+j|-v r|-|+j qg }.|D ]!}|+j|| v r|| |+j t|| dkr|.| q|.D ]}t	||}/|/D ]}0||0 ||0 qܐqg }1|D ]!}|+j|| v r|| |+j t|| dkr|1| q|1D ]}t	||}/|/D ]}0||0 ||0 qqt|dkst
|}2| jD ]}||2v rCq:|| || q:t|dks]J d| t|}3|3  |3S )NrQ   r   c                 S   "   g | ]}|j tjjjjkr|qS r   targettorchopsdcallgather_paramdefaultr^   r   r   r   r-        " z&fast_free_schedule.<locals>.<listcomp>rr   c                 s   &    | ]}|j tjjjjkr|V  qd S r   r   r^   r   r   r   rG   *  s   $ z%fast_free_schedule.<locals>.<genexpr>c                 S   r   r   )r   r   r   r   reduce_gradr   r^   r   r   r   r-   ,  r   c                 s   r   r   r   r^   r   r   r   rG   /      c                 S   s,   g | ]}t |tr|jtjjjjkr|qS r   )r*   r   r   r   r   r   r   r   r^   r   r   r   r-   2  s    c                 s   r   r   r   r^   r   r   r   rG   8  r   c                 S   s   i | ]	\}}|t |qS r   )r'   )r+   ru   r7   r   r   r   
<dictcomp>=  r.   z&fast_free_schedule.<locals>.<dictcomp>c                    s   g | ]
}|  kr|qS r   r   r+   agag_countag_nodes_countr   r   r-   C  r_   c                 s   rj   rk   rm   r^   r   r   r   rG   P  rn   c                 s   rj   rk   rm   r^   r   r   r   rG   Q  rn   c                 s   ,    | ]}|j tjjjjkr|jd  V  qdS rQ   Nr   r   r   r   r   r   rR   r^   r   r   r   rG   S      c                 s   r   r   r   r^   r   r   r   rG   U  r   c                 S   r   r   r   r^   r   r   r   r-   ^  r   zNo runnable allgather nodesc                 S   s   g | ]	}|j d kr|qS )r   r   r   r   r   r   r-   o  r.   c                 S      | j S r   )r   ro   r   r   r   r   q      z$fast_free_schedule.<locals>.<lambda>rK   c                 S   r   r   r   ro   r   r   r   r   v  r   zThere are unscheduled nodes: )r   r7   rR   r   r   r6   r   r(   r8   r   r5   r   r'   itemsrM   valuesrs   r   r   r   rS   r   r`   r!   lint)4r2   rg   rh   ri   node_to_last_useuser_to_last_usesr   r   r>   r?   r;   r<   r=   unscheduled_agsrelease_nodesr   ag_nodes_in_pathru   r   required_nodesreduce_nodes ag_nodes_in_path_to_reduce_nodesreduce_nodeoutput_nodes ag_nodes_in_path_to_output_nodesoutput_node
count_listrunnable_agstarget_unscheduled_agsds_idr   diff_required_nodesr   r   r   r   r   r   release_noder   taskags_with_no_additional_ag
sorted_agsnext_agnodes_to_scheduler7   reduces_to_scheduleneed_to_schedulennoutputs_to_schedulescheduled_set	ret_graphr   r   r   fast_free_schedule  s   















i


r   )+collectionsr   typingr   r   r   dataclassesr   r   torch.fxr   r   torch.fx.noder	   torch.utils._pytreer   ImportErrorutilr   r   fxr   r!   r)   r0   r1   rC   rI   strr   rO   r4   rU   rY   ra   re   rf   boolr}   r   r   r   r   r   r   r   r   <module>   sJ   

&&"

*"

)"