o
    ipk                    @  sb  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZmZ erqd dlmZmZ d dlm Z  d dl!Z!d dl"Z"d dl#Z"d dl$m%  m&Z' d dl(m)Z)m*Z* d d	l+m,Z,m-Z- d d
l.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z: d dl;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZGmHZHmIZI ddlJmKZKmLZL ddlAmMZMmNZNmOZOmPZP ddlQmRZRmSZS ddlTmUZU ddlBmVZVmWZWmXZXmYZYmZZZ ddl[m\Z\ ddl]m^Z^m_Z_ ddl`maZambZb ddlcmdZd ddl%meZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZu ddlvmwZw exeyZze"j{|eydZ}e"j{|eyd Z~e"j{|eyd!Zed" Zd#ed$< ed%Zed&ZejG d'd( d(ZejG d)d* d*eZG d+d" d"Zejdrd.d/Zdsd2d3Zdtd5d6Zdud8d9ZG d:d; d;Zdvd>d?ZG d@dA dAZdwdHdIZG dJdK dKeZG dLdM dMeZG dNdO dOeZdxdRdSZdydXdYZG dZd[ d[eZG d\d] d]eZG d^d_ d_eZ	`dzd{dhdiZejG djdk dkZe Zd|dmdnZG dodU dUZG dpdq dqZdS )}    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)	ParamSpec	TypeAlias)IteratorSequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols
OrderedSet)free_symbol_is_typesymbol_is_typeSymT)
has_triton   )commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)get_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitmaybe_log_cudagraph_partitionsympy_product)Vfusionloop_orderingcompute_dependenciesBaseSchedulerNoder   PartitionType_T_Pc                   @  s   e Zd ZU ded< ded< ded< ejedZded	< ejedZ	d
ed< d(ddZ
d)ddZd(ddZd(ddZd*ddZd+ddZd,ddZd-d d!Zd-d"d#Zd.d%d&Zd'S )/SchedulerBuffer	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr9   
mpi_bufferreturnstrc                 C  s   | j }|d us	J | S N)r\   get_name)selfop rg   V/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/torch/_inductor/scheduler.pydefining_op_namei   s   z SchedulerBuffer.defining_op_nameintc                 C  s   t | jjS rc   )hashrZ   namere   rg   rg   rh   __hash__n      zSchedulerBuffer.__hash__c                 C  s  t  }|  }|| dt| jj  || d| jj  |  r3|| dt|    | 	 rE|| dt| 	   t
| jdkr[|| d| j  | S || d |d | jD ]
}|| d qlW d    n1 sw   Y  |d	 | S )
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rF   rd   	writelinetyperZ   __name__layoutget_aliasespformatget_mutationslenr_   indentgetrawvalue)re   resultrl   userrg   rg   rh   	debug_strq   s&   

zSchedulerBuffer.debug_strc                 C  
   | j  S rc   rZ   rd   rm   rg   rg   rh   rd         
zSchedulerBuffer.get_nameNonec                 C  s   | j d usJ | j  sd S | j  s!| j  s!t| j  tjr+tj	j
| j  d S ttjdra|  tjjv ratjj|   }|| jjv rO| jj| j }n| jj| j }tj	j
|| j  d S tj	j
| j  d S )Nargs)rZ   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr$   CommBufferLayoutrO   graphwrapper_codecodegen_allocationhasattrkernelrd   inplace_update_buffersrY   name_to_donated_buffername_to_bufcodegen_inplace_reuse)re   input_buffer_nameinput_bufferrg   rg   rh   allocate   s6   

zSchedulerBuffer.allocateboolc                 C  sN   | j d usJ t| j jtjst| j rdS | jD ]}t|j tr$ dS qdS NFT)rZ   r   rv   r$   r7   rJ   r_   
OutputNode)re   userg   rg   rh   can_free   s   
zSchedulerBuffer.can_freec                 C  s\   i }|D ] }t |j|v r||t |j |t |j< q||t |j< qt| | _d S rc   )idrZ   mergelistvaluesr_   )re   r_   r}   r   rg   rg   rh   	set_users   s    zSchedulerBuffer.set_usersSequence[str]c                 C     | j d usJ | j  S rc   )rZ   r   rm   rg   rg   rh   rw         
zSchedulerBuffer.get_aliasesc                 C  r   rc   )rZ   r   rm   rg   rg   rh   ry      r   zSchedulerBuffer.get_mutationsOptional[torch.device]c                 C  s   | j   S rc   )rZ   r   
get_devicerm   rg   rg   rh   r         zSchedulerBuffer.get_deviceNra   rb   ra   rj   ra   r   ra   r   )r_   r^   ra   r   ra   r   ra   r   )ru   
__module____qualname____annotations__dataclassesfieldr   r_   r9   r`   ri   rn   r   rd   r   r   r   rw   ry   r   rg   rg   rg   rh   rW   _   s$   
 





!



rW   c                   @  s   e Zd ZU dZded< dS )SchedulerDonatedBufferNr[   r\   )ru   r   r   r\   r   rg   rg   rg   rh   r      s   
 r   c                   @  s~  e Zd ZU ded< ded< ded< ded< ded	< d
ed< dZded< dddZdddZdddZdddZdddZ	dd!d"Z
dd#d$Zdd%d&Zdd+d,Zdd/d0Zdd3d4Zdd5d6Zdd8d9Zdd=d>Zdd?d@ZddAdBZddCdDZddEdFZddGdHZddKdLZddMdNZddOdPZeddQdRZeddSdTZeddUdVZeddWdXZddZd[Z dd]d^Z!ddadbZ"ddddeZ#ddfdgZ$ddhdiZ%ddjdkZ&ddldmZ'ddndoZ(ddpdqZ)ddrdsZ*ddvdwZ+ddxdyZ,ddzd{Z-	|ddddZ.edddZ/edddZ0edddZ1dddZ2dddZ3edddZ4dddZ5edddZ6dddZ7dddZ8e9dddZ:dS )rS   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writesOrderedSet[Dep]unmet_dependenciesrj   	min_order	max_orderr:   mpi_nodeNOptional[float]override_estimated_runtimerY   rX   ra   r   c                 C  s   || _ dd | _d S )Nc                  _  s   g S rc   rg   )r   kwargsrg   rg   rh   <lambda>   s    z,BaseSchedulerNode.__init__.<locals>.<lambda>)rY   debug_device_strre   rY   rg   rg   rh   __init__   s   zBaseSchedulerNode.__init__rZ   ir.Operationc                   sT   | _ t  _tt   _d _ fdd| D  _dd  jD  _i  _	d S )NFc                   s   g | ]
}t  j| d qS ))rY   rZ   r\   )rW   rY   ).0outputrm   rg   rh   
<listcomp>   s    z5BaseSchedulerNode._init_from_node.<locals>.<listcomp>c                 S     i | ]}|  |qS rg   rd   r   bufrg   rg   rh   
<dictcomp>       
z5BaseSchedulerNode._init_from_node.<locals>.<dictcomp>)
rZ   r   	ancestorsrb   
last_usagewrittenget_outputsoutputsoutputs_by_namemutation_renamesre   rZ   rg   rm   rh   _init_from_node   s   

	z!BaseSchedulerNode._init_from_noderb   c                 C  s   t | j d|  dS )Nz(name=)rt   ru   rd   rm   rg   rg   rh   __repr__      zBaseSchedulerNode.__repr__c                 C  s  |   }t }|| dt| j dtt| ddj d| dt| jj d| dt| j	 d| d	t| jj
| j	  d| d
 |  |  D ]	}||  qKW d   n1 s_w   Y  |d z	||   W n ty   tjddd Y nw |  S )#Longer form printout for trace logsrp   (rZ   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rr   Ignoring error in debug_str()Texc_info)rd   rF   splicert   ru   getattrrx   r   writesr   readsr{   r   r   rs   debug_str_extra	Exceptionlogwarningr|   rstrip)re   rl   r   outrg   rg   rh   r     sH   

	
zBaseSchedulerNode.debug_strc                 C     dS )N rg   rm   rg   rg   rh   r        z!BaseSchedulerNode.debug_str_extra	list[str]c                 C  s
   |  | S rc   )r   rm   rg   rg   rh   _debug_str_for_device  r   z'BaseSchedulerNode._debug_str_for_devicec                 C  sz   t | jdd }d}t|tjjjrd|j| gddd }nt|tjjj	r7d|j|
 | gddd }|  | S )Ndatar   z, F)shorten	multiline)r   rZ   r   torch	_inductorr$   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)re   
maybe_datadata_strrg   rg   rh   debug_str_short   s   
z!BaseSchedulerNode.debug_str_shortc                 C  s   t d| | j| jj d S )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   rm   rg   rg   rh   log_details/  s   zBaseSchedulerNode.log_detailsself_depr-   	other_depr   c                 C  r   NFrg   )re   r  r  rg   rg   rh   reorder_loops_by_dep_pair7     z+BaseSchedulerNode.reorder_loops_by_dep_pairrenamesdict[str, str]c                   s<    fdddd | j  D D | _| | j | j d S )Nc                      i | ]}| v r| | qS rg   rg   r   rl   r  rg   rh   r   =  
    z:BaseSchedulerNode.update_mutated_names.<locals>.<dictcomp>c                 s      | ]}|j V  qd S rc   rl   r   deprg   rg   rh   	<genexpr>?      z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>)r   reads_and_writesr   set_read_writesrenamere   r  rg   r  rh   update_mutated_names<  s   
z&BaseSchedulerNode.update_mutated_namesr  r,   c                 C  s   |  | j| d S rc   )r  r   	with_readre   r  rg   rg   rh   add_fake_depD     zBaseSchedulerNode.add_fake_depc                 C     t dd |  D S )Nc                 s  s     | ]}|  p| V  qd S rc   )rw   ry   r   rg   rg   rh   r  H  s    
z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>)anyr   rm   rg   rg   rh   has_aliasing_or_mutationG  s   z*BaseSchedulerNode.has_aliasing_or_mutationrwc                 C  s   || _ | j j| _|   d S rc   )r   r   r   
prune_deps)re   r  rg   rg   rh   r  L  s   
z!BaseSchedulerNode.set_read_writesfuture_used_buffersOrderedSet[str]mutation_real_namec                   s,   |   }t fdd|D }|| | _d S )Nc                 3  s    | ]	}  ||V  qd S rc   )get)r   kr#  rg   rh   r  U      z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>)used_or_aliased_buffer_namesr   r   )re   r!  r#  used_buffersrg   r&  rh   set_last_usageQ  s   z BaseSchedulerNode.set_last_usagec                 C  s   | j D ]}|  qd S rc   )r   r   )re   r   rg   rg   rh   mark_runX  s   

zBaseSchedulerNode.mark_runc                 C  s"   t dd t| jj| jjD S )Nc                 s  r  rc   r  r  rg   rg   rh   r  ]  
    
z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>)r   	itertoolschainr   r   r   rm   rg   rg   rh   used_buffer_names\  s   z#BaseSchedulerNode.used_buffer_namesc                   s   t   dd t| jj| jjD }t|dkr@| } | t	j
j|r:| fddt	j
j|  D  t|dks S )Nc                 S     g | ]}|j qS rg   r  r  rg   rg   rh   r   e  s    zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r   c                 3  s    | ]	}| vr|V  qd S rc   rg   )r   alias
used_namesrg   rh   r  m  s    zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>)r   r-  r.  r   r   r   rz   popaddrO   r   name_to_bufferr$  extendr   )re   depsr  rg   r2  rh   r(  b  s    
z.BaseSchedulerNode.used_or_aliased_buffer_namesc                   s   t  fdd jD  _d S )Nc                 3  s"    | ]}|j  jjvr|V  qd S rc   )rl   rY   available_buffer_namesr  rm   rg   rh   r  w      z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>r   r   rm   rg   rm   rh   r   v  s   zBaseSchedulerNode.prune_depsc                   s>   d	 fddt fdd jjD }  j| d S )
Nr  r,   ra   r   c                   s,   t | tsdS  jj| j  }|tjjv S r  )	r   r/   rY   r   rl   ri   rO   r   removed_operations)r  op_namerm   rg   rh   should_prune  s   
z7BaseSchedulerNode.prune_weak_deps.<locals>.should_prunec                 3      | ]	} |r|V  qd S rc   rg   r  r>  rg   rh   r        
z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>r  r,   ra   r   )r   r   r   r  remove_reads)re   	to_removerg   )re   r>  rh   prune_weak_deps}  s
   z!BaseSchedulerNode.prune_weak_depsname_to_fused_nodedict[str, BaseSchedulerNode]c                 C  s   t | || jj d S rc   )_prune_redundant_depsrY   r   )re   rF  rg   rg   rh   prune_redundant_deps  s   z&BaseSchedulerNode.prune_redundant_depsc                 C  r   rc   )rZ   get_operation_namerm   rg   rg   rh   rd     r   zBaseSchedulerNode.get_namec                 C  s   |   S rc   r   rm   rg   rg   rh   get_first_name  s   z BaseSchedulerNode.get_first_namec                 C  r  )Nc                 s      | ]}|  V  qd S rc   r   r   rZ   rg   rg   rh   r        z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>)r   	get_nodesrm   rg   rg   rh   get_operation_names     z%BaseSchedulerNode.get_operation_namesc                 C     t dd | jD S )Nc                 s  rL  rc   r   r   r   rg   rg   rh   r    rN  z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>)r   r   rm   rg   rg   rh   get_buffer_names     z"BaseSchedulerNode.get_buffer_namesc                 C  r  )Nc                 s  s&    | ]}t |tot|d dV  qdS )T)disallow_fp32_opsNr   SchedulerNoder&   r   nrg   rg   rh   r    s    


zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>allrO  rm   rg   rg   rh   can_codegen_in_low_precision  s   z.BaseSchedulerNode.can_codegen_in_low_precisionc                 C  r  )Nc                 s  s"    | ]}t |tot|V  qd S rc   rW  rY  rg   rg   rh   r    s
    
z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>r[  rm   rg   rg   rh   r&     s   z-BaseSchedulerNode.can_codegen_without_upcastsSequence[BaseSchedulerNode]c                 C  s   | gS rc   rg   rm   rg   rg   rh   rO       zBaseSchedulerNode.get_nodesSequence[SchedulerBuffer]c                 C     | j S rc   )r   rm   rg   rg   rh   r     r_  zBaseSchedulerNode.get_outputsbuf_namerW   c                 C  s
   | j | S rc   )r   )re   rb  rg   rg   rh   
get_output  r   zBaseSchedulerNode.get_outputr   c                 C  r   rc   )rZ   r   rm   rg   rg   rh   r     r   zBaseSchedulerNode.get_devicec                 C  s   |   }|d uo|jdkS Ncpu)r   rt   re   devicerg   rg   rh   is_cpu     zBaseSchedulerNode.is_cpuc                 C  s   |   }|d uot|jS rc   )r   rI   rt   rf  rg   rg   rh   rI     ri  zBaseSchedulerNode.is_gpuc                 C  r   r  rg   rm   rg   rg   rh   is_reduction  r   zBaseSchedulerNode.is_reductionc                 C  r   r  rg   rm   rg   rg   rh   is_split_scan  r   zBaseSchedulerNode.is_split_scanc                 C  r   r  rg   rm   rg   rg   rh   is_template  r   zBaseSchedulerNode.is_templatec                 C  r   r  rg   rm   rg   rg   rh   	is_extern  r   zBaseSchedulerNode.is_externc                 C  r   r  rg   rm   rg   rg   rh   
is_foreach  r   zBaseSchedulerNode.is_foreachread_depdependencies.Depc                 C  r   r  rg   re   ro  rg   rg   rh   can_inplace  r   zBaseSchedulerNode.can_inplacec                 C  r   r  rg   rm   rg   rg   rh   has_side_effects  r   z"BaseSchedulerNode.has_side_effectsc                   sd  ddl m} ttr1tjr1tj	 t
jr1ttjtjjjjr+ttjdddur1ttjds3dS jtjjB jjB  dfd
d} D ]}|j}|dusTJ | rh| sh| sh| tjjv riqIjj D ]}|j!jj"v r~jj"|j! }njj#$|j!}|r.tjj%&|r.t|j't(s.|j)dusJ  fdd|j)D }t*|dkr.|d j+r.|d ju r.|jdur.t|j, t-j.t-j/t-j0fs.|j'rt|j'jt-j1t-j2frt*|j dks.||j|jr.||r.tjj34| |  ttjtjjjjr"tjj56|  tjj56|  | tjj7| <  nqmqIdS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNr   buf_to_be_inplacedrW   ra   r   c                   s   | j }|   t }| jD ]3}|j}t|tsq| | j j	vs+| j ||ur,q| fdd|j
 D O }t|dkrC dS qdS )Nc                 3  s    | ]
}|j  kr|V  qd S rc   r  )r   orb  rg   rh   r  
  s    
z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>r   FT)rY   get_fused_noderd   r   r_   rZ   r   rS   rK  rF  r   r  rz   )rv  
fused_noder8  r~   	user_noderm   rx  rh   single_index_in_fused_node  s*   


zKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodec                   s   g | ]}|j   vr|qS rg   r   r   x)inconsequential_nodesrg   rh   r   ,  s
    z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>r   )rv  rW   ra   r   )8codegen.wrapperrt  r   rX  r!   inplace_buffersrO   r   has_featurer   r'   INPLACE_BUFFERSr   r   r   codegensimd
SIMDKernelr   r   r   r<  rY   completed_operationsr   rZ   r   r   r   rd   removed_buffersr   r   rl   r   r   r$  r   	can_reuser\   NopKernelSchedulerNoder_   rz   rr  r   r$   r7   r6   MutationLayoutSHOULDREMOVEFallbackKernelr5   r   make_inplaceru  r5  r   )re   rt  r|  r   buf_noderead	input_bufremaining_usesrg   )r  re   rh   decide_inplace_update  s   
"



z'BaseSchedulerNode.decide_inplace_updateTbufferrF   	only_oncec           	      C  s(  t jsd S |r| jrd S | jd usJ | j }g }|D ]e}|jdkr$q|d |d d|j d|j }d|jv rG|d|jd   }|| d|jv r|jd  }|j	d	d
dd }|d|
dd
dd
dd
dd  |d |d qt|dkrd S || d| _d S )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplit{z{{}z}}r   \z\\z#pragma CMT END ORIGINr   T)r!   comment_originr   rZ   get_originsrf   appendtargetmetarsplitreplacerz   
writelines)	re   r  r  origins	out_linesrw  op_info_strr  stack_trace_last_linerg   rg   rh   codegen_originating_infoY  sH   









	


z*BaseSchedulerNode.codegen_originating_infoc                 C  s   | j dddS )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrm   rg   rg   rh   get_read_write_buffers_sizes     z.BaseSchedulerNode.get_read_write_buffers_sizesc                 C     | j dddS )NTFr  r  rm   rg   rg   rh   get_read_buffer_sizes  r  z'BaseSchedulerNode.get_read_buffer_sizesc                 C  r  )NFTr  r  rm   rg   rg   rh   get_write_buffer_sizes  r  z(BaseSchedulerNode.get_write_buffer_sizesr  r  c                 C  s   t | j||d ddS )Nr  r   )start)sumget_read_write_buffer_accessesr   )re   r  r  rg   rg   rh   r    s   z3BaseSchedulerNode.get_read_write_buffers_sizes_impldict[str, int]c                   s
  t tri S t trt jtri S t tr+t jtjr+jjtj	j
ju r+i S dddt trHt d t d  ntd	tt}|rbjjD ]
}||j | qW|rsjjD ]
}||j | qh|rtd
d jjD nt }|rtdd jjD nt }dfddt trtfdd|D }|| }|| }i }||B D ]I}	tfdd||	 D  |	tjjv rtjj|	 }
n|	tjjv rtjj|	 }
nqd fdd|
}|	|vr|||	< q||	  |7  < q|S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        s
sympy.Exprra   rj   c                 S  s   t jjj| ddS )Nr   fallback)rO   r   sizevars	size_hint)r  rg   rg   rh   try_size_hint  s   zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hintr   r       eAc                 s  r  rc   r  r  rg   rg   rh   r    r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>c                 s  r  rc   r  r  rg   rg   rh   r    r  r   rb   snodesr^  r   c                   s4    j j|  j}tdd |D }t|t| dkS )Nc                 s  r  rc   rZ   r   r~   rg   rg   rh   r    r  z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>r   )rY   r   r_   r   rz   )r   r  r_   buf_usesrm   rg   rh   is_materialized  s   zIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedc                 3  s     | ]} |j s|V  qd S rc   r  r  )r  re   rg   rh   r    s    
c                 3  s    | ]} V  qd S rc   rg   r  )
node_numelrg   rh   r    s    <Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]c                   s   | sdS t | tjr|  S t | jtrGjj|   j	}d}|D ]#}t |j
ts+J t |j
j
trB|j
 D ]	}||j
7 }q7q! dS |S t | jtjr[tfdd|  D S t|  }t|  t | S )Nr   c                 3  s     | ]} t j|V  qd S rc   )rO   r   
get_buffer)r   mut_name)get_buf_bytesrg   rh   r    s
    
zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>)r   r$   TorchBindObjectr  rv   r6   rY   r   rd   r_   rZ   rS   r5   r   r7   r  r   rN   r   rC   	get_dtypemin)r   r_   totr~   	sched_buf	buf_elems)buf_accessed_elemsr  re   r  rg   rh   r    s.   zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesN)r  r  ra   rj   )r   rb   r  r^  ra   r   )r   r  ra   rj   )r   r  ExternKernelSchedulerNoderZ   r5   r$   r  op_overloadr   _prims	rng_primsgraphsafe_run_with_rng_staterX  rN   
get_rangesrj   collectionsr   r   r   r   rl   r  r   r   FusedSchedulerNoder  rO   r   r6  graph_inputs)re   r  r  buf_accessesr  r   r   r  buf_byte_accessesrb  r   	buf_bytesrg   )r  r  r  r  re   r  rh   r    st   




#
z0BaseSchedulerNode.get_read_write_buffer_accesses
int | Nonec                 C  sb   | j d u rd S | j  }|d u rd S t|}|d u rd S tjjj|dd}td d  |7  < |S )Nr   r  inductor
flop_count)rZ   get_origin_noder2   rO   r   r  r  r   )re   fx_nodeflopsresolved_flopsrg   rg   rh   estimate_flops0  s   

z BaseSchedulerNode.estimate_flopsfloatc                 C  s   | j d ur| j S |  S rc   )r   _get_estimated_runtimerm   rg   rg   rh   get_estimated_runtime@  s   
z'BaseSchedulerNode.get_estimated_runtimec              
   C  s  |   d  d }|j }tt|sdS t| jrt| jtj	s%J z:t
jrZt| }t }||}|durCt|ts@J |W S t| }|du rPt| j}|j||d |W S t| jW S  tyw } zt| W Y d}~dS d}~w ty } zt| W Y d}~dS d}~ww t| jrdS t| }|dur|S |j }	z!t }
t|	d }|
dkrtd|
 |dkrtd| W n
 ty   Y dS w |  }|dks|du r|  |
 }|d }|S d}|  }|du rdn|}|| | d	 }||
 }t ||}|d }|S )
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r  )!rO  r   rZ   r   rI   r3   rG   r   r$   IRNoder"   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupr  r+   r*   	set_value
ValueErrorr   r   	TypeErrorrL    maybe_estimate_runtime_benchmarkmaybe_get_dtyperD   rB   AssertionErrorr   r  r  max)re   r   rv   	cache_keycache	cache_valmseretdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_timerg   rg   rh   r  F  sz   








z(BaseSchedulerNode._get_estimated_runtimeOptional[ir.TemplateBuffer]c                 C     d S rc   rg   rm   rg   rg   rh   get_template_node  r   z#BaseSchedulerNode.get_template_nodeir.TemplateBufferc                 C  s   |   }|d us
J |S rc   r	  )re   templaterg   rg   rh   get_template_node_or_throw  s   z,BaseSchedulerNode.get_template_node_or_thrownodeslist[BaseSchedulerNode]Jtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]c                 C  sD   t dd t| D }| d| }| | }| |d d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c                 s  s     | ]\}}|  r|V  qd S rc   rl  r   irZ  rg   rg   rh   r    s    zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>Nr   )next	enumerate)r  template_indexprologuetemplate_nodeepiloguerg   rg   rh   get_prologue_template_epilogue  s
   
z0BaseSchedulerNode.get_prologue_template_epilogue)rY   rX   ra   r   )rZ   r   ra   r   r   )ra   r   r   r  r-   r  r-   ra   r   r  r  ra   r   )r  r,   ra   r   r   )r  r   ra   r   r!  r"  r#  r  ra   r   ra   r"  rF  rG  ra   r   ra   r^  )ra   r`  )rb  rb   ra   rW   r   ro  rp  ra   r   T)r  rF   r  r   ra   r   r   )r  r   r  r   ra   rj   )r  r   r  r   ra   r  ra   r  )ra   r  ra   r  )ra   r
  )r  r  ra   r  );ru   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r*  r+  r/  r(  r   rE  rI  rd   rK  r?   rP  rT  r]  r&   rO  r   rc  r   rh  rI   rj  rk  rl  rm  rn  rr  rs  r  r  r  r  r  r  r  r  r  r  r	  r  staticmethodr  rg   rg   rg   rh   rS      s   
 



































 /


 

W
ra   $torch._inductor.codecache.LocalCachec                   C  s   t jj S rc   )r   r   	codecache
LocalCacherg   rg   rg   rh   r       r  snoderb   c                   s|   t | jdd}| jj}| jg || jj| jj}| jj}t||f\}}d	dd t|ft	 fdd|D  }|S )
Npython_kernel_namer   ra   r   c                 S  s   t | tjot | tj S rc   )r   r$   r  GeneratorStater~  rg   rg   rh   _is_tensor_ir  r   z@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_irc                 3  s(    | ]} |rt | nd V  qd S rc   )tupler   )r   ar.  rg   rh   r    s   & z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>r   )
r   rZ   inputsfill_non_provided_argsconstant_argsr   pytreetree_flattenrb   r/  )r*  r+  r   r   	flat_argsflat_args_pytree_specr  rg   r1  rh   r    s   
r  Optional[Callable[[Any], Any]]c                 C  s`   t | tsd S tjjjtjjjtjjjd}t| j	dd}||vr#d S t | j	t
js,d S || S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr+  r   )r   r  r   opsatenmmbmmaddmmr   rZ   r$   ExternKernel)r*  mms_fnsr+  rg   rg   rh   _get_mm_like_fn  s   
rA  r   c                   s   d d }t jrt}|d u rd S |fdd}nd S t}t }||}|d ur6t|ts4J |S ddlm	 | \ ddl
m} | fdd}|j||d |S )	Nc                     s    S rc   rg   rg   )r*  snode_args_kwargsrg   rh   r         z2maybe_estimate_runtime_benchmark.<locals>.<lambda>r   )rB  r   )do_benchc                     s    i S rc   rg   rg   )r   bench_fnr   rg   rh   r     s    r  )r!   !runtime_estimations_mms_benchmarkrA  r  r  r  r   r  utilsrB  triton.testingrD  r  )r*  args_kwargs_fnmm_fnr  r  r  rD  r  rg   )r   rE  r   r*  rB  rh   r    s*   

r  c                   @  sD   e Zd ZU g dZded< ded< dddZdddZdddZdS )	WhyNoFusename1name2reasonr   rb   rO  ztuple[Any, ...]r   node1rS   node2ra   r   c                 C  s   |  | _|  | _d S rc   )rd   rM  rN  re   rP  rQ  rg   rg   rh   r     s   
zWhyNoFuse.__init__r   c                 G  s   || _ || _t|  d S rc   )rO  r   
fusion_logdebug)re   rO  r   rg   rg   rh   __call__  s   zWhyNoFuse.__call__c                 C  s"   d| j  d| j d| j| j  S )Nzcannot fuse z with rp   rL  rm   rg   rg   rh   __str__  s   
zWhyNoFuse.__str__NrP  rS   rQ  rS   ra   r   )rO  rb   r   r   ra   r   r   )ru   r   r   	__slots__r   r   rU  rV  rg   rg   rg   rh   rK    s   
 

rK  objr   c                 C  sF   t | ttfrt| td} tj| dd}d|v r!dt|d S |S )Nkey   )r{   r       )	r   r   setsortedrb   pprintrx   textwrapr{   )rY  r}   rg   rg   rh   rx     s   rx   c                   @  s8   e Zd ZdddZddd	ZdddZdddZeZdS )r   r  r.   ra   r   c                 C  s   t |g| _d S rc   r;  r  rg   rg   rh   r        zOutputNode.__init__r   c                 C  r   r  rg   rm   rg   rg   rh   rj     r   zOutputNode.is_reductionr   c                 C  r   )Nrg   rg   rm   rg   rg   rh   r   #  r   z'OutputNode.get_inputs_that_alias_outputrb   c                 C  r   )NOUTPUTrg   rm   rg   rg   rh   rd   &  r   zOutputNode.get_nameN)r  r.   ra   r   r   r   r   )ru   r   r   r   rj  r   rd   r   rg   rg   rg   rh   r     s    



r   rZ   rF  rG  r   dict[str, SchedulerBuffer]r   c                   s   t  jD ]}t|ts! |j  }|    d7  < qd fddtfdd	jD }|rKj| _	j
| d
S d
S )am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   r  r,   ra   r   c                   sD   t | tr  | j  }|   dk}| k}|p|S dS )Nr   F)r   r/   rl   ri   rd   )r  r=  is_redundantis_self_dep)r   name_to_dep_countrF  rZ   rg   rh   r>  @  s   
z+_prune_redundant_deps.<locals>.should_prunec                 3  r?  rc   rg   r  r@  rg   rh   r  L  rA  z(_prune_redundant_deps.<locals>.<genexpr>NrB  )r  r   r   r   r/   rl   ri   rd   r   r  r   rC  )rZ   rF  r   r  r=  deps_to_prunerg   )r   rg  rF  rZ   r>  rh   rH  ,  s   

rH  c                      s<   e Zd Zd fddZdd
dZdddZdddZ  ZS )r  rY   rX   rZ   r   ra   r   c                   (   t  | | | | |  d S rc   superr   r   r  get_read_writesre   rY   rZ   	__class__rg   rh   r   V     
z"ExternKernelSchedulerNode.__init__rb   c                 C  s   |    dt| jdd  S )Nz.node.kernel = r+  )rd   r   rZ   rm   rg   rg   rh   r   [  s   z)ExternKernelSchedulerNode.debug_str_extrar   c                 C  r   NTrg   rm   rg   rg   rh   rm  ^  r   z#ExternKernelSchedulerNode.is_externc                 C  s$   | j d usJ t| j do| j  S )Nrs  )rZ   r   rs  rm   rg   rg   rh   rs  a  s   z*ExternKernelSchedulerNode.has_side_effectsrY   rX   rZ   r   ra   r   r   r   )ru   r   r   r   r   rm  rs  __classcell__rg   rg   rn  rh   r  U  s
    

r  c                      s   e Zd Zd	 fddZ  ZS )
r  rY   rX   rZ   r   ra   r   c                   ri  rc   rj  rm  rn  rg   rh   r   g  rp  zNopKernelSchedulerNode.__init__rr  )ru   r   r   r   rs  rg   rg   rn  rh   r  f  s    r  c                      s4  e Zd ZU dZded< ded< dW fddZ		dXdYddZ		dXdZddZd[ddZd\dd Z	d]d$d%Z
d^d&d'Zd_d+d,Zd`d.d/Zdad1d2Zdbd3d4Zdbd5d6Zdbd7d8Zdcd:d;Zddd>d?ZdedAdBZdfdCdDZ	EdgdhdHdIZedidJdKZedidLdMZdjdPdQZedkdSdTZedb fdUdVZ  ZS )lrX  zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr8   _bodyrY   rX   rZ   +Union[ir.ComputedBuffer, ir.TemplateBuffer]ra   r   c                   s"   t  | | | |   d S rc   )rk  r   r   _compute_attrsrm  rn  rg   rh   r   v  s   
zSchedulerNode.__init__Nextra_indexing_constraints*Optional[tuple[dict[Any, Any], list[Any]]]recompute_sizes_body_funcOptional[Callable[_P, _T]]c                 C  s   t | jtjtjfsJ | jj||d\| _}|| _| j }| j	
|j}||| jf| _tj p7t|j }t | jtjrK| | jj|d d S | tj| jg| jR d|i d S )Nrx  rz  )	normalizer}  )r   rZ   r$   ComputedBufferTemplateBuffersimplify_and_reorderrt  ru  get_device_or_errorrY   get_backendgroup_fnr   r!   loop_ordering_after_fusionrI   rt   r  extract_read_writesr#   )re   rx  rz  bodyrg  r  should_normalizerg   rg   rh   rw    s2   

zSchedulerNode._compute_attrsOptional[Callable[..., Any]]c                 C  s   | j ||d d S )Nr|  )rw  )re   rx  rz  rg   rg   rh   recompute_size_and_body  s   
z%SchedulerNode.recompute_size_and_bodyr}  r   need_clear_tiling_cachec                 C  st   t dd | jjD }| tj| jg| jR d|i|	| j
 | j|  |r8ddlm} |j  d S d S )Nc                 s  s"    | ]}t |ttfr|V  qd S rc   )r   r/   r.   r  rg   rg   rh   r    s    
z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>r}  r   SIMDScheduling)r   r   r   r  r#   r  ru  rt  r  r  r   pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)re   r}  r  	fake_depsr  rg   rg   rh   refresh_dependencies  s&   z"SchedulerNode.refresh_dependencies	new_orderSequence[int]c                 C  s*   | j || _ | j j| _| jddd d S )NFTr}  r  )ru  reorder_iter_loopssizesrt  r  )re   r  rg   rg   rh   apply_new_loop_order  s
   
z"SchedulerNode.apply_new_loop_order	dimensionrj   	new_rangec                 C  sl   t | jtjtjfsJ | j||| _| jj| _| j	 }| j
|j}||| jf| _| jddd d S )NTr  )r   rZ   r$   r~  r  ru  #expand_dimension_for_pointwise_noder  rt  r  rY   r  r  r   r  )re   r  r  rg  r  rg   rg   rh   r    s   

z1SchedulerNode.expand_dimension_for_pointwise_nodec                 C  s(   | j  | _ | j j| _| jddd d S )NTFr  )ru  merge_loopsr  rt  r  rm   rg   rg   rh   r    s   
zSchedulerNode.merge_loopsr  r-   r  c                 C  s~   d }| j d }t||j  kr|jkrn n||}|r5t jd7  _td|  | | 	| dS td|   dS )Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
rt  rz   num_varsdecide_loop_order_to_matchr%   num_loop_reorderingloop_ordering_logrT  rd   r  )re   r  r  r  
self_sizesrg   rg   rh   r    s    
 


z'SchedulerNode.reorder_loops_by_dep_pairrb   c                 C  s   |   }| d| jd  | d| jd  | d| j g}| j D ]#}t|tsG|j}tj	
|}t|tjsG|| dt|j  q$t| jtrc|d| d |t| j d	 | jd usjJ ||   d
|S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r]  r   )rd   r   rt  r   r  r   r/   rl   rO   r   r  r$   r  r  rx   rv   ru  r8   ra  r{   r   rZ   r7  r   join)re   rl   linesr  rb  r   rg   rg   rh   r     s$   

zSchedulerNode.debug_str_extraSequence[Sequence[sympy.Expr]]c                 C  ra  rc   )rt  rm   rg   rg   rh   r    r_  zSchedulerNode.get_rangesc                 C  s6   t | jtjtjfsJ dt| jt| j S Nztype(self.node)=)r   rZ   r$   r~  r  rt   r   r   rm   rg   rg   rh   rj    s   zSchedulerNode.is_reductionc                 C  sF   t | jtjtjfsJ dt| jt | jtjo"t | jjtjS r  )r   rZ   r$   r~  r  rt   r   	SplitScanrm   rg   rg   rh   rk  "  s   
zSchedulerNode.is_split_scanc                 C  s   t | jtjS rc   r   rZ   r$   r  rm   rg   rg   rh   rl  *  r   zSchedulerNode.is_templater  c                 C  s   t | jtjr
| jS d S rc   r  rm   rg   rg   rh   r	  -     zSchedulerNode.get_template_node
index_varsSequence[sympy.Expr]c                 G  s   |    |   | | d S rc   )r  r+  r  )re   r  rg   rg   rh   run0  s   zSchedulerNode.rundict[sympy.Expr, sympy.Expr]c                 C  sH   | j }ttt|ttt|ksJ tttj|tj|}|S rc   )	rt  r  maprz   dictzipr-  r.  from_iterable)re   r  r  
var_rangesrg   rg   rh   ranges_from_index_vars5  s    

z$SchedulerNode.ranges_from_index_varsc              	   C  s   |  |}zCttt |. tj|  | j|  W d   n1 s'w   Y  W d   W dS W d   W dS 1 sAw   Y  W dS  tyW   t	
d| j  w )a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)r  rO   set_ops_handlerr=   get_ops_handlerr   set_current_noderu  r   r   fatalrZ   )re   r  r  rg   rg   rh   r  B  s   

VzSchedulerNode.codegenT	pointwiser   c                 C  s:   |r| j nt| j \}}tj| j|tjjgt| gdS )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	rt  reversedr#   r  ru  sympySZerorz   )re   r  
keep_sizesignore_sizesrg   rg   rh   "pointwise_or_reduction_read_writesY  s   z0SchedulerNode.pointwise_or_reduction_read_writesc                 C     | j ddS )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r  rm   rg   rg   rh   r  d     z#SchedulerNode.pointwise_read_writesc                 C  r  )zD
        Get the memory dependencies in the reduction axes.
        Fr  r  rm   rg   rg   rh   reduction_read_writesk  r  z#SchedulerNode.reduction_read_writesro  rp  c                 C  s   |   rdS tdd |  D rdS t| jjdkrDt|tjrDt	t
| jj}t|tjs8J dt||j|jkoC|j|jkS dS )NFc                 s  rL  rc   )rw   rS  rg   rg   rh   r  u  rN  z,SchedulerNode.can_inplace.<locals>.<genexpr>r   ztype(write_dep)=)rl  r  r   rz   r   r   r   r#   r-   r  iterrt   indexsize)re   ro  	write_deprg   rg   rh   rr  r  s   zSchedulerNode.can_inplacer"  c                 C  s   t  }t| jtrP| j D ]A}|jdkrO|jdkrOd|jv r&|jd dks4t|j	dkrO|j	d dkrO|
d|jv r@|jd nt|j	dkrL|j	d	 nd
 q|S )Ncall_methodstoremode
atomic_add   r\  rl      r   r   )r   r   ru  r8   rO  rf   r  r   rz   r   r5  )re   buffers_store_as_atomic_addrZ   rg   rg   rh   _get_atomic_add_buffers  s   



z%SchedulerNode._get_atomic_add_buffersc                   s$   | j d ur| j drdS t  S )Ndevice_assert_asyncT)ru  has_oprk  rs  rm   rn  rg   rh   rs    s   
zSchedulerNode.has_side_effects)rY   rX   rZ   rv  ra   r   NN)rx  ry  rz  r{  ra   r   )rx  ry  rz  r  ra   r   )r}  r   r  r   ra   r   )r  r  ra   r   )r  rj   r  rj   ra   r   r   r  r   )ra   r  r   r$  )r  r  ra   r   )r  r  ra   r  )r  r  ra   r   r"  )r  r   ra   r   )ra   r   r!  r  )ru   r   r   __doc__r   r   rw  r  r  r  r  r  r  r   r  rj  rk  rl  r	  r  r  r  r  r?   r  r  rr  r  rs  rs  rg   rg   rn  rh   rX  m  sH   
 #















rX  group_snode/Union[FusedSchedulerNode, GroupedSchedulerNode]c                   sV    j } tjdd |D  t fddtjdd |D  D  jj  _	d S )Nc                 S  r0  rg   r   r}  rg   rg   rh   r         z3refresh_group_node_dependencies.<locals>.<listcomp>c                 3  "    | ]}|j   vr|V  qd S rc   rl   rT  r  r  rg   rh   r    r:  z2refresh_group_node_dependencies.<locals>.<genexpr>c                 S  r0  rg   )r   r}  rg   rg   rh   r     r  )
r  r  r#   
ReadWrites
merge_listr   unionr   r   r   )r  r  rg   r  rh   refresh_group_node_dependencies  s   r  rY   rX   r  r  c                 C  s   t | ttfs	J || _|| _d | _tjdd |D  | _t	|  t
dd | jD | _tdd | jD | _dd |  D | _d S )Nc                 S  s   g | ]
}|j d ur|j qS rc   )r   r}  rg   rg   rh   r         z#init_group_node.<locals>.<listcomp>c                 s  r  rc   r   r}  rg   rg   rh   r    r  z"init_group_node.<locals>.<genexpr>c                 s  r  rc   )r   r}  rg   rg   rh   r    r  c                 S  r   rg   r   r   rg   rg   rh   r     r   z#init_group_node.<locals>.<dictcomp>)r   r  GroupedSchedulerNoder  rY   rZ   r   r  r   r  r  r   r  r   r   r   )r  rY   r  rg   rg   rh   init_group_node  s   r  c                      s^  e Zd ZU dZded< edRdd	ZedSddZdTddZ	dU fddZ
edVddZdVddZedWddZdXd!d"ZdVd#d$ZdVd%d&ZdY fd*d+ZedWd,d-ZedWd.d/ZdZd1d2ZdVd3d4Zed[d5d6Zed[d7d8Zed[d9d:Zed\d<d=Zd]d?d@Zed[dAdBZd^dDdEZd_dHdIZd`dLdMZdVdNdOZed[ fdPdQZ   Z!S )ar  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r  rP  rS   rQ  ra   c                 C  s:  |j |j u sJ t|ttfsJ | rt|trt|jts"J t|j	j
dks,J ttt|j	j
ts9J tt|j	j
j}dd | D }t|dksSJ |d }t|j	j
dksaJ tt|j	j
}t|tspJ tt||j|j|j|jg|j	_
n	t|ttfsJ tt| | }| |j |S )Nr   c                 S     g | ]}|  r|qS rg   r  rM  rg   rg   rh   r         z+FusedSchedulerNode.fuse.<locals>.<listcomp>r   )rY   r   rX  r  rl  r  rZ   r5   rz   r   r   r  r  r.   rl   rO  r-   r   r  	var_namesr  r  r   r-  r.  )clsrP  rQ  rl   template_nodesr  writer  rg   rg   rh   fuse  s,   
zFusedSchedulerNode.fuser  c                 C  8   t td dd |  D }t|dkrd S t|}|S )Nc                 s  (    | ]}|  s| r| V  qd S rc   rl  rm  r  rM  rg   rg   rh   r        
z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>r   r   filterrO  rz   r  re   fpsr  rg   rg   rh   r       
z!FusedSchedulerNode.estimate_flopsr  r-   r  r   c                 C  s  |   rdS d}| jD ]%}t|tsJ |dur+t|t|jd kr+td  dS |jd }qd}|dus9J t||j	  krG|j	krNn n|
|}|sZtd|   dS t jd7  _td|  | | jD ]}t|tsvJ || qmt|  dS )	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)rl  r  r   rX  r/  rt  r  rT  rz   r  r  rd   r%   r  r  r  )re   r  r  r  r*  r  rg   rg   rh   r     s<   
 


z,FusedSchedulerNode.reorder_loops_by_dep_pairrY   rX   r   c                   s6   t  | t| || g | _t|dd dj| _d S )Nc                 S  s   t |  S rc   )rj   rj  r-  rg   rg   rh   r   -  s    z-FusedSchedulerNode.__init__.<locals>.<lambda>rZ  )rk  r   r  r_   r  r   )re   rY   r  rn  rg   rh   r   )  s   zFusedSchedulerNode.__init__rb   c                 C     d dd | jD S )N_c                 S     g | ]}|  qS rg   r   r}  rg   rg   rh   r   1      z/FusedSchedulerNode.get_name.<locals>.<listcomp>r  r  rm   rg   rg   rh   rd   /  rQ  zFusedSchedulerNode.get_namec                 C     | j d  S Nr   r  rd   rm   rg   rg   rh   rK  3  r   z!FusedSchedulerNode.get_first_namer"  c                 C     t jdd | jD  S )Nc                 S  r  rg   rT  r}  rg   rg   rh   r   8  r  z7FusedSchedulerNode.get_buffer_names.<locals>.<listcomp>r   r  r  rm   rg   rg   rh   rT  6  rQ  z#FusedSchedulerNode.get_buffer_nameslist[SchedulerBuffer]c                 C  "   g }| j D ]	}||  q|S rc   r  r7  r   re   r}   rZ   rg   rg   rh   r   :     
zFusedSchedulerNode.get_outputsc                   sP    fddt  jD } jd j}|d ur|   td| dS )Nc                   s,   g | ]\}}    d | d|  qS )z.snodes[z] =
)rd   r   )r   r  rZ   rm   rg   rh   r   A  s    z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>r   r   r]  )	r  r  rZ   r7  r   ra  r{   r  r   )re   r  rZ   rg   rm   rh   r   @  s   
z"FusedSchedulerNode.debug_str_extrac                 C  s   dd | j D }|  d| S )Nc                 S  r  rg   )r   rM  rg   rg   rh   r   L  r  z6FusedSchedulerNode.debug_str_short.<locals>.<listcomp>z
, snodes: r  )re   
snodes_strrg   rg   rh   r   K  s   z"FusedSchedulerNode.debug_str_shortr!  r#  r  c                   s@   t  || t }t| jD ]}||| ||j qd S rc   )rk  r*  r   r  r  updater   )re   r!  r#  rZ   rn  rg   rh   r*  O  s   z!FusedSchedulerNode.set_last_usagec                 C  r  )Nc                 S  r  rg   )r/  r}  rg   rg   rh   r   ^  r  z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>r  rm   rg   rg   rh   r/  \  rQ  z$FusedSchedulerNode.used_buffer_namesc                 C  r  )Nc                 S  r  rg   )r(  r}  rg   rg   rh   r   c  r  zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r  rm   rg   rg   rh   r(  `  s   z/FusedSchedulerNode.used_or_aliased_buffer_namesr^  c                 C  ra  rc   r  rm   rg   rg   rh   rO  f  r_  zFusedSchedulerNode.get_nodesc                 C  s   t | j d|   dS )Nz(nodes=r   r   rm   rg   rg   rh   r   i  r   zFusedSchedulerNode.__repr__c                 C  rR  )Nc                 s  rL  rc   )rj  r}  rg   rg   rh   r  n  rN  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>r  r  rm   rg   rg   rh   rj  l  rU  zFusedSchedulerNode.is_reductionc                 C  rR  )Nc                 s  rL  rc   )rk  r}  rg   rg   rh   r  r  rN  z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>r  rm   rg   rg   rh   rk  p  rU  z FusedSchedulerNode.is_split_scanc                 C  rR  )Nc                 s  rL  rc   r  r}  rg   rg   rh   r  v  rN  z1FusedSchedulerNode.is_template.<locals>.<genexpr>r  rm   rg   rg   rh   rl  t  rU  zFusedSchedulerNode.is_templater  c                 C  s$   | j D ]}| r|   S qd S rc   )r  rl  r	  r   rg   rg   rh   r	  x  s
   
z$FusedSchedulerNode.get_template_nodetorch.devicec                 C  s
   | j d S r  )r   rm   rg   rg   rh   r     r   zFusedSchedulerNode.get_devicec                 C  rR  )Nc                 s  rL  rc   )r  r}  rg   rg   rh   r    rN  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>r  rm   rg   rg   rh   r    rU  z+FusedSchedulerNode.has_aliasing_or_mutationr  c                 C     t rc   NotImplementedErrorr  rg   rg   rh   r    r   z'FusedSchedulerNode.update_mutated_namesrl   r,   c                 C  r	  rc   r
  )re   rl   rg   rg   rh   r    r   zFusedSchedulerNode.add_fake_depro  rp  c                 C  r	  rc   r
  rq  rg   rg   rh   rr    r   zFusedSchedulerNode.can_inplacec                 C  s  |   }ddd | jD }t }|| dt| j d| d| dt| jj	 d| d	t| j
 d| d
t| jj| j
  d| d |  |  D ]	}||  qOW d   n1 scw   Y  |d z	||   W n ty   tjddd Y nw |  S )r   rq   c                 s  s    | ]}t |jV  qd S rc   )rt   ru   rY  rg   rg   rh   r        z/FusedSchedulerNode.debug_str.<locals>.<genexpr>rp   r   r   r   r   r   r   z.outputs = [
            Nrr   r   Tr   )rd   r  r  rF   r   rt   ru   rx   r   r   r   r   r{   r   r   rs   r   r   r   r   r|   r   )re   rl   node_typestrr   r   rg   rg   rh   r     sJ   

	
zFusedSchedulerNode.debug_strc                   s(   | j d urtdd | j D S t  S )Nc                 s  rL  rc   )rs  rM  rg   rg   rh   r    rN  z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>)r  r  rk  rs  rm   rn  rg   rh   rs    s   

z#FusedSchedulerNode.has_side_effectsrP  rS   rQ  rS   ra   r  r#  r  )rY   rX   r  r  ra   r   r   r  ra   r   r  r   r   r$  )ra   r  r  )rl   r,   ra   r   r!  )"ru   r   r   r  r   classmethodr  r?   r  r  r   rd   rK  rT  r   r   r   r*  r/  r(  rO  r   rj  rk  rl  r	  r   r  r  r  rr  r   rs  rs  rg   rg   rn  rh   r    sR   
 !
)










r  c                      s   e Zd ZU dZd<ddZd=d	d
Zed>ddZed?ddZ			d@dA fddZ	edBddZ
edCd!d"ZeZd#ed$< edDd&d'ZedCd(d)ZdEd*d+ZdEd,d-ZdFd.d/ZdGd0d1ZdHd3d4ZdId6d7ZdJd:d;Z  ZS )KForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    producerrS   ra   r[   c                 C  s2   |  D ]}| | jv r| j|    S qd S rc   )r   rd   read_to_node)re   r  r   rg   rg   rh   get_consumer_subnode_for  s
   z3ForeachKernelSchedulerNode.get_consumer_subnode_forconsumerc                 C  sp   t t  }|jjD ] }|j| jjvrq	| jj|j  }|| jv r)|	| j|  q	t
|dkr6tt|S d S Nr   )r   rS   r   r   rl   rY   r   ri   name_to_noder5  rz   r  r  )re   r  	producersrd	node_namerg   rg   rh   get_producer_subnode_for  s   

z3ForeachKernelSchedulerNode.get_producer_subnode_forr   c                   s&  t  |}  r;| r;tt  tt|}t jt|jk}|s)|d |o:t fddt j|jD S | re 	 rI|d dS tt|}|
 }|d ur_|j |S |d dS   r|	 rs|d dS tt   |}|d ur j||S |d dS td	)
Nzforeach do not have same lengthc                 3  s"    | ]\}} j ||V  qd S rc   )rY   can_fuser   lrr  rg   rh   r    s
    
z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)rK  rn  typingcastr  rz   r  r\  r  rj  r  rY   r  r  r  )r  r  r  whyforeach_matchconsumer_subnodeproducer_subnoderg   r   rh   r    sJ   


z#ForeachKernelSchedulerNode.can_fusec                 C  s  |  s
|  s
J |  rtt|}|j}|j}ntt|}|j}|j}d }d }|  rL|  rLtt|}tt|}dd t|j|jD }nj|  rtt|}||}g }|}d }|jD ]}	|	|u rxt	
|	|}
|
}||
 qd||	 qdn7|  rtt|}||}g }|}d }|jD ]}	|	|u rt	
||	}
|
}||
 q||	 qntd| |j|||||dS )Nc                 S  s   g | ]
\}}t ||qS rg   )r  r  r  rg   rg   rh   r     s    
z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>zTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)rn  r!  r"  r  r'  r*  r  r  r  r  r  r  r  r  rY   )r  r  r  r'  r*  r(  r)  fused_nodesr&  rZ   new_noder%  rg   rg   rh   r    sj   



zForeachKernelSchedulerNode.fuseNFrY   rX   r  r  r'  r(  r)  r*  r   c                   s  i  _ i  _|d u s|d u r4t || |D ]}|jjD ]}| j |j< q| D ]}	| j|	< q*qn| _| _	d  _
g  _ tj|j|jg t fddt|j|jD  jj  _t|j|jg _t|j|jg _| rt|tsJ ||}
}nt|tsJ ||}
}|
j _ j|j |
j _| D ]}	| j|	< qdd  j	D  _| _|d  }|sJ |t !dfff _"tt#j$j%   _&| _'d S )Nc                 3  r  rc   r  r  rm   rg   rh   r  f  s    z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>c                 S  s&   i | ]}|j  D ]\}}||q	qS rg   )r   items)r   r*  r%  vrg   rg   rh   r     s
    

z7ForeachKernelSchedulerNode.__init__.<locals>.<dictcomp>r   combo_kernel)(r  r  rk  r   r   r   rl   rP  rY   r  rZ   r_   r  r#   r  r  r   r  r   r   r  r   r  r   rn  r   r  r   r  r   r'  r   r  Exprr   r   fxNoder  r*  )re   rY   r  r'  r(  r)  r*  rZ   r  rl   foreach_node
other_noderg  rn  rm   rh   r   D  sb   	


z#ForeachKernelSchedulerNode.__init__r  c                   s   dd |D }|rt dt|dd |D  dd |D }dd |D }|r/t dt| dd |D }d	d |D   rHt d
t    fdd|D }|S )Nc                 S     g | ]	}t |tr|qS rg   )r   r  r}  rg   rg   rh   r     s    z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>z/ComboKernels: %d external nodes are filtered %sc                 S  s    g | ]}|j d ur|j  qS rc   rZ   r  rM  rg   rg   rh   r     s     c                 S  s   g | ]}t |ttfs|qS rg   )r   r  r  r}  rg   rg   rh   r     s    c                 S  r5  rg   r   r  r}  rg   rg   rh   r     
    
z+ComboKernels: %d foreach nodes are filteredc                 S  s   g | ]	}t |ts|qS rg   r7  r}  rg   rg   rh   r     r8  c                 S  r  rg   r  r}  rg   rg   rh   r     r  z0ComboKernels: %d template nodes are filtered: %sc                   s   g | ]}| vr|qS rg   rg   r}  r  rg   rh   r     r  )r   rT  rz   )r  r  externfiltered_nodesforeach_nodesrg   r9  rh   combinable_nodes  s6   z+ForeachKernelSchedulerNode.combinable_nodeslist[list[BaseSchedulerNode]]c                   sD   |   }g }d |D ]| fddtdt D  q
|S )zS
        Returns a list of lists of nodes that are to be grouped together.
           c                   s   g | ]
}||   qS rg   rg   )r   r  max_num_nodesr  rg   rh   r     s    zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>r   )_topological_sort_nodesr7  rangerz   )rY   sorted_nodesgrouped_nodesrg   r@  rh   &_default_group_nodes_for_combo_kernels  s   zAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelscustom_group_algorithmc                 C  s
   | t _d S rc   r  rH  )rI  rg   rg   rh   %set_group_algorithm_for_combo_kernels  s   z@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernelsc                 C  s
   t | S rc   rJ  rY   rg   rg   rh   group_nodes_for_combo_kernels  s   
z8ForeachKernelSchedulerNode.group_nodes_for_combo_kernelsc                 C  r	  rc   r
  rm   rg   rg   rh   r+    r   z#ForeachKernelSchedulerNode.mark_runc                 C  r	  rc   r
  rm   rg   rg   rh   r    r   z"ForeachKernelSchedulerNode.codegenc                 C  r   rq  rg   rm   rg   rg   rh   rn    r   z%ForeachKernelSchedulerNode.is_foreachc                 C  s
   t | jS )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r  rm   rg   rg   rh   get_subkernel_nodes  s   
z.ForeachKernelSchedulerNode.get_subkernel_nodesr^  c                 C  s   t tjdd | jD S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c                 s  rL  rc   )rO  r}  rg   rg   rh   r    rN  z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>)r   r-  r.  r  r  rm   rg   rg   rh   rO    s   z$ForeachKernelSchedulerNode.get_nodesrb   c                 C  r  r  )r  rK  rm   rg   rg   rh   rK    r   z)ForeachKernelSchedulerNode.get_first_namerF  rG  c                 C  s*   t | || jj | jD ]}|| qd S rc   )rH  rY   r   r  rI  )re   rF  rZ   rg   rg   rh   rI    s   
z/ForeachKernelSchedulerNode.prune_redundant_deps)r  rS   ra   r[   )r  rS   ra   r[   r  rS   r  rS   ra   r   )r  rS   r  rS   ra   r  )NNF)rY   rX   r  r  r'  r   r(  r[   r)  r[   r*  r   ra   r   r  r  ra   r  )rY   rX   ra   r>  )rI  rG  ra   r   r   r   ra   r  r   r   r  )ru   r   r   r  r  r  r  r  r  r   r=  r%  rF  rH  r   rK  rM  r+  r  rn  rN  rO  rK  rI  rs  rg   rg   rn  rh   r    s:   
 

	.EH!






r  c                      s   e Zd ZU dZded< ed+ddZ	d,d- fddZd.ddZd/ddZ	e
d0ddZd0ddZe
d1ddZd2ddZe
d3d!d"Zd4d$d%Zed5d)d*Z  ZS )6r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r  ra   c                   sX   |d j  t fdd|D sJ |  |}|D ]	}| j| < q| j| < |S )Nr   c                 3  s    | ]}|j  u V  qd S rc   rL  rM  rL  rg   rh   r    r  z.GroupedSchedulerNode.create.<locals>.<genexpr>)rY   r\  rF  rd   )r  r  grouped_snoder*  rg   rL  rh   create  s   

zGroupedSchedulerNode.createFrY   rX   temp_groupingr   r   c                   s"   t  | t| || || _d S rc   )rk  r   r  rT  )re   rY   r  rT  rn  rg   rh   r   	  s   
zGroupedSchedulerNode.__init__c                 C  sD   | j r| jS | jD ]
}|| jj| < q	| jj|  = | j| jS )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )rT  r  rY   rF  rd   
fuse_nodes)re   r*  rg   rg   rh   unpack  s   
zGroupedSchedulerNode.unpackfake_depr,   c                 C  s"   |  | j| | j| d S rc   )r  r   r  r   r5  )re   rW  rg   rg   rh   r  %  s   z!GroupedSchedulerNode.add_fake_deprb   c                 C  r  )Nr  c                 S  r  rg   r   r}  rg   rg   rh   r   +  r  z1GroupedSchedulerNode.get_name.<locals>.<listcomp>r  rm   rg   rg   rh   rd   )  rQ  zGroupedSchedulerNode.get_namec                 C  r  r  r  rm   rg   rg   rh   rK  -  r   z#GroupedSchedulerNode.get_first_namer"  c                 C  r  )Nc                 S  r  rg   r  r}  rg   rg   rh   r   2  r  z9GroupedSchedulerNode.get_buffer_names.<locals>.<listcomp>r  rm   rg   rg   rh   rT  0  rQ  z%GroupedSchedulerNode.get_buffer_namesr   c                 C  r  rc   r  r  rg   rg   rh   r   4  r  z GroupedSchedulerNode.get_outputsr  c                 C  r  )Nc                 s  r  rc   r  rM  rg   rg   rh   r  @  r  z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>r   r  r  rg   rg   rh   r  :  r  z#GroupedSchedulerNode.estimate_flopsr^  c                 C  ra  rc   r  rm   rg   rg   rh   rO  L  r_  zGroupedSchedulerNode.get_nodesr  rS   r  c                 C  r   r  rg   )r  r  r  rg   rg   rh   r  O  r  zGroupedSchedulerNode.can_fuse)r  r  ra   r  F)rY   rX   r  r  rT  r   ra   r   rQ  )rW  r,   ra   r   r   r  r  r#  r   rO  )ru   r   r   r  r   r  rS  r   rV  r  r?   rd   rK  rT  r   r  rO  r  rs  rg   rg   rn  rh   r    s(   
 	




r  rg   stride_lengthslist[list[int]]r  r  priority_idxr  	list[int]c                   sb   t jd fdd}ttttd }t|dkr&fdd	|D tjr/|j|d
 |S )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    r0  rj   bra   c                   s     dks dkrt   dk dkS  fddD }fddD }tdd t||D }tdd t||D }||krIdS ||krOdS t  S )	Nr   c                      g | ]}t |  qS rg   absr   sl)r0  rg   rh   r   g  r  z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>c                   r^  rg   r_  ra  )r]  rg   rh   r   h  r  c                 s  s$    | ]\}}|d kp||k V  qdS r   Nrg   r   sl_asl_brg   rg   rh   r  l      
z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>c                 s  s$    | ]\}}|d kp||k V  qdS rc  rg   rd  rg   rg   rh   r  o  rg  r  )r@   r  r  )r0  r]  stride_len_astride_len_ba_firstb_firstr  rY  )r0  r]  rh   	index_cmp_  s   
z"pick_loop_order.<locals>.index_cmpr   c                      g | ]} | qS rg   rg   )r   pi)rY  rg   rh   r   }  r  z#pick_loop_order.<locals>.<listcomp>rZ  N)r0  rj   r]  rj   ra   rj   )		functools
cmp_to_keyr   r  rC  rz   r!   pick_loop_orderssort)rY  r  r[  rm  orderrg   rl  rh   pick_loop_orderU  s   
ru  c                   @  sV   e Zd ZU ded< dZded< dZded< dd	d
ZdddZdddZdddZ	dS )NodeUser$Union[BaseSchedulerNode, OutputNode]rZ   Fr   rr  is_weakra   rj   c                 C  s   t | j | j| jfS rc   )rk   rZ   rd   rr  rx  rm   rg   rg   rh   rn     r  zNodeUser.__hash__otherobjectc                 C  s2   t |to|  | ko| j|jko| j|jkS rc   )r   rv  rd   rr  rx  re   ry  rg   rg   rh   __eq__  s   


zNodeUser.__eq__rb   c                 C  r   rc   r   rm   rg   rg   rh   rd     r   zNodeUser.get_namec                 C  s.   | j |j u sJ t| j | jo|j| jo|jS rc   )rZ   rv  rr  rx  r{  rg   rg   rh   r     s   

zNodeUser.mergeNr   )ry  rz  ra   r   r   )ry  rv  ra   rv  )
ru   r   r   r   rr  rx  rn   r|  rd   r   rg   rg   rg   rh   rv    s   
 


rv  r   c                   C  s   t jS rc   )r!   rF  rg   rg   rg   rh   *used_non_deterministic_runtime_estimations  r_  r}  c                      sF  e Zd ZdZdddZd fdd	ZdddZedddZej	dddZdddZ
dddZdddZdddZddd Zdd!d"Zdd#d$Zdd&d'Zdd)d*Zdd,d-Zdd.d/Zdd0d1Zdd2d3Zdd4d5Zdd8d9Z	:ddd?d@ZddDdEZddFdGZddIdJZddNdOZddPdQZddRdSZdddUdVZ ddWdXZ!ddZd[Z"dd\d]Z#dd^d_Z$ddbdcZ%ddddeZ&ddhdiZ'ddjdkZ(ddldmZ)ddrdsZ*ddudvZ+ddwdxZ,ddydzZ-dd}d~Z.dddZ/d ddZ0dddZ1dddZ2dddZ3dddZ4dddZ5dddZ6dddZ7dddZ8dddZ9dddZ:dddZ;	dd	ddZ<d
ddZ=dddZ>dddZ?dddZ@dddZAdddÄZBdddńZCdddǄZDdddʄZEddd̄ZFddd΄ZGdddфZHdddӄZIdddՄZJdddׄZKdddڄZLddd܄ZMdddZNdddZO  ZPS (  rX   z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    r  list[ir.Operation]ra   r   c                 C  s8   t d | | W d    d S 1 sw   Y  d S )NScheduler.__init__)r   _initre   r  rg   rg   rh   r     s   
"r  c           	        s  t     tj_i  _tt _t	
  _t  _tg tjj tjj tjj  _ fdd|D  _d  _    jtjj   jD ]}|  qOd  _   _dd  jD  _dd  jD  _ j  _i  _i  _ t!" j j j _ #   $ j _ %  dd  jD  _ &  t' j(t) j7  _(ddl*m+}m,} | j t) j _- .   $ j _tt/t0t0f    _1t2j3d urt23 j _ 4 j _t2j5d urt25 j _ 6   7  t2j8rt9d	d
d
d  j:d d W d    n	1 sw   Y  t2j;r?ddl<m;} | j j jttjj ttj=  _t2j>rt2j;sTddl<m?} | j j t@ rgtAjBrgddl!mC} | j ddlDmE} |ddd  fddd t!F j _ G  tHjIj2jJrtHjIj2jKjLr M j _ N j _ O  tHjIj2jPjQr R  | j tjST j  U  t  _Vi  _WtXdY fdd d S )Nc                   s   g | ]}  |qS rg   )create_scheduler_noderY  rm   rg   rh   r         z#Scheduler._init.<locals>.<listcomp>c                 S  r   rg   r   rY  rg   rg   rh   r     r   z#Scheduler._init.<locals>.<dictcomp>c                 S  s$   i | ]}|  D ]}| |qqS rg   )r   rd   )r   rZ   r   rg   rg   rh   r     s
    
c                 S  r   rg   r   rY  rg   rg   rh   r     r  r   )log_ir_post_fusionlog_ir_pre_fusion#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodesr   )reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffers)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                   S  s
   dddS )N#scheduler_nodes_before_comm_overlapstring)rl   encodingrg   rg   rg   rg   rh   r   4	  s   z!Scheduler._init.<locals>.<lambda>c                     s   d dd t jD S )Nz

c                 S  s2   g | ]\}}d | d|   d|   qS )zsnode[rr   z buffer_names:)r   rT  r  rg   rg   rh   r   9	  s    
z5Scheduler._init.<locals>.<lambda>.<locals>.<listcomp>)r  r  r  rg   rm   rg   rh   r   8	  s
    )metadata_fn
payload_fngraph_statsc                     s    j  jt jdS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesrz   r  rg   rm   rg   rh   r   \	  s   )Zrk  r   rO   r   rY   backendsr  _post_grad_graph_counterr  r-  count_graph_partition_counterr   r  r  keys	constantstorchbind_constantsr9  r  current_nodeupdate_zero_dim_cpu_tensorr  r   default_device_contextget_donated_buffersr   r  r   copyrF  r#  r   r    decide_global_ordering_of_commsrR   topological_sort_scheduledead_node_eliminationcompute_ancestorsr%   ir_nodes_pre_fusionrz   torch._inductor.debugr  r  r  create_foreach_nodesr/  rb   logged_slow_fusionr!   _pre_fusion_custom_passrU  _post_fusion_custom_passr  finalize_multi_template_bufferscombo_kernelsr   create_combo_kernel_nodesr  memoryget_output_names reorder_for_compute_comm_overlapr  r}  r"   6runtime_estimations_align_across_all_distributed_ranksr  torch._loggingr  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesr   r   graph_partitiontriton
cudagraphs&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesrT  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)	re   r  rZ   r  r  r  r  r  r  rn  rm   rh   r    s   











	




zScheduler._init!dict[str, SchedulerDonatedBuffer]c                 C  sD   i }t jjD ]}tt jj| tjrt| t jj| d d||< q|S )N)r\   )rO   r   graph_inputs_originalr   r$   DonatedBufferr   )re   name_to_donated_bufrl   rg   rg   rh   r  c	  s   

zScheduler.get_donated_buffersr   c                 C  s   t jjS rc   rO   r   current_devicerm   rg   rg   rh   r  n	  s   zScheduler.current_devicerg  c                 C  s   |t j_d S rc   r  rf  rg   rg   rh   r  r	  r)  c                 C  s4   t jdddkrddlm} || jdd dS dS )z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr$  rT  r  r  )re   r  rg   rg   rh   r  v	  s   zScheduler.debug_draw_graphlabelrb   c                 C  s4   t tjrt d| | jD ]}|  qd S d S )Nz%s:)r   isEnabledForloggingINFOr   r  r  )re   r  rZ   rg   rg   rh   debug_print_nodes}	  s   

zScheduler.debug_print_nodesrZ   r   rS   c                 C  s`   |  d us
J d| rt| |S t|tjtjfr!t| |S t|tjr,t	| |S t
|)Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  r   r$   r~  r  rX  r?  r  r  r   rg   rg   rh   r  	  s   


zScheduler.create_scheduler_nodec                   s   t  g }j  tjj D ]9} fdd|D }|sq| fdd|D }tj	dk}t
|d|d}|| |D ]}|j|< qAqfddjD t| _d S )Nc                   s(   g | ]}| v rt j| ts|qS rg   )r   r  r  r
  )kept_node_namesre   rg   rh   r   	  s    z2Scheduler.create_foreach_nodes.<locals>.<listcomp>c                   s   g | ]} j | qS rg   r  r
  rm   rg   rh   r   	  r  r   Fr'  r*  c                   s   g | ]
}|   vr|qS rg   r   rM  )removed_node_namesrg   rh   r   	      )r   rF  r  rO   r   listsr   r  r!   combo_kernels_autotuner  r  r  r   )re   fe_nodesnamesr  r*  fe_noderl   rg   )r  r  re   rh   r  	  s6   





zScheduler.create_foreach_nodesc           !        s  G  fdddt t  t jD ]b}| D ][}| }t|jj	t
jr1t| dkr1q| D ]?}|v rc|v rc| }| }|| } D ]}| |u s]| |u ra||< qOq5|v rn| |< q5| |< q5qqd.fdd				d/d0fdd}	i }
tjj D ]3\}}t|tjr|jD ]}d|
|< qqt|t
jrdd | D }|D ]}|jD ]}d|
|< qqqd	}jD ]-}|jdusJ t|j dd d}|D ]}t|tjsJ d}||
vr| |
|< qqˈjD ]%}td|j |rO|jdusJ t|jjdddd d}|D ].}||
v s0J | d|
 |
|  }durMj|  D ]}|t |  q@q t|j!j"dkrmt#t$|j!j" }rmt|t%rm|j&}nd}| D ]d}t|' dksJ |' D ]Q}|}|	|| |t ||d | jD ]6}| | krqt|jt(sJ |j) D ]}|}|t*|| d |	||dd  qqqqs|j!j+D ]}t|t*s|	|j,||-| q|.j/ | D ]'}|' D ]}| j/|< | j/|< j01||j0| < q qqtj2 D ]}td!| |	|t3t | q(|rtjj4D ]?}|jddD ]5}||
v s\J | d|
  |
|  }r~j| ) D ]}td"|| |	|t3t | qjqJqBj/D ],}|tjjv r|	|t3t | tjj56| q|tjj7v r|	|t3t | qd#d$ t8tjj D fd%dtjj5D tj_9jD ]}| D ]}|:|  j qՐqψj;D ]}j;| :| j qt< }|=d&  D ].\}}|>  d'd |jD }|=d(| d)| d* W d   n	1 s+w   Y  q|=d+ |? @ } tAd, tAd-|  dS )1zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                      s8   e Zd ZdZ		ddd	d
ZdddZd fddZdS )z1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nr-  Optional[list[_T]]
membershipOptional[OrderedSet[_T]]ra   r   c                 S  s   |pg | _ |p	t | _d S rc   )r-  r   r  )re   r-  r  rg   rg   rh   r   	  s   
z:Scheduler.compute_dependencies.<locals>.DedupList.__init__	node_userrU   c                 S  s*   || j v rd S | j| | j | d S rc   )r  r-  r  r5  )re   r  rg   rg   rh   r  	  s   
z8Scheduler.compute_dependencies.<locals>.DedupList.appendry  DedupList[_T]c                   s4   t  j|j} j fdd|jD  }||S )Nc                   s   g | ]	}| j vr|qS rg   )r  r}  rm   rg   rh   r   	      zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>)r   r  r  r-  )re   ry  new_membership	new_items	DedupListrm   rh   __add__	  s
   
z9Scheduler.compute_dependencies.<locals>.DedupList.__add__r  )r-  r  r  r  ra   r   )r  rU   ra   r   )ry  r  ra   r  )ru   r   r   r  r   r  r  rg   r  rg   rh   r  	  s    
r  r   rZ  rb   ra   c                   s   | j v r j |  S | S rc   r   rZ  )r  re   rg   rh   r  	  s   
z.Scheduler.compute_dependencies.<locals>.renameFused_by_namer{  rw  rr  r   rx  r   c                   s    |   t||| d S rc   )r  rv  )r  r{  rr  rx  )name_to_usersr  rg   rh   add_user
  s   
z0Scheduler.compute_dependencies.<locals>.add_userNc                 S  s   g | ]
}t |tjr|qS rg   )r   r  r0  r   r  rg   rg   rh   r   
  r  z2Scheduler.compute_dependencies.<locals>.<listcomp>c                 S  ra  rc   r  r-  rg   rg   rh   r   #
      z0Scheduler.compute_dependencies.<locals>.<lambda>rZ  Tzscheduling %s)unbacked_onlyc                 S  ra  rc   r  r-  rg   rg   rh   r   6
  r  z not in )r  )mutating_buf)rx  zscheduling output %sz+scheduling output %s for unbacked symint %sc                 S     i | ]\}}||qS rg   rg   )r   r  rl   rg   rg   rh   r   
  r   z2Scheduler.compute_dependencies.<locals>.<dictcomp>c                   rn  rg   rg   r
  )	inp_namesrg   rh   r   
      r  c                 S  r  rg   r   )r   r.  rg   rg   rh   r   
  r  'z': rq   r  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)rZ  rb   ra   rb   )FF)
r  rb   r{  rw  rr  r   rx  r   ra   r   )Br   rU   r  r   r  r   rd   r   rZ   rv   r$   r7   rz   rw   r  rO   r   r  r-  r  r0  r   	TensorBoxr   r_  get_unbacked_symbol_defsSymbolr   rT  get_free_symbol_usesr  r  r.   r   r   r  r  r-   r  ry   rS   rT  r/   r   rl   rr  r  r   r#  r$  r  r   graph_outputsmutated_inputsr5  r  r  mutated_input_idxsr   r   rF   r   r{   r|   r   compute_dependencies_log)!re   rZ   buf1	buf1_name	buf2_namelist1list2combinedr[  r  unbacked_symbol_to_origin_noderl   valfssym_sizer  has_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr  r   r  	node_modealt_namer~   
other_namer  rb  r   logbufr  r_   rb   rg   )r  r  r  r  re   rh   rR   	  sD  






	
	








zScheduler.compute_dependenciesc                   sF  ddl m}m}m}m} ttjj	 }| j
|}tjjjs&| j
 j ttj }| j
||\}}	}	dd tt j
D |D ]&}
|
jdkrR|
jdkrRqE|
j }|
j d | |
j d | qEddlm} |  d fdd}g }t j
D ]\}}|| ||||t j
d kd q| _
d S )Nr   )r  compute_memory_timelineFreeableInputBufferget_freeable_input_bufc                 S  s   g | ]}g g fqS rg   rg   )r   r  rg   rg   rh   r   
  r  z7Scheduler.insert_memory_check_nodes.<locals>.<listcomp>r   )register_check_mem_opstep_idxrj   is_final_stepr   ra   r  c                   sn   |  d }|  d }|||g}t jttddtjjjjg |dd d}d j	|  
  |_t |S )	Nr   r   re  )rg  c                 S  s   | |d |d |d dfS )Nr   r   r  )alivedeadr  rg   )tensor_argsr4  rg   rg   rh   r   
  s   zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>)rv   r   r  nontensor_argsunflatten_args
mem_check_)r$   MemoryCheckKernelr7   r   rg  r:  _inductor_debugcheck_memory_stepdefaultr  rd   operation_namer  )r  r  expected_newly_aliveexpected_newly_deadr  rZ   re   step_allocs_deallocsrg   rh   construct_mem_check_node
  s   


zEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node)r  )r  rj   r  r   ra   r  )r  r  r  r  r  r   rO   r   r  r  r  r   r   r!   r  r   r  rC  rz   
size_alloc	size_freer  rd   
start_stepr  end_step#torch._inductor.runtime.debug_utilsr  r  )re   r  r  r  r  r  name_to_freeable_input_bufr  buf_info_listr  buf_inforb  r  r+  	new_nodesr  rZ   rg   r)  rh   r  
  sB   





z#Scheduler.insert_memory_check_nodesc                   s   g }t | jD ]uddd d} D ]$}t fdd	|jD }|r6td
|  tj	j
|  qd}q  o@| }|sI| qtd  tj	j  jjD ]}|j| jv r{| j|j j}fdd|D | j|j _q^qtt || _| jD ]  qdS )z0
        Remove any nodes without users
        r~   rv  ra   r   c                 S  s   | j p
|  tjjv S rc   )rx  rd   rO   r   r<  )r~   rg   rg   rh   can_eliminate_user   r  z;Scheduler.dead_node_elimination.<locals>.can_eliminate_userFc                 3      | ]} |V  qd S rc   rg   r   u)r5  rg   rh   r    rN  z2Scheduler.dead_node_elimination.<locals>.<genexpr>zremoved dead buffer: %sTzremoved dead operation: %sc                   s"   g | ]}|j    kr|qS rg   r   r7  r  rg   rh   r     s    z3Scheduler.dead_node_elimination.<locals>.<listcomp>N)r~   rv  ra   r   )r  r  r   r\  r_   r   rT  rd   rO   r   r  r5  rs  r  r<  r   r   rl   r   r   rE  )re   updated_nodesactive_buffersr   can_eliminater  r_   rg   )r5  rZ   rh   r  
  s6   



zScheduler.dead_node_eliminationr  c                   s^   t t  t  g d fdd|D ]}| D ]}| |< qq|D ]}| q&S )	z?
        Ensure nodes is in topologically sorted order
        rZ  rS   ra   r   c                   sV   | vr) |  t| jdd dD ]}|j vrq |j  q|  d S d S )Nc                 S  ra  rc   r  )drg   rg   rh   r   -  r  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>rZ  )r5  r_  r   rl   r  )rZ  r  r  r}   seenvisitrg   rh   r?  *  s   

z2Scheduler.topological_sort_schedule.<locals>.visitN)rZ  rS   ra   r   )r   rS   r  rT  )re   r  rZ   rl   rg   r=  rh   r     s   



z#Scheduler.topological_sort_scheduler*  c                   sr   t  }t|ttttfr|jD ]}||j qn
t	dt
| d fdd|D }tt  fdd|D S )Nz+get_unmet_dep_nodes is not implemented for .c                 3  s    | ]
} j |  V  qd S rc   )r   ri   r  rm   rg   rh   r  L      z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>c                 3  s    | ]} j | V  qd S rc   rF  rY  rm   rg   rh   r  M  r  )r   r   rX  r  r  r  r   r5  rl   RuntimeErrorrt   r   )re   r*  
unmet_depsr  unmet_dep_opsrg   rm   rh   _get_unmet_dep_nodes;  s"   
	zScheduler._get_unmet_dep_nodesr>  c                 C  s   g }t | jd}i }| jD ]!}| |}t|||< |D ]}||g }|| |||< qqdd | D }|rf|| |D ]}	||	g D ]
}
||
  d8  < qJ||	 qBdd | D }|s;|rlJ d|S )zU
        Sort nodes by their topological order, return a list of node lists.
        r   c                 S     g | ]
\}}|d kr|qS r   rg   r   rZ  r.  rg   rg   rh   r   ^  r  z5Scheduler._topological_sort_nodes.<locals>.<listcomp>r   c                 S  rG  rH  rg   rI  rg   rg   rh   r   e  r  zTopological sort failed!)	r  fromkeysr  rF  rz   r$  r  r-  r4  )re   rt  r  childrenrZ   r8  r  czero_deg_nodesrZ  r~   rg   rg   rh   rB  O  s,   




z!Scheduler._topological_sort_nodesc                 C  s~   i }| j D ]'}t }|jD ]}| j|j  }|| ||| O }q||| < ||_qt	| j D ]
\}}||_
||_q2dS )z.
        Populate each node.ancestors
        N)r  r   r   r   rl   ri   r5  rd   r   r  r   r   )re   name_to_ancestorsrZ   r   r  dep_node_namert  rg   rg   rh   r  i  s   


zScheduler.compute_ancestorsc                 C  sf   t jsd S | jD ](}t|ttfr| st jdkrq| D ]}t|tr*|	 r+q|
  qqd S )Nhalide)r!   r  r  r   rX  r  rI   cpu_backendrO  rl  r  )re   rZ   r*  rg   rg   rh   r  |  s   


zScheduler.merge_loopsc                 C  s   t ddddC tdD ]4}t|}td|d | | |}t|}td|d || ||ks6|dkr@td|d   nq|W  d	   S 1 sMw   Y  d	S )
zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTr  
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   rC  rz   rS  rT  fuse_nodes_once)re   r  r  old_lennew_lenrg   rg   rh   rU    s4   
$zScheduler.fuse_nodesc                 C  s8   g }| j D ]}|t|tr| n|g q|| _ dS )zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r7  r   r  rV  )re   r4  rZ   rg   rg   rh   r    s   

zScheduler.process_grouped_nodesr^  tuple[float, str]c                 C  sh   t |dksJ |d  }|| _| |}tdddd ||W  d   S 1 s-w   Y  dS )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)r  dynamo_compile_column_usN)rz   r   r  r  r   rX  )re   r  rg  backendrg   rg   rh   rX    s   
$zScheduler.benchmark_fused_nodesNbenchmark_kernelr   hint_overrideOptional[int]c                 C  sh   t |dksJ |d  }|| _| |}td |j|||dW  d   S 1 s-w   Y  dS )rW  r   rX  r]  N)rz   r   r  r  r   generate_kernel_code_from_nodes)re   r  r\  r]  rg  r[  rg   rg   rh   r`    s   


$z)Scheduler.generate_kernel_code_from_nodesmoduler   r  c                 C  sF   || _ | |}td ||W  d   S 1 sw   Y  dS )rW  rX  N)r  r  r   benchmark_codegened_module)re   ra  rg  r[  rg   rg   rh   rb    s
   

$z$Scheduler.benchmark_codegened_modulec                   s  ddd}t | jD ]\}}t|trt|jtjr|j}tjj	s+|
 \}}ntd	d
 | D }t|tjjjrztjrsi }||d< tjD ]!}|j|d}	dd |	 D }
t|
 dd dd }|||< qJ|j| n|j| q
| }|j}t|tjsJ |j}t|tjsJ |j|_||| | |}|| j|< || j| < || j| < i  t|j j!|j"D ]}| j#$|j%d }r|j% |< qd fdd}||j"|_"||j j!|j _!t&|' |' D ]\}}|| j(| < |j)|_)q|j*|_*|j+|_+|j,|_,q
dS )a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        	orig_nodeir.MultiTemplateBufferr,  ir.OperationBufferra   r   c                 S  s   |  }|   }t|trt|tsJ | }|  }t|tr&t|ts(J tjj|= ||_tjj|= ||_	tjj
| }tjj
| |tjj
|< |tjj|< tjj| }tjj| |tjj|< |tjj|< d S rc   )rd   r   rb   rJ  rO   r   r6  rl   
name_to_opr&  buffersr  remove
operations)rc  r,  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigrg   rg   rh   replace_operation_buffer  s$   

zKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_bufferc                 s  s$    | ]}t |tjjjr|V  qd S rc   )r   r   r   select_algorithmExternKernelCaller)r   timingrg   rg   rh   r    s    
z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>Nr_  c                 S  s    i | ]\}}t |tr||qS rg   )r   r   )r   r%  r.  rg   rg   rh   r   2  s    z=Scheduler.finalize_multi_template_buffers.<locals>.<dictcomp>c                 S     | d S r  rg   r-  rg   rg   rh   r   7  rC  z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>rZ  r   r8  r   c                   s   t  fdd| D S )Nc                 3  s    | ]}|  V  qd S rc   )r  r  r  rg   rh   r  V  r  zQScheduler.finalize_multi_template_buffers.<locals>.rename_deps.<locals>.<genexpr>r   )r8  r  rg   rh   rename_depsU  r  z>Scheduler.finalize_multi_template_buffers.<locals>.rename_deps)rc  rd  r,  re  ra   r   )r8  r   ra   r   )-r  r  r   rX  rZ   r$   MultiTemplateBufferr!   r  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr   r   r   multi_kernel_hintsr-  r  finalize_as_triton_callersfinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferrv   r  r  rd   rF  r-  r.  r   r   r   r#  r$  rl   r  r   r   r_   r   r   r   )re   ro  r  rZ   
multi_nodemin_node_unfusedr  callershinttimingstriton_timingschoiceout_tensorboxout_storage
out_buffernew_scheduler_noder  	real_namert  new_outold_outrg   r  rh   r    s   









z)Scheduler.finalize_multi_template_buffers	node_listc                 C  s   t dd |D S )Nc                 s  sB    | ]}t |jd o|jduot |jjdo|jjjdkV  qdS )r   Nscatter_moder  )r   rZ   r   r  rY  rg   rg   rh   r  j  s    
z,Scheduler._any_atomic_add.<locals>.<genexpr>)r  re   r  rg   rg   rh   _any_atomic_addi  s   zScheduler._any_atomic_addrP  rQ  Union[bool, Callable[[], bool]]c                   sN  t dd fD }tjs|sdS  rt tjr& s& r(dS 	 }|d 
 s6J jdkr=dS 	 }tt||}|rPdS ddlm  t|d 
 dusgJ d'fdd	tjj 	d(d)fdd}|rt dd fD r dur n ttjsJ i g tjD ]}|}	t|	 dd dD ]0\}
}t|
tjjjsq|
 |
g|||
j dR  W d   n1 sw   Y  qt!d}d}i }D ]d\}
}}z|dur|"  W n( t#y1 } zt$%t&j'r't$(ds!dndt)| W Y d}~qd}~ww |
 *|\}}|||
< ||k rM|}|
}W d   n	1 sXw   Y  q|j+|< t|t,skJ ||< q }	- \}
r.|n.|\}g d}t|	 t/0d dD ]X\}
}t|
tjjj,sqst1|
d!r|
j2j2krq|
 kr n/|d 7 }|tj3kr n#|
 |
g||R  W d   n	1 sw   Y  qt4dkrd"S d*	
f	d$d%}|S ||||||d* 	fd&d%}|S )+
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c                 s  s(    | ]}|  ot| tjV  qd S rc   )rl  r   r	  r$   ru  rY  rg   rg   rh   r  z  s    
z.Scheduler.speedup_by_fusion.<locals>.<genexpr>Tr   re  CompilationErrorNms_fusedr  ms1ms2ra   r   c              	     st   t tjr8| || k r"t d   t|| |  d d S t d   t| ||  d d S d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)rS  r  r  DEBUGrT  rT  r;   r<   )r  r  r  )rP  rQ  rg   rh   
log_fusion  s   z/Scheduler.speedup_by_fusion.<locals>.log_fusionr  r^  r]  r^  )tuple[Optional[LambdaFuture], ModuleType]c                   sR   j | d|d}t|}  sd }||fS  jd|d}t|ts%J ||fS )NT)r\  r]  triton_)kernel_namesource_code)r`  r   loaduse_process_poolr  r   r   )r  r]  src_codemodfut)async_compilere   rg   rh   compile_kernel  s   
z3Scheduler.speedup_by_fusion.<locals>.compile_kernelc                 s  s    | ]	}|  d uV  qd S rc   r  rY  rg   rg   rh   r    s    
c                 S  rs  r  rg   r-  rg   rg   rh   r     rC  z-Scheduler.speedup_by_fusion.<locals>.<lambda>rZ  r_  infException in compiling %s: %sr  r  r   allowed_prologue_inpsFr   c            	        s(  t d} d }i }D ]^\}}}z
|d ur|  W n% ty> } zttjr4tds.dndt| W Y d }~q
d }~ww 	| 
| \}}|||< || k rY|} |}W d    n1 scw   Y  q
|  |  k r|d urtjr|d <  n| |jd < dS dS )Nr  r  r  r  TF)r  r}   r   rS  r  r  r  rT  rb   swap_as_triton_callerrb  r!   ry  rz  r{  _choice_timings)	min_ms_fusedms_fused_choicenew_timingsr  future	mod_fusedr  r  path)	rg  epilogue_fusionfuture_choices hint_override_best_fusion_choicer  r  r  r  re   rg   rh   benchmark_when_ready-  sP   
	

z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyc               
     sp  ddl m}  zd 
d 	d fD ]
}|d ur|  qd \ t r3d W dS 
d \trId W dS 	d \tr_d W dS   tdr  krfjvrjf t	d
 fd	d
   k W S  | y   Y dS  y } zdt|v rW Y d }~dS  d }~ww )Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc                	     s       dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratiorg   rg   r  r  r  path1path2
path_fusedrg   rh   r     s   
zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  r}   rb  mathisinfr   r  r5  r   r  rb   )r  r  r  )r  rg  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  re   r#  r  rh   r  c  sZ   


)r  r  r  r  r  r  ra   r   rc   )r  r^  r]  r^  ra   r  r   )5r  r!   benchmark_fusionrl  r   r	  r$   TritonTemplateBufferrn  rO  r   rt   r   r-  r.  r  triton.compiler.errorsr  rK  r   r   r  AsyncCompileru  ry  rx  r_  r-  rp  TritonTemplateCallerr  r  r]  r  r}   r   rS  r  r  r  rT  rb   rb  r  r   rw  rX  operator
itemgetterr   r   max_epilogue_benchmarked_choicesrz   )re   rP  rQ  is_multi_templatenode_list_1node_list_2node_list_fusedr  r]  rx  r  unfused_timer  r  r  r  r  r  r  r  r  r  triton_choicesr  rg   )r  r  rg  r  r  r  r  r  r  r  r  r  r  rP  rQ  re   r#  rh   speedup_by_fusionr  s  













.BzScheduler.speedup_by_fusionc                 C  s   | j |  S )z0Look up the node in Scheduler name_to_fused_node)rF  rK  r   rg   rg   rh   ry    s   zScheduler.get_fused_nodec                   s  t |ttjrtd D ]
}td|  qi dfdd d fd
d}|D ]@\}}||| |}|}	||rt
||st||}t|rl|||f|< |||f|< q4|soq4 || q4t  } D ]/\}}	}
||v rq||| |	|	u sJ |
|
u sJ | r
|	|
s |	|
 q|tdd d}|}| |S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %srP  rS   rQ  ra   c                   s   t d|  |  |  }| |ksJ || | |  |   j	 fdd 
 D   S )Nzfusing %s with %sc                      i | ]}|   qS rg   r   rY  node3rg   rh   r     r  zEScheduler.fuse_nodes_once.<locals>.fuse_two_nodes.<locals>.<dictcomp>)rS  rT  rd   r   r  r  rh  r5  rF  r  rO  )rP  rQ  rg  )r+  re   r  rh   fuse_two_nodes  s   


z1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodesr   c                   s    | v s |v rf |  |d }|d us$J |\}}}|d  |d   ||u s>J  ||u sGJ | rP| |rQq  ||  | v s |v sd S d S rc   )ry  r$  r4  will_fusion_create_cycle)rP  rQ  pending_fusion
is_speedup	node_key1	node_key2)r  pending_fusionsre   rg   rh   resolve_pending_fusions  s"   

z:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusionsc                 S  ra  rc   r  r-  rg   rg   rh   r     r  z+Scheduler.fuse_nodes_once.<locals>.<lambda>rZ  N)rP  rS   rQ  rS   ra   rS   rW  )r   rS  r  r  r  rT  r   get_possible_fusionsry  r  r  r  callabler   r5  r_  r  rI  )re   r  rZ   r  rP  rQ  speedupseen_pair_speedup_fnis_speedup_fnr  r  rg   )r  r+  r  re   rh   rS    sR   










zScheduler.fuse_nodes_oncer  c           	        s<  t | j}d}t| j}td| tt| D ]a\}}t|}t|dk r)q|dur3||kr3 nH| 	|s?td| q|d7 }t
jdk}t|d j|d|d td	t|| |D ]}|| q^|  | j fd
d  D  qt|dd d| _| | j| _td||t| j | | j dS )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r  Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                   r  rg   r   rY  r  rg   rh   r   9  r  z7Scheduler.create_combo_kernel_nodes.<locals>.<dictcomp>c                 S  ra  rc   r  r-  rg   rg   rh   r   ;  r  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>rZ  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  rz   r   rT  r  r  rM  r=  speedup_by_combo_kernelr!   r  rY   r   rh  r5  rF  r  rO  r_  r  rI  )	re   r  r+  r  num_nodes_orignumr  r*  rZ   rg   r  rh   r    sV   





r  c                 C  s   |D ]}| | j qd S rc   )rI  rF  )re   r  rZ   rg   rg   rh   rI  E  s   zScheduler.prune_redundant_deps1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]c           	        s   g  t tttf   d fdd}tt}|D ]}|r#q| D ]	}|| | q'q|	 D ]}|| q6t
jrdtt}|D ]}t|dd}|rX|| | qG|	 D ]}|| q]   jjd	d
 tdt   S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        r  r  ra   r   c                   s   t | D ]C\}}| |d |d tj  D ]1}||f}|v r q| ||r1 | q| s9| rF||rF ||f qqd S r  )r  r!   )max_fusion_buffer_group_pairwise_attemptsr5  r  r  rl  rn  )r  node1_indexrP  rQ  r[  possible_fusionsr>  re   rg   rh   check_all_pairsR  s*   
z7Scheduler.get_possible_fusions.<locals>.check_all_pairsr   NT)r[  reversezfound %d possible fusionsr  r  ra   r   )r   r/  rS   r  r   r   unfusable_noder/  r  r   r!   aggressive_fusionr   *get_possible_fusions_with_highest_priorityrs  score_fusion_keyrS  rT  rz   )	re   r  r  buffer_names_groupingrZ   r   node_groupinggroup_groupingr   rg   r  rh   r  I  s6   




zScheduler.get_possible_fusionsc                   s   t t  d fdd| j | j B |jj |jj B   tfdd D }|rAt||d	 |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        rZ   rS   ra   r   c                   s^   t | tr-| vr-|  |   rdS t| j@ p,tfdd| j  D S dS )NFc                 3      | ]
} j | V  qd S rc   rB  rY  
found_pathre   rg   rh   r    
    
zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>)r   r  r5  rP  issubsetr   r   r  r  combined_ancestorscombined_namesr  re   visitedrg   rh   r    s   

z6Scheduler.will_fusion_create_cycle.<locals>.found_pathc                 3  r  rc   rB  rY  r  rg   rh   r    rA  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>zwill create cycleNrZ   rS   ra   r   )r   r  rP  _dictr  r   r  rK  )re   rP  rQ  cyclerg   r  rh   r    s   
z"Scheduler.will_fusion_create_cyclec              	     s   ddl m  dfdd}||}||}t fd	d
|D }t fdd
|D }||}d}	|D ]}
z
|	t|
d 7 }	W q4 tyK   Y  dS w ||}tjj	
|	d| r^dS dS )a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   buffer_reuse_keyrZ   rS   ra   list[ir.Buffer]c                   sL   g }| j jD ]} j|j}|r#t|jdkr#|j r#|	|j q|S r  )
r   r   r   r$  rl   rz   r_   rZ   has_tensor_outputr  )rZ   r   r  r   rm   rg   rh   _find_single_user_inputs  s   zKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputsc                 3  r6  rc   rg   r   r  rg   rh   r    rN  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>c                 3  r6  rc   rg   r   r  rg   rh   r    rN  r   r  F    TN)rZ   rS   ra   r  )r  r  r   intersectionrj   r  score_fusion_memoryrO   r   r  statically_known_gt)re   rP  rQ  r
  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr[  	bw_savingrg   )r  re   rh   can_fusion_increase_peak_memory  s$   
z)Scheduler.can_fusion_increase_peak_memory	thresholdrj   c                   s:   |j j|j jB |j j|j jB  }t fdd|D |kS )Nc                 3      | ]}  |V  qd S rc   dep_size_hintr  rm   rg   rh   r    r  z:Scheduler.fusion_accumulate_large_reads.<locals>.<genexpr>)r   r   r   r  )re   rP  rQ  r  	all_readsrg   rm   rh   fusion_accumulate_large_reads  s   z'Scheduler.fusion_accumulate_large_readsc                 C  s*   t t|j|j t|j|j }|dkS )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r`  r   r   )re   rP  rQ  proximity_scorerg   rg   rh   are_long_distant_nodes  s
   z Scheduler.are_long_distant_nodescommon_buf_names"Union[tuple[str], OrderedSet[str]]c                 C  sb  i }dd |j  D }dd |j  D }|D ]}tj|}|| }	|| }
t|	tr2t|
tsAdt|	 dt|
 ||< q|	 |
 krXd|	  d|
  ||< qt	|	j
t	|
j
krgd||< q|	 }|
 }||kr~d| d| ||< q|	 |
 krd	|	 d|
 ||< qd
}t|tjsd|j }d|	 d|
 d| ||< qt|S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                 S     i | ]}|j |qS rg   r  r  rg   rg   rh   r     r  z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>c                 S  r"  rg   r  r  rg   rg   rh   r     r  znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r   zLayout: zUnknown reason: z. )r   r  rO   r   r  r   r-   rt   	get_numelrN   r  
get_offsetnormalize_with_stride_orderr$   r  rv   rb   )re   rP  rQ  r   reasonsnode1_name2depnode2_name2deprb  r   lhs_deprhs_deplhs_offrhs_off
layout_strrg   rg   rh   decide_fusion_fail_reason	  sD   
z#Scheduler.decide_fusion_fail_reasonc                 C  s  t jrtdd ||fD rdS | s| rdS |j }|j }||@ }|s,dS dd |j D }dd |j D }g }|D ]#}	||	 }
||	 }|
 | krg|t	j
jj|
 dd|
|f qDt|dkrpdS t|tdd	\}}
}t|
trt|tsdS |
j|jkr|
 | kr| |
S dS d
}| s||
|}n| s|||
}ntd| |  |r| ||S dS )a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c                 s  rL  rc   )rh  rY  rg   rg   rh   r  W  s    
z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>r  c                 S  r"  rg   r  r  rg   rg   rh   r   i  r  z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>c                 S  r"  rg   r  r  rg   rg   rh   r   j  r  r   r  rZ  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)r!   r  r  rl  r   buffer_namesr  r&  r  rO   r   r  r  r$  rz   r  r  r  r   r-   r  r}  r  rj  r  r  rT  rd   r  )re   rP  rQ  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr(  r)  
candidatesbuffer_namer*  r+  _numel	reorderedrg   rg   rh   !shared_data_after_reordering_loopG  s^   


z+Scheduler.shared_data_after_reordering_loopc                 C  s$   t |ttfo|  ot|j S )z>
        Is this node unfusable under any conditions.
        )r   r  r  rl  rK   rZ   r   rg   rg   rh   r    s
   
zScheduler.unfusable_nodeprologue_noder  r#  rK  c           	      C  s   |  tjjkr
dS | }| }d}||| kr |d dS tdd | D }|tj	j
jjfkr:|d dS ddd}|| jrP| sP|d dS dS )zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc                 s  s:    | ]}|j d ur|j  D ]}|jdkr|jV  qqd S )Ncall_function)rZ   r  rf   r  r   rZ  r  rg   rg   rh   r    s    

zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>z\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsr  torch.dtypera   r   c                 S  s   | j dko| jS )Nr  )itemsizeis_floating_point)r  rg   rg   rh   low_prec_fp  rb  zGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fpzVprologue fusion that must be upcast to fp32 not profitable for low precision templatesN)r  r=  ra   r   )rP  rO   r   invoke_quant_opsr  r  r/  rO  r   r:  r;  constant_pad_ndr%  r  r  r]  )	re   r9  r  r#  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  r@  rg   rg   rh   (check_prologue_fusion_heuristics_fusable  s4   

z2Scheduler.check_prologue_fusion_heuristics_fusable/Optional[tuple[int, SchedulerNode, sympy.Expr]]c                   s  t |tr
t |tsdS t |jtjrt |jtjsdS | s$| r&dS tjdkr-dS |j|j}}|\}}|\}}|	 sP|	 sP||ksPt
|t
|krRdS t
|jjdksbt
|jjdkrddS  tt|jj}	 tt|jj}
t|	|
tjkrdS d fdd	}||s||rdS g }tt||D ]\}\}}||kr|| qt
|dkrdS |d
 }|| || }}tjj||r|||fS tjj||r|||fS dS )ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        NrP  r   rZ   rS   ra   r   c                   s`   | j jD ])}|j jv r j|j }n j|j}|r-tjj	|| r-t
|jts- dS qdS )NTF)r   r   rl   r   r   r$  rO   r   r   r  r   r\   r  )rZ   r  r  rm   rg   rh   has_reusable_buffer  s   
zIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_bufferr   r  )r   rX  rZ   r$   r~  r  r!   rQ  rt  rj  rz   r   r   r  r  r  r  small_memory_access_thresholdr  r  r  rO   r   r  statically_known_lt)re   rP  rQ  n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryrH  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2rg   rm   rh   "get_expand_dim_for_pointwise_nodes  s`   
 


z,Scheduler.get_expand_dim_for_pointwise_nodesc                   s  ||u rdS t ||}| r| | ||rdS t|ts&t|tr,|d dS t|ttfr=| s=|d dS t|ttfrN| sN|d dS |	 |j
@ r[|d dS | r,tjsi|d dS | sq| rw|d dS | }t|tjs|d	 dS | }td
d |jD | }| |@ r|d dS | s| r|d dS |   dd D ]}| }|D ]}	t fdd|	jD s|d   dS qqt|ts|gndd |jD }
t|
dksJ |
d }t d jdkrt d jd jdkr d jd jd j|u s"|d dS | |||s,dS | rE| s?| s?tj sE|d dS | t!j"j#@ sW| t!j"j#@ r]|d dS | }| }||krr|d|| dS ~| $||}|tj%k rtj&r| '||}|dkr|}tj(r| )|| }r|\}}}|*|| | $||}t+,t-j.rt+/d|0 |0 | t!j12| |||sdS |	 |j
@ r| 3||ot!j13| |||o| |3||S t!j14| |||o| |4||S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc                 s  rL  rc   r   )r   inprg   rg   rh   r  {  rN  z%Scheduler.can_fuse.<locals>.<genexpr>z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr  c                 3  s    | ]}|j  v V  qd S rc   r  r  prologue_nodesrg   rh   r    r  z7template prologue can only fuse nodes with a single usec                 S  r  rg   r  rY  rg   rg   rh   r     r  z&Scheduler.can_fuse.<locals>.<listcomp>r   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)5rK  rl  r  r   can_fuse_multi_outputs_templater   r  r  r  rP  r   r!   prologue_fusionrj  r  r$   r  get_allowed_prologue_inpsr   r2  rT  r  rO  r   r\  r_   r  r  rz   r   rZ   rF  r  rO   r   no_fuse_buffer_namesr  score_fusion_memory_thresholdr  r8  $expand_dimension_for_pointwise_nodesrZ  r  r  r  r  r  rT  rd   choicesr  can_fuse_verticalcan_fuse_horizontal)re   rP  rQ  r#  r  r  unsupported_prologue_argsrZ   	node_outsr   template_snodestemplate_snoderg  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizerg   r\  rh   r  E  s   





zScheduler.can_fusec                 C  s*  |  }t||}tt}|jD ]}| j|j|j}t|t	r(| 
|||r(q|| | q|jjD ]&}t|ts<q4|| j|j|j}	|	rZ|	D ]}
| |
|rY|	|
 qLq4tdd tj| D }||@ rt|d dS | }|D ]}| j|  }|| j| j@ r|d  dS qzdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c                 s  r  rc   r  r  rg   rg   rh   r    r,  z.Scheduler.can_fuse_vertical.<locals>.<genexpr>zmemory deps did not matchFz(intermediate nodes between node1 & node2T)rT  rK  r   r   r   r   r$  rl   r   r/   fusable_weak_depr  r   r   r-   fusable_read_and_writerh  r   r-  r.  r  r   rP  r   ri   rF  r   )re   rP  rQ  node1_buf_namesr#  remaining_deps_by_namer  rl   cd	remainingr  remaining_depsnode1_op_namesr=  rg   rg   rh   re    sB   




zScheduler.can_fuse_verticalweak_depr/   c                   s   j | vr	dS fdd|jjD }t|dkrdS |d tts'J tjt	j
r0dS | jj   fdd|jjD }tfdd|D S )	NFc                   s   g | ]
}|j  jkr|qS rg   )rl   r  )r   r  )rz  rg   rh   r     
    z.Scheduler.fusable_weak_dep.<locals>.<listcomp>r   r   c                   s   g | ]	}|j  kr|qS rg   r  r   r  )r  rg   rh   r   *  r  c                 3  sB    | ]}t |tot|jtj o|j jko|j jkV  qd S rc   )r   r-   r   r  r   TMPr  r|  )r  rg   rh   r  -  s    



z-Scheduler.fusable_weak_dep.<locals>.<genexpr>)rl   rT  r   r   rz   r   r-   r   r  r   r}  r#  r  r   r\  )re   rz  rP  rQ  mutating_writesrelevant_readsrg   )r  rz  r  rh   rr    s$   

zScheduler.fusable_weak_depr  r,   r  r-   c                 C  s   t |trQ| j|j|j}||jks!t|jtjs!t|jtjr#dS t	j
r4|j|jkr4| }| }|j|jkoPt|jt|jkoP|jd t|j |jkS t |try| j|j|j}| j|j|j}|j|jkry|jd ury||krydS dS r   )r   r-   r   r$  rl   r   r  r   r}  r!   r  r  r}  rz   r  r.   r  )re   r  r  	read_name
write_namerg   rg   rh   rs  9  s0   



z Scheduler.fusable_read_and_writer  c                 C  s   t j|S rc   )rO   r   get_dep_size_hintr  rg   rg   rh   r  [  ro   zScheduler.dep_size_hintc                   s   t |jjt |jj }t |jjt  jj }t||d t||k rH||kr.|} }|  fdd|jj|jjB D }tfdd|D S |jj|jjB  jj jjB @ }tfdd|D S )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        r\  c                   s(   g | ]}| j jv s| j jv r|qS rg   )r   r   r   r  )rQ  rg   rh   r   o  s
    z1Scheduler.score_fusion_memory.<locals>.<listcomp>c                 3  r  rc   r  r  rm   rg   rh   r  u  r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>c                 3  r  rc   r  r  rm   rg   rh   r  z  r  )rz   r   r   r   r  r  r  )re   rP  rQ  node1_dep_lennode2_dep_lentmpr8  common_memory_depsrg   )rQ  re   rh   r  ^  s   
zScheduler.score_fusion_memoryr  c                 C  s   t |dkr|S i }|D ]2\}}| | ksJ | }t| |||}||vr5||fg||< q|| ||f qt| t	ddd }t |dksTJ |S )Nr   rZ  r   )
rz   r   rj   r  get_fusion_pair_priorityr  r  r-  r  r  )re   r  "possible_fusions_group_by_priorityrP  rQ  rg  fusion_pair_priority&possible_fusions_with_highest_priorityrg   rg   rh   r  |  s.   
z4Scheduler.get_possible_fusions_with_highest_priority+tuple[BaseSchedulerNode, BaseSchedulerNode]r   c                 C  s   t jj| g|R  S )z-
        Shim for list.sort(key=...)
        )rO   rd  score_fusionr  rg   rg   rh   r    s   zScheduler.score_fusion_keyc                 C  s<   t tj }t| jD ]}||| j ||j	 qdS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rO   r   r  r  r  r*  r#  r  r   )re   r!  rZ   rg   rg   rh   r    s
   zScheduler.compute_last_usagec                 C  s   t | jtjj tjjj D ]Q}|| jv r'| j| }| r&tjj	|j
 q|tjjv r_tjj| }t|tjrAtjj	| qt|tjrHq|j}t|tjrU| sWJ tjj	|j q| j  dS )z*Free any buffers that are no longer neededN)r_  r  rO   r   r  r   freedr   r   codegen_freerZ   r  r   r$   r  r,  r   r}  is_input_bufferclear)re   rl   r   r[  storagerg   rg   rh   free_buffers  s4   


zScheduler.free_buffersc                 C  s$   | j  D ]}|  q|   d S rc   )r  r   flushr  )re   r[  rg   rg   rh   r    s   
zScheduler.flushscheduler_noder  c                 C  s   t |tsJ td d  d7  < ttdd |  |  W d    n1 s,w   Y  |j}t |t	j
sCJ dt||tjj |   d S )Nr  extern_callsr   F)increase_kernel_countztype(node)=)r   r  r   rO   set_kernel_handlerr)   r  r+  rZ   r$   r?  rt   r  r   r   r  )re   r  rZ   rg   rg   rh   codegen_extern_call  s   
zScheduler.codegen_extern_callBaseSchedulingc                 C  s   t |jr|jd usJ | dtj| t|j}|d u r(td|j t sR|jdkrBt	j
| }jdk rBt|t t |jrR|jdksRtt || S )Nz( should have been normalized in loweringzUnsupported device type: cuda   mps)rI   rt   r  rO   r   add_device_infor(   rC  r   r   r  get_device_propertiesmajorr0   inspectcurrentframer1   )re   rg  device_schedulingdevice_propsrg   rg   rh   create_backend  s   

zScheduler.create_backendc                 C  s0   |d usJ || j vr| || j |< | j | S rc   )r  r  rf  rg   rg   rh   r    s   

zScheduler.get_backendc                   s`   dfdd  fdd|  D }t| }|r.t|td	d
\}}tjj	| d S d S )NrZ  torch.fx.Nodera   rj   c                   s2   |  j vr j dd t| jjD   j |  S )Nc                 S  r  rg   rg   r  rg   rg   rh   r     r  z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>)r  r  r  r   r  r  rm   rg   rh   	get_order  s   

z*Scheduler.enter_context.<locals>.get_orderc                   s4   i | ]}|j d ur|j  D ]	} ||fd qqS rc   r6  r<  )r  rg   rh   r     s    
z+Scheduler.enter_context.<locals>.<dictcomp>r   rZ  )rZ  r  ra   rj   )
rO  r   r  r  r  r  rO   r   r   enter_context)re   rZ   r  r  lastrg   )r  re   rh   r    s   
zScheduler.enter_contextrl   fused_node_namesr"  c                   sP   z| j | j}W n
 ty   Y dS w t fdd|D o'|| jvo'|| jvS )NFc                 3  s"    | ]}|j p|  v V  qd S rc   )rx  rd   r  r  rg   rh   r    s     zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>)r   r_   KeyErrorr\  r   r#  )re   rl   r  r_   rg   r  rh   $can_buffer_be_removed_through_fusion
  s   z.Scheduler.can_buffer_be_removed_through_fusionF
should_logc           	        sd  |j }t|tjjjr:|j }r:| }t|tjj	r#| d|j
 n|}|tjv s/|tjv r:t|tjj	s8J dS tjjjjsGtjdu rGdS dd
d}|rPtn|}t|trct fdd|jD S |j dusjJ | sv|d|d dS t|j tjr|d|d dS t|j tjr|d|d dS t|j ddr|d|d dS t|j r|d|d dS dS )zBReturn True if we should partition the inductor graph on this noder@  TNmsgrb   rZ   r[   ra   r   c                 S  r  rc   rg   )r  rZ   rg   rg   rh   noop_log:  r   z,Scheduler.should_partition.<locals>.noop_logc                 3  r  rc   )should_partitionr   r*  rm   rg   rh   r  @  r  z-Scheduler.should_partition.<locals>.<genexpr>znon gpu opsr  zDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opsF)r  rb   rZ   r[   ra   r   )rZ   r   r   r   r$   r  r  rl   _ops
OpOverload_overloadnamer!   custom_should_partition_opsr  r  r>   wrapperrM   r  r  r  rI   
DeviceCopyConditionalr   rH   )	re   rZ   r  ir_noderf   op_overload_packet_nameop_overload_namer  log_partition_reasonrg   rm   rh   r    sL   






zScheduler.should_partition;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]]c                 C  s@   i }| tjj | jD ]}|j D ]	\}}|j||< qq|S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r  rO   r   r  r  r   r-  rZ   )re   r  rZ   rl   scheduler_bufferrg   rg   rh   get_name_to_nodes[  s   
zScheduler.get_name_to_nodes
signatureslist[GraphPartitionSignature]c           
      C  s   dd t tjjD }dd t tj D }g tj_t |D ]7\}}|jr'qg }|jD ]
}||	| q,g }|j
D ]}	||	|	  q<tjjt||||j qdS )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        c                 S  r  rg   rg   r   rT  rl   rg   rg   rh   r   s  r   z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>c                 S  r  rg   rg   r  rg   rg   rh   r   v  r   N)r  rO   r   r  r  partition_mapsskip_cudagraphinput_nodesr  r$  output_nodesrd   rE   constant_names)
re   r  name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingrl   output_mappingrZ   rg   rg   rh   compute_graph_partition_mapsk  s2   


z&Scheduler.compute_graph_partition_maps	partitionrT   r  OrderedSet[sympy.Symbol]c                   s   dfdddfdd	dfdd ddd}t  jfdd|D  }|j fdd| D   ||}t  }|D ]}tjj|}||j q?t t	|t
ddS )ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        rZ   	ir.IRNodera   r  c                   sx   t  }|  }t|tjr/|t|jt|jB t|j	B  t|tj
r-| |j |S |d u s:J d| |S )Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr   r$   Layoutr  r   r  strideoffsetr  r  )rZ   free_symbol_usesrv   get_layout_symintsrg   rh   r    s"   
zGScheduler.get_graph_partition_symbol_inputs.<locals>.get_layout_symintsrS   c                   s`   t | trt jfdd| jD  S | jdusJ | j }|j fdd| j D   |S )z4
            Gets symbols used in node.
            c                 3  r6  rc   rg   r  get_scheduler_node_symbol_usesrg   rh   r    rN  zfScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>Nc                 3  r6  rc   rg   )r   r  r  rg   rh   r    rN  )	r   r  r   r  r  rZ   r   r  r   )rZ   r  )r  r  rg   rh   r    s   

zSScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]c                   s8   t | tjr	t S t | tjr | S tdt|  )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )r   r$   r  r   r  r  rt   r  r  rg   rh   get_input_node_symbols  s
   zKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbolssymbolsc                 S  s   t dd | D S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c                 s  s.    | ]}t |tjtjtjtjfr|V  qd S rc   )r   r   SIZEFLOATUNBACKED_INTUNBACKED_FLOATr  rg   rg   rh   r    s    
zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>r   )r  rg   rg   rh   filter_symbols  s   zCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbolsc                 3  r6  rc   rg   rM  r  rg   rh   r    rN  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>c                 3  s    | ]	\}} |V  qd S rc   rg   )r   r  rZ   )r  rg   rh   r    r'  rl   rZ  N)rZ   r  ra   r  )rZ   rS   ra   r  )rZ   r  ra   r  )r  r  ra   r  )r   r  r-  rO   r   r  simplifyr  r   r_  r  
attrgetter)re   r  r  r  candidate_symbolsresr  symplified_srg   )r  r  r  rh   !get_graph_partition_symbol_inputs  s    
z+Scheduler.get_graph_partition_symbol_inputs
partitionslist[PartitionType]skip_cudagraphs
list[bool]c                   s  g }t tj } dfddtt|t|D ]\}}t  }|D ]
}||j	  q'|
|}	tjdd |D }
t fd	d|
j|
jB D | }t fd
d|D }t   |D ]} |j qcfdd | D }|| fdd|D } fdd|D } fdd|D }|	| t fdd|	D }	fdd|	D }dd |D }||}t||||||}|| |||	 }q|ddd S )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        rb  rb   ra   r   c                   sX   j | d}|du rdS t|jjtr*t|jtjr(j| d }r( |S dS dS )z
            Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
            so graph partition should not take it as inputs or outputs.
            NFT)	r   r$  r   rZ   rv   r7   r$   MutationOutputr#  )rb  r   r  )is_none_layoutre   rg   rh   r    s   z?Scheduler.get_graph_partition_signature.<locals>.is_none_layoutc                 S  r0  rg   r  rM  rg   rg   rh   r   (  r  z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>c                   s   g | ]
} |j s|j qS rg   r  r}  )r  rg   rh   r   /  s    c                 3      | ]
} j ||V  qd S rc   r#  r$  r
  rm   rg   rh   r  8  r  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>c                   s   g | ]}| v r|qS rg   rg   r
  r  rg   rh   r   D  s
    c                   r	  rg   rg   r
  r  rg   rh   r   K  r  z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>c                   s&   i | ]}|v r|| v rd ndqS )TFrg   r
  r  r  rg   rh   r   P  s
    c                   s    g | ]}|v r| vr|qS rg   rg   r
  r  rg   rh   r   Z  
    c                 3  r  rc   r  r
  rm   rg   rh   r  b  r  c                   s   g | ]
} |s| qS rg   rg   r
  )r  r  rg   rh   r   g  s    c                 S  s   g | ]
}|t jjv r|qS rg   )rO   r   r  r
  rg   rg   rh   r   m  r  Nr  )rb  rb   ra   r   )r   rO   r   r  r  r  r  r  r   r  r  r#   r  r  r   r   r   r  r4   r  r  )re   r  r  r  unmet_output_namesr  r  output_namesrZ   returned_output_namesr   partition_input_namesextra_input_namesr  input_deallocationextra_output_namesr  r  symbol_inputspartition_signaturerg   )r  r  r  re   rh   get_graph_partition_signature  s   








	z'Scheduler.get_graph_partition_signaturer  r4   c                 C  s^   dd |j  D }dd |j D }dd |jD }dd |jD }t|j||||j|S )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        c                 S  "   i | ]\}}|t jjvr||qS rg   rO   r   r  )r   rl   r  rg   rg   rh   r     
    zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>c                 S  r  rg   r  )r   rl   r  rg   rg   rh   r     r   c                 S  s    g | ]}|  tjjvr|qS rg   )maybe_get_namerO   r   r  rM  rg   rg   rh   r     r  zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>c                 S  s   g | ]
}|t jjvr|qS rg   r  r
  rg   rg   rh   r     r{  )r  r-  r  r  r  r4   r  r  )re   r  r  r  r  r  rg   rg   rh   .clean_removed_buffer_from_partition_signatures  s(   z8Scheduler.clean_removed_buffer_from_partition_signaturesc                   s  ddl t g  g dd t|D d fd	d
dfdd}|D ]}t|jj|< | dkr=| q)g }d}|t|k rsL rr`\}}|| || sN rt \}}|| ||  sb|d7 }|t|k rsL sL|t|krtd|S )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                 S  r  rg   rg   )r   rT  rZ   rg   rg   rh   r     r  z>Scheduler.reorder_for_minimizing_partition.<locals>.<dictcomp>rZ   rS   ra   r   c                   s6   |  | f} | r| d S  | d S rc   )r  heappush)rZ   node_with_index)cudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesre   rg   rh   insert_pending_nodes  s   
zHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodesc                   sF   | j jD ]}| dksJ |  d8  < | dkr  | qd S )Nr   r   )r   
succ_nodes)rZ   	succ_node)r	  node_to_indegreerg   rh   update_indegree  s   zCScheduler.reorder_for_minimizing_partition.<locals>.update_indegreer   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                rZ   rS   ra   r   )	r  r  r  rz   r   
pred_nodesheappopr  rC  )re   r  r  rZ   schedule	num_itersr  rg   )r  r  r	  r  r  r  re   rh    reorder_for_minimizing_partition  sP   

z*Scheduler.reorder_for_minimizing_partitionc           
      C  sp   ddl m}m} ttj }||| j| jttjj	
 |\}}| |}||||\}}	||d k r6|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r   )estimate_peak_memoryprepare_planning_infor:  )r  r  r  r   rO   r   r  r   rF  r  r  r  )
re   r  r  r  r  default_peak_memoryr1  reordered_nodesreorder_peak_memoryr  rg   rg   rh   r    s    
z0Scheduler.maybe_reorder_for_minimizing_partitionc                 C  sv   g }g }g }d	dd}|D ]'}|  |}|r#t|jdkr#|| q|r/||r/|| q|| q|| | S )
a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        rZ   rS   ra   r   c                 S  s2   |   D ]}|jD ]}t|jts  dS q	qdS r   )r   r_   r   rZ   r   )rZ   r   r   rg   rg   rh   only_output_user  s   
zPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_userr   Nr  )r  rz   r   r  )re   r  frontmiddlebackr  rZ   r  rg   rg   rh   r  
  s   


z6Scheduler.reorder_for_partition_with_simple_dependency9tuple[list[PartitionType], list[GraphPartitionSignature]]c                 C  s   g }d}g }g }| j D ]"}| j|dd}|r&||kr&|| || g }|}|| q|r:|| || | j||d}| | ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        T)r  )r  r  )r  r  r  r  r  )re   r  r  cur_partitionr  rZ   r  r  rg   rg   rh   r  *  s(   





zScheduler.graph_partitionc                 C  sL   t d tjjjr|  n| | j	 W  d    S 1 sw   Y  d S )NScheduler.codegen)r   r   r   r!   r  _codegen_partitions_codegenr  rm   rg   rg   rh   r  J  s   


$r  c                 C  s   ddl m} tjj}t| j}tj > tjjdd| ||d | 	| t
tjj|s0J | |}|tjj_tjj  tjjtjj\}}W d   n1 sUw   Y  tjj|j tjj|| tjjjdd |jD  dS )	z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesNc                 S  r  rg   r   rM  rg   rg   rh   r   y  r  z8Scheduler._codegen_partition_wrapper.<locals>.<listcomp>)r  r"  rO   r   r   r  r  set_current_wrapper_codeinit_wrapper_coder!  r   r  r'  write_prefixgenerateis_inferencedefine_subgraph_launcher_fnr  codegen_partition_call	allocatedr  r  )re   r  r  r"  r&  graph_partition_idpartition_coder  rg   rg   rh   _codegen_partition_wrapperR  s,   





z$Scheduler._codegen_partition_wrapper'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )Nra   Iterator[None]c                   3  s       jr#tjjr#jjd usJ dtjjjj zd V  W jr7tjjr7tjj	  d _d S jrKtjjrKtjj	  d _w )Ndevice should have an index)
%update_graph_partition_default_devicer  rA   rt   r  rO   r   r   codegen_device_guard_entercodegen_device_guard_exitrg   r  re   r  rg   rh   ctx  s.   
z1Scheduler.use_default_device_context.<locals>.ctx)ra   r4  )
contextlibcontextmanager)re   r  r  r:  rg   r9  rh   use_default_device_context|  s   z$Scheduler.use_default_device_contextc                 C  s   t |dkr|d jsd S ddd}ddd}d }t||D ]\}}|js+||} nq|d u r2d S t||D ]\}}|jrF|||sF d S q7|| _d S )Nr   r   r  rT   ra   r  c                 S  s   | d   }|d usJ |S r  r   )r  partition_devicerg   rg   rh   get_cudagraph_partition_device  s   zWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_devicetarget_devicer   c                 S  s$   | D ]}|  }||kr dS qdS r   r>  )r  rA  rZ   rg  rg   rg   rh   all_on_target_device  s   zMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device)r  rT   ra   r  )r  rT   rA  r  ra   r   )rz   r  r  r  )re   r  r  r@  rB  cudagraph_partition_devicer  r  rg   rg   rh   r6    s&   	

	
z/Scheduler.update_graph_partition_default_devicec                 C  s  |   \}}t|dkrdt| d}t|dd | ||0 t||D ]"\}}t|dks9J dt| |jrB| | q&| || q&W d   n1 sSw   Y  t| j	}t
jj| |dkrt
jjduspJ |tt
jjksJ d	| d
tt
jj dS dS )z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   zcudagraph partition into z partitionsr   )r  prefixz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r  rz   rM   r=  r  r  r!  r2  r  r  rO   r   r   set_all_partition_namesr  )re   r  r  r  r  r  num_partitionsrg   rg   rh   r     s.   
zScheduler._codegen_partitionsc              	   C  sX  t jr@dd l}t }t }t|D ]-}|jdkr#|j|j	j
jkr# n|j|jf}||vs:J d|j d|j d|| q| j| _| jrQt jjrQtjj  |D ]5}ttjr{ztd| |  W n tyz   td|  Y nw | | |  }r|| jks|  s|! r| "  || jkr| jrt#| jj$rtjj%  || _t#|j$r|j&d usJ dtjj'|j& || _(| j)*|j+ |! r|,t-|. \}	}
}| /|0|
||	 nc|  rt12t3|}| 4| nS|5 r/t12t6|}| /|}d	d
l7m8} d	dl9m:} t;|||fr |}n	t<dt$| |=| nt;|t>t?fr@| /|@| nt;|tAsHJ |B  t jjCrX| /|D  | jE*|F  | jG*|H  t;|tAs| }|d ur|j$dkr| /|I r| "  qS| j| jkr| jd usJ t#| jj$rtjj%  | "  d S )Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r5  r   )CUDACombinedSchedulingr  ztype(self)=r  )Jr!   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r  rl   filename_dynamoconvert_frame__file__linenor5  r  r  r  autotune_at_compile_timerO   r   r   write_get_raw_stream_headerr   r  r  r  rT  rd   r  r   r  r   rm  rl  r  rA   rt   r8  r  r7  r  r  r  r   r  r   rO  r  codegen_templater!  r"  r  r  rn  r   codegen.cuda_combined_schedulingrI  r  r  r   r  codegen_combo_kernelr  rX  codegen_noder  r+  debug_sync_kernelcodegen_syncr9  rT  r  rP  ready_to_flush)re   r  r   stackr>  framer[  rZ   rg  r  r  r  backend_rI  r  r[  rg   rg   rh   r!    s   













zScheduler._codegen(tuple[float, float, list[Optional[str]]]c                 C  s:   |d   }| tj_|| _|dusJ | |}||S )rW  r   N)r   rO   r   rY   r  r  benchmark_combo_kernel)re   r  rg  r[  rg   rg   rh   r`  [  s   

z Scheduler.benchmark_combo_kernelc                 C  s  t jsdS |}|d  }|du s|jdkrdS ddlm} dg }}t|D ]T\}}| }	| |	r9t	
d z| |	\}
}t|
rPt	
d| W  d	S W n  |yq } zd
t|v rlt	
d W Y d}~ dS  d}~ww ||
7 }|| q'z
| |\}}}W n |y } zd
t|v rt	
d W Y d}~dS  d}~ww || dk p|dk }t	tjr||ks|rt	
dt|| d nt	
dt|| d || |k p|S )r  Tr   Nre  r  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r!   r`  r   rt   r  r  r  rO  r  rS  rT  rX  r  r  rb   r  r  r  r  r;   r<   )re   r  subkernel_nodesrg  r  r  
path1_listr  r*  r  r  r  r  r  	ms2_clone_path2_listsmall_kernelrg   rg   rh   r  i  sr   


	z!Scheduler.speedup_by_combo_kernelrb  	ir.Layoutc                 C  s"   | j | }|jd usJ |j S rc   )r   rZ   
get_layout)re   rb  r   rg   rg   rh   get_buffer_layout  s   

zScheduler.get_buffer_layoutc                 C  sr   | j D ]3}| r6|jjD ](}tjj|j}|r5t	|dkr5t
|jttfs5| g kr5tjj|j qqd S rd  )r  rI   r   r   rO   r   r6  r$  rl   r3   r   rv   r7   r6   r   zero_dim_cpu_tensor_listr5  )re   rZ   r  r  rg   rg   rh   r    s   

z$Scheduler.update_zero_dim_cpu_tensor)r  r~  ra   r   )ra   r  r   )rg  r   ra   r   r   )r  rb   ra   r   )rZ   r   ra   rS   rP  )r*  rS   ra   r  )ra   r>  r  r^  ra   rV  rc   r  r^  r\  r   r]  r^  ra   rb   )ra  r   rg  r  ra   rV  )r  r^  ra   r   )rP  rS   rQ  rS   ra   r  )rZ   rS   ra   rS   )r  r^  ra   r   r  )r  r  ra   r  rP  rS   rQ  rS   ra   r   )rP  rS   rQ  rS   r  rj   ra   r   )rP  rS   rQ  rS   r   r!  ra   rb   rP  rS   rQ  rS   ra   rj   r  )r9  rS   r  rS   r#  rK  ra   r   )rP  rS   rQ  rS   ra   rG  )rz  r/   rP  rS   rQ  rS   ra   r   )r  r,   r  r-   ra   r   )r  r,   ra   rj   )r  r  ra   r  )r  r  ra   r   )r  r  ra   r   )rg  r  ra   r  )rg  r   ra   r  r  )rl   rb   r  r"  ra   r   rX  )rZ   rS   r  r   ra   r   )ra   r  )r  r  ra   r   )r  rT   r  r  ra   r  )r  r  r  r  ra   r  )r  r4   ra   r4   )ra   r  )r  rT   r  r4   ra   r   )r  r  r  r  ra   r3  )r  r  r  r  ra   r   r  r^  ra   r_  )r  r  ra   r   )rb  rb   ra   rf  )Qru   r   r   r  r   r  r  propertyr  setterr  r  r  r  rR   r  r  r  rF  rB  r  r  rU  r  rX  r`  rb  r  r  r  ry  rS  r  rI  r  r  r  r  r  r/  r8  r  rF  rZ  r  re  rr  rs  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r2  r=  r6  r   r!  r`  r  rh  r  rs  rg   rg   rn  rh   rX     s    
 
0




$ 
v
M
*









y
	  
7
j
0

6
.
9


>
W


;
b 

5
%"
 


D)g 
$
A
 
 *
/
!tKc                      s   e Zd ZdG fddZdHddZdIddZdJddZdJddZdJddZdKddZ	dLddZ
dMd%d&Z	'dNdOd-d.ZdPd1d2ZdHd3d4ZdQd5d6ZdHd7d8ZdRd:d;ZdSd>d?ZdTdAdBZdUdEdFZ  ZS )Vr  rY   Optional[Scheduler]c                   s   t    || _d S rc   )rk  r   rY   r   rn  rg   rh   r     s   

zBaseScheduling.__init__ra   r   c                 C  s   | j r
| j   d S d S rc   )rY   r  rm   rg   rg   rh   free_buffers_in_scheduler  s   z(BaseScheduling.free_buffers_in_schedulerrg  r  OrderedSet[BackendFeature]c                 C  s   t  S )z0Return a set of .codegen.common.BackendFeature()r   rf  rg   rg   rh   get_backend_features  s   z#BaseScheduling.get_backend_featuresrP  rS   rQ  r   c                 C  r	  )zO
        Check whether node1 and node2 can be vertically fused or not.
        r
  rR  rg   rg   rh   re       z BaseScheduling.can_fuse_verticalc                 C  r	  )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r
  rR  rg   rg   rh   rf    ru  z"BaseScheduling.can_fuse_horizontalc                 C  r   )au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Frg   rR  rg   rg   rh   r^    s   
z.BaseScheduling.can_fuse_multi_outputs_templater  c                 C  s(   |  s|  rt||S t||S )z 
        Fuse two nodes
        )rn  r  r  r  rR  rg   rg   rh   r    s   zBaseScheduling.fuser  r  "tuple[tuple[sympy.Expr, ...], ...]c                 C  r	  )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r
  )re   r  rg   rg   rh   r    ru  zBaseScheduling.group_fnr  epilogue_nodesr^  r]  Optional[str]c                 C  r	  )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r
  )re   r  rw  r]  rg   rg   rh   rU    s   zBaseScheduling.codegen_templateNr  r\  r]  r^  rb   c                 C  r	  zD
        Generate a kernel given a list of pre-fused nodes.
        r
  )re   r  r\  r]  rg   rg   rh   r`    s   	z.BaseScheduling.generate_kernel_code_from_nodesrZ   (Union[FusedSchedulerNode, SchedulerNode]c                 C  r	  ry  r
  r   rg   rg   rh   rX       zBaseScheduling.codegen_nodec                 C  r	  )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r
  rm   rg   rg   rh   rZ  $  r{  zBaseScheduling.codegen_syncc                 C  r   )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Frg   rm   rg   rg   rh   r[  *     zBaseScheduling.ready_to_flushc                 C  r	  )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r
  rm   rg   rg   rh   r  1  r{  zBaseScheduling.flushrV  c                 C  r	  )rW  r
  r  rg   rg   rh   rX  7     z$BaseScheduling.benchmark_fused_nodesra  r   c                 C  r	  )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r
  )re   ra  rg   rg   rh   rb  @  r|  z)BaseScheduling.benchmark_codegened_modulerj   c                 C  r   )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   rg   rR  rg   rg   rh   r  G  r}  z'BaseScheduling.get_fusion_pair_priorityr  r_  c                 C  r	  )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r
  r  rg   rg   rh   r`  P  r}  z%BaseScheduling.benchmark_combo_kernel)rY   rq  r   )rg  r  ra   rs  rl  r  )r  r  ra   rv  )r  rS   rw  r^  r]  r^  ra   rx  rc   rk  )rZ   rz  ra   r   r   rj  )ra  r   ra   rV  rm  rn  )ru   r   r   r   rr  rt  re  rf  r^  r  r  rU  r`  rX  rZ  r[  r  rX  rb  r  r`  rs  rg   rg   rn  rh   r    s(    













	
	r  )ra   r&  )r*  rS   ra   rb   )r*  rS   ra   r9  )r*  rS   ra   r   )rY  r   ra   rb   )rZ   rS   rF  rG  r   rd  ra   r   )r  r  ra   r   )r  r  rY   rX   r  r  ra   r   )rg   )rY  rZ  r  r  r[  r  ra   r\  r   )
__future__r   r  r;  r   rp  r  r-  r  r  r  r  r`  ra  rL  r!  r   r   r   r   r   r   r	   r
   r   typing_extensionsr   r   collections.abcr   r   typesr   r  r   torch._inductor.async_compiletorch.utils._pytreerG  _pytreer5  torch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   r   torch.utils._tritonr   r   r    r!   r"   r#   r$   r%   analyze_preserves_zero_maskr&   codegen.commonr'   r(   r)   comm_analysisr*   r+   r,   r-   r.   r/   excr0   r1   fx_utilsr2   r3   r4   r5   r6   r7   	loop_bodyr8   r  r9   r:   runtime.runtime_utilsr;   r<   r  r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   virtualizedrO   	getLoggerru   r   _logginggetArtifactLoggerrS  r  r  r   rT   r   rU   rV   	dataclassrW   r   rS   r  r  r  rA  r  rK  rx   r   rH  r  r  rX  r  r  r  r  r  ru  rv  r  r  r}  rX   r  rg   rg   rg   rh   <module>   s    $ L
k     j





)  
0
 p  Ce.
                          9