o
    i^                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZ d dl m Z  d dl	m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dlm5Z5 d dl6Z6d dl7Z7d dl8m9  m:Z; d d	l<m=Z= d d
l>m?Z? d dl@mAZA d dlBmCZC d dl8mDZDmEZE ddgZFd dlGmHZHmIZImJZJmKZK e+rId dlmLZLmMZMmNZN d dl7mOZOmPZPmQZQ d dlRmSZS d dlTmUZU d dlVmWZW ddlXmYZY ddlZm[Z[ ddl\m]Z] ddl^m_Z_m`Z`maZambZbmcZcmdZd ddlemfZf ddlgmhZhmiZi g dZje,dZkejldd!d"Zmd d#lnmoZo d d$lpmqZq d d%lrmsZs d d&ltmuZu d d'lvmwZw d d(lxmyZy d d)lzm{Z{m|Z|m}Z}m~Z~mZ d d*lmZmZ d d+lmZmZ dd,lmZ dd-lmZ ejd.kZeeZe7jed/Ze,d0Zee6je6jf Ze)e-e7jee7jQf  Zd1d2d3Zd4Zd4Zd4Zd5Zd6Zeed @ d kred7ksJ d8dd;d<Zdd@dAZG dBdC dCe6jZejdDdEG dFdG dGZdϐddOdPZ	IdϐddQdRZejlddSdTZddXdYZdd\d]ZddadbZddedfZddjdkZddndoZddsdtZddwdxZdd{d|ZdddZdd fdddZdddZdސdddZ		ddddZ					ddddZdddZdddZdddZdddZdddZe1dZe,ddDdZe$e/e#ef ef ZG dd de*e&eef ZdddZdddńZdddɄZddd΄ZdddӄZÐddd؄Z	dddd߄ZŐdddZƐdddZǐdddZȐdddZɐdddZʐdddZːdddZ̐dddZ͐dddZΐdd dZϐdddZАdddZѐdd
dZd dlZӐdddZg Zde֐d< dddZאdddZej			Dd dddZeZeZeZݐdddZސdd"d#Zed7dd&d'ZG d(d) d)e(ZejG d*d+ d+ZG d,d- d-ZG d.d/ d/eZejِdd0d1ZG d2d3 d3ZG d4d5 d5eZejlddd8d9Zejdd:d;Zejdѐd<d=Zdd>d?Z	dd	dDdEZd
dJdKZddMdNZddOdPZdQdQdDdRddVdWZdQdXdd\d]ZdQdXdd^d_ZddcddZddfdgZe-ee6jf Zdhe֐di< ejlddjdkZejlddldmZejlddndoZejlddpdqZejlddsdtZddudvZddwdxZddydzZdd{d|Zdd}d~Z dddZ	Q	D	Q	ddddZdѐddZG dd dZdddZdddZdddZdddZdddZ	dddZ
dddZejِdddZ	ddddZdddZd ddZd!ddZd!ddZd"ddZd#ddZejِd$ddZdddZejldddÄZejld%dĐdńZejlddƐdǄZddȐdɄZd&dːd̄Zd'd͐d΄ZdѐdϐdЄZdѐdѐd҄Zd(dՐdքZddאd؄ZG dِdڄ dej Z!d)dސd߄Z"d*ddZ#d*ddZ$	dd+ddZ%d,ddZ&d-ddZ'd-ddZ(d.ddZ)d/ddZ*dd fd0ddZ+dd fd0ddZ,d1ddZ-d2ddZ.ejG dd dZ/ejِd3ddZ0d4ddZ1d5ddZ2d6ddZ3d7ddZ4d8d"d#Z5d9d$d%Z6d:d(d)Z7d;d+d,Z8d<d.d/Z9d=d1d2Z:d>d5d6Z;d?d;d<Z<d@d=d>Z=	ddAdEdFZ>dBdHdIZ?dCdKdLZ@dDdOdPZAdѐdQdRZBd4dSdTZCdUdVdWdXdYdZdZd[ZDd\d] eDE D ZFeGd^ZHdEd_d`ZIdFdadbZJdGdedfZKdGdgdhZLejldHdjdkZMejG dldm dmZNi ZOdne֐do< dIdsdtZPeC ZQdue֐dv< dJdwdxZRd̐dydzZSdKd{d|ZTe,d}ZUe,d~ZVG dd deeUeVf ZWe0dDdddDdEdLddZXdMddZYG dd dej ZZejldNddZ[dѐddZ\dOddZ]d̐ddZ^dPddZ_dѐddZ`dQddZadZbdRddZcdRddZddSddZe		dTdUddZfdVddZgdѐddZhdWddZi		dXdYddZjdZddZkejdDdEG dd dZle$de#f Zme$emelgemf ZnG dĐdń dŃZoeo Zpd[dǐdȄZqd\dʐd˄ZrdS (]      )annotationsN)
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)datasheet_tops)DeviceProperties)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTreturnstrc                  C  s>   dd t D } t| dksJ t| dkrd}|S |  }|S )Nc                 S  s   g | ]}t t| r|qS  )getattrtorchis_available.0xrI   rI   R/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/torch/_inductor/utils.py
<listcomp>k   s    z get_gpu_type.<locals>.<listcomp>r2   r   rB   )	GPU_TYPESlenpop)
avail_gpusgpu_typerI   rI   rP   get_gpu_typei   s   rW   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32
perf_hints_Tz.cubinz.spv)rB   rD         @      zmust be power of 2nbytesintc                 C  s   | t  d t  @ S )z/Round up to the nearest multiple of ALIGN_BYTESr2   )ALIGN_BYTES)rq   rI   rI   rP   _align   s   rt   v
sympy.Exprboolc                 C  s<   t | tjtjfrttt| jS t | tpt	| t
t
kS )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrs   )ru   rI   rI   rP   r~      s   r~   c                   @  s&   e Zd ZdZdZdZeddd	Zd
S )r   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr2   Tvaluerv   rG   Optional[sympy.Expr]c                 C  s,   t |ttjfrtt|S t|r|S d S N)rx   rr   ry   Integerrt   r~   )clsr   rI   rI   rP   eval   s
   z
align.evalN)r   rv   rG   r   )__name__
__module____qualname____doc__nargs
is_integerclassmethodr   rI   rI   rI   rP   r      s    r   Tfrozenc                   @  s2   e Zd ZU dZded< ded< ded< ded< d	S )
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    rr   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__rI   rI   rI   rP   r      s   
 r      d   fnCallable[[], Any]warmuprepfloatc              
   C  s   |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|	  |   q)|  t j  |
|d }tdt|| }tdt|| }	t|D ]}|   qYdd	 t|	D }d
d	 t|	D }t jjt jjjgdP}
t j  t|	D ],}|	  ||   t jjd |   W d   n1 sw   Y  ||   qt j  t dd	 t||D }W d   n1 sw   Y  t | }td t|
 jddd tdd	 |
 D }|r|tdd |D d 8 }td| |S )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArB   dtypedeviceTenable_timing   r2   c                 S     g | ]	}t jjd dqS Tr   rK   rB   EventrN   _rI   rI   rP   rQ          zfp8_bench.<locals>.<listcomp>c                 S  r   r   r   r   rI   rI   rP   rQ      r   
activitiesRunCudaModuleNc                 S  s   g | ]	\}}| |qS rI   )elapsed_time)rN   serI   rI   rP   rQ      r   
raw eventsself_device_time_totalsort_by	row_limitc                 S  s.   g | ]}|j tjkrtd |jdur|qS )zfused_abs_max_\dN)device_typerZ   CUDArematchnamerN   eventrI   rI   rP   rQ      s    c                 s      | ]}|j V  qd S r   device_time_totalr   rI   rI   rP   	<genexpr>	      zfp8_bench.<locals>.<genexpr>     @@profiling results: %s ms)rK   rB   synchronizeemptyrr   float16r   recordrangezero_r   maxprofilerprofileProfilerActivityr   nvtxtensorzipmeanitemlogdebugkey_averagestabler[   events
statistics)r   r   r   cachestart_event	end_eventr   estimate_msn_warmupn_repeatpitimesresfiltered_eventsrI   rI   rP   	fp8_bench   sh   	





r   c                   s  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|  |   q)|  t j  |	|d }t
dt|| }t
dt|| }	t|D ]}|   qYt j  t jjt jjjgd}
t|	D ]	}|  |   qtt j  W d	   n1 sw   Y  td
 t|
 jddd tdd |
 D }t||	 dkrtdt||	t||	  t fddt|D }|  | }td t|jdd tdd |D d |	 }td| |S )r   r   rB   r   Tr   r   r2   r   Nr   r   r   r   c                 S  s&   g | ]}|j tjkr|jd kr|qS )zContext Sync)r   rZ   r   r   r   rI   rI   rP   rQ   G  s
    z,do_bench_using_profiling.<locals>.<listcomp>r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS r   rI   )rN   r   r   num_event_per_grouprI   rP   rQ   V  s
    zprofiling time breakdown)r   c                 s  r   r   r   r   rI   rI   rP   r   b  r   z+do_bench_using_profiling.<locals>.<genexpr>r   r   )rK   rB   r   r   rr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r[   r   rS   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   actual_eventsr   rI   r   rP   do_bench_using_profiling  sj   





r   c               
   C  s   zddl m}  tjdd | d uotttjdd dW S  ty&   Y dS  t	y@ } zdt
|v s5J W Y d }~dS d }~ww )	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   rK   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrJ   opsImportErrorr   rH   )r   r   rI   rI   rP   has_torchvision_roi_aligng  s   
r   r   "Union[Optional[torch.device], str]torch.devicec                 C  s`   | d u r
t djS t| trt | } | jdvr.| jd u r.t| j}t j| j|j	 dS | S )Ng        )cpumeta)index)
rK   r   r   rx   rH   typer  rX   Workercurrent_devicer   device_interfacerI   rI   rP   decode_devicew  s   


r  itIterable[sympy.Expr]c                 C  s   t tj| tjjS r   )	functoolsreduceoperatormulry   SOner	  rI   rI   rP   sympy_product     r  seq1Sequence[sympy.Expr]seq2c                 C  s2   t | t |ks
J ttdd t| |D S )Nc                 s  s    | ]	\}}|| V  qd S r   rI   )rN   abrI   rI   rP   r     s    zsympy_dot.<locals>.<genexpr>)rS   ry   expandr   r   )r  r  rI   rI   rP   	sympy_dot  s   r  Iterable[_T]ValuesView[_T]c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS rI   )r   rM   rI   rI   rP   
<dictcomp>      zunique.<locals>.<dictcomp>)valuesr  rI   rI   rP   unique     r   numberUnion[int, sympy.Expr]denomc              	   C  sr   t | tjst |tjrtt| t|S t | tr!t |ts4J |  dt|  d| dt| t| |S )Nz: , )rx   ry   Exprr^   sympifyrr   r  runtime_ceildiv)r"  r$  rI   rI   rP   ri     s    
ri   keyOptional[torch.dtype]c                 C  s   | d u rdS t | dd }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|d'd( t| D  t| t r`| S d)||  S )*Nz*i8.r   rw   i1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64c                 S  s   i | ]}||qS rI   rI   )rN   ru   rI   rI   rP   r    s    z_type_of.<locals>.<dictcomp>*)rH   splitupdatelistr  rx   )r)  	dtype_strtysrI   rI   rP   _type_of  sZ   
rV  lst"Iterable[Union[int, torch.SymInt]]list[sympy.Expr]c                 C     dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s   g | ]}t |qS rI   )ry   r'  rN   r   rI   rI   rP   rQ     r  z-convert_shape_to_inductor.<locals>.<listcomp>rI   rW  rI   rI   rP   convert_shape_to_inductor  s   r]  r   Union[int, torch.SymInt]c                 C  sB   ddl m} t| tr| S t| tjrt| S |jjjj	| ddS )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r2   VN)hint)
virtualizedr`  rx   rr   ry   r   graphsizevars	shape_envcreate_symintnode)r   r`  rI   rI   rP   convert_to_symint  s   
rg   Iterable[Union[int, sympy.Expr]]list[Union[int, torch.SymInt]]c                 C  rZ  )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    c                 S     g | ]}t |qS rI   )rg  r[  rI   rI   rP   rQ         z+convert_shape_to_symint.<locals>.<listcomp>rI   r\  rI   rI   rP   convert_shape_to_symint  s   rl  optorch._ops.OpOverloadc                 C  s   t dd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s    | ]}|j d uV  qd S r   )
alias_inforN   r  rI   rI   rP   r         zis_view.<locals>.<genexpr>)any_schema	argumentsrm  rI   rI   rP   is_view  s   rv  c                 C     dS NFrI   )r   rI   rI   rP   <lambda>      ry  user1   is_pointwise_fn'Callable[[torch._ops.OpOverload], bool]c                   s~   | j dksdS t| jtjjs| jtju sdS ttjj| j}|tju s(t	|r4t
 fdd| jD S tjj|jv p> |S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc                 3  s    | ]}t | V  qd S r   )is_pointwise_use)rN   ur|  rI   rP   r     rq  z#is_pointwise_use.<locals>.<genexpr>)rm  rx   targetrK   _ops
OpOverloadr  getitemr   rv  r|   usersTag	pointwisetags)r{  r|  r  rI   r  rP   r    s   

r  r  r   r   	list[Any]kwargsdict[str, Any]&tuple[GraphModule, list[torch.Tensor]]c                   s   t j  g d
 fdd} j| gtt j|||fR  }t| jjdkr5t	| jjd j
d	kr5|f} | t ji  }|fS )Nargtorch.TensorrG   r1   c                   s    |   dt S )Nr  )appendplaceholderrS   )r  g
graph_argsrI   rP   add_tensor_arg  s   
z)gen_gm_and_inputs.<locals>.add_tensor_argr2   r   Tensor)r  r  rG   r1   )rK   fxGraphr~  r"   r  rS   rs  returnsrH   r  outputr0   )r  r   r  r  nodegmrI   r  rP   gen_gm_and_inputs  s   

r  rB   Nonec                 C  s,   | dkrd S t | }| r|  d S d S Nr   )rX   rL   r   r  rI   rI   rP   r      s   r   modelCallable[..., Any]example_inputsSequence[Any]r   c                 C  sT   t | td t }t|D ]
}| | }t | qt }|d us&J || S )Ni9  )r   rK   manual_seedtimeperf_counterr   )r  r  r   r   t0r   resultt1rI   rI   rP   timed(  s   

r  rI   
         ?repeatbaselinec                   sH   t  fddt|D }t | }t|| d | S )Nc                   s   g | ]	}t  qS rI   )r  r   r   r  r  r   rI   rP   rQ   C  r   z%print_performance.<locals>.<listcomp>z.6f)rK   r   r   medianprintr   )r  r  r   r  r  r   timingstookrI   r  rP   print_performance:  s   r  objmethodc                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S r   rI   rI   r  rI   rP   ry  M  rz  z#precompute_method.<locals>.<lambda>N)rJ   setattr)r  r  rI   r  rP   precompute_methodJ  s   r  methodsr   c                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)r  )r  r  r  rI   rI   rP   precompute_methodsP  s   r  r  r  c                 C  s   t | |kt | |k  S r   )rr   )r  r  rI   rI   rP   cmpV     r  rO   Union[int, Sequence[int]]sizeSequence[int]c                 C  s:   t | tr
| g| S t| dkrt| | d g| S | S )Nr2   r   )rx   rr   rS   r  )rO   r  rI   rI   rP   pad_listlikeZ  s
   

r  tuple[_T, ...]list[_T]c                 C  s&   t | dkrg S d	dd}t| |dS )
Nr   elemrl   rG   rH   c                 S  s0   t | tr| S ddlm} t | |sJ |  S )Nr2   )r@   )rx   rH   	schedulerr@   get_name)r  r@   rI   rI   rP   	sort_funcg  s
   
ztuple_sorted.<locals>.sort_funcr)  )r  rl   rG   rH   )rS   sorted)rO   r  rI   rI   rP   tuple_sortedc  s   
	r  PRV)	covariantc                   @  s$   e Zd ZedddZdddZdS )CachedMethodr   r   rG   r  c                 C     d S r   rI   )r   rI   rI   rP   clear_cachey     zCachedMethod.clear_cacher   P.argsr  P.kwargsr  c                 O  r  r   rI   selfr   r  rI   rI   rP   __call__|  rz  zCachedMethod.__call__N)r   r   rG   r  )r   r  r  r  rG   r  )r   r   r   staticmethodr  r  rI   rI   rI   rP   r  x  s    r  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c                   sl   | j }d| d d| i}td| d  d  d | t| || d }d fdd}||_|S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfr  r   rG   r  c                      t |  rt|   d S d S r   r   delattrr  r  rI   rP   r       
z"cache_on_self.<locals>.clear_cacher  r   rG   r  r   execlstripr  wrapsr  )r   r   ctxwrapperr  rI   r  rP   cache_on_self  s$   	r  Callable[P, RV]c                 C  s   t | S )z]
    Variant of cache_on_self for properties. The only difference is the type signature.
    )r  r   rI   rI   rP   cache_property_on_self  s   r  
class_name*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]]c                   s   d fdd}|S )Nr   FN_TYPE[P, RV]rG   c                   sh   d d| j  d d| i}td  d  d  d | t| |d	 }d fdd}||_|S )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerr  r   rG   r  c                   r  r   r  r  r  rI   rP   r    r  z<cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cacher  r  )r   r  r  r  r  r  rP   r    s"   z'cache_on_self_and_args.<locals>.wrapper)r   r  rG   r  rI   )r  r  rI   r  rP   cache_on_self_and_args  s   &r  node_schedule0Union[Sequence[BaseSchedulerNode], ExternKernel]OrderedSet[Node]c                 C  sJ   ddl m} t| trttjdd | D t S t| |j	r"| j
S t S )Nr2   irc                 S  s$   g | ]}t |d r|jr|jjqS )r  )r   r  originsrN   r  rI   rI   rP   rQ     s    z%aggregate_origins.<locals>.<listcomp>) r  rx   rS  r  r  r  or_r    r9   r  )r  r  rI   rI   rP   aggregate_origins  s   
	r  Sequence[BaseSchedulerNode]descriptive_names8Literal[True, 'torch', 'original_aten', 'inductor_node']c                 C  s   t | }|dkrdd |D }tt|}nH|dkrPg }|D ]*}|jdkrHd|jv rH|jd d }t|d tr@||d  q||d j qtt|}n|d	kr\d
d |D }nt	|}d
dg| S )Noriginal_atenc                 S  s<   g | ]}|j d krd|jv r|jd dur|jd jjqS )r~  r   N)rm  r  _overloadpacketr   rN   originrI   rI   rP   rQ     s    

z)get_fused_kernel_name.<locals>.<listcomp>rK   r~  source_fn_stackr   r2   inductor_nodec                 S  s   g | ]
}|j d kr|jqS r~  )rm  r   r  rI   rI   rP   rQ     s    r   fused)r  r  r    rm  r  rx   rH   r  r   NotImplementedErrorjoin)r  r  all_originssourcesr  	source_fnrI   rI   rP   get_fused_kernel_name  s.   r  r  r5   tuple[str, str]c                   s~  t | }dd |D }tt}tt}dt|rKtdd |D }t|dkrK|d jtdsAd	d
 tj	D }|_
|jfddd |D ]3}d|jv rk|jd durkt|jd j}	||	 |j d|jv r|jd d j}	||	 |j qMdurdnd}
|j d|
 dd|  dd|  d}|j dg}t| D ]\}}||j d| ddt|  qdurddlm  ||j d t }g }t|  jsddlm} d9 fd#d$}d:d'd(d;fd+d,}| D ]}t|d-r|jdu rqt|jd.r`|jjdur`|jjD ];}|j|v r.q$||j |j|j}|du rBq$|||j\}}||j d/| d0|| d1| d q$t|jd2r|jj dur|jj D ] }|j|j}|du rqr|||j\}}|d3|  qrq|D ]}||j d|j!d4d5  q||j d6d7|  |d8|fS )<aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    c                 S  s   g | ]	}|j d kr|qS r  ru  r  rI   rI   rP   rQ     r   z'get_kernel_metadata.<locals>.<listcomp>Nc                 s  r   r   )rc  )rN   nrI   rI   rP   r   )  r   z&get_kernel_metadata.<locals>.<genexpr>r2   r   )_inductor_kernel_metadata_node_to_idx_mapc                 S     i | ]\}}||qS rI   rI   )rN   idxr  rI   rI   rP   r  .  r  z'get_kernel_metadata.<locals>.<dictcomp>c                   s
    j |  S r   )r  r  )single_graphrI   rP   ry  1  s   
 z%get_kernel_metadata.<locals>.<lambda>r  r   	from_nodezTopologically SortedUnsorted z Source Nodes: [r%  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:r_  buffer2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]rw_namerH   rG   tuple[str, ir.Layout | None]c                   sp   t |  jrt | j jr| jjj}n| j}|d u r|}n|j}z	|  }W ||fS  ty7   d }Y ||fS w r   )rx   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )r  r  r   r   layoutr  rI   rP   get_buffer_infoR  s   
z,get_kernel_metadata.<locals>.get_buffer_infoshapeIterable[int]c                 S  s   dd dd | D  dS )N[r%  c                 S  rj  rI   )rH   rM   rI   rI   rP   rQ   g  rk  z@get_kernel_metadata.<locals>.stringify_shape.<locals>.<listcomp>r  )r	  )r$  rI   rI   rP   stringify_shapef  s   z,get_kernel_metadata.<locals>.stringify_shaper"  ir.Layout | Nonec                   sJ   | d u rdS  | j  } | j }| j }dt| j  | | | dS )Nr  ")r  strider   r   r   )r"  shape_annotationstride_annotationdevice_annotation)r'  rI   rP   stringfy_layouti  s   z,get_kernel_metadata.<locals>.stringfy_layoutread_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r  r  r  rH   rG   r  )r$  r%  rG   rH   )r"  r(  rG   rH   )"r  collectionsdefaultdictrS  rS   r    rc  r   r   nodesr  sortr  rH   r  r  r   commentr	  keysr  itemsr  r  rx   r9   rb  r`  r/  r0  addtry_get_bufferr1  format_node)r  r  r
  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsnode_to_idx_mapr  r)  sort_strmetadatadetailed_metadataoriginal_noder8  	all_reads
all_writesr`  r#  r.  r  rr  
input_namer"  woutput_namer   rI   )r  r  r'  rP   get_kernel_metadata  s   










rO  initial_queueIterable[torch.fx.Node]skip_filterOptional[Callable[[Any], bool]]OrderedSet[torch.fx.Node]c                 C  sZ   t | } t| }| r+|  }|jD ]}|r||rq||vr(|| | | q| s
|S )zJReturns the set of nodes whose values depend on those within initial_queue)rS  r    rT   r  r=  r  )rP  rR  dominated_setr  userrI   rI   rP   dominated_nodes  s   


	rW  Sequence[IRNode]dict[str, IRNode]c                   sp   ddl m  d fddt|\}}fd	d
|D }t| \}}fdd
|D }ttjg ||R  S )Nr2   r  r  r:   rG   rw   c                   sT   t |  jr| jS t |  jr| jS t |  jo)t |  j j j jf S r   )	rx   r  r  r  r:   ComputedBufferInputsKernelInputBufferTemplateBufferr  r  is_unrealized_noderI   rP   r_    s   

z*gather_origins.<locals>.is_unrealized_nodec                      g | ]	} |r|j qS rI   r  rN   valr_  rI   rP   rQ     r   z"gather_origins.<locals>.<listcomp>c                   r`  rI   ra  rb  rd  rI   rP   rQ     r   )r  r:   rG   rw   )r  r  r!   r    	itertoolschain)r   r  kwargs_flattenr   kwargs_originsargs_flattenargs_originsrI   r^  rP   gather_origins  s   rk  exprc                   s@   ddd d fdd	d fd
ddfdd| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    rl  rv   rG   rw   c                 S  s(   t | tjot| jdko| jd dkS )N   r   r   )rx   ry   MulrS   r   rl  rI   rI   rP   is_neg_lead  s   &zsympy_str.<locals>.is_neg_leadrH   c                   sj   t | tjr1t| jdkr( | jd r(| jd  d| jd jd  S dt| jS | S )Nrm  r2   r   z - z + )rx   ry   rz   rS   r   r	  r}   ro  )rp  sympy_str_mulrI   rP   sympy_str_add  s
   (z sympy_str.<locals>.sympy_str_addc                   sB   t | tjr | rd| jd  S dt| jS | S )N-r2   z * )rx   ry   rn  r   r	  r}   ro  )rp  sympy_str_atomrI   rP   rq    s
   z sympy_str.<locals>.sympy_str_mulc                   sp   t | tjr	| jS t | tjtjfrd |  dS t | tttt	fr4| j
j ddtt| j dS t| S )N()r%  )rx   ry   Symbolr   rz   rn  rb   r_   r`   ra   funcr   r	  r}   	sympy_strr   rH   ro  )rr  rI   rP   rt    s   "z!sympy_str.<locals>.sympy_str_atomN)rl  rv   rG   rw   rl  rv   rG   rH   rI   ro  rI   )rp  rr  rt  rq  rP   ry    s
   

ry  r  ValueRanges[Any]c                 C  s>   ddl m} tjrt|jdd  }r|jdkrt| S t	 S )Nr2   r_  current_node
index_expr)
rb  r`  rh   compute_all_boundsrJ   interpreterr  re   rf   unknown)r  r`  fx_noderI   rI   rP   get_bounds_index_expr  s   
r  prefixc                 C  s   | d dkS )Nr   rK  rI   )r  rI   rI   rP   prefix_is_reduction     r  rd   r  sympy.Symbolc                 C  s   | t jksJ t| |dddS )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)rd   SIZErc   )r  r  rI   rI   rP   sympy_index_symbol_with_prefix  s   r  checkc                 C  s   | st jot jS r   )rh   debug_index_assertsassert_indirect_indexing)r  rI   rI   rP   generate_assert     r  r   c                 C  s    | d dksJ t j| dddS )r  r   r   Tr  )ry   rw  r   rI   rI   rP   sympy_index_symbol  s   r  replacementsdict[sympy.Expr, Any]c                   s,   ddd t |  fd	d
| D S )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    replacedrv   replacementUnion[sympy.Expr, str]rG   r  c                 S  s2   t | tjsJ t |trtj|| j| jdS |S )Nr  )rx   ry   r&  rH   rw  r   is_nonnegative)r  r  rI   rI   rP   	to_symbol1  s   
zsympy_subs.<locals>.to_symbolc                   s   i | ]
\}}| ||qS rI   rI   rN   kru   r  rI   rP   r  @      zsympy_subs.<locals>.<dictcomp>N)r  rv   r  r  rG   r  )ry   r'  xreplacer<  )rl  r  rI   r  rP   
sympy_subs+  s   

r  ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                 C  s:   t | tjpt | tjotdd t|  |  D S )Nc                 s      | ]}t |V  qd S r   is_symbolicrM   rI   rI   rP   r   G      zis_symbolic.<locals>.<genexpr>)	rx   rK   r.   r  rr  re  rf  r  r*  )r  rI   rI   rP   r  D  s    r  c                  G     t dd | D S )Nc                 s  r  r   r  rp  rI   rI   rP   r   L  r  z"any_is_symbolic.<locals>.<genexpr>rr  )r   rI   rI   rP   any_is_symbolicK  r!  r  r  torch.fx.GraphModuleOptional[torch.fx.Node]c                 C  s   ddl m} tg d}t r|d | jjD ]9}t|j	|v r&|  S tj
jjs@t|j	tjjr@tjjj|j	jv r@|  S |jd }d urR||rR|  S qd S )Nr   )r&   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outrc  )%torch.fx.experimental.symbolic_shapesr&   r    rK   $are_deterministic_algorithms_enabledrR  rc  r8  rH   r  	_inductorrh   graph_partitionrx   r  r  r   r  cudagraph_unsafer  r  get)r  r&   forbidden_setr  rc  rI   rI   rP   %get_first_incompatible_cudagraph_nodeO  s*   r  c                 C  s&   t tt| jj}|jdksJ |S )z$Get the output node from an FX graphr  )nextiterreversedrc  r8  rm  )r  	last_noderI   rI   rP   output_node  s   r  OrderedSet[torch.device]c                 C  s\   | j jdd}tdd |D }t| jd }t|tr|n|f}tdd |D }||B S )Nr  ru  c                 s  s0    | ]}t |jd tjr|jd  jV  qdS rc  N)rx   r  r  rK   r  r   r  rI   rI   rP   r     s    

z"get_all_devices.<locals>.<genexpr>r   c                 s  s>    | ]}t |tjjrt |jd tjr|jd  jV  qdS r  )rx   rK   r  r1   r  r  r  r   )rN   r  rI   rI   rP   r     s    

)rc  
find_nodesr    r  r   rx   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicesrI   rI   rP   get_all_devices  s   r  c                  C  s   t tj D ]B} | dsqtj|  }|j D ]+}|drDt||}t|tj	j
jjrD|jD ]}t|tj	j
jjrC|jjj  q1qtj| = qdtjv r_tjd }t|jjj`|jj`t  d S )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rS  sysmodulesr;  
startswith__dict__rJ   rx   rK   r  runtimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r  driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  rI   rI   rP   unload_xpu_triton_pyds  s.   








r  _registered_cachesc                 C  s0   t | dr
t| jst|  dt|  | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r  r  rI   rI   rP   clear_on_fresh_cache  s   
r  c                  C  s   t D ]} |   qdS )z&
    Clear all registered caches.
    N)r  r  r  rI   rI   rP   clear_caches  s   
r  cache_entriesOptional[dict[str, Any]]dirOptional[str]deleteIterator[None]c              	   #  sh   t   ddlm} |tj|d zztjtj	d iZ t
d  |tj dtjtj	di1 dV  t| trbt| dksKJ d	tjrbt}| fd
d|D  W d   n1 slw   Y  W d   n1 s{w   Y  |rt rtj rt  tj t  fddd W n ty   t
d   w W t   dS t   w )z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)r  TORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictc              	     s,   i | ]}d |vr|t jt j |qS )z.lock)ospathgetsizer	  )rN   f)triton_cache_dirrI   rP   r    s
    zfresh_cache.<locals>.<dictcomp>c                   s   t jd |dS )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)rx  r  r  )inductor_cache_dirrI   rP   ry    s
    zfresh_cache.<locals>.<lambda>)ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr   patchdictr  environr   r   r  r	  rx   rS   existslistdirrR  
is_windowsrK   rD   rL   r  shutilrmtree	Exceptionr  )r  r  r  r  filesrI   )r  r  rP   fresh_cache  sT   




r  seq	list[int]c                 C  s(   | j }tt| }ttt||ddS )NT)r)  reverse)__getitem__r   rS   rS  r  r  )r  gettera_rrI   rI   rP   argsort  s   r  re  r(   .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                   sD   d fdd}dd	 t |D }t|t|d
}dd	 |D }|S )Nr  tuple[int, sympy.Expr]r  rG   rr   c                   sZ   | \}}|\}}d
 fdd}|||k rdS |||krdS ||k r%dS ||kr+dS d	S )Nrl  %Union[bool, torch.SymInt, sympy.Expr]rG   rw   c                   s   t | tr| S  j| ddS )NT)size_oblivious)rx   rw   evaluate_exprro  re  rI   rP   evaluate,  s   
z*argsort_sym.<locals>.cmp.<locals>.evaluater   r2   r   )rl  r  rG   rw   rI   )r  r  a_idxa_valb_idxb_valr  r  rI   rP   r  (  s   zargsort_sym.<locals>.cmpc                 S  s,   g | ]\}}|t |tjr|jjn|fqS rI   )rx   rK   r.   r  rl  )rN   r  r   rI   rI   rP   rQ   @  s    zargsort_sym.<locals>.<listcomp>r  c                 S  s   g | ]\}}|qS rI   rI   )rN   r  r   rI   rI   rP   rQ   E  rk  )r  r  r  r  rG   rr   )r   r  r  
cmp_to_key)re  r  r  exprsr  rI   r  rP   argsort_sym%  s   r  r   torch.dtypec                 C  s    | t jkrdS t jd| d S )Nrp   rI   r   )rK   rO  r   element_sizer  rI   rI   rP   get_dtype_sizeI  s   
r  c                   @  s   e Zd ZU ded< dS )LineContextr   contextNr   r   r   r   rI   rI   rI   rP   r  R  s   
 r  c                   @     e Zd ZU ded< ded< dS )ValueWithLineMaprH   r   zlist[tuple[int, LineContext]]line_mapNr   rI   rI   rI   rP   r"  V     
 r"  c                   @  s   e Zd ZdZdDdEddZejdFddZdGddZdHddZ	dHddZ
dIddZdJddZdHddZdIddZdKd d!ZdLd$d%ZdMdNd)d*ZdMdOd+d,ZdMdOd-d.Z	/dPdQd3d4ZdRd7d8ZdHd9d:ZdSd=d>ZdTdAdBZdCS )UIndentedBuffer   r   initial_indentrr   rG   r  c                 C  s   g | _ || _d S r   )_lines_indent)r  r'  rI   rI   rP   __init___     
zIndentedBuffer.__init__tabwidthr  c                 c  s*    | j }z|| _ d V  W || _ d S || _ w r   )r,  )r  r,  prevrI   rI   rP   set_tabwidthc  s   zIndentedBuffer.set_tabwidthr"  c                 C  s   t  }d}g }| jD ]:}t|tr| }|d u rq
nt|tr(|||jf q
|}t|ts1J || |d |d|	d 7 }q
t
| |S )Nr2   r5  )r
   r(  rx   DeferredLineBaser  r  r  rH   writecountr"  getvalue)r  bufr   linemaplilinerI   rI   rP   getvaluewithlinemapl  s$   




z"IndentedBuffer.getvaluewithlinemaprH   c                 C  s
   |   jS r   )r7  r   r  rI   rI   rP   r2       
zIndentedBuffer.getvaluec                 C  s   t  }| jD ]8}t|tr| }|d u rqnt|trq|}t|ts%J |dr4||d d  q|| |d q| S )N\r   r5  )	r
   r(  rx   r/  r  rH   endswithr0  r2  )r  r3  r5  r6  rI   rI   rP   getrawvalue  s    




zIndentedBuffer.getrawvaluec                 C  s   | j   d S r   )r(  clearr  rI   rI   rP   r<       zIndentedBuffer.clearrw   c                 C  
   t | jS r   )rw   r(  r  rI   rI   rP   __bool__  r8  zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr  )r)  r,  r  rI   rI   rP   r    r  zIndentedBuffer.prefixc                 C  s   |  d d S )Nr5  	writeliner  rI   rI   rP   newline  r=  zIndentedBuffer.newliner6  )Union[LineContext, DeferredLineBase, str]c                 C  sr   t |tr| j| d S t |tr| j||   d S | r1| j|   |  d S | jd d S Nr  )rx   r  r(  r  r/  with_prefixr  stripr  r6  rI   rI   rP   rA    s   

zIndentedBuffer.writelinelines3Sequence[Union[LineContext, DeferredLineBase, str]]c                 C  s   |D ]}|  | qd S r   r@  )r  rH  r6  rI   rI   rP   
writelines  s   zIndentedBuffer.writelinesr2   offset'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )NrG   r  c                	   3  s<     j  7  _ zd V  W  j  8  _ d S  j  8  _ w r   r)  rI   rK  r  rI   rP   r    
   "z"IndentedBuffer.indent.<locals>.ctxrG   r  )
contextlibcontextmanager)r  rK  r  rI   rN  rP   indent  s   zIndentedBuffer.indentc                 C  s   |  j |7  _ d S r   rM  r  rK  rI   rI   rP   	do_indent  r!  zIndentedBuffer.do_indentc                 C  s   |  j |8  _ d S r   rM  rT  rI   rI   rP   do_unindent  r!  zIndentedBuffer.do_unindentF
other_codeUnion[IndentedBuffer, str]rF  c                 C  s   t |trJtd}|jD ]}t |ts"|r"t|t|t|  }qt	|r*d}|jD ]}t |tr;| j
| q-t| |t|d   q-d S t|}|rU| }|sYd S | }|dD ]}| | qbd S )Ninfr   r5  )rx   r%  r   r(  r  minrS   r  mathisinfr  rA  rr   textwrapdedentrstriprQ  )r  rW  rF  r^  r6  r   rI   rI   rP   splice  s,   





zIndentedBuffer.splicerx  Callable[[Any], Any]c                   s&   t | jd} fdd| jD |_|S )Nr'  c                      g | ]} |qS rI   rI   )rN   r6  rx  rI   rP   rQ     rk  z&IndentedBuffer.map.<locals>.<listcomp>)r%  r)  r(  )r  rx  r   rI   rd  rP   r}     s   zIndentedBuffer.mapc                 C  s   t |  d|   dS )Nru  rv  )r  r2  r  rI   rI   rP   __repr__  r  zIndentedBuffer.__repr__otherr   c                 C  s8   | j |j ksJ t| j d}|| j ||j |S )Nrb  )r)  r%  rJ  r(  )r  rf  r   rI   rI   rP   __add__  s
   zIndentedBuffer.__add__new_line)Union[DeferredLineBase, LineContext, str]c                 C  s
   || j v S r   )r(  )r  rh  rI   rI   rP   contains  r8  zIndentedBuffer.containsNr   )r'  rr   rG   r  )r,  rr   rG   r  )rG   r"  rG   rH   rG   r  rG   rw   )r6  rC  rG   r  )rH  rI  rG   r  r   )rK  rr   rG   rL  )rK  rr   rG   r  )F)rW  rX  rF  rw   rG   r  )rx  ra  rG   r%  )rf  r   rG   r%  )rh  ri  rG   rw   )r   r   r   r,  r*  rQ  rR  r.  r7  r2  r;  r<  r?  r  rB  rA  rJ  rS  rU  rV  r`  r}   re  rg  rj  rI   rI   rI   rP   r%  \  s.    












r%  c                      s(   e Zd Zd
 fddZddd	Z  ZS )FakeIndentedBufferrG   r  c                   s   t    d S r   )superr*  r  	__class__rI   rP   r*    r=  zFakeIndentedBuffer.__init__r   rH   r   c                 C  s$   |dkr
t | |S td| d)Nrq  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   rI   rI   rP   rs    s
   
z#FakeIndentedBuffer.__getattribute__rl  )r   rH   rG   r   )r   r   r   r*  rs  __classcell__rI   rI   rp  rP   rn    s    rn  c               	   c  s<    t jt j} }zd V  W | |t _t _d S | |t _t _w r   )r  stdoutstderr)initial_stdoutinitial_stderrrI   rI   rP   restore_stdout_stderr   rO  ry  c                   @  s`   e Zd ZdZdddZddd	ZdddZd ddZd!ddZd"ddZ	d#ddZ
d$ddZdS )%r/  z.A line that can be 'unwritten' at a later timer6  rH   c                 C  s   |  sd}|| _d S rD  )rF  r6  rG  rI   rI   rP   r*    s   
zDeferredLineBase.__init__rG   Union[str, None]c                 C     t )zJReturns either self.line or None to indicate the line has been 'unwritten'r  r  rI   rI   rP   r       zDeferredLineBase.__call__r   c                 C  r{  )z3Returns a new deferred line with the same conditionr|  rG  rI   rI   rP   	_new_line  r}  zDeferredLineBase._new_liner  c                 C  s   |  | | j S r   r~  r6  )r  r  rI   rI   rP   rE    r  zDeferredLineBase.with_prefixc                 C  s   |  | j S r   )r~  r6  r  r  rI   rI   rP   r    r  zDeferredLineBase.lstripr  Union[int, slice]c                 C  s   |  | j| S r   r  )r  r  rI   rI   rP   r    r  zDeferredLineBase.__getitem__rw   c                 C  r>  r   )rw   r6  r  rI   rI   rP   r?  "  r8  zDeferredLineBase.__bool__rr   c                 C  r>  r   )rS   r6  r  rI   rI   rP   __len__%  r8  zDeferredLineBase.__len__N)r6  rH   )rG   rz  )r6  rH   rG   r   )r  rH   rG   r   )rG   r   )r  r  rG   r   rm  rG   rr   )r   r   r   r   r*  r  r~  rE  r  r  r?  r  rI   rI   rI   rP   r/  	  s    






r/  c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`r)  rH   value_fnCallable[[], str]r6  c                   s   t  | || _|| _d S r   )ro  r*  r)  r  )r  r)  r  r6  rp  rI   rP   r*  ,  s   
zDelayReplaceLine.__init__rG   c                 C  s   | j | j|  S r   )r6  replacer)  r  r  rI   rI   rP   r  1  r  zDelayReplaceLine.__call__c                 C  s   t | j| j|S r   )r  r)  r  rG  rI   rI   rP   r~  4  r  zDelayReplaceLine._new_line)r)  rH   r  r  r6  rH   rk  )r6  rH   rG   r  )r   r   r   r   r*  r  r~  rt  rI   rI   rp  rP   r  )  s
    
r  index_or_deviceUnion[int, torch.device]c                 C  s   t | tjr	| }ntt | }t|}tjjr3|jd us J |jdk s*|jdkr1t	
d dS dS |jdkr:dnd}|j}||k rOt	j
d	||d
d dS dS )N	   r  z6GPU arch does not support max_autotune_gemm mode usageFTrD   rm   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rx   rK   r   rW   r   createversionhipmajorr   r  r  multi_processor_count)r  r   propr  r  rI   rI   rP   
is_big_gpu8  s&   

r  c                   C  s$   t j rt j jS t jdjS )NrB   )rK   rD   rL   get_device_propertiesgpu_subslice_countrB   r  rI   rI   rI   rP   get_max_num_smsU  s   
r  c                  C  s*   t j sdS t jt j } | jdkS )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rK   rB   rL   r  r  r  )device_propertiesrI   rI   rP   
using_b200\  s   

r  c                  C  s2   t j rt S t j } t | dur|  S d S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )rK   rD   rL   r  r   _get_sm_carveout_experimental)carveoutrI   rI   rP   get_num_smsf  s   

r  num_tma_descriptorsnum_programsOptional[int]r3   c                 C  sH   ddl m}m} |du rt }|d}||  t }||||| dS )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r2   )r3   WorkspaceZeroModeNF)r1  	zero_moder   
outer_name)codegen.commonr3   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)r  r   r  r3   r  r  r  rI   rI   rP   get_tma_workspace_argo  s   
r  r"  r;   allowed_layout_dtypeslist[torch.dtype]c                 C  s:   | j |vrtd| j | t| jjo| j |v ot| jS )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r  r  )r"  r  rI   rI   rP   _use_template_for_gpu  s   
r  backendc                 C  "   |   dd tj  dD v S )Nc                 S     g | ]}|  qS rI   rF  rM   rI   rI   rP   rQ         z)_use_autotune_backend.<locals>.<listcomp>r4  )upperrh   max_autotune_gemm_backendsrQ  r  rI   rI   rP   _use_autotune_backend     r  c                 C  r  )Nc                 S  r  rI   r  rM   rI   rI   rP   rQ     r  z._use_conv_autotune_backend.<locals>.<listcomp>r4  )r  rh   max_autotune_conv_backendsrQ  r  rI   rI   rP   _use_conv_autotune_backend  r  r  F)enable_int32enable_float8check_max_autotuner  r  r  c                C  s   ddl m}m} tjtjtjg}|rtjtjtjtjg}|r'|tj	tj
g t| jjo1t| |p<| jjdko<| j|v oPtjpEtjpE| oPtdoP|| j|jS )Nr2   )BackendFeaturehas_backend_featurer   TRITON)r  r  r  rK   r   r;  r=  rE  extendr5  r6  r  r   r  r  r   rh   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r"  r  r  r  r  r  layout_dtypesrI   rI   rP   use_triton_template  s"   	
r  )
add_guardsmatricesr:   r  c                   sf   ddl m} ddlm  d fd	d
d fddd fdd| o2tfdd|D S )u  
    Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 2 ≤ rank ≤ 5
      * dtype ∈ {FP16, BF16, FP8-E4M3FN}
      * Every logical size ≥ 2
      * Base pointer 16-byte aligned
      * All "outer" dims have 16-byte aligned strides
      * The “inner” dim has stride 1 (contiguous)
      * For FP8 tensors, inner dim ≥ 32
    r   )has_triton_tma_devicer2   r_  
expr_bytesr#  rG   rw   c                   s    j j| tS r   )rc  rd  statically_known_multiple_ofTMA_ALIGNMENT)r  r_  rI   rP   _aligned  r  zcan_use_tma.<locals>._alignedrO   r:   c                   s\  |   }|  }t|}|  }|j}|dk s|dkrdS |tjtjtjfvr)dS | 	  j
jv r3dS rD j
j|} j
j|}n fdd|D } fdd|D }t fdd|D rcdS  fd	dt|D }t|d
krvdS |d }	t|D ]\}
}|
|	krq~|| s dS q~||	 }|| sdS |tjkr j
j|dsdS dS )Nrm  r   Fc                      g | ]	} j j|qS rI   rc  rd  symbolic_hintrN   r   r_  rI   rP   rQ     r   zCcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<listcomp>c                   r  rI   r  rN   str_  rI   rP   rQ     r   c                 3  s"    | ]} j j|d  V  qdS rm  N)rc  rd  statically_known_geqr  r_  rI   rP   r     s     zBcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<genexpr>c                   $   g | ]\}} j j|d r|qS r   rc  rd  statically_known_equalsrN   r   r  r_  rI   rP   rQ         r2   r       T)get_size
get_striderS   	get_dtypeitemsizerK   r   r;  r5  r  rc  unaligned_buffersrd  guard_int_seqrr  r   r  )rO   sizesstridesrankr   r  sizes_i	strides_ir  	inner_idxr   r  	inner_dim)r`  r  r  rI   rP   _is_tma_compatible_default  sL   
z/can_use_tma.<locals>._is_tma_compatible_defaultc                   sD   |   } fdd|D } fddt|D }t|dkr dS dS )Nc                   r  rI   r  r  r_  rI   rP   rQ     r   z?can_use_tma.<locals>._is_tma_compatible_xpu.<locals>.<listcomp>c                   r  r   r  r  r_  rI   rP   rQ     r  r2   FT)r  r   rS   )rO   r  r  r  r_  rI   rP   _is_tma_compatible_xpu  s   
z+can_use_tma.<locals>._is_tma_compatible_xpuc                 3  s:    | ]}|   d u sjdkr |n|V  qd S )NrD   )
get_devicer  rN   r  )r  r  m_devicerI   rP   r     s    
zcan_use_tma.<locals>.<genexpr>N)r  r#  rG   rw   rO   r:   rG   rw   )torch.utils._tritonr  rb  r`  r|   )r  r  r  rI   )r`  r  r  r  r  r  rP   can_use_tma  s   <r  c                 G  s(   t dd |D ot|d| iotjjS )Nc                 s  s     | ]}t | d kV  qdS r  )rS   r  r  rI   rI   rP   r   %      z*use_triton_tma_template.<locals>.<genexpr>r  )r|   r  rh   r  enable_persistent_tma_matmul)r  r  rI   rI   rP   use_triton_tma_template#  s
   r  r  r  r  c           	      C  s   ddl m} |jjj|| | dd}|dks|tjjk rdS ddlm	} t
jjr+dS t
jt
jt
jg}t| |oAtjp=tjoAtd}|rQ| sQtd	tjj dS |S )
Nr2   r_  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rb  r`  rc  rd  	size_hintrh   rB   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  rK   r  r  r   r;  rE  r  r  r  r  r   r  cutlass_dir)	r"  r  r  r  r`  	gemm_sizer  r  r   rI   rI   rP   use_cutlass_template+  s*   

r  op_namec                 C  s4   t jj }|dkrdS |  dd |dD v S )z8Check if CUTLASS should be used for the given operation.ALLTc                 S  r  rI   r  rM   rI   rI   rP   rQ   Q  rk  z'_use_cutlass_for_op.<locals>.<listcomp>r4  )rh   rB   cutlass_enabled_opsr  rQ  )r  enabled_opsrI   rI   rP   _use_cutlass_for_opL  s   r  r   _IntLikec              
   C  s`   ddl m} tjj}tjj o/|jj	
tt|||  t||| o/|jj o/|jj S )Nr   r_  )torch._inductor.virtualizedr`  rh   r  decompose_k_thresholdrK   r  r  rc  rd  statically_known_truery   AndGeaot_modecpp_wrapper)r  r  r  r`  r  rI   rI   rP   use_decompose_k_choiceW  s   
r  c              
   C  sb   t jj}ddlm} ttjjo0|j	j
tt|||  t||| o0|j	j o0|j	j S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   r_  )rh   rocmcontiguous_thresholdr  r`  rw   rK   r  r  rc  rd  r  ry   r  r  r  r  )r  r  r  r
  r`  rI   rI   rP   use_contiguousj  s   r  c                   s0  t jj}g d}t|tjr|js|S |dkrg S t| tjr"| jr+t|tjr.|js.d n	t||  ||  dt|} fdd|D }g g g }}}|D ].}	||	 }
|
dk r]qR|
|
d @ dkro|
dkro|	|	 qR|
d	 dkr{|	|	 qR|	|	 qRt j
d
kr|| | S || | }|d | S )N)rm   r  ro   rn      r   r  rm  c                   s    g | ]}| kr|kr|qS rI   rI   )rN   divisormax_k_splitmin_k_splitrI   rP   rQ     s
    z get_k_splits.<locals>.<listcomp>rn   r2   r  
EXHAUSTIVE)rh   r  num_decompose_k_splitsrx   ry   r&  	is_numberrZ  divisorsr  max_autotune_gemm_search_space)r  r  r  k_splits_limitdefault_k_splitsr  pow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitsrI   r  rP   get_k_splits  s@   


r  c                 C  s   t j| jS r   )rK   rB   r  gcnArchNamer   rI   rI   rP   _rocm_native_device_arch_name  s   r!  Qtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                  C  s|   zdd l } ddlm}m} ddlm} tj| j	}W n t
y7   ddd}ddd	}G d
d d}d }Y nw ||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationrG   r  c                   S     g S r   rI   rI   rI   rI   rP   r#    r  z*try_import_ck_lib.<locals>.gen_ops_libraryc                   S  r&  r   rI   rI   rI   rI   rP   r$    r  z.try_import_ck_lib.<locals>.gen_ops_preselectedc                   @  s   e Zd ZdS )z*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   rI   rI   rI   rP   r%    s    r%  )rG   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr#  r$  ck4inductor.universal_gemm.opr%  r  r  dirname__file__r   )r'  r#  r$  r%  package_dirnamerI   rI   rP   try_import_ck_lib  s   

r-  c                   s   t jst jsdS tjjsdS | jjdksdS t| j}dd t j	j
D p,|dd |i  fdd  t j	j@ D }|s@dS | jtjtjtjfvrMdS t \}}}}|s]td	 dS t  re|t j	_t j	jsptd
 dS |t j	jkr}td dS dS )NFrB   c                 S  s   i | ]
}| d d |qS ):r   )rQ  rN   r  rI   rI   rP   r    r  z#use_ck_template.<locals>.<dictcomp>r.  r   c                   s   g | ]} | qS rI   rI   r/  requested_archsrI   rP   rQ     s    z#use_ck_template.<locals>.<listcomp>z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)rh   r  r  rK   r  r  r   r  r!  r	  archrQ  r;  ck_supported_archr   r   r;  r=  r-  r   r  	is_fbcodeck_dir)r"  native_archrequested_supported_archsck_package_dirnamer   rI   r0  rP   use_ck_template  s<   




r9  c                 C  :   ddl m} tdot| o|jjj|| | dddkS )Nr2   r_  CKr   r  r   rb  r`  r  r9  rc  rd  r  r"  r  r  r  r`  rI   rI   rP   use_ck_gemm_template     r>  c                 C  r:  )Nr2   r_  CKTILEr   r  r   r<  r=  rI   rI   rP   use_ck_tile_gemm_template  r?  rA  c                 C  s   t dot| S )Nr;  )r  r9  r"  rI   rI   rP   use_ck_conv_template   r  rC  c                 C  s   t jpt jo| jjdkS r  )rh   r  r  r   r  rB  rI   rI   rP   _use_template_for_cpu$  s   

rD  mat1Union[ReinterpretView, Buffer]mat2c                 C  s6   ddl m} t|j|sJ t| ||ddo|j S )Nr2   )r;   F)require_constant_mat2)r  r;   rx   r"  use_cpp_gemm_templateis_contiguous)r"  rE  rG  r;   rI   rI   rP   use_cpp_bmm_template*  s
   rK  mat2_transposedrH  is_woq_int4q_group_sizec                 C  s:  ddl m} ddlm} ddlm}	 ddlm}
 t| r t	ds"dS t
jjs(dS | tjtjfv }tjtjtjtjg}|
|||rD| jnd ||d\}}}} }}t||frXdS t||jrb| }|	| \}}|d	|||| | |t | |d

}ddd}| j|v o|d uo||ot||jo| p| S )Nr2   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtyperL  use_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refrN  rO   r:   rG   rw   c                 S  s   |    |  d dkS )Nr   r2   )freeze_layoutr  rO   rI   rI   rP   is_last_dim_stride1j  s   z2use_cpp_gemm_template.<locals>.is_last_dim_stride1r  )r  r  codegen.cpp_micro_gemmrO  codegen.cpp_utilsrP  kernel.mm_commonrQ  rD  r  rh   cppweight_prepackr  rK   rL  rA  r=  r;  halfr   has_free_symbolsrx   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r"  rE  rG  rL  rH  rM  rN  r  rO  rP  rQ  	int8_gemmr  r  r  r  rX  r   rU  r]  rI   rI   rP   rI  7  sX   		


rI  c                   C  s   t jpt j p
tdS )NATEN)rh   r  r  r  rI   rI   rI   rP   use_aten_gemm_kernelsw  s   
rk  c                   @  s>   e Zd ZU edZded< dddZddd	ZdddZ	dS )DebugDirManagerr   rH   prev_debug_namerG   r  c                 C  s   t tj| _d S r   )r  rl  counterr   r  rI   rI   rP   r*    r  zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )N_tmp_)rK   _dynamorh   debug_dir_rootrm  r   new_namer  rI   rI   rP   	__enter__  s   zDebugDirManager.__enter__r   r   c                 G  s   t | j | jtjj_d S r   )r   r  rr  rm  rK   rp  rh   rq  )r  r   rI   rI   rP   __exit__  s   zDebugDirManager.__exit__Nrl  )r   r   rG   r  )
r   r   r   re  r1  rn  r   r*  rs  rt  rI   rI   rI   rP   rl  }  s   
 


rl  Callable[P, _T]r  r  tuple[_T, list[str]]c                   st   ddl m} g  d
 fdd}tj|d	| tj  | |i |}W d    | fS 1 s1w   Y  | fS )Nr2   r6   coderH   rG   r  c                        |  d S r   r  rw  source_codesrI   rP   save_output_code  r=  z*run_and_get_code.<locals>.save_output_coder}  rw  rH   rG   r  rc  r7   r   r  rr  rK   rp  reset)r   r   r  r7   r}  r  rI   r{  rP   run_and_get_code  s   

r  c                 O  sF   t | g|R i |\}}g }|D ]}|td|tj q||fS )Nz	'''.*?''')r  r  r   findallDOTALL)r   r   r  r  r|  kernelsrw  rI   rI   rP   run_and_get_kernels  s
   r  tuple[Any, list[str]]c                   s   d fdd}t |S )NrG   r   c                    s     } |     | S r   )r   backwardr  r  rI   rP   run_with_backward  s   z1run_fw_bw_and_get_code.<locals>.run_with_backward)rG   r   )r  )r   r  rI   r  rP   run_fw_bw_and_get_code  s   r  c              	     s   ddl m} g dfdd d fdd}tj|d|5 tj|d  tj  | |i |}W d   n1 s>w   Y  W d   S W d   S 1 sVw   Y  S )zLGet the inductor-generated code, but skip any actual compilation or running.r2   r6   rw  rH   rG   r  c                   rx  r   ry  rz  r{  rI   rP   r}    r=  z"get_code.<locals>.save_output_coder  r7   r   c                   sF   G dd d}| j r|  n|  \}} |j |r  |j | S )Nc                   @  s$   e Zd ZdZdddZdd	d
ZdS )z@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulerG   r  c                 S  r  r   rI   r  rI   rI   rP   r*    r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__r   r   r  c                 _  r  r   rI   r  rI   rI   rP   call  r}  zEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callNrl  r   r   r  r   rG   r  )r   r   r   r   r*  r  rI   rI   rI   rP   DummyModule  s    
r  )r  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_code)r}  rI   rP   patched_compile_to_module  s   

z+get_code.<locals>.patched_compile_to_modulecompile_to_moduler}  Nr~  )r  r7   rG   r   r  )r   r   r  r7   r  r   rI   )r}  r|  rP   get_code  s$   
(


r  c                 O  sJ   t | g|R i |}dt|  krdks!n J dt| |d S Nr2   rm  z%expected one or two code outputs got r   )r  rS   )r   r   r  r|  rI   rI   rP   get_triton_code  s
   r  c                 O  sN   t | g|R i |\}}dt|  krdks#n J dt| |d S r  )r  rS   )r   r   r  r   r|  rI   rI   rP   run_and_get_triton_code  s
   r  tuple[Any, list[GraphLowering]]c                   s   ddl m  ddlm} |jg d fd	d
}tj|d| | |i |}W d    |fS 1 s7w   Y  |fS )Nr   r6   r>   r   r   r  rG   r  c                    s2   | i | | d }t | sJ | d S )Nrm  )rx   r  )r   r  rc  r7   graph_lowerings	real_initrI   rP   	fake_init  s   z-run_and_get_graph_lowering.<locals>.fake_initr*  r  )torch._inductor.graphr7   torch._inductor.output_coder?   r*  r   r  rr  )r   r   r  r?   r  r  rI   r  rP   run_and_get_graph_lowering  s   
r  aten_opoverride_fnc              	   c  sN    ddl m} |j|  }zt|||j| < dV  W ||j| < dS ||j| < w )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr  	loweringsr  partial)r  r  r  orig_fnrI   rI   rP   override_lowering	  s   
r  pre_fnpost_fnOptional[Callable[..., Any]]c                   s6   ddl m} |j d fdd}tjj|d	|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerr  r   r8  rG   c                   s&   | |  | |}r| | |S r   rI   )r  r8  outr  r  r  rI   rP   r  $	  s
   


z(add_scheduler_init_hook.<locals>.wrapperr*  N)r  r   r8  r   rG   r   )torch._inductor.schedulerr  r*  unittestr   r  rr  )r  r  r  r  rI   r  rP   add_scheduler_init_hook	  s   r  msgc                 C  s"   t jr
t|  dS t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rh   developer_warningsr   r  info)r  rI   rI   rP   developer_warning.	  s   r  c                  C  s   z/t jd} | d tt jk r.tt j| d  dkr.t j| d  d dkr.t j| d  W S W n	 ty8   Y nw t jD ]}|drM|tdd   S q<dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr2   r   rs  z--only=N)r  argvr  rS   
ValueErrorr  )r  r  rI   rI   rP   get_benchmark_name:	  s   

r  r<  c                 C  r  )Nc                 s      | ]}|d kV  qdS r2   NrI   rM   rI   rI   rP   r   \	  r  zis_ones.<locals>.<genexpr>r|   r<  rI   rI   rP   is_ones[	  r!  r  c                 C  r  )Nc                 s  r  )r   NrI   rM   rI   rI   rP   r   `	  r  zis_zeros.<locals>.<genexpr>r  r  rI   rI   rP   is_zeros_	  r!  r  inputsSequence[torch.Tensor]c                 C  r  )Nc                 s  s,    | ]}t |tjr|jtd kV  qdS )r   N)rx   rK   r  r   )rN   r   rI   rI   rP   r   d	  s    

z is_cpu_device.<locals>.<genexpr>r  )r  rI   rI   rP   is_cpu_devicec	  s   r  rc  c                 C  s&   t | tjs
J d| jrtjS tjS )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rx   ry   r&  r   rK   rG  r?  )rc  rI   rI   rP   get_sympy_Expr_dtypek	  s   r  should_profileIterator[Any]c                 o  sN    | r"t jj|i |}|V  W d    d S 1 sw   Y  d S d V  d S r   )rK   r   r   )r  r   r  r   rI   rI   rP   maybe_profileu	  s   "
r  c                  C  s   t jj} | dk rt } | S Nr2   )rh   ra  threadsrK   get_num_threads)r  rI   rI   rP   rg  ~	  s   rg  c                  C  s,   ddl m}  |  }|dtjjrdS dS )Nr2   )get_backend_options
num_stagesrm     )runtime.triton_helpersr  r  rK   r  r  )r  optionsrI   rI   rP   get_backend_num_stages	  s   r  c                 C  s  t | tjjjjd}|dur|S ddlm}m} tj	 o#tj
 dk}| tjtjtjfv s0J t|jdrcddlm} | }| tjtjfv rQ|rQ|| |S tjjjjr]|tj|S |tj|S | tjtjfv rq|rq|| S tjjjjr||tjS |tjS )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    )is_tf32Nr   )get_max_simd_tflopsget_max_tensorcore_tflops)rp   r   
clock_rate)max_clock_rate)r   rK   backendsrB   matmul
allow_tf32triton.testingr  r  rL   get_device_capabilityr   r;  r=  inspect	signature
parametersr  torch._utils_internalr  )r   ds_topsr  r  SM80OrLaterr  sm_clockrI   rI   rP   get_device_tflops	  s&   


r  c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)r  r  r  rI   rI   rP   get_gpu_dram_gbps	  s   r  c                  C  s"   ddl m}  | jjdddS )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r  r  rI   rI   rP   get_gpu_shared_memory	  s   r  reduction_typec                 C  s
   |  dS )Nwelford)r  r  rI   rI   rP   is_welford_reduction	  r8  r  c                 C  s   t | rdS | dkrdS dS )Nr  online_softmax_reducerm  r2   )r  r  rI   rI   rP   reduction_num_outputs	  s
   r  c                   C  s   t  dkS )NLinux)platformsystemrI   rI   rI   rP   is_linux	  r  r  c                   C  s
   t jdkS )Nrj   )r  r  rI   rI   rI   rP   r  	  r8  r  itrIterable[Any]c                 C  r  )Nc                 s  s$    | ]}t |tjo|j V  qd S r   )rx   ry   r&  r  rM   rI   rI   rP   r   	     " z#has_free_symbols.<locals>.<genexpr>r  )r  rI   rI   rP   rd  	  r!  rd  c                  G  s~   ddl m} | D ]4}t||j|j|j|j|jfr-t|	 pds)t|
 p'dr, dS qt||js4qtdt| dS )Nr2   r  rI   Tzunexpected type for is_dynamic F)r  r  rx   r  r  re  rZ  r8   rd  maybe_get_sizemaybe_get_strider:   	TypeErrorr  )r   r  trI   rI   rP   
is_dynamic	  s   
r  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r  r  rI   rI   rI   rP   r  	  s    r  rx  r0   inpc              	   C  s4  ddl m} tjdddd}t }t }t|t|dj|  t	d|j
 |d	 t	|j
|d	 t }t|| | |j
 W d    n1 sLw   Y  t | }	||j
 |j
  |  t	d
|j
 |d	 t	|j
|d	 | | k}
td||j|
|	 W d    d S 1 sw   Y  d S )Nr2   )stable_topological_sortrM  zutf-8F)modeencodingr  )r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r  NamedTemporaryFileior
   r]   rY   	propagater  rc  r	   nowr\   lint	recompiler2  r   r  r   )rx  r  r  r  r  r  	before_ioafter_io
start_timetime_elapsedr  rI   rI   rP   pass_execution_and_save	  s>   

"r  	input_buf"Optional[Union[Buffer, Operation]]c                 C  s&   ddl m} t| |jot| j|jS )zB
    Check if input buffer is a multi-outputs template buffer
    r2   r  )r  r  rx   CppTemplateBufferr"  MultiOutputLayoutr  r  rI   rI   rP   is_multi_outputs_template
  s   r  c                 C  s4   ddl m} t| |jot| jdkot| jd S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r2   r  r   )r  r  rx   MultiOutputrS   r  r  r  rI   rI   rP   #is_output_of_multi_outputs_template)
  s   r  r   Optional[Union[Node, Operation]]!Optional[torch._ops.OperatorBase]c                 C  s   | d u rdS ddl m} t| |jo!t| |j o!|d u p!| j|u pXt| |jkoXtt	j
jdo8| jt	j
jjjkpXtt	j
jdoH| jt	j
jjjkpXtt	j
jdoX| jt	j
jjjkS )NFr2   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  rx   _CollectiveKernel_WaitKernelop_overloadr  FallbackKernelr   rK   r   torchrecr  defaultr  r  r  rm  r  rI   rI   rP   is_collective8
  s(   

r!  "Optional[Union[IRNode, Operation]]c                 C  s   ddl m} t| |jkS Nr2   r  )r  r  r  r  r  r  rI   rI   rP   is_wait^
  s   r%  snoder@   c                 C  4   ddl m} t| |rtdd | jD S t| jS )Nr   GroupedSchedulerNodec                 s  r  r   )contains_collectiverM   rI   rI   rP   r   h
  r  z&contains_collective.<locals>.<genexpr>)r  r)  rx   rr  snodesr!  r  r&  r)  rI   rI   rP   r*  d
     

r*  c                 C  r'  )Nr   r(  c                 s  r  r   )contains_waitrM   rI   rI   rP   r   q
  r  z contains_wait.<locals>.<genexpr>)r  r)  rx   rr  r+  r%  r  r,  rI   rI   rP   r.  m
  r-  r.  Optional[Operation]?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                 C  s6   ddl m} t|tjjr|g}t| |jo| j|v S r#  )r  r  rx   rK   r  r  r  r  r   rI   rI   rP   is_fallback_opv
  s   r1  buf_namename_to_bufname_to_fused_nodec                 C  s   |||  j   S r   )defining_opr  )r2  r3  r4  rI   rI   rP   buf_name_to_fused_snode
  s   r6  c                 C  rw  rx  rI   r&  rI   rI   rP   ry  
  rz  collected_node_setMutableSet[BaseSchedulerNode]dict[str, SchedulerBuffer]dict[str, BaseSchedulerNode]criteria_cbCallable[[Any], bool]c                 C  sP   || rd S | |  | jD ]}t|j||}||v rqt|||||d qd S )Nr<  )r=  unmet_dependenciesr6  r   find_recursive_deps_of_node)r&  r8  r3  r4  r<  depdefining_op_for_deprI   rI   rP   r@  
  s"   

r@  c                 C  rw  rx  rI   r7  rI   rI   rP   ry  
  rz  c              	   C  s   || rd S | |  |  D ]4}|jD ].}|jd usJ |j dkr%q|j |vr-q||j  }||v r9qt|||||d qqd S )NOUTPUTr>  )r=  get_outputsr  r  r  find_recursive_users_of_node)r&  r8  r3  r4  r<  orV  user_oprI   rI   rP   rE  
  s,   

rE  dynamo_gm_num_inputsaot_fw_gm_num_inputsc                 C  s   t jjjrdnd}||  | S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)rm  r   )rK   
_functorchrh   functionalize_rng_ops)rH  rI  num_rng_seed_offset_inputsrI   rI   rP   num_fw_fixed_arguments
  s   rM  fx_gc                 C  sd   ddd}d}g }| j jD ]}|jdkr!||r|| |d	7 }q|ttt|ks.J t|S )z>
    Infers which inputs are static for a backwards graph
    rO   r1   rG   rw   c                 S  s(   d| j vod| j vod| j vod| j vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  r\  rI   rI   rP   is_saved_tensor
  s   
z'count_tangents.<locals>.is_saved_tensorr   r  r2   N)rO   r1   rG   rw   )rc  r8  rm  r  rS  r   rS   )rN  rS  	arg_countstatic_arg_idxsr  rI   rI   rP   count_tangents
  s   


rV  c                   @  s.   e Zd ZU ded< dddZedd	d
ZdS )	BoxedBoolrw   r   rG   c                 C  s   | j S r   )r   r  rI   rI   rP   r?  
  s   zBoxedBool.__bool__r  r   Union[BoxedBool, bool]c                 C  s   t | tr
d| _| S dS rx  )rx   rW  r   r  rI   rI   rP   disable
  s   
zBoxedBool.disableNrm  )r  r   rG   rX  )r   r   r   r   r?  r  rY  rI   rI   rI   rP   rW  
  s
   
 
rW  kernel_listc                 #  sh    ddl m} |j	 		 dd fdd}tj|d| d V  W d    d S 1 s-w   Y  d S )Nr2   r4   Tr  r5   kernel_namerH   r  rF  r  gpurw   cpp_definitionrG   r   c                   s     | | |||||S r   ry  )r  r[  r  rF  r\  r]  rZ  orig_define_kernelrI   rP   define_kernel
  s   
z.collect_defined_kernels.<locals>.define_kernelr`  )NTN)r  r5   r[  rH   r  rH   rF  r  r\  rw   r]  r  rG   r   )codegen.wrapperr5   r`  r   r  rr  )rZ  r5   r`  rI   r^  rP   collect_defined_kernels
  s   "rb  c                 C  s   | d S )N__original__rI   r  rI   rI   rP    get_cloned_parameter_buffer_name     rd  c                 C  s   | t v S r   )rR   r   rI   rI   rP   r    re  r  c                 C  s   | dkot | S )NrC   )r  r   rI   rI   rP   device_need_guard  r  rf  c                 C  sL   t  r| tjkrtj rtj dkrt jrdS | ttj	tj
tjgv S )N)r  r   F)rh   r4  rK   r;  rB   rL   r  bfloat16_atomic_adds_enabledr    rG  rw   r  rI   rI   rP   ,needs_fallback_due_to_atomic_add_limitations  s   
rh  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensorc                 C  s   | j tjjjtjjjfv r|d u rdS | j tjjjkrdnd}|d |fvp]|o.t|o.t|p]| j tjjjkoM|dkoM|oM|dkoMt	j
joMt	j
jpMt dkp]||koY|tjtjfv p]t S )NFr=  r   r   r2   )overloadpacketrK   r   atenscatter_reduce_scatter_reducescatter_r  rh  rh   ra  fallback_scatter_reduce_sumdynamic_threadsrg  rw   rG  r  )r  r  ri  rj  rk  rl  	reduce_tyrI   rI   rP   use_scatter_fallback#  s8   	ru  c                 C  s  ddl m}m} ddlm} tdt|  d t| D ]m\}}td|dd ||u r2td	 q||u r;td
 qt||r|	 }t|rIdnd d |rb|j
dusXJ td|j
jj  td |jjD ]}t| qjtd |jjD ]}t| qyqtdt| dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r.  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdrv  rw  r  rx  r  rS   r   rx   is_reductionr  r  reduction_hintr/  r0  r1  r   r  )r  rv  rw  rx  r  r  is_redrA  rI   rI   rP   dump_node_scheduleJ  s0   




r  r   r  c                 C  s*   ddl m} ||  t| j t dkS )Nr   )r  )r  r  storage_offsetr  r   GPU_ALIGN_BYTES)r   r  rI   rI   rP   tensor_is_alignedi  s   r  example_inputc                 C  s   t | jjsdS tjpt| S rx  )r  r   r  rh   assume_aligned_inputsr  )r  rI   rI   rP   should_assume_input_alignedw  s   r  rL  c                  C  s>   t jj } | st S | jr| jjst S | jj}| S r   )	rK   _guardsTracingContexttry_getrQ  nullcontextr  re  suppress_guards)tracing_contextre  rI   rI   rP   #maybe_get_suppress_shape_guards_ctx  s   r  tuple[_T, str]c                 O  s   t jjtddJ tj  dd l}dd l	}|
 }||}ddlm} || |j}||j | |i |}	| }
|| || W d    |	|
fS 1 sVw   Y  |	|
fS )Nr   Tr   )output_code_log)r  r   r  rr  rh   rK   rp  r  r  loggingr
   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr2  removeHandler)r   r   r  r  r  log_capture_stringchr  
prev_levelr  r   rI   rI   rP   run_and_get_cpp_code  s$   




r  Sequence[InputType]Optional[ShapeEnv]c                 C  s<   t | }|d ur|jS | D ]}t|tjr|jj  S qd S r   )rY   re  rx   rK   r.   r  )r  r  inputrI   rI   rP   shape_env_from_inputs  s   r  Callable[[list[InputType]], _T]inputs_to_checkmutated_input_idxsOrderedSet[int]c                   s&   t  dkrS d fdd}|S )	Nr   
new_inputslist[InputType]rG   r   c                   s0   t |  \}}| }t|rt|| |S r   )copy_misaligned_inputsrS   rK   _foreach_copy_)r  old_tensorsnew_tensorsr  r  r  r  rI   rP   r    s   z)align_inputs_from_check_idxs.<locals>.run)r  r  rG   r   )rS   )r  r  r  r  rI   r  rP   align_inputs_from_check_idxs  s   r  c                 C  s`   d|   v r	d}ntdd t|   |  D d }t| |fd }t||   |  S )Nr   c                 s  s     | ]\}}|d  | V  qdS r  rI   )rN   r$  r*  rI   rI   rP   r     r  z)clone_preserve_strides.<locals>.<genexpr>r2   r   )r  r   r   r*  rK   
as_stridedclone)rO   needed_sizer  rI   rI   rP   clone_preserve_strides  s   "r  r  r  check_inputs_idxsreturn_pair_idxsOptional[OrderedSet[int]]-tuple[list[torch.Tensor], list[torch.Tensor]]c                 C  s   g }g }|du}|D ]3}| | }t |tjsJ dt| | t r=t|| |< |r=||v r=|| || |  q
||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )rx   rK   r  r  data_ptr	ALIGNMENTr  r  )r  r  r  r  r  ret_pair_definedr   _inprI   rI   rP   r    s   

r  static_input_idxsc                 C  sT   g }|D ]}| | }t |tjr| t dkr|| qt|t|kr(|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rx   rK   r  r  r  r  rS   )r  r  aligned_static_input_idxsr  r  rI   rI   rP   remove_unaligned_input_idxs  s   
r  r   c                 C  sv   ddl m} ttjj}|jjj}|jjj	j
}|jj| |kr#dS |jr1|jj| dk r1dS || o:|| |kS )Nr2   r_  Tg@xDF)rb  r`  rK   iinforE  r   rc  rd  r  re  has_hintr  aot_compilation)r   r`  int_maxr  r  rI   rI   rP   expr_fits_within_32bit  s   
r  compiled_graphr?   c                   s   t jj }|d urX|jd urZt|jdksJ t| |jd us#J |jD ]5}|d u r3|jd  q&d t jj  }r@|j d fdd|jt	fd	d
|D  q&d S d S d S )Nr   Fr   r   rG   ,Union[float, int, SymInt, SymFloat, SymBool]c                   s(   d u rt | S  r| S | S r   )rr   deserialize_symexprevaluate_symexpr)r   )fakify_first_callre  rI   rP   map_exprE  s
   

z4set_tracing_context_output_strides.<locals>.map_exprc                 3  s    | ]} |V  qd S r   rI   rN   r   )r  rI   rP   r   M  r  z5set_tracing_context_output_strides.<locals>.<genexpr>)r   r   rG   r  )
rK   r  r  r  output_stridesrS   r  r  r  r  )r  r  r  r  r  rI   )r  r  re  rP   "set_tracing_context_output_strides4  s"   
r  c                  C  s`   t jd urt jS t  sdS tj rdS zddlm}  W n
 ty'   Y dS w | tj	dkS )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rh   fx_graph_remote_cacher4  rK   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  rI   rI   rP    should_use_remote_fx_graph_cacheQ  s   

r  c                 C  s   t dd| S )Nz[^a-zA-Z0-9_]r   )r   subr  rI   rI   rP   normalize_named  r=  r  ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2c                 C  r  rI   rI   r  rI   rI   rP   r  t  r  r  z^.*[.]c                 C  s   t dt| }t||S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  rH   _triton_type_mappingr  )r   triton_type_namerI   rI   rP   triton_typez  s   r  c                 C  s6   t | | }|dd}tt|}t|tjsJ |S )Nr  r  )_torch_triton_mappingr  r  rJ   rK   rx   r   )r   adjusted_type	type_namerS  rI   rI   rP   triton_type_to_torch  s
   
r  r  r   c                 C  sh   | j  o3|  | ko3|  | ko3| j|jko3| j|jko3|   |  ko3|  | kS r   )	is_mkldnnr  r*  r   r   untyped_storager  r  r  r   rI   rI   rP   is_same_tensor  s   

r  c                 C  sJ   | j o$|  | ko$| j|jko$| j|jko$tjj| tjj|kS r   )r  r  r   r   rK   r   mkldnnr  r  rI   rI   rP   is_same_mkldnn_tensor  s   

r  tuple[str, ...]c                   C  rw  )N)r\  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorrI   rI   rI   rI   rP   boolean_ops  r}  r  c                   @  r!  )OpDtypeRuler/   type_promotion_kindr*  override_return_dtypeNr   rI   rI   rI   rP   r    r$  r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesr  r/   r  c                 C  s   t ||t| < d S r   )r  r  )r   r  r  rI   rI   rP   #register_op_dtype_propagation_rules  s   r  zOrderedSet[str]op_requires_libdevice_fp64c                 C  s   t |  d S r   )r  r=  r  rI   rI   rP   #register_op_requires_libdevice_fp64  r=  r  c                  C  s8   ddl m}  | j j}|dkrtjS |dkrdS tjS )Nr   r_  r   rC   )r  r`  rc  get_current_device_or_throwr  rh   cpu_backendcuda_backend)r`  
device_strrI   rI   rP   get_current_backend  s   r  c                 C  s,   | t jt jfv rtjjrt dkrt jS | S )z"Maybe upcast [b]float16 to float32r  )rK   r   r;  rh   r  codegen_upcast_to_fp32r  r=  r  rI   rI   rP   upcast_compute_type  s   
r  KeyTypeValTypec                   @  sl   e Zd ZdZd#ddZd$d
dZd%ddZd&ddZd'd(ddZd)ddZ	d*ddZ
d+dd Zd,d!d"ZdS )-
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    original_dictMapping[KeyType, ValType]c                 C  s   || _ i | _d S r   r  	new_items)r  r  rI   rI   rP   r*    r+  zScopedDict.__init__r)  r  rG   r   c                 C  s   || j v r
| j | S | j| S r   r  r  r  r)  rI   rI   rP   r    s   


zScopedDict.__getitem__r   r  c                 C  s   || j |< d S r   )r  )r  r)  r   rI   rI   rP   __setitem__  r=  zScopedDict.__setitem__rr  rw   c                 C  s   || j v p	|| jv S r   r  r  rI   rI   rP   __contains__  r  zScopedDict.__contains__Nr  Optional[ValType]c                 C  s"   || j v r
| j | S | j||S r   )r  r  r  )r  r)  r  rI   rI   rP   r    s   

zScopedDict.getrr   c                 C  s,   t | j}| jD ]}|| jvr|d7 }q|S r  )rS   r  r  )r  r  r  rI   rI   rP   r    s   


zScopedDict.__len__Iterator[KeyType]c                 c  s.    | j E d H  | jD ]
}|| j vr|V  q
d S r   r  )r  r  rI   rI   rP   __iter__
  s   

zScopedDict.__iter__c                 C  s   t | jp| jS r   )rw   r  r  r  rI   rI   rP   r?    r  zScopedDict.__bool__c                 C  r{  r   r|  r  rI   rI   rP   __delitem__  r  zScopedDict.__delitem__)r  r  )r)  r  rG   r   )r)  r  r   r   rG   r  )r)  rr  rG   rw   r   )r)  r  r  r
  rG   r
  r  )rG   r  rm  )r)  r  rG   r  )r   r   r   r   r*  r  r  r	  r  r  r  r?  r  rI   rI   rI   rP   r    s    






r  )frozen_defaultr   Optional[type[Any]]r   c                 s"   d fdd}| d u r|S || S )Nr   rl   rG   c                   s(   t jdkrtj| d dS tj|  dS )N)r  r  T)kw_onlyr   r   )r  version_infodataclasses	dataclass)r   r   rI   rP   wrap  s   
zir_dataclass.<locals>.wrap)r   rl   rG   rl   rI   )r   r   r  rI   r   rP   ir_dataclass  s   r  Optional[list[int]]c                  C  s&   t jj } | d ur| jr| jjS d S r   )rK   r  r  r  fw_metadatabw_donated_idxs)r  rI   rI   rP   get_donated_idxs&  s   r  c                   @  s    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r2   rm  r  r&  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTrI   rI   rI   rP   r  -  s    r  c                  C  sT   t jdd u rtjS dd l} dd l} t| jj	drtj
S t| j	j	dr'tjS tjS )Nr  r   AttrsDescriptor)	importlibutil	find_specr  r  triton.backends.compilertriton.compiler.compilerr   r  compilerr  r  r  )r  rI   rI   rP   #get_triton_attrs_descriptor_version7  s   r'  c                   C  s   t  tjkS r   )r'  r  r  rI   rI   rI   rP   triton_version_uses_attrs_dictQ  r  r(  r<   c                 C  sF   ddl m} t| |jsdS t| jtjjr!tjj	j
| jjv r!dS dS )zq
    Returns True if the node is an op that is not cudagraphable.
    Usually only custom ops have this tag.
    r2   r  FT)r  r  rx   r  r  rK   r  r  r   r  r  r  r$  rI   rI   rP   is_cudagraph_unsafe_opU  s   r)  c                  C  sX   t jdd} t r*ddlm} | }|r*t j|dd}| r(t j	|| gn|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  r  rh   r4  libfb.py.parutilr+  r  r	  pathsep)r  r+  runtime_pathlib_pathrI   rI   rP   get_ld_library_pathh  s   r1  c                 C  s    ddl m} t| |o| jd uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr2  rx   partition_signatures)r  r2  rI   rI   rP   #is_codegen_graph_partition_subgraphu  s   
r5  c                   C  s    t jjjjp
tjd uot jjjS r   )rK   r  rh   r  
cudagraphs&_unstable_customized_partition_wrapperr  r  rI   rI   rI   rP   is_using_cudagraph_partition~  s
   r8  c                 C  s8   ddl m} |jj| dr|jj| drtjS tjS )Nr2   r_  l        i   )	rb  r`  rc  rd  statically_known_ltr  rK   rE  rG  )r  r`  rI   rI   rP   dtype_from_size  s   r:  )r   rD   r   c                 C  $   | dkr
t jj S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN BF16.
    r   rD   TF)rK   r   r  _is_mkldnn_bf16_supportedr   rI   rI   rP   is_mkldnn_bf16_supported  
   r>  c                 C  r;  )z;
    Returns True if the device supports MKL-DNN FP16.
    r   rD   TF)rK   r   r  _is_mkldnn_fp16_supportedr=  rI   rI   rP   is_mkldnn_fp16_supported  r?  rA  elementsSequence[Sequence[T]]headersSequence[T]c              	   C  s   dd |D }| D ]"}t |t |ksJ t|D ]\}}t|| t t|||< qq	g }|ddd t||D  t|t |d  t |d  }|d|  | D ]}|dd	d t||D  qWd
|S )Nc                 S  s   g | ]}t t|qS rI   )rS   rH   r  rI   rI   rP   rQ     s    ztabulate_2d.<locals>.<listcomp>|c                 s  $    | ]\}}d || d V  qdS r  NrI   )rN   hrM  rI   rI   rP   r     r  ztabulate_2d.<locals>.<genexpr>rm  r2   rs  c                 s  rG  rH  rI   )rN   r   rM  rI   rI   rP   r     r  r5  )rS   r   r   rH   r  r	  r   r   )rB  rD  widthsrowr   r   rH  total_widthrI   rI   rP   tabulate_2d  s     "
rM  dict1r  dict2
d1_defaultValType | None
d2_defaultEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None]c                 c  s`    t |  t | B }|D ]}| |}||}||dur"|n||dur)|n|fV  qdS )a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r    r;  r  )rN  rO  rP  rR  all_keysr)  value1value2rI   rI   rP   	zip_dicts  s   

rW  config_patchesc                 C  s`   dd	d
}|  dtjj}|  } |r.|| dd || dd || dtjj  || dd | S )a1  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    rX  r  config_namerH   config_valuer   rG   r  c                 S  sP   |  |tt|}|d u r|| |< d S |s$||kr&td| d| dd S d S )NzInvalid config: =z. when aot_inductor.compile_standalone is True.)r  rJ   rh   r   )rX  rY  rZ  r   rI   rI   rP   patch_config  s   z2maybe_aoti_standalone_config.<locals>.patch_configzaot_inductor.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelN)rX  r  rY  rH   rZ  r   rG   r  )r  rh   aot_inductorcompile_standalonecopyrK   r  r  )rX  r\  r_  rI   rI   rP   maybe_aoti_standalone_config  s   
ra  c                  C  sV   ddl m}  | jj}|du rdS t|tstd|dkrdS td|s)tddS )	zL
    Validates if a model name is suitable for use in code generation.

    r   rg   NTz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  rh   r^  model_name_for_generated_filesrx   rH   r  r   r   )rh   
model_namerI   rI   rP   is_valid_aoti_model_name  s   
rd  r'   unbacked_onlyOrderedSet[sympy.Symbol]c                 C  s   |rt | S t| S r   )r&   r%   )rO   re  rI   rI   rP   get_free_symbols*  s   rg  cudagraph partition due to Optional[BaseSchedulerNode]c                 C  s`   t jjsdS | |  }|r)|j }r)|  }r)|jdd }r)| d| }t| dS )z
    Cudagraph partition may lead to extra memory overhead so we
    log partition reasons to help users understand the overhead.
    Nstack_tracez. Found from : 
 )	rh   r  r6  r  get_origin_noder  r  perf_hint_logr  )r  r  r  warning_msgir_noder  rj  rI   rI   rP   maybe_log_cudagraph_partition1  s   	
ro  dict[str, str]c                  C  s@   i t jdt jdt jtji} t rt	
d| d< | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr  
PYTHONHOME)r  r  r  r.  r	  r  r  rh   r4  	sysconfigget_path)envrI   rI   rP   python_subprocess_envJ  s   rw  c                   @  s"   e Zd ZU dZded< ded< dS )CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    rr   num_partitionspartition_indexNr   rI   rI   rI   rP   rx  e  s   
 rx  .c                   @  s   e Zd ZU dZded< dS )CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   rI   rI   rI   rP   r{  |  s   
 r{  CUDAGraphWrapperTypec                 C  s
   | t _d S r   )r7  r  )r  rI   rI   rP   !set_customized_partition_wrappers  r8  r}   tuple[list[Any], dict[str, Any]]c                   s   | j j}| j g || j j| j j}| j j}t||f\}}ddd  fdd|D }ddd	dfddfdd|D }t||\}}||fS )NrG   rw   c                 S  s"   t | tjjjot | tjjj S r   )rx   rK   r  r  r:   GeneratorStater\  rI   rI   rP   _is_tensor_ir  s   
z(snode_args_kwargs.<locals>._is_tensor_irc                   s*   g | ]} |rt jjj|d dn|qS )F)guard_shape)rK   r  r  ir_node_to_tensorrp  )r  rI   rP   rQ     s    z%snode_args_kwargs.<locals>.<listcomp>r  c                 S  s   t j| ||dS )Nr   )rK   r   )r  r   r   rI   rI   rP   _tensor  r  z"snode_args_kwargs.<locals>._tensorr   r   c                   s(   t | tjs| S  |  | j| j}|S r   )rx   rK   r  r  r   r   )r   r  )r  rI   rP   to_real_tensor  s   z)snode_args_kwargs.<locals>.to_real_tensorc                   rc  rI   rI   rp  )r  rI   rP   rQ     rk  rm  )rG   r  )r   r   rG   r   )r  r  fill_non_provided_argsconstant_argsr  pytreer!   tree_unflatten)r&  r   r  	flat_argsflat_args_pytree_specrI   )r  r  r  rP   snode_args_kwargs  s    


r  rk  )rq   rr   rG   rr   )ru   rv   rG   rw   )r   r   )r   r   r   rr   r   rr   rG   r   rm  )r   r   rG   r   )r	  r
  rG   rv   )r  r  r  r  rG   rv   )r	  r  rG   r  )r"  r#  r$  r#  rG   r#  )r)  r*  rG   rH   )rW  rX  rG   rY  )r   r#  rG   r^  )rW  rh  rG   ri  )rm  rn  rG   rw   )r{  r1   r|  r}  rG   rw   )r  r   r   r  r  r  rG   r  )rB   )r   rH   rG   r  )r2   rB   )
r  r  r  r  r   rr   r   rH   rG   r   )rI   r  r  r  rB   )r  r  r  r  r   rr   r  rr   r  r   r   rH   rG   r   )r  r   r  rH   rG   r  )r  r   r  r   rG   r  )r  rr   r  rr   rG   rr   )rO   r  r  rr   rG   r  )rO   r  rG   r  )r   r  rG   r  )r   r  rG   r  )r  rH   rG   r  )r  r  rG   r  )r  r  r  r  rG   rH   )r  r  r  r5   rG   r  r   )rP  rQ  rR  rS  rG   rT  )r   rX  r  rY  rG   rT  rz  )r  rv   rG   r{  )r  rH   rG   rw   )r  rd   r  rr   rG   r  )r  rw   rG   rw   )r   rH   rG   r  )rl  rv   r  r  rG   rv   )r  r   rG   r  )r   r   rG   rw   )r  r  rG   r  )r  r  rG   r1   )r  r  rG   r  rl  )r  r   rG   r   )NNT)r  r  r  r  r  rw   rG   r  )r  r  rG   r  )re  r(   r  r  rG   r  )r   r  rG   rr   rP  r   )r  r  rG   rw   r  )r  rr   r   r   r  r  rG   r3   )r"  r;   r  r  rG   rw   )r  rH   rG   rw   )
r"  r;   r  rw   r  rw   r  rw   rG   rw   )r  r:   r  rw   rG   rw   )
r"  r;   r  rr   r  rr   r  rr   rG   rw   )r  rH   rG   rw   )r  r   r  r   r  r   rG   rw   )r  r   r  r   r  r   rG   r  )r   rH   rG   rH   )rG   r"  )r"  r;   rG   rw   )r"  r;   rE  rF  rG  r:   rG   rw   )FTFN)r"  r;   rE  r:   rG  r:   rL  rw   rH  rw   rM  rw   rN  r  rG   rw   )r   ru  r   r  r  r  rG   rv  )r   r  rG   r  )r   ru  r   r  r  r  rG   r   )r   ru  r   r  r  r  rG   rH   )r   ru  r   r  r  r  rG   r  )r  r  r  r  rG   r  )r  r  r  r  rG   r   )r  rH   rG   r  )rG   r  )r<  r  rG   rw   )r  r  rG   rw   )rc  rv   rG   r  )r  rw   r   r   r  r   rG   r  )r   r  rG   r   )r  rH   rG   rw   )r  rH   rG   rr   )r  r  rG   rw   )
rx  r  r  r0   r  r  r  rH   rG   r  )r  r  rG   rw   )r  r  rm  r  rG   rw   )r  r"  rG   rw   )r&  r@   rG   rw   )r  r/  rm  r0  rG   rw   )r2  rH   r3  r  r4  r  rG   r   )r&  r@   r8  r9  r3  r:  r4  r;  r<  r=  rG   r  )rH  rr   rI  rr   rG   rr   )rN  r  rG   rr   )rZ  r   rG   r  )r   rH   rG   rH   )r   r  rG   rw   )r   rH   rG   rw   )r   r  rG   rw   )r  rn  r  r  ri  r  rj  r  rk  rH   rl  rw   rG   rw   )r  r  rG   r  )r   r  rG   rw   )r  r  rG   rw   )rG   rL  )r   ru  r   r  r  r  rG   r  )r  r  rG   r  )r  r  r  r  r  r  rG   r  )rO   r  rG   r  )r  r  r  r  r  r  rG   r  )r  r  r  r  rG   r  )r   rv   rG   rw   )r  r  r  r?   rG   r  )r   r  rG   rH   )r   rH   rG   r  )r  r  r   r  rG   rw   )rG   r  )r   rH   r  r/   r  r*  rG   r  )r   rH   rG   r  )r   r  rG   r  )r   r  r   rw   rG   r   )rG   r  )rG   r  )r  r<   rG   rw   )r  r5   rG   rw   )r  rr   rG   r  )r   rH   rG   rw   )rB  rC  rD  rE  rG   rH   )NN)
rN  r  rO  r  rP  rQ  rR  rQ  rG   rS  )rX  r  rG   r  )rO   r'   re  rw   rG   rf  )rh  N)r  rH   r  r  r  ri  rG   r  )rG   rp  )r  r|  rG   r  )r&  r@   rG   r~  (s  
__future__r   r6  rQ  r  enumr  r!  r  r  re  r  r[  r  r  r  r   r   r   r  rt  r  r]  r  r  collections.abcr   r   r   r   r   r   r	   r
   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   r   ry   rK   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   torch.utils._dtype_abbrsr   torch.utils._ordered_setr    r!   r"   OPTIMUS_EXCLUDE_POST_GRADr  r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   torch._prims_commonr/   torch.fxr0   torch.fx.noder1   r  r3   ra  r5   rc  r7   r  r8   r9   r:   r;   r<   r=   output_coder?   r  r@   rA   rR   rF   r   rW   torch._dynamo.device_interfacerX   torch._dynamo.utilsrY   torch.autogradrZ   torch.autograd.profiler_utilr[   (torch.fx.passes.graph_transform_observerr\   torch.fx.passes.shape_propr]   torch.utils._sympy.functionsr^   r_   r`   ra   rb   torch.utils._sympy.symbolrc   rd   torch.utils._sympy.value_rangesre   rf   r  rh   runtime.runtime_utilsri   r(  _IS_WINDOWS	getLoggerr   r   _logginggetArtifactLoggerrl  rl   r  r&  	VarRangesr  rr   	InputTypeGPU_KERNEL_BIN_EXTSr  r  r  r  rs   rt   r~   Functionr   r  r   r   r   r   r  r  r  r   rV  r]  rg  rl  rv  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r  r  r  r  r  rO  rW  rk  ry  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  rR  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r  	lru_cacher  r  r"  r%  rn  ry  r/  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r!  r-  r9  r>  rA  rC  rD  rK  rI  rk  rl  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rg  r  r  r  r  r  r  r  r  rd  r  Enumr  r  r  r  r!  r%  r*  r.  r1  r6  r@  rE  rM  rV  rW  rb  rd  r  rf  rh  ru  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r<  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r'  r(  r)  r1  r5  r8  r:  SUPPORTED_MKLDNN_DEVICESr>  rA  rM  rW  ra  rd  rg  ro  rw  rx  PartitionFnTyper|  r{  r7  r}  r  rI   rI   rI   rP   <module>   s    4  


$
KV&
		.$ /;=$  		g!8.

@
	+	!
(	
$&		'	#


$
0
	#1	

