o
    ߗi                    @  s
  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+m,Z,m-Z- d dlm.Z. d dl/Z/d dl0Z0d dl1m2Z2 e%rd d	l3m4Z4 d d
l5m6Z6 ddgZ7e8ddd Z9d dl:m;Z; d dl<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dlFmGZGmHZHmIZImJZJmKZK d dlLmMZMmNZN d dlOmPZPmQZQ ddlRmSZS ddlTmUZV ejdkZWe
XeYZZe&dZ[ee/j\e/j\f Z]e e'e0j^e_e0j`f  ZadddZbd Zcd Zdd!Zeeeeed @ d krJeed"ksNJ d#d$d% Zfdd(d)ZgG d*d+ d+e/jhZiddd2d3Zje8ddd5d6Zkdd:d;Zldd>d?ZmddCdDZnddGdHZoddLdMZUdNdO ZpddSdTZqddWdXZrdd[d\Zs	dǐdd_d`Ztdadb ZudɐddedfZv	dːddkdlZw	d͐ddpdqZxddudvZyddydzZzdd{d|Z{d}d~ Z|dddZ}e,dZ~e&dddZG dd de!ee~ef ZdddZdd Zdd Zdd Z	dǐdddZdd ZdddZdd ZdddZdd ZdddZdddZdddZdddZdddZdddZg Zded< dddZdd ZejdddĄZdddǄZddd˄Ze8d"dd̈́ ZG ddτ deZG ddф dуZG ddӄ deZejddՄ ZG ddׄ d׃ZG ddل deZe8ddddd݄Zddd߄ZdddZdddZdddZdddddZdd Ze8ddd Ze8ddd Zdd Zdd Zdd Zdd Zdd Z	dddZdd  ZG dd dZdddZdd Zdd	 Zd
d Zdd Zdd Zejdd ZdǐddZdd Zdd Zdd Zdd Zdd Zdd d!Zejd"d# Zd$d% Ze8dd&d' Ze8dd(d) Ze8ddѐd*d+ZÐdѐd,d-ZĐdd/d0ZŐdd1d2ZƐdd3d4Zǐdd5d6ZȐdd9d:Zɐdd;d<ZG d=d> d>ej˃Z̐d?d@ Z͐dǐdAdBZΐdCdD ZϐdEdF ZАdGdH ZѐdIdJ ZҐdKdL Z	dǐdMdNZ	dǐdOdPZՐddSdTZ֐ddVdWZejG dXdY dYZejdZd[ Zڐdd\d]Zېdd_d`ZܐdΐdadbZݐdcdd ZސddfdgZߐdhdi ZddldmZddodpZdqdr Zdsdt ZddwdxZdd|d}Zdd~dZdddZdddZdddZdd Zdd ZdddZddddddZdd e D ZedZdddZdddZdddZdddZe8ddd ZejG dd dZi Zded< dddZdddZe+ddddddddZdddZdS (       )annotationsN)datetime)StringIO)AnyCallableDictGenericIterableList
NamedTupleOptionalProtocolSequenceSetTupleTYPE_CHECKINGTypeVarUnion
ValuesView)Concatenatedataclass_transform	ParamSpec	TypeGuard)mock)DeviceProperties)ELEMENTWISE_TYPE_PROMOTION_KIND)tree_map_onlycudaxpuc                  C  s>   dd t D } t| dksJ t| dkrd}|S |  }|S )Nc                 S  s   g | ]}t t| r|qS  )getattrtorchis_available.0xr   r   S/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/torch/_inductor/utils.py
<listcomp>B   s    z get_gpu_type.<locals>.<listcomp>   r   r   )	GPU_TYPESlenpop)
avail_gpusgpu_typer   r   r&   get_gpu_type@   s   r.   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesr(   )config)ceildivwin32_Tz.cubinz.spv)r   r      @      zmust be power of 2c                 C  s   | t  d t  @ S )z/Round up to the nearest multiple of ALIGN_BYTESr(   )ALIGN_BYTES)nbytesr   r   r&   _alignm      rG   v
sympy.Exprc                 C  s<   t | tjtjfrttt| jS t | tpt	| t
t
kS )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrE   )rI   r   r   r&   rQ   r   s   rQ   c                   @  s&   e Zd ZdZdZdZeddd	Zd
S )rS   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr(   TvaluerJ   returnOptional[sympy.Expr]c                 C  s,   t |ttjfrtt|S t|r|S d S N)rK   intrL   IntegerrG   rQ   )clsrV   r   r   r&   eval   s
   z
align.evalN)rV   rJ   rW   rX   )__name__
__module____qualname____doc__nargs
is_integerclassmethodr]   r   r   r   r&   rS   y   s    rS      d   fnCallable[[], Any]rW   floatc                   s  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|  |   q)|  t j  |	|d }t
dt|| }t
dt|| }	t|D ]}|   qYt jjt jjjgd}
t|	D ]	}|  |   qot j  W d	   n1 sw   Y  td
 t|
 jddd tdd |
 D }t||	 dkrtdt||	t||	  t fddt|D }|  | }td t|jdd tdd |D d |	 }td| |S )aR  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
    g    Ar   )dtypedeviceT)enable_timing   r(   )
activitiesNz
raw eventsself_device_time_total)sort_by	row_limitc                 S  s&   g | ]}|j tjkr|jd kr|qS )zContext Sync)device_typer1   CUDAnamer$   eventr   r   r&   r'      s
    z,do_bench_using_profiling.<locals>.<listcomp>r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS r   r   )r$   irw   num_event_per_groupr   r&   r'      s
    zprofiling time breakdown)rr   c                 s  s    | ]}|j V  qd S rY   )device_time_totalrv   r   r   r&   	<genexpr>   s    z+do_bench_using_profiling.<locals>.<genexpr>g     @@zprofiling results: %s ms)r!   r   synchronizeemptyrZ   Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityrt   logdebugkey_averagestabler2   eventsr*   RuntimeError	enumerate_build_treesum)rg   warmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpry   filtered_eventsactual_eventsresr   rz   r&   do_bench_using_profiling   sh   	




r   boolc               
   C  s   zddl m}  tjdd | d uotttjdd dW S  ty&   Y dS  t	y@ } zdt
|v s5J W Y d }~dS d }~ww )	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   r!   _C%_dispatch_has_kernel_for_dispatch_keyhasattrr    opsImportErrorr   str)r   er   r   r&   has_torchvision_roi_align   s   
r   rk   "Union[Optional[torch.device], str]torch.devicec                 C  s`   | d u r
t djS t| trt | } | jdvr.| jd u r.t| j}t j| j|j	 dS | S )Ng        )cpumeta)index)
r!   tensorrk   rK   r   typer   r/   Workercurrent_devicerk   device_interfacer   r   r&   decode_device   s   


r   itIterable[sympy.Expr]c                 C  s   t tj| tjjS rY   )	functoolsreduceoperatormulrL   SOner   r   r   r&   sympy_product      r   seq1Sequence[sympy.Expr]seq2c                 C  s2   t | t |ks
J ttdd t| |D S )Nc                 s  s    | ]	\}}|| V  qd S rY   r   )r$   abr   r   r&   r}      s    zsympy_dot.<locals>.<genexpr>)r*   rL   expandr   zip)r   r   r   r   r&   	sympy_dot   s   r   Iterable[_T]ValuesView[_T]c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS r   )idr#   r   r   r&   
<dictcomp>       zunique.<locals>.<dictcomp>)valuesr   r   r   r&   unique      r   numerUnion[int, sympy.Expr]denomc              	   C  sr   t | tjst |tjrtt| t|S t | tr!t |ts4J |  dt|  d| dt| t| |S )Nz: , )rK   rL   Exprr5   sympifyrZ   r   runtime_ceildiv)r   r   r   r   r&   r?     s    
r?   c                 C  s   | d u rdS t | dd }i dddddd	d
ddddddd	ddddddddddddddddd d!d"d#d$d%d&}t| D ]}|||< qOt| t r]| S d'||  S )(Nz*i8.rp   r   i1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float16fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64uint8u8uint16u16u32u64)uint32uint64*)r   splitlistr   rK   )key	dtype_strtysrI   r   r   r&   _type_of  sX   	

r  lst"Iterable[Union[int, torch.SymInt]]List[sympy.Expr]c                 C  s   dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s   g | ]}t |qS r   )rL   r   r$   ry   r   r   r&   r'   :  r   z-convert_shape_to_inductor.<locals>.<listcomp>r   r  r   r   r&   convert_shape_to_inductor2  s   r   Iterable[Union[int, sympy.Expr]]List[Union[int, torch.SymInt]]c                   s   ddl m   fdd| D S )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    r(   Vc                   sB   g | ]}t |tr|nt |tjrt|n	 jjjj|d dqS )N)hint)rK   rZ   rL   r[   graphsizevars	shape_envcreate_symintnoder	  r  r   r&   r'   F  s    


z+convert_shape_to_symint.<locals>.<listcomp>)virtualizedr  r
  r   r  r&   convert_shape_to_symint=  s   

r  optorch._ops.OpOverloadc                 C  s(   t | tjjs	J tdd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s    | ]}|j d uV  qd S rY   )
alias_infor$   r   r   r   r&   r}   Y      zis_view.<locals>.<genexpr>)rK   r!   _ops
OpOverloadany_schema	argumentsr  r   r   r&   is_viewT  s   r"  is_pointwise_fn1Optional[Callable[[torch._ops.OpOverload], bool]]c                   s~   | j dksdS t| jtjjs| jtju sdS | jtju s"t| jr.t	 fdd| j
D S tjj| jjv p> duo> | jS )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc                 3  s    | ]}t | V  qd S rY   )is_pointwise_use)r$   ur#  r   r&   r}   n  r  z#is_pointwise_use.<locals>.<genexpr>N)r  rK   targetr!   r  r  r   getitemr"  rO   usersTag	pointwisetags)user#  r   r(  r&   r&  \  s   
	r&  c                   s   t j  g  fdd} j| gtt j|||fR  }t| jjdkr4t	| jjd j
dkr4|f} | t ji  }|fS )Nc                   s    |   dt S )Narg)appendplaceholderr*   )r0  g
graph_argsr   r&   add_tensor_argy  s   
z)gen_gm_and_inputs.<locals>.add_tensor_argr(   r   Tensor)r!   fxGraphr%  r   r7  r*   r  returnsr   r   outputGraphModule)r)  rR   kwargsr6  nodegmr   r3  r&   gen_gm_and_inputsu  s   

r@  r   Nonec                 C  s,   | dkrd S t | }| r|  d S d S Nr   )r/   r"   r~   r   r   r   r&   r~     s   r~   modelCallable[..., Any]timesrZ   c                 C  sT   t | td t }t|D ]
}| | }t | qt }|d us&J || S )Ni9  )r~   r!   manual_seedtimeperf_counterr   )rC  example_inputsrE  rk   t0r   resultt1r   r   r&   timed  s   

rM  r   
         ?c                   sD   t  fddt|D }t | }t|| d |S )Nc                   s   g | ]	}t  qS r   )rM  )r$   r   rR   rk   rg   rE  r   r&   r'         z%print_performance.<locals>.<listcomp>z.6f)r!   r   r   medianprint)rg   rR   rE  repeatbaselinerk   timingstookr   rP  r&   print_performance  s   "rX  objr   methodc                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S rY   r   r   rK  r   r&   <lambda>  s    z#precompute_method.<locals>.<lambda>N)r    setattr)rY  rZ  r   r[  r&   precompute_method  s   r^  methods	List[str]c                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)r^  )rY  r_  rZ  r   r   r&   precompute_methods  s   ra  c                 C  s   t | |kt | |k  S rY   )rZ   )r   r   r   r   r&   cmp     rb  c                 C  s&   t | dkrt| | d g| S | S )Nr(   r   )r*   r   )r%   sizer   r   r&   pad_listlike  s   re  r%   Tuple[_T, ...]List[_T]c                 C  s$   t | dkrg S dd }t| |dS )Nr   c                 S  s   t | tr| S |  S rY   )rK   r   get_name)elemr   r   r&   	sort_func  s   
ztuple_sorted.<locals>.sort_funcr  )r*   sorted)r%   rj  r   r   r&   tuple_sorted  s   rm  PRVT)	covariantc                   @  s$   e Zd ZedddZdd
dZdS )CachedMethodrW   rA  c                 C     d S rY   r   selfr   r   r&   clear_cache     zCachedMethod.clear_cacherR   P.argsr=  P.kwargsro  c                 O  rr  rY   r   rt  rR   r=  r   r   r&   __call__     zCachedMethod.__call__NrW   rA  )rR   rw  r=  rx  rW   ro  )r^   r_   r`   staticmethodru  rz  r   r   r   r&   rq    s    rq  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c                   sj   | j }d| d d| i}td| d  d  d | t| || d } fd	d
}||_|S )N___cacherg   z        def zC_cache_on_self(self):
            try:
                return self.zl
            except AttributeError:
                rv = fn(self)
                object.__setattr__(self, "z)", rv)
                return rv
        _cache_on_selfc                   s   t |  rt|   d S d S rY   )r   delattrrs  rk  r   r&   ru    s   
z"cache_on_self.<locals>.clear_cache)r^   execlstripr   wrapsru  )rg   ru   ctxwrapperru  r   rk  r&   cache_on_self  s$   r  c                 C  sJ   ddl m} t| trttjdd | D t S t| |j	r"| j
S t S )Nr(   irc                 S  s$   g | ]}t |d r|jr|jjqS )r>  )r   r>  origins)r$   r>  r   r   r&   r'     s    z%aggregate_origins.<locals>.<listcomp>) r  rK   r  r   r   r   or_setExternKernelr  )node_scheduler  r   r   r&   aggregate_origins  s   
	r  c                 C  s   t | }|dkrdd |D }tt|}nH|dkrPg }|D ]*}|jdkrHd|jv rH|jd d }t|d tr@||d  q||d j qtt|}n|d	kr\d
d |D }nt	|}d
dg| S )Noriginal_atenc                 S  s<   g | ]}|j d krd|jv r|jd dur|jd jjqS )r%  r  N)r  r   _overloadpacketr^   r$   originr   r   r&   r'     s    

z)get_fused_kernel_name.<locals>.<listcomp>r!   r%  source_fn_stackrp   r(   inductor_nodec                 S  s   g | ]
}|j d kr|jqS r%  )r  ru   r  r   r   r&   r'   '  s    r   fused)r  rl  r  r  r   rK   r   r1  r^   NotImplementedErrorjoin)r  descriptive_namesall_originssourcesr  	source_fnr   r   r&   get_fused_kernel_name  s.   r  c                   s  t | }dd |D }tt}tt}d  t|rOdd |D }t|dkrO|d j t dsEi }t jD ]\}}	|||	< q9| _	|j
 fdd	d
 |D ]3}
d|
jv ro|
jd d urot|
jd j}|| |
j d|
jv r|
jd d j}|| |
j qQ d urdnd}|j d| dd|  dd|  d}|j dg}t| D ]\}}||j d| ddt|  q d ur||j d |D ]}	||j d|	   q|d|fS )Nc                 S  s   g | ]	}|j d kr|qS r  r!  r  r   r   r&   r'   2  rQ  z'get_kernel_metadata.<locals>.<listcomp>c                 S  s   h | ]}|j qS r   )r  )r$   nr   r   r&   	<setcomp><  s    z&get_kernel_metadata.<locals>.<setcomp>r(   r   )_inductor_kernel_metadata_node_to_idx_mapc                   s
    j |  S rY   )r  r  single_graphr   r&   r\  F  s   
 z%get_kernel_metadata.<locals>.<lambda>rk  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r   z], Original ATen: []z" Source node to ATen node mapping:z   z => z Graph fragment:
)r  collectionsdefaultdictr  r*   r  r   r   nodesr  sortr   r   r  r1  ru   commentr  keysrl  itemsformat_node)r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsnode_to_idx_mapidxr  r>  r  sort_strmetadatadetailed_metadataoriginal_noder  r   r  r&   get_kernel_metadata0  sP   






r  initial_queueIterable[torch.fx.Node]Set[torch.fx.Node]c                 C  sZ   t | } t| }| r+|  }|jD ]}|r||rq||vr(|| | | q| s
|S )zJReturns the set of nodes whose values depend on those within initial_queue)r  r  r+   r+  addr1  )r  skip_filterdominated_setr>  userr   r   r&   dominated_nodesh  s   


	r  c                   sb   dd l }ddlm   fddfdd| D }fdd| D }t|jg ||R  S )	Nr   r(   r  c                   sD   t |  jr| jS t |  jr| jS t |  jo!t |  jS rY   )rK   	TensorBoxdata
StorageBoxIRNode	Pointwiser  r  is_unrealized_noder   r&   r    s
   

z*gather_origins.<locals>.is_unrealized_nodec                      g | ]	} |r|j qS r   r  )r$   valr  r   r&   r'     rQ  z"gather_origins.<locals>.<listcomp>c                   r  r   r  )r$   r0  r  r   r&   r'     rQ  )	itertoolsr  r  r   r  chain)rR   r=  r  kwarg_originsarg_originsr   r  r&   gather_origins{  s   r  exprc                 C  s   t | tjr	| jS t | tjrdtt| jS t | tj	r'dtt| jS t | t
tttfrA| jj ddtt| j dS t| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    z + z * (r   ))rK   rL   Symbolru   rM   r  rP   	sympy_strrR   Mulr9   r6   r7   r8   funcr^   r   r  r   r   r&   r    s   "r  c                 C  s>   ddl m} tjrt|jdd  }r|jdkrt| S t	 S )Nr(   r  current_node
index_expr)
r  r  r>   compute_all_boundsr    interpreterr)  r<   r=   unknown)r   r  fx_noder   r   r&   get_bounds_index_expr  s   
r  prefixr;   r  sympy.Symbolc                 C  s   | t jksJ t| |dddS )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)r;   SIZEr:   )r  r  r   r   r&   sympy_index_symbol_with_prefix  s   r  c                 C  s   | st jot jS rY   )r>   debug_index_assertsassert_indirect_indexing)checkr   r   r&   generate_assert     r  ru   c                 C  s    | d dksJ t j| dddS )r  r   sTr  )rL   r  ru   r   r   r&   sympy_index_symbol  s   r  replacementsDict[sympy.Expr, Any]c                   s*   dd  t |  fdd| D S )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                 S  s2   t | tjsJ t |trtj|| j| jdS |S )Nr  )rK   rL   r   r   r  rc   is_nonnegative)replacedreplacementr   r   r&   	to_symbol  s   
zsympy_subs.<locals>.to_symbolc                   s   i | ]
\}}| ||qS r   r   r$   krI   r  r   r&   r         zsympy_subs.<locals>.<dictcomp>)rL   r   xreplacer  )r  r  r   r  r&   
sympy_subs  s   
r  r   ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                 C  s:   t | tjpt | tjotdd t|  |  D S )Nc                 s      | ]}t |V  qd S rY   is_symbolicr#   r   r   r&   r}         zis_symbolic.<locals>.<genexpr>)	rK   r!   SymIntr7  r  r  r  rd  stride)r   r   r   r&   r    s    r  rR   c                  G     t dd | D S )Nc                 s  r  rY   r  r  r   r   r&   r}     r  z"any_is_symbolic.<locals>.<genexpr>r  )rR   r   r   r&   any_is_symbolic  r   r  r?  torch.fx.GraphModuleOptional[torch.fx.Node]c                 C  sv   ddl m} h d}t r|h d | jjD ]}t|j|v r&|  S |j	
d }d ur8||r8|  S qd S )Nr   )free_unbacked_symbols>   aten._assert_scalaraten._local_scalar_densefbgemm.dense_to_jagged.default%fbgemm.jagged_to_padded_dense.default,aten._fused_moving_avg_obs_fq_helper.default7aten._fused_moving_avg_obs_fq_helper_functional.defaultrun_with_rng_staterun_and_save_rng_state>   aten.scatter.srcaten.scatter_add_aten.scatter.reduceaten.index_put.defaultaten.index_put_.defaultaten.scatter_reduce.twoaten.scatter_add.defaultaten.scatter_reduce_.twoaten.scatter.value_reduceaten.scatter_reduce.two_outaten._unsafe_index_put.default0aten._unsafe_masked_index_put_accumulate.defaultr  )%torch.fx.experimental.symbolic_shapesr  r!   $are_deterministic_algorithms_enabledupdater  r  r   r)  r   get)r?  r  forbidden_setr>  r  r   r   r&   %get_first_incompatible_cudagraph_node  s   r)  c                 C  s&   t tt| jj}|jdksJ |S )z$Get the output node from an FX graphr;  )nextiterreversedr  r  r  )r?  	last_noder   r   r&   output_node  s   r.  z	List[Any]_registered_cachesc                 C  s0   t | dr
t| jst|  dt|  | S )zq
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_inductor_cache().
    cache_clearz# does not have a cache_clear method)r   callabler0  AttributeErrorr/  r1  rY  r   r   r&   clear_on_fresh_inductor_cache   s   
r4  c                  C  s   t D ]} |   qdS )z&
    Clear all registered caches.
    N)r/  r0  r3  r   r   r&   clear_inductor_caches,  s   
r5  c              	   #  sD   t   tj|d}zzstjtjd|iX t	d| tj
|d tjtjd i1 dV  t| trXt| dksAJ dtj
 rXt }|  fd	d
|D  W d   n1 sbw   Y  W d   n1 sqw   Y  |r}t| W n ty   ts	 td|  Y nw W t   dS W t   dS t   w )z
    Contextmanager that provides a clean tmp cachedir for inductor.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    )dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNr   z!expected empty cache_entries dictc              	     s,   i | ]}d |vr|t jt j |qS )z.lock)ospathgetsizer  )r$   ftriton_cache_dirr   r&   r   L  s
    z(fresh_inductor_cache.<locals>.<dictcomp>z(on error, temporary cache dir kept at %s)r5  tempfilemkdtempr   patchdictr:  environr   r   r;  r  rK   r*   existslistdirr&  shutilrmtree	Exception_IS_WINDOWSwarning)cache_entriesr6  deleteinductor_cache_dirfilesr   r>  r&   fresh_inductor_cache4  sL   





	rP  	List[int]c                 C  s(   | j }tt| }ttt||ddS )NT)r  reverse)__getitem__r   r*   r  r,  rl  )seqgettera_rr   r   r&   argsort`  s   rW  rT  .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                   sB    fdd}dd t |D }t|t|d}dd |D }|S )Nc                   sX   | \}}|\}} fdd}|||k rdS |||krdS ||k r$dS ||kr*dS dS )Nc                   s   t | tr| S  j| ddS )NT)size_oblivious)rK   r   evaluate_exprr  r  r   r&   evaluaten  s   
z*argsort_sym.<locals>.cmp.<locals>.evaluaterp   r(   r   r   )r   r   a_idxa_valb_idxb_valr\  r[  r   r&   rb  j  s   zargsort_sym.<locals>.cmpc                 S  s,   g | ]\}}|t |tjr|jjn|fqS r   )rK   r!   r  r>  r  )r$   r  r  r   r   r&   r'     s    zargsort_sym.<locals>.<listcomp>rk  c                 S  s   g | ]\}}|qS r   r   )r$   r  r   r   r   r&   r'         )r   rl  r   
cmp_to_key)r  rT  rb  exprsrK  r   r[  r&   argsort_symg  s   rd  c                 C  s   t jd| d S )Nr   rj   )r!   r   element_sizere  r   r   r&   get_dtype_size  rH   rg  c                   @  s   e Zd ZU ded< dS )LineContextr   contextNr^   r_   r`   __annotations__r   r   r   r&   rh    s   
 rh  c                   @  s   e Zd ZdZd-ddZd.ddZd/d
dZd/ddZdd Zdd Z	dd Z
dd Zdd Zdd Zd0ddZd0ddZd0dd Zd1d"d#Zd2d&d'Zd(d) Zd*d+ Zd,S )3IndentedBuffer   r   c                 C  s   g | _ || _d S rY   )_lines_indent)rt  initial_indentr   r   r&   __init__  s   
zIndentedBuffer.__init__rW   )tuple[str, list[tuple[int, LineContext]]]c                 C  s   t  }d}g }| jD ]8}t|tr| }|d u rq
nt|tr(|||jf q
t|ts/J || |d |d|	d 7 }q
|
 |fS )Nr(   r  )r   rn  rK   DeferredLineBaserh  r1  ri  r   writecountgetvalue)rt  bufr   linemapliner   r   r&   getvaluewithlinemap  s"   




z"IndentedBuffer.getvaluewithlinemapr   c                 C  s   |   \}}|S rY   )rz  )rt  rI   r   r   r   r&   rv    s   zIndentedBuffer.getvaluec                 C  s   t  }| jD ]6}t|tr| }|d u rqnt|trqt|ts#J |dr2||d d  q|| |d q| S )N\rp   r  )	r   rn  rK   rs  rh  r   endswithrt  rv  )rt  rw  ry  r   r   r&   getrawvalue  s   




zIndentedBuffer.getrawvaluec                 C  s   | j   d S rY   )rn  clearrs  r   r   r&   r~       zIndentedBuffer.clearc                 C  
   t | jS rY   )r   rn  rs  r   r   r&   __bool__     
zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr  )ro  tabwidthrs  r   r   r&   r    r  zIndentedBuffer.prefixc                 C  s   |  d d S )Nr  	writeliners  r   r   r&   newline  r  zIndentedBuffer.newlinec                 C  sr   t |tr| j| d S t |tr| j||   d S | r1| j|   |  d S | jd d S Nr  )rK   rh  rn  r1  rs  with_prefixr  striprt  ry  r   r   r&   r    s   

zIndentedBuffer.writelinec                 C  s   |D ]}|  | qd S rY   r  )rt  linesry  r   r   r&   
writelines  s   zIndentedBuffer.writelinesr(   c                   s   t j fdd}| S )Nc                	   3  s<     j  7  _ zd V  W  j  8  _ d S  j  8  _ w rY   ro  r   offsetrt  r   r&   r    s
   "z"IndentedBuffer.indent.<locals>.ctx)
contextlibcontextmanager)rt  r  r  r   r  r&   indent  s   zIndentedBuffer.indentc                 C  s   |  j |7  _ d S rY   r  rt  r  r   r   r&   	do_indent  r   zIndentedBuffer.do_indentc                 C  s   |  j |8  _ d S rY   r  r  r   r   r&   do_unindent  r   zIndentedBuffer.do_unindentFc                 C  s   t |trJtd}|jD ]}t |ts"|r"t|t|t|  }qt	|r*d}|jD ]}t |tr;| j
| q-t| |t|d   q-d S t|}|rU| }|sYd S | }|dD ]}| | qbd S )Ninfr   r  )rK   rl  ri   rn  rh  minr*   r  mathisinfr1  r  rZ   textwrapdedentrstripr   )rt  
other_coder  r  ry  r   r   r&   splice  s,   





zIndentedBuffer.splicer  Callable[[Any], Any]c                   s&   t | jd} fdd| jD |_|S )Nrp  c                   s   g | ]} |qS r   r   )r$   ry  r  r   r&   r'     ra  z&IndentedBuffer.map.<locals>.<listcomp>)rl  ro  rn  )rt  r  r   r   r  r&   rP     s   zIndentedBuffer.mapc                 C  s   t |  d|   dS )Nr  r  )r   rv  rs  r   r   r&   __repr__
  rc  zIndentedBuffer.__repr__c                 C  s8   | j |j ksJ t| j d}|| j ||j |S )Nr  )ro  rl  r  rn  )rt  otherr   r   r   r&   __add__  s
   zIndentedBuffer.__add__Nrx   )rW   rr  rW   r   rU   )F)r  r  rW   rl  )r^   r_   r`   r  rq  rz  rv  r}  r~  r  r  r  r  r  r  r  r  r  rP   r  r  r   r   r   r&   rl    s&    









rl  c                      s&   e Zd Zd fddZdd Z  ZS )FakeIndentedBufferrW   rA  c                   s   t    d S rY   )superrq  rs  	__class__r   r&   rq    r  zFakeIndentedBuffer.__init__c                 C  s$   |dkr
t | |S td| d)Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )rt  ru   r   r   r&   r    s
   
z#FakeIndentedBuffer.__getattribute__r|  )r^   r_   r`   rq  r  __classcell__r   r   r  r&   r    s    r  c                 c  s*    zd V  W | t _|t _d S | t _|t _w rY   )sysstdoutstderr)initial_stdoutinitial_stderrr   r   r&   restore_stdout_stderr$  s   
r  c                   @  sT   e Zd ZdZdd ZdddZdd
dZdd Zdd Zdd Z	dd Z
dd ZdS )rs  z.A line that can be 'unwritten' at a later timec                 C  s   |  sd}|| _d S r  )r  ry  r  r   r   r&   rq  0  s   
zDeferredLineBase.__init__rW   Optional[str]c                 C     t )zJReturns either self.line or None to indicate the line has been 'unwritten'r  rs  r   r   r&   rz  5  rv  zDeferredLineBase.__call__ry  r   c                 C  r  )z3Returns a new deferred line with the same conditionr  r  r   r   r&   	_new_line9  rv  zDeferredLineBase._new_linec                 C  s   |  | | j S rY   r  ry  )rt  r  r   r   r&   r  =  r   zDeferredLineBase.with_prefixc                 C  s   |  | j S rY   )r  ry  r  rs  r   r   r&   r  @  r  zDeferredLineBase.lstripc                 C  s   |  | j| S rY   r  )rt  r   r   r   r&   rS  C  r  zDeferredLineBase.__getitem__c                 C  r  rY   )r   ry  rs  r   r   r&   r  F  r  zDeferredLineBase.__bool__c                 C  r  rY   )r*   ry  rs  r   r   r&   __len__I  r  zDeferredLineBase.__len__N)rW   r  )ry  r   rW   rs  )r^   r_   r`   ra   rq  rz  r  r  r  rS  r  r  r   r   r   r&   rs  -  s    

rs  c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`r  r   value_fnCallable[[], str]ry  c                   s   t  | || _|| _d S rY   )r  rq  r  r  )rt  r  r  ry  r  r   r&   rq  P  s   
zDelayReplaceLine.__init__rW   c                 C  s   | j | j|  S rY   )ry  replacer  r  rs  r   r   r&   rz  U  r   zDelayReplaceLine.__call__c                 C  s   t | j| j|S rY   )r  r  r  r  r   r   r&   r  X  r  zDelayReplaceLine._new_line)r  r   r  r  ry  r   r  )ry  r   rW   r  )r^   r_   r`   ra   rq  rz  r  r  r   r   r  r&   r  M  s
    
r  index_or_deviceUnion[int, torch.device]c                 C  s   t | tjr	| }ntd| }t|}tjjr2|jd usJ |jdk s)|jdkr0t	d dS dS d}|j
}||k rGtj	d||d	d
 dS dS )Nr   	   rN  z6GPU arch does not support max_autotune_gemm mode usageFTD   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rK   r!   rk   r   createversionhipmajorr   rK  multi_processor_count)r  rk   propr  r  r   r   r&   
is_big_gpu\  s&   

r  c                   C  s   t jpt jpt jS rY   )r>   max_autotunemax_autotune_gemmsearch_autotune_cacher   r   r   r&   use_max_autotuney  s   r  allowed_layout_dtypesList[torch.dtype]c                 C  s    | j jdko| j|v ot| j S )Nr   )rk   r   rj   r  )layoutr  r   r   r&   _use_template_for_cuda  s
   r  backendc                 C  "   |   dd tj  dD v S )Nc                 S     g | ]}|  qS r   r  r#   r   r   r&   r'         z)_use_autotune_backend.<locals>.<listcomp>,)upperr>   max_autotune_gemm_backendsr   r  r   r   r&   _use_autotune_backend     r  c                 C  r  )Nc                 S  r  r   r  r#   r   r   r&   r'     r  z._use_conv_autotune_backend.<locals>.<listcomp>r  )r  r>   max_autotune_conv_backendsr   r  r   r   r&   _use_conv_autotune_backend  r  r  F)enable_int32enable_float8c                C  s   ddl m}m} tjtjtjg}|rtjtjtjtjg}|r'|tj	tj
g | jjdko1t| |p<| jjdko<| j|v oJt oJtdoJ|| j|jS )Nr(   )BackendFeaturehas_backend_featurer   r   TRITON)codegen.commonr  r  r!   r   r   r   r   extendr   r   rk   r   r  rj   r  r  TRITON_TEMPLATES)r  r  r  r  r  layout_dtypesr   r   r&   use_triton_template  s"   	r  c           	      C  s   ddl m} |jjj|| | dd}|dks|tjjk rdS ddlm	} t
jjr+dS t
jt
jt
jt
jg}t| |o@t o@td}|rM| sMtd	 dS |S )
Nr(   r  rp   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)r  r  r  r  	size_hintr>   r   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  r!   r  r  r   r   r   r   r  r  r  r   rK  )	r  mr  r  r  	gemm_sizer  r  r   r   r   r&   use_cutlass_template  s(   
r  c                 C  s   t j| jS rY   )r!   r   get_device_propertiesgcnArchNamerk   r   r   r&   _rocm_native_device_arch_name  s   r  c                  C  sx   zdd l } ddlm}m} ddlm} tj| j	}W n t
y5   dd }dd }G dd	 d	}d }Y nw ||||fS )
Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                   S     g S rY   r   r   r   r   r&   r    r{  z*try_import_ck_lib.<locals>.gen_ops_libraryc                   S  r  rY   r   r   r   r   r&   r    r{  z.try_import_ck_lib.<locals>.gen_ops_preselectedc                   @  s   e Zd ZdS )z*try_import_ck_lib.<locals>.CKGemmOperationN)r^   r_   r`   r   r   r   r&   r    s    r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr  r:  r;  dirname__file__r   )r  r  r  r  package_dirnamer   r   r&   try_import_ck_lib  s   r  c                   s   t  sdS tjjsdS | jjdksdS t| j}dd tjj	D p)|
dd |i  fdd  tjj@ D }|s=dS | jtjtjtjfvrJdS t \}}}}|sZtd	 dS t rb|tj_tjjsmtd
 dS |tjjkrztd dS dS )NFr   c                 S  s   i | ]
}| d d |qS ):r   )r   r$   r  r   r   r&   r     r   z#use_ck_template.<locals>.<dictcomp>r  r   c                   s   g | ]} | qS r   r   r  requested_archsr   r&   r'     s    z#use_ck_template.<locals>.<listcomp>z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)r  r!   r  r  rk   r   r  r>   rocmarchr   r  ck_supported_archrj   r   r   r   r  r   rK  	is_fbcodeck_dir)r  native_archrequested_supported_archsck_package_dirnamer   r   r  r&   use_ck_template  s<   




r  c                 C  s:   ddl m} tdot| o|jjj|| | dddkS )Nr(   r  CKrp   r  r   )r  r  r  r  r  r  r  )r  r  r  r  r  r   r   r&   use_ck_gemm_template  s   r  c                 C  s   t dot| S )Nr  )r  r  r  r   r   r&   use_ck_conv_template   r  r  c                 C  s   t  o| jjdkS rB  )r  rk   r   r  r   r   r&   _use_template_for_cpu$  r   r  c                 C  s   t | ||ddo|j S )NF)require_constant_mat2)use_cpp_gemm_templater  is_contiguous)r  mat1mat2r   r   r&   use_cpp_bmm_template(  s   r  c              
   C  s*  ddl m} ddlm} ddlm} ddlm} t| r t	ds"dS t
jjs(dS | tjk}	tjtjtjtjg}
||||	rA| jnd |d\}}}} }}t||frTdS t||jr^| }|| \}}|d	|||| | |t d
}dd }| j|
v o|d uo||ot||jo| p| S )Nr(   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposed
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsc                 S  s   |    |  d dkS )Nrp   r(   )freeze_layout
get_strider%   r   r   r&   is_last_dim_stride1X  s   z2use_cpp_gemm_template.<locals>.is_last_dim_stride1)r  r  codegen.cpp_micro_gemmr  codegen.cpp_utilsr  kernel.mm_commonr  r  r  r>   cppweight_prepack	get_dtyper!   r   r   r   halfrj   has_free_symbolsrK   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r  r  r  r   r  r  r  r  r  	int8_gemmr  r  r  r  r$  r   r!  r)  r   r   r&   r  /  sR   

r  c                   C  s   t   ptdS )NATEN)r  r  r   r   r   r&   use_aten_gemm_kernelse  r  r8  c                   @  s:   e Zd ZU edZded< dddZdd	 Zd
d Z	dS )DebugDirManagerr   r   prev_debug_namerW   rA  c                 C  s   t tj| _d S rY   )r*  r9  counterr   rs  r   r   r&   rq  m  r  zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )N_tmp_)r!   _dynamor>   debug_dir_rootr:  r   new_namers  r   r   r&   	__enter__p  s   zDebugDirManager.__enter__c                 G  s   t | j | jtjj_d S rY   )rG  rH  r?  r:  r!   r=  r>   r>  )rt  rR   r   r   r&   __exit__u  s   zDebugDirManager.__exit__Nr|  )
r^   r_   r`   r  ru  r;  rk  rq  r@  rA  r   r   r   r&   r9  i  s   
 

r9  Tuple[Any, List[str]]c                   st   ddl m} g  d fdd}tj|d| tj  | |i |}W d    | fS 1 s1w   Y  | fS )	Nr(   GraphLoweringcoder   c                        |  d S rY   r1  rE  source_codesr   r&   save_output_code  r  z*run_and_get_code.<locals>.save_output_coderK  rE  r   r  rD  r   rB  r  r!   r=  reset)rg   rR   r=  rD  rK  rK  r   rI  r&   run_and_get_codez  s   

rO  c                   s    fdd}t |S )Nc                    s     } |     | S rY   )r   backwardr[  rg   r   r&   run_with_backward  s   z1run_fw_bw_and_get_code.<locals>.run_with_backward)rO  )rg   rR  r   rQ  r&   run_fw_bw_and_get_code  s   rS  c              	     s   ddl m} g dfdd d fd	d
}tj|d|5 tj|d  tj  | |i |}W d   n1 s>w   Y  W d   S W d   S 1 sVw   Y  S )zLGet the inductor-generated code, but skip any actual compilation or running.r(   rC  rE  r   c                   rF  rY   rG  rH  rI  r   r&   rK    r  z"get_code.<locals>.save_output_codert  rD  c                   s6   G dd d}| j r|  n|  \}} | | S )Nc                   @  s"   e Zd ZdZd	ddZdd ZdS )
z@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulerW   rA  c                 S  rr  rY   r   rs  r   r   r&   rq    r{  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__c                 _  rr  rY   r   ry  r   r   r&   call  rv  zEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callNr|  )r^   r_   r`   ra   rq  rT  r   r   r   r&   DummyModule  s    
rU  )cpp_wrappercodegen_with_cpp_wrappercodegen)rt  rU  rE  r   )rK  r   r&   patched_compile_to_module  s
   z+get_code.<locals>.patched_compile_to_modulecompile_to_modulerK  NrL  )rt  rD  rM  )rg   rR   r=  rD  rY  r   r   )rK  rJ  r&   get_code  s"   
(r[  c                 O  sJ   t | g|R i |}dt|  krdks!n J dt| |d S Nr(      z%expected one or two code outputs got r   )r[  r*   )rg   rR   r=  rJ  r   r   r&   get_triton_code  s
   r^  c                 O  sN   t | g|R i |\}}dt|  krdks#n J dt| |d S r\  )rO  r*   )rg   rR   r=  r   rJ  r   r   r&   run_and_get_triton_code  s
   r_  c                   s~   ddl m  ddlm} |jg  fdd}tj|d| | |i |}W d    |fS 1 s6w   Y  |fS )Nr   rC  )CompiledFxGraphc                    s2   | i | | d }t | sJ | d S )Nr]  )rK   r1  )rR   r=  r  rD  graph_lowerings	real_initr   r&   	fake_init  s   z-run_and_get_graph_lowering.<locals>.fake_initrq  )torch._inductor.graphrD  torch._inductor.output_coder`  rq  r   rB  r  )rg   rR   r=  r`  rd  rK  r   ra  r&   run_and_get_graph_lowering  s   
rg  c              	   c  sN    ddl m} |j|  }zt|||j| < dV  W ||j| < dS ||j| < w )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorrh  	loweringsr   partial)aten_opoverride_fnrh  orig_fnr   r   r&   override_lowering  s   
ro  c                   s4   ddl m} |j  fdd}tjj|d|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                   s&   | |  | |}r| | |S rY   r   )	schedulerr  outrn  post_fnpre_fnr   r&   r    s
   


z(add_scheduler_init_hook.<locals>.wrapperrq  )torch._inductor.schedulerrp  rq  unittestr   rB  r  )ru  rt  rp  r  r   rs  r&   add_scheduler_init_hook  s   rx  c                 C  s"   t jr
t|  dS t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)r>   developer_warningsr   rK  info)msgr   r   r&   developer_warning   s   r|  c                  C  s   z/t jd} | d tt jk r.tt j| d  dkr.t j| d  d dkr.t j| d  W S W n	 ty8   Y nw t jD ]}|drM|tdd   S q<dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr(   r   -z--only=N)r  argvr   r*   
ValueError
startswith)r  r0  r   r   r&   get_benchmark_name  s   

r  c                 C  r
  )Nc                 s      | ]}|d kV  qdS r(   Nr   r#   r   r   r&   r}   ,  r  zis_ones.<locals>.<genexpr>rO   r  r   r   r&   is_ones+  r   r  c                 C  r
  )Nc                 s  r  )r   Nr   r#   r   r   r&   r}   0  r  zis_zeros.<locals>.<genexpr>r  r  r   r   r&   is_zeros/  r   r  c                 C  r
  )Nc                 s  s,    | ]}t |tjr|jtd kV  qdS )r   N)rK   r!   r7  rk   )r$   itemr   r   r&   r}   4  s    

z is_cpu_device.<locals>.<genexpr>r  )inputsr   r   r&   is_cpu_device3  s   r  r  torch.dtypec                 C  s&   t | tjs
J d| jrtjS tjS )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rK   rL   r   rc   r!   r   r   )r  r   r   r&   get_sympy_Expr_dtype;  s   r  c                 o  sN    | r"t jj|i |}|V  W d    d S 1 sw   Y  d S d V  d S rY   )r!   r   r   )should_profilerR   r=  r   r   r   r&   maybe_profileE  s   "
r  c                  C  s   t jj} | dk rt } | S )Nr(   )r>   r-  threadsr!   get_num_threads)r  r   r   r&   r4  N  s   r4  c                  C  s,   ddl m}  |  }|dtjjrdS dS )Nr(   )get_backend_options
num_stagesr]     )runtime.triton_helpersr  r'  r!   r  r  )r  optionsr   r   r&   get_backend_num_stagesU  s   r  c                 C  s   ddl m}m} | tjtjtjfv sJ t|j	
drEddlm} | }| tjtjfv r3|| |S tjjjjr?|tj|S |tj|S | tjtjfv rQ|| S tjjjjr\|tjS |tjS )Nr   )get_max_simd_tflopsget_max_tensorcore_tflops
clock_rate)max_clock_rate)triton.testingr  r  r!   r   r   r   inspect	signature
parametersr'  torch._utils_internalr  backendsr   matmul
allow_tf32)rj   r  r  r  sm_clockr   r   r&   get_device_tflops]  s   


r  c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)r  r  r  r   r   r&   get_gpu_dram_gbpsy  s   r  c                  C  s"   ddl m}  | jjdddS )Nr   drivermax_shared_mem)triton.runtimer  activeutilsr  r'  r  r   r   r&   get_gpu_shared_memory  s   r  reduction_typec                 C  s
   |  dS )Nwelford)r  r  r   r   r&   is_welford_reduction  r  r  c                 C  s   t | rdS dS )Nr  r(   )r  r  r   r   r&   reduction_num_outputs  r  r  c                   C  s   t  dkS )NLinux)platformsystemr   r   r   r&   is_linux  s   r  c                   C  s
   t jdkS )Nr@   )r  r  r   r   r   r&   
is_windows  r  r  itrIterable[Any]c                 C  r
  )Nc                 s  s$    | ]}t |tjo|j V  qd S rY   )rK   rL   r   	is_numberr#   r   r   r&   r}     s   " z#has_free_symbols.<locals>.<genexpr>r  )r  r   r   r&   r1    r   r1  c                  G  s~   ddl m} | D ]4}t||j|j|j|j|jfr-t|	 pds)t|
 p'dr, dS qt||js4qtdt| dS )Nr(   r  r   Tzunexpected type for is_dynamic F)r  r  rK   r  r  r2  ComputedBufferBufferr1  maybe_get_sizemaybe_get_strider  	TypeErrorr   )rR   r  tr   r   r&   
is_dynamic  s   
r  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r^   r_   r`   r  r  r   r   r   r&   r    s    r  c              	   C  s4  ddl m} tjdddd}t }t }t|t|dj|  t	d|j
 |d	 t	|j
|d	 t }t|| | |j
 W d    n1 sLw   Y  t | }	||j
 |j
  |  t	d
|j
 |d	 t	|j
|d	 | | k}
td||j|
|	 W d    d S 1 sw   Y  d S )Nr(   )stable_topological_sortwzutf-8F)modeencodingrM  )r?  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  r@  NamedTemporaryFileior   r4   r0   	propagaterS  r  r   nowr3   lint	recompilerv  r   rz  ru   )r  r?  inpr{  r  r=  	before_ioafter_io
start_timetime_elapsedr  r   r   r&   pass_execution_and_save  s>   

"r  c                 C  s   ddl m} t| |jko|d u p| j|u pLt| |jkoLttjj	do,| jtjj	j
jkpLttjj	do<| jtjj	jjkpLttjj	doL| jtjj	jjkS )Nr(   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r   _CollectiveKernelop_overloadFallbackKernelr   r!   r   torchrecr  defaultr  r  r>  r  r  r   r   r&   is_collective  s   

r  c                 C  s   ddl m} t| |jkS Nr(   r  )r  r  r   _WaitKernel)r>  r  r   r   r&   is_wait  s   r  c                 C  F   ddl m}m} t| |sJ t| |rtdd | jD S t| jS )Nr   BaseSchedulerNodeGroupedSchedulerNodec                 s  r  rY   )contains_collectiver#   r   r   r&   r}     r  z&contains_collective.<locals>.<genexpr>)rv  r  r  rK   r  snodesr  r>  snoder  r  r   r   r&   r    
   

r  c                 C  r  )Nr   r  c                 s  r  rY   )contains_waitr#   r   r   r&   r}     r  z contains_wait.<locals>.<genexpr>)rv  r  r  rK   r  r  r  r>  r  r   r   r&   r    r  r  c                 C  s6   ddl m} t|tjjr|h}t| |jo| j|v S r  )r  r  rK   r!   r  r  r  r  r  r   r   r&   is_fallback_op  s   r  c                 C  s   |||  j   S rY   )defining_oprh  )buf_namename_to_bufname_to_fused_noder   r   r&   buf_name_to_fused_snode  r   r  c                 C  sT   |r|| rd S | |  | jD ]}t|j||}||v rqt|||||d qd S )Ncriteria_cb)r  unmet_dependenciesr  ru   find_recursive_deps_of_node)r  collected_node_setr  r  r  depdefining_op_for_depr   r   r&   r    s"   

r  c              	   C  s   |r|| rd S | |  |  D ]4}|jD ].}|jd usJ |j dkr'q|j |vr/q||j  }||v r;qt|||||d qqd S )NOUTPUTr  )r  get_outputsr+  r>  rh  find_recursive_users_of_node)r  r  r  r  r  or  user_opr   r   r&   r  3  s,   

r  dynamo_gm_num_inputsaot_fw_gm_num_inputsc                 C  s8   t jjjrdnd}t jjjrt jj sdS ||  | S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r]  r   )r!   
_functorchr>   functionalize_rng_opsr=  inline_inbuilt_nn_modulesr  is_parameter_freezing)r  r  num_rng_seed_offset_inputsr   r   r&   num_fw_fixed_argumentsL  s   
r	  fx_gc                 C  sb   dd }d}g }| j jD ]}|jdkr ||r|| |d7 }q|ttt|ks-J t|S )z>
    Infers which inputs are static for a backwards graph
    c                 S  s   d| j vod| j vod| j vS )Ntangentsbwd_seedbwd_base_offsetr  r(  r   r   r&   is_saved_tensorb  s
   
z'count_tangents.<locals>.is_saved_tensorr   r2  r(   )r  r  r  r1  r  r   r*   )r
  r  	arg_countstatic_arg_idxsr  r   r   r&   count_tangents]  s   

r  c                   @  s*   e Zd ZU ded< dd Zedd ZdS )	BoxedBoolr   rV   c                 C  s   | j S rY   )rV   rs  r   r   r&   r  y  s   zBoxedBool.__bool__c                 C  s   t | tr
d| _| S dS NF)rK   r  rV   r3  r   r   r&   disable|  s   
zBoxedBool.disableN)r^   r_   r`   rk  r  r}  r  r   r   r   r&   r  u  s
   
 r  c                 #  s`    ddl m} |j fdd}tjj|d| d V  W d    d S 1 s)w   Y  d S )Nr(   )PythonWrapperCodegenc                   s&     | | |||g|R i |S rY   rG  )r  ru   kernel_coder  rR   r=  kernel_listorig_define_kernelr   r&   new_define_kernel  s   
z2collect_defined_kernels.<locals>.new_define_kerneldefine_kernel)codegen.wrapperr  r  rw  r   rB  r  )r  r  r  r   r  r&   collect_defined_kernels  s   "r  c                 C  s   | d S )N__original__r   r  r   r   r&    get_cloned_parameter_buffer_name  s   r  r  c                 C  s"   t | ts| d u sJ | | tv S rY   )rK   r   r)   r  r   r   r&   is_gpu  s   r   c                 C  s   t | tsJ t| S rY   )rK   r   r   r  r   r   r&   device_need_guard  s   r!  c                 C  s*   t  r| tjkrdS | tjtjtjhv S r  )r>   r
  r!   r   r   r   re  r   r   r&   ,needs_fallback_due_to_atomic_add_limitations  s   r"  r  c                 C  s   | j tjjjtjjjfv r|d u rdS | j tjjjkrdnd}|d |hvp]|o.t|o.t|p]| j tjjjkoM|dkoM|oM|dkoMt	j
joMt	j
jpMt dkp]||koY|tjtjhv p]t S )NFr  r   r   r(   )overloadpacketr!   r   atenscatter_reduce_scatter_reducescatter_r   r"  r>   r-  fallback_scatter_reduce_sumdynamic_threadsr4  r   r   r%  )r  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tyr   r   r&   use_scatter_fallback  s8   	r/  c                 C  s  ddl m}m} ddlm} tdt|  d t| D ]m\}}td|dd ||u r2td	 q||u r;td
 qt||r|	 }t|rIdnd d |rb|j
dusXJ td|j
jj  td |jjD ]}t| qjtd |jjD ]}t| qyqtdt| dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr0  r1  rv  r2  rS  r*   r   rK   is_reductionr>  r  reduction_hintread_writesreadswritesr   r   )r  r0  r1  r2  r  r>  is_redr  r   r   r&   dump_node_schedule  s0   




r=  r   torch.Tensorc                 C  s*   ddl m} ||  t| j t dkS )Nr   )statically_known_true)r$  r?  storage_offsetrg  rj   GPU_ALIGN_BYTES)r   r?  r   r   r&   tensor_is_aligned  s   rB  example_inputc                 C  s   t | jjsdS tjpt| S r  )r   rk   r   r>   assume_aligned_inputsrB  )rC  r   r   r&   should_assume_input_aligned  s   rE  c                  C  s4   t jj } | st S | jj}|st S | S rY   )	r!   _guardsTracingContexttry_getr  nullcontextr  r  suppress_guards)tracing_contextr  r   r   r&   #maybe_get_suppress_shape_guards_ctx
  s   rL  c                 O  s   t jjtddJ tj  dd l}dd l	}|
 }||}ddlm} || |j}||j | |i |}	| }
|| || W d    |	|
fS 1 sVw   Y  |	|
fS )Nr   Tr   )output_code_log)rw  r   rB  r  r>   r!   r=  rN  r  loggingr   StreamHandlertorch._inductor.codecacherM  
addHandlerlevelsetLevelDEBUGrv  removeHandler)rg   rR   r=  r  rN  log_capture_stringchrM  
prev_levelrK  r  r   r   r&   run_and_get_cpp_code  s$   




rY  r  Sequence[InputType]c                 C  s@   d }t | }|d ur|jS | D ]}t|tjr|jj  S qd S rY   )r0   r  rK   r!   r  r>  )r  r  r  inputr   r   r&   shape_env_from_inputs2  s   r\   Callable[[List[InputType]], Any]inputs_to_checkSequence[int]c                   s$   t  dkrS d fdd}|S )Nr   
new_inputsList[InputType]c                   s   t |   | S rY   )copy_misaligned_inputs)r`  r^  rC  r   r&   runN  s   
z)align_inputs_from_check_idxs.<locals>.run)r`  ra  )r*   )rC  r^  rd  r   rc  r&   align_inputs_from_check_idxsG  s   re  c                 C  s`   d|   v r	d}ntdd t|   |  D d }t| |fd }t||   |  S )Nr   c                 s  s     | ]\}}|d  | V  qdS r  r   )r$   shaper	  r   r   r&   r}   [  s    z)clone_preserve_strides.<locals>.<genexpr>r(   rU   )rd  r   r   r	  r!   
as_stridedclone)r%   needed_sizebufferr   r   r&   clone_preserve_stridesU  s   "rk  r`  ra  check_inputs_idxsc                 C  s>   |D ]}| | }t |tjsJ | t rt|| |< qd S rY   )rK   r!   r7  data_ptr	ALIGNMENTrk  )r`  rl  ry   _inpr   r   r&   rb  a  s   rb  static_input_idxsc                 C  sT   g }|D ]}| | }t |tjr| t dkr|| qt|t|kr(|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rK   r!   r7  rm  rn  r1  r*   )r  rp  aligned_static_input_idxsr  r[  r   r   r&   remove_unaligned_input_idxsk  s   
rr  r   c                 C  sZ   ddl m} ttjj}|jjj}|jjj	j
}|jj| |kr#dS || o,|| |kS )Nr(   r  T)r  r  r!   iinfor   r   r  r  r  r  has_hintis_expr_static_and_true)r   r  int_maxr  rt  r   r   r&   expr_fits_within_32bit}  s   
rw  c                   s   t jj }|d urP|jd urRt|jdksJ t| |jD ]4}|d u r,|jd  qd t jj  }r9|j  fdd|jt	fdd|D  qd S d S d S )Nr   Fc                   s(   d u rt | S  r| S | S rY   )rZ   deserialize_symexprevaluate_symexpr)r   )fakify_first_callr  r   r&   map_expr  s
   

z4set_tracing_context_output_strides.<locals>.map_exprc                 3  s    | ]} |V  qd S rY   r   )r$   r   )r{  r   r&   r}     r  z5set_tracing_context_output_strides.<locals>.<genexpr>)
r!   rF  rG  rH  output_stridesr*   r\  r1  rz  tuple)rI  compiled_graphri  rc  r  r   )rz  r{  r  r&   "set_tracing_context_output_strides  s   
 r  c                  C  s`   t jd urt jS t  sdS tj rdS zddlm}  W n
 ty'   Y dS w | tj	dkS )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
r>   fx_graph_remote_cacher
  r!   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  r   r   r&    should_use_remote_fx_graph_cache  s   

r  c                 C  s   t dd| S )Nz[^a-zA-Z0-9_]r   )resubr  r   r   r&   normalize_name  r  r  ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzc                 C  s   i | ]\}}||qS r   r   r  r   r   r&   r     r   r   z^.*[.]rj   c                 C  s   t dt| }t||S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r   _triton_type_mappingr'  )rj   triton_type_namer   r   r&   triton_type  s   r  c                 C  s6   t | | }|dd}tt|}t|tjsJ |S )Nr  r  )_torch_triton_mappingr'  r  r    r!   rK   rj   )rj   adjusted_type	type_namer  r   r   r&   triton_type_to_torch  s
   
r  r  rV   c                 C  sh   | j  o3|  | ko3|  | ko3| j|jko3| j|jko3|   |  ko3|  | kS rY   )	is_mkldnnrd  r	  rj   rk   untyped_storagerm  r@  r  rV   r   r   r&   is_same_tensor  s   

r  c                 C  sJ   | j o$|  | ko$| j|jko$| j|jko$tjj| tjj|kS rY   )r  rd  rj   rk   r!   r   mkldnnrm  r  r   r   r&   is_same_mkldnn_tensor  s   

r  c                   C  s   dS )N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   r   r   r&   boolean_ops  rv  r  c                   @  s   e Zd ZU ded< ded< dS )OpDtypeRuler   type_promotion_kindOptional[torch.dtype]override_return_dtypeNrj  r   r   r   r&   r  	  s   
 r  zDict[str, OpDtypeRule]op_dtype_propagation_rulesr  r   r  r  c                 C  s   t ||t| < d S rY   )r  r  )ru   r  r  r   r   r&   #register_op_dtype_propagation_rules
	  s   r  c                 C  s"   t jjr| tjtjfv rtjS | S )z"Maybe upcast [b]float16 to float32)r>   r8  codegen_upcast_to_fp32r!   r   r   r   re  r   r   r&   upcast_compute_type	  s   r  )frozen_defaultfrozenr  c                 s"   d fdd}| d u r|S || S )Nr\   rA   rW   c                   s(   t jdkrtj| d dS tj|  dS )N)r  rN  T)kw_onlyr  r  )r  version_infodataclasses	dataclass)r\   r  r   r&   wrap	  s   
zir_dataclass.<locals>.wrap)r\   rA   rW   rA   r   )r\   r  r  r   r  r&   ir_dataclass	  s   r  Optional[List[int]]c                  C  s&   t jj } | d ur| jr| jjS d S rY   )r!   rF  rG  rH  fw_metadatabw_donated_idxs)rK  r   r   r&   get_donated_idxs,	  s   r  )rI   rJ   )re   rf   )rg   rh   rW   ri   )rW   r   )rk   r   rW   r   )r   r   rW   rJ   )r   r   r   r   rW   rJ   )r   r   rW   r   )r   r   r   r   rW   r   )r  r  rW   r  )r  r  rW   r  )r  r  rW   r   rY   )r#  r$  rW   r   )r   )rk   r   rW   rA  )r(   r   )rC  rD  rE  rZ   rk   r   rW   ri   )r   rN  rN  rO  r   )rk   r   )rY  r   rZ  r   )rY  r   r_  r`  )rW   rZ   )r%   rf  rW   rg  )rg   r~  rW   r  )r  r  rW   r  )r  rJ   rW   r   )r  r;   r  rZ   rW   r  )ru   r   rW   r  )r  rJ   r  r  rW   rJ   )r   r   rW   r  )rR   r   rW   r   )r?  r  rW   r  )r?  r  )rY  r   )NNT)rW   rQ  )rT  rX  rW   rQ  rx   )r  r  rW   r   )r  r  rW   r   )r  r   rW   r   )FT)rW   rB  )r  rJ   rW   r  )r  r   rW   r   )r  r   rW   rZ   )r  r  rW   r   )r  rZ   r  rZ   )r
  r  )ru   r   )rk   r  )r  r  )r   r>  )rC  r>  )r  rZ  )rC  r]  r^  r_  rW   r]  )r%   r>  )r`  ra  rl  r_  rW   rA  )r  rZ  rp  r_  rW   r_  )r   rJ   )ru   r   rW   r   )rj   r  rW   r   )rj   r   rW   r  )r  r>  rV   r>  )r  r   r  r  )rj   r  rW   r  )r  r   )rW   r  )
__future__r   r  r  r  enumr   r  r  r  rN  r  r   r:  r  r  rG  r  r@  r  rG  rw  r   r   typingr   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   rL   r!   torch._inductor.runtime.hintsr   torch._prims_commonr   torch.utils._pytreer   r)   	lru_cacher.   torch._dynamo.device_interfacer/   torch._dynamo.utilsr0   torch.autogradr1   torch.autograd.profiler_utilr2   (torch.fx.passes.graph_transform_observerr3   torch.fx.passes.shape_propr4   torch.utils._sympy.functionsr5   r6   r7   r8   r9   torch.utils._sympy.symbolr:   r;   torch.utils._sympy.value_rangesr<   r=   r  r>   runtime.runtime_utilsr?   r   rJ  	getLoggerr^   r   rA   r   	VarRangesr7  rZ   r  	InputTypeGPU_KERNEL_BIN_EXTSrA  rn  rE   rG   rQ   FunctionrS   r   r   r   r   r   r   r  r  r  r"  r&  r@  r~   rM  rX  r^  ra  rb  re  rm  rn  ro  rq  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r)  r.  r/  rk  r4  r5  r  rP  rW  rd  rg  rh  rl  r  r  rs  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r8  r9  rO  rS  r[  r^  r_  rg  ro  rx  r|  r  r  r  r  r  r  r4  r  r  r  r  r  r  r  r  r1  r  Enumr  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r  r   r!  r"  r/  r=  rB  rE  rL  rY  r\  re  rk  rb  rr  rw  r  r  r  r  r  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r&   <module>   s  H



$R#			
!9,+$
 
 

.

6	'		

"


'	



	