o
    "ib                    @   s
  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dlm
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZmZ d dlm  m  m Z! d dl"Z#d dl$Z#d dl%m&  m'Z( d dl)m*Z* d dl+m,Z, d d	l-m.Z. d d
l/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB d dlCmDZDmEZEmFZF d dlGmHZH ddlImJZJmKZK ddlLmMZM ddlKmNZNmOZOmPZPmQZQ ddlRmSZS ddlTmUZU ddlVmWZW ddl&mXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZf ddlgmhZhmiZi er.ddljmkZk elemZnej
ejoddZoe#jhjpZp	 dd Zqdd  Zrd!d" Zsd#d$ Ztd%d& Zug d'Zvg d(Zwd)d* Zxd+eey d,eey fd-d.Zzdd0d1Z{d2d3 Z|d4d5 Z}d6d7 Z~d8d9 ZG d:d; d;ZejG d<d= d=eZd>d? ZG d@dA dAeZejG dBdC dCeZerdDerdEerdFerdGerdHerdIdJZddKdLZejG dMdN dNeZdOdP ZG dQdR dReZejG dSdT dTeZejG dUdV dVeZdWdX ZdYdZ Z	[dd\d]Zej
ed/d^Zd_d` ZejG dadb dbeZejG dcdd ddeZejG dedf dfeZG dgdh dheZejG didj djeZejG dkdl dleZejG dmdn dneZG dodp dpeZG dqdr dreZejG dsdt dteZejG dudv dveZdwdx Zdydz ZejG d{d| d|eZG d}d~ d~eZG dd deZG dd deZG dd deZG dd deZejG dd deZG dd deZG dd deZG dd deZG dd deZejG dd deZG dd deZG dd deZeeyeeeeeeyeeef  f ZG dd dZG dd deZG dd deZG dd deZG dd deZejG dd deZG dd deZG dd deZdd ZejG dd deZejG dd deZG dd deZG dd deZG dd deZdefddZG dd deZG dd deZG dd deZG dd deZG dd deÃZG dd deZG ddÄ deZG ddń deZG ddǄ deZG ddɄ deZG dd˄ deZejG dd̈́ d̓ZepjjepjjepjjepjjepjjepjjepjjepjjepjjepjjepjjepjjepjjhZG ddτ deZejG ddф deރZejG ddӄ deZG ddՄ deZ	[	ddddddddeey deey deey deydedeeey  fddZ						אdddZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZejG dd deZG ddׄ deZG dd  d eZejG dd deZdd ZejG dd deZejG dd deZG d	d
 d
eރZejG dd deZG dd de#jjZG dd dZG dd dZG dd deރZG dd deZdd Z dS (      N)nullcontext)partial)AnyCallableClassVarDictIterableListOptionalSequenceSetTupleTYPE_CHECKINGUnion)patch)ExprInteger)get_interface_for_device)identity)GraphModuleSerializer)can_auto_functionalize)metrics)compute_required_storage_lengthis_boolean_dtypeis_float_dtypemake_channels_last_strides_for
StrideType)get_schema_info)CallMethodKeycompute_unbacked_bindingsDivideByKeyfree_unbacked_symbolsrebind_unbackedresolve_unbacked_bindingsSymTypes)CleanDivFloorDivModularIndexing)SymT   )configdependencies)index_prevent_reordering)extract_free_unbacked_symbols#extract_input_node_reduction_rangesextract_read_writesvar_builder)OpCounterCSE)ReductionHint)do_bench)argsortcache_on_selfceildivconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningget_kernel_metadata
is_dynamicis_gpupad_listlike	sympy_dotsympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)opsV)GraphLoweringz  prefixc                    s    fdd  |  d S )Nc              	      s   | d u rd S t | ttfr| D ]} | qd S t | tr*|  D ]} | q!d S t | tjjjt	t
ttjjjttfsFJ dt|  dd S )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])
isinstancelisttupledictvaluestorch	_inductorir
ExpandViewDynamicScalarAssertScalar	TensorBoxsympylogicboolalgBooleanr   EffectfulKerneltype)nodesnode_check_tensorbox P/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torch/_inductor/ir.pyr]      s.   


z%validate_ir.<locals>._check_tensorboxr^   )node_or_nodesr^   r\   r_   validate_ir   s   ra   c                    s   t  tsJ  fdd}|S )Nc                     s   t t | i |S N)getattrrC   argskwargsnamer^   r_   fn      zops_wrapper.<locals>.fn)rH   str)rh   ri   r^   rg   r_   ops_wrapper   s   rl   c                    s&   t t| tt|   fdd}|S )Nc                    0   t  t ks
J  fddtt  D S )Nc                       g | ]} |  qS r^   r^   .0i)index	inv_orderr^   r_   
<listcomp>       z4inverse_reorder.<locals>.reindex.<locals>.<listcomp>lenrangerr   rs   ry   r_   reindex      z inverse_reorder.<locals>.reindex)rK   ziprx   rw   orderr{   r^   rz   r_   inverse_reorder   s   r   c                        fdd}|S )Nc                    rm   )Nc                    rn   r^   r^   ro   )rr   r   r^   r_   rt      ru   z1same_reorder.<locals>.reindex.<locals>.<listcomp>rv   ry   r   ry   r_   r{      r|   zsame_reorder.<locals>.reindexr^   r~   r^   r   r_   same_reorder      r   c                    s    fdd}|S )Nc                        | S rb   r^   ry   reindex1reindex2r^   r_   r{         z fuse_reindexing.<locals>.reindexr^   )r   r   r{   r^   r   r_   fuse_reindexing   s   r   )   r      r)   )   r   r   r   r)   c                    s0   dd t | D   fddtt| D }|S )z
    Convert stride order to fill order
    For channel last format,

    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    c                 S      i | ]\}}||qS r^   r^   rp   idxposr^   r^   r_   
<dictcomp>       z+stride_order2fill_order.<locals>.<dictcomp>c                       g | ]} | qS r^   r^   ro   lookupr^   r_   rt          z+stride_order2fill_order.<locals>.<listcomp>)	enumeraterx   rw   )r   
fill_orderr^   r   r_   stride_order2fill_order   s   r   seqreturnc                 C   s<   t | }dd tt| D }t|D ]\}}|||< q|S )z)
    Convert strides to stride order
    c                 S      g | ]}d qS r   r^   rp   _r^   r^   r_   rt          z$get_stride_order.<locals>.<listcomp>)r4   rx   rw   r   )r   
sorted_idxoutrq   elemr^   r^   r_   get_stride_order   s
   
r   Tc                    s   | d u rd S |st jjj nt  fdd|  D }t| r, fdd|  jD }nt	
|}|  }|  }t|}t|}tj||||d }|S )Nc                       g | ]} |qS r^   r^   rp   sshape_fnr^   r_   rt      r   z%ir_node_to_tensor.<locals>.<listcomp>c                    r   r^   r^   r   r   r^   r_   rt      r   )sizestridedtypedevice)rD   graphsizevars	size_hintr   get_sizeis_storage_and_layout
get_layoutr   FlexibleLayoutcontiguous_strides	get_dtype
get_devicer8   rM   empty_stridedzero_)xguard_shaper   r   r   r   tr^   r   r_   ir_node_to_tensor   s&   
r   c                 C   s   t | tr
| s
d gS | S rb   )rH   rI   valuer^   r^   r_   may_convert_to_optional   s   r   c                 C   s.   t | dd rt|  S t| tjr| jS d S )Nr   )rc   get_device_typer   rH   rM   r   rY   r   r^   r^   r_   r     s
   r   c                 C      t t| S rb   )r<   r   r   r^   r^   r_   	is_triton
  r   r   c                 C   s   t | dkS Ncpu)r   r   r^   r^   r_   is_cpu  r   r   c                   @   s  e Zd ZU e Zeee  ed< e	e
jdeejj fddZdd Zdd Zd	d
 Zdd Zdd Zedd Zdd Zdd Zdd Zdd Zdd Zdd Zd,ddZeg ejf ed < ej ed!< eg e!f ed"< eg ef ed#< eg ef ed$< eg ef ed%< eg e"f ed&< eg eegef f ed'< eg eegef f ed(< ee#gdf ed)< eg df ed*< eg ee$j% f ed+< dS )-IRNode_current_originsoriginsc                 c   s.    t j}|| B t _z	d V  W |t _d S |t _w rb   )r   r   )r   oldr^   r^   r_   current_origins  s   
zIRNode.current_originsc                 C   s*   t | j| _tjrt | _d S d | _d S rb   )setr   r   r*   debug_ir_traceback	tracebackformat_stackselfr^   r^   r_   __post_init__  s   zIRNode.__post_init__c                 C      | j S rb   )r   r   r^   r^   r_   get_traceback#     zIRNode.get_tracebackc                 C   s6   dt | dd }t|dkr|d d  d}|gS )Nzorigins=r    @   =   z...)rc   rw   )r   r   r^   r^   r_   common_repr&  s   zIRNode.common_reprc                 C   s6   ||    }tdtt|}t| j d| dS )Nz,
z(
z
))r   indentjoinmaprk   rY   __name__r   linesr^   r^   r_   
str_helper-  s   zIRNode.str_helperc                 C   s   ||   v S rb   )get_read_namesr   rh   r^   r^   r_   
is_user_of2  r   zIRNode.is_user_ofc                 C   s   dd |   D S )Nc                 S   s   h | ]}|j qS r^   rg   )rp   depr^   r^   r_   	<setcomp>7  s    z(IRNode.get_read_names.<locals>.<setcomp>)	get_readsr   r^   r^   r_   r   5     zIRNode.get_read_namesc                 C   r   rb   r   r   r^   r^   r_   r   9  r   zIRNode.get_dtypec                 C      t dt|  d)Nz#get_layout() is not implemented by !NotImplementedErrorrY   r   r^   r^   r_   r   <  rj   zIRNode.get_layoutc                 C   r   )Nz!get_size() is not implemented by r   r   r   r^   r^   r_   r   ?  rj   zIRNode.get_sizec                 C   s   t |  S rb   )rA   r   r   r^   r^   r_   	get_numelB  r   zIRNode.get_numelc                 C      t jjt|  dS Nr   rD   r   r   is_expr_static_and_truerT   Eqr   r   r^   r^   r_   is_zero_elementsE     zIRNode.is_zero_elementsc                 C      t dt|  )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on r   r   r^   r^   r_   realizeH  s   zIRNode.realizeNc                 C   r   )Nzcodegen_reference NYI on r   r   writerr^   r^   r_   codegen_referenceZ     zIRNode.codegen_referencer   r   get_namer   
get_strideget_storage_numelhas_exceeded_max_readsmake_loadermake_indexer
mark_reuserealize_hintget_unbacked_symbol_usesrb   )&r   
__module____qualname__r   r   r   r   r   __annotations__staticmethod
contextlibcontextmanagerrM   fxNoder   r   r   r   r   r   r5   r   r   r   r   r   r   r   r   r   r   r   rk   boolintrT   Symbolr^   r^   r^   r_   r     s>   
 


r   c                       s  e Zd ZU ejed< ejed< edef ed< e	e
 ed< deej fddZd.d
dZ fddZeZdd Zdd Zdd Zdd Zdd Zedd ZeejfddZedd Zdd Zd d! Z d"d# Z!d$d% Z"d&d' Z#d(d) Z$d*d+ Z%d,d- Z&  Z'S )/Loopsr   r   .inner_fnrangesr   c                 C   s&   t  jg dd | jD |  R  S )Nc                 s       | ]}t |V  qd S rb   r!   rp   er^   r^   r_   	<genexpr>w      z1Loops.get_unbacked_symbol_uses.<locals>.<genexpr>)r   unionr  inner_fn_free_unbacked_symbolsr   r^   r^   r_   r  u  s
   zLoops.get_unbacked_symbol_usesr  c                    sF     d jj dt j  g fdd|D  d jg S )N'c                    s    g | ]}| d t  | qS =)rc   )rp   rh   r   r^   r_   rt          z!Loops.__str__.<locals>.<listcomp>origin_node=)r   r   rY   rk   r   inner_fn_strorigin_node)r   namesr^   r   r_   __str__{  s   zLoops.__str__c                       t    d | _d S rb   superr   r#  r   	__class__r^   r_   r        

zLoops.__post_init__c                 C   r   rb   r   r   r^   r^   r_   r     r   zLoops.get_devicec                 C   r   rb   r#  r   r^   r^   r_   get_origin_node  r   zLoops.get_origin_nodec                 C   r   rb   r  r   r^   r^   r_   r     r   zLoops.get_sizec                 C   r   rb   r  r   r^   r^   r_   get_pointwise_size  r   zLoops.get_pointwise_sizec                 C      dS NFr^   r   r^   r^   r_   	is_extern     zLoops.is_externc                 O   sN   | dd }| dd }| |i |}||_tjr|pt nd |_t|S )Nr#  r   )popr#  r*   r   r   r   rS   create)clsre   rf   r#  tbrr^   r^   r_   r5    s    
zLoops.createc                    s    fddt | D S )Nc                    s,   g | ]\}}|d krt dnt |qS )r)   r   )rT   r   r@   )rp   nr   rF   r^   r_   rt     s    z Loops._index.<locals>.<listcomp>)r   )r  rG   r^   rF   r_   _index  s   
zLoops._indexc              	   C   s   t t }t|1 ttdd | j|    |j	W  d    W  d    S 1 s/w   Y  W d    d S 1 s?w   Y  d S Nallow_indexingT)
r1   rD   MockHandlerset_ops_handlerr   objectr   r  inner_fn_argsop_count)r   	opcounterr^   r^   r_   inner_fn_opcount  s   RzLoops.inner_fn_opcountc                 C   s   |  | jfS rb   )r:  r  r   r^   r^   r_   r@       zLoops.inner_fn_argsc                 C   s   t jj| jg|  R  S rb   )rD   KernelFormatterHandlerir_to_stringr  r@  r   r^   r^   r_   r"    s
   zLoops.inner_fn_strc                 C   s   |   tjkS rb   )rC  r*   realize_opcount_thresholdr   r^   r^   r_   has_large_inner_fn  rD  zLoops.has_large_inner_fnc                 C   s   |  | j}t| j|S rb   )r:  r  r-   r  )r   rr   r^   r^   r_   r    s   z$Loops.inner_fn_free_unbacked_symbolsc                 C   sv   t tdd* |  r t|  |  |  jW  d    S t|  |  jW  d    S 1 s4w   Y  d S r;  )	r   r?  r   get_reduction_typer/   r  r   get_reduction_sizereadsr   r^   r^   r_   r     s   $zLoops.get_readsc                 C   r   )Nz+get_reduction_size() is not implemented by r   r   r   r^   r^   r_   rJ       zLoops.get_reduction_sizec                 C   r   )Nz+get_reduction_type() is not implemented by r   r   r   r^   r^   r_   rI    rL  zLoops.get_reduction_typec                 C   r   )Nz+constant_to_device() is not implemented by r   r   r   r   r^   r^   r_   constant_to_device  rL  zLoops.constant_to_device)r  )(r   r  r  rM   r   r  r   r   r   r	   r   r   rT   r  r  r%  r   __repr__r   r.  r   r/  r2  classmethodr5  r	  r(   INDEXr:  r5   rC  r@  r"  rH  r  r   rJ  rI  rN  __classcell__r^   r^   r)  r_   r  n  s8   
 





	r  c                C   s"   |j rttd|S td|S )Nnanr   )is_floating_pointrC   constantfloat)r   r   r^   r^   r_   nop_loader_fn  s   rW  c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )	Pointwisec                 C   s   |   rtt| jdS | jS )Nr   )r   r   rW  r   r  r   r^   r^   r_   r    s   zPointwise.make_loaderc                 C      g S rb   r^   r   r^   r^   r_   rJ    r3  zPointwise.get_reduction_sizec                 C      d S rb   r^   r   r^   r^   r_   rI    r3  zPointwise.get_reduction_typec                 C   s   |   }t|||||S rb   )r  rC   storer   output_nameindexervarsloaderr^   r^   r_   store_output  s   zPointwise.store_outputc                 C   s,   |   }ttd||}t|| j|| jS FMove this to a given device. Requires that all reads are to constants.override_device)r  r   r?  ConstantBufferrX  r   r  r   r   r`  r^   r^   r_   rN    s   zPointwise.constant_to_deviceN)r   r  r  r  rJ  rI  ra  rN  r^   r^   r^   r_   rX    s    rX  c                   @   sD   e Zd ZU eee gef ed< dZee	 ed< dd Z
dd ZdS )Scatteroutput_indexerNscatter_modec                 C   s4   |   }ttd||}t|| j|| j| j| jS rb  )	r  r   r?  re  rg  r   r  rh  ri  rf  r^   r^   r_   rN  	  s   zScatter.constant_to_devicec                 C   s*   |   }tj||| |||| jdS )N)mode)r  rC   r[  rh  ri  r\  r^   r^   r_   ra    s   zScatter.store_output)r   r  r  r   r	   r   r  ri  r
   rk   rN  ra  r^   r^   r^   r_   rg    s
   
 rg  
logical_ormaximumminimummuladdbitwise_xor)anymaxminprodsumxor_sumc                    sR   t v r
t  }|S dv r fdd}|S dkr"dd }|S td )N   argmaxargminc                    s   | \}}|\}}dkrt ||}nt ||}t ||}trCt ||}t ||}	t |t ||	}t |t ||	} rKt ||nt ||}
t |t ||
}t |||t |||fS )Nry  )	rC   ltgteqr   nerk  logical_andwhere)aba_valuea_indexb_valueb_indexmaskequala_isnanb_isnantiearg_break_ties_leftr   reduction_typer^   r_   
combine_fn/  s&   
z,get_reduction_combine_fn.<locals>.combine_fnwelford_combinec                 S   sR   | \}}}|\}}}|| }|| }	||	 }
|||
  || || | |
  |	fS rb   r^   )r  r  a_meana_m2a_weightb_meanb_m2b_weightdelta
new_weight	w2_over_wr^   r^   r_   r  L  s   


zunknown reduction_type=)REDUCTION_COMBINE_FNr   )r  r   r  r  r^   r  r_   get_reduction_combine_fn*  s   0-r  c                       sT  e Zd ZU ee ed< eed< ejed< e	ed< dd Z
dd Zd	eej f fd
dZdd Zdd Zdd Zdd Zdd Zdd Zdd Ze	d>dee fddZedd Zee	jdfd ejd!ejdejd"ed#e f d$ee dee dede	dee fd%d&Z!ed'd( Z"ed)d* Z#ed+e$d,e$de	d	e	fd-d.Z%ed/d0 Z&ed1d2 Z'ed ejd!ejdejd3ed#e f d4ee d5ee d6ee d7ee ded+e$de	fd8d9Z(ed ejd!ejdejd"ed#e f d$ee dee ded+e$de	fd:d;Z)ed ejd!ejdejd"ed#e f d4ee d5ee d6ee d7ee dede	fd<d=Z*  Z+S )?	Reductionreduction_rangesr  	src_dtypereduction_hintc                 C   s   t j| ddS )N)r  r  r  )r$  )r  r%  r   r^   r^   r_   r%  g  s   zReduction.__str__c                 C      |   S rb   )r%  r   r^   r^   r_   rO  l     zReduction.__repr__r   c                    s"   t   t jdd | jD  B S )Nc                 s   r  rb   r  r  r^   r^   r_   r  q  r  z5Reduction.get_unbacked_symbol_uses.<locals>.<genexpr>)r(  r  r   r  r  r   r)  r^   r_   r  o  s   z"Reduction.get_unbacked_symbol_usesc                 C   r   rb   )r  r   r^   r^   r_   rJ  t  r   zReduction.get_reduction_sizec                 C   r   rb   r  r   r^   r^   r_   rI  w  r   zReduction.get_reduction_typec              	   C   s0   t | j| j| j| ||}t ||||S rb   )rC   	reductionr   r  r  r  store_reduction)r   r]  r^  r_  reduction_varsr   r^   r^   r_   r  z  s   
zReduction.store_reductionc                 C      t | jt | j S rb   )rw   r  r  r   r^   r^   r_   index_length  rj   zReduction.index_lengthc                 C   s$   |  | j}|  | jtj}||fS rb   )r:  r  r  r(   RINDEXr   rr   rindexr^   r^   r_   r@    s   zReduction.inner_fn_argsc                 C   s*   |  | j}|  | jtj}t| j||S rb   )r:  r  r  r(   r  r-   r  r  r^   r^   r_   r    s   z(Reduction.inner_fn_free_unbacked_symbolsc              	   C   s<   |   }ttd||}t|| j|| j| j| j| j	t
jS rb  )r  r   r?  re  r  r   r  r  r  r  r2   DEFAULTrf  r^   r^   r_   rN    s   zReduction.constant_to_deviceN
input_nodec	               	      s  dd }	t jj|}
t jjt|}tt| o(|dvo(tjo(|	|
o(|	|}|s0t	j
dfS tt| }|j| }t| dkrF|jn|jddd      fd	d
} fdd}|dkr||
|}|dkrt	j|fS |d urt|trt|\}}|d ur|d urt jjt|| }|
|krtd||||| t	jdfS t	j|fS |
ks|d d krt	j
dfS t| ||||||t	j
}dd }||\}}|r||\}}t|dkrt	j
dfS t| | \\}}}d}d}|D ],}t jj||}t jj||| }tdd |D }|r2|d7 }q|d7 }q||krEt	j||
|fS t	j ||
|fS )Nc                 S   s   t | ttjfS rb   rH   r  rT   r   r   r^   r^   r_   
_is_static     z(Reduction.num_splits.<locals>._is_staticrw  r)   xpu       i   c           	         s  d}d| }|d krdS | dkrdS | | kr}n^| | k r_ d|  }|| d | }| ||  d ||   t | }t| fddd}t|  d	k r\t|}n }nt | }t|fd
dd}t| dk ry|}n}| ||  d ||  S )N   r  r   r)   i    c                       t |   S rb   absr   tmp_split_sizer^   r_   <lambda>      zFReduction.num_splits.<locals>.inner_reduction_splits.<locals>.<lambda>key   c                    r  rb   r  r   max_elements_per_threadr^   r_   r    r  2   rT   divisorsrs  r  rr  )	reduction_numel_hint
numel_hint	num_warpsnum_threads
split_sizetarget_blocksblocks_per_outputr  closestmax_elements_per_devicer  min_elements_per_devicemin_elements_per_threadnum_smthreads_per_smr  r_   inner_reduction_splits  s6   

z4Reduction.num_splits.<locals>.inner_reduction_splitsc                    s  d}|d }d}d}|| d | }| | k r}n\| | k r[ | }|| d | }| ||  d ||   t | }	t|	 fddd}
t |
 d	k rXt|
}n }nt | }	t|	fd
dd}
t|
 dk ru|
}n}| ||  d ||  S )Nr  r  r      r)   c                    r  rb   r  r   r  r^   r_   r    r  zFReduction.num_splits.<locals>.outer_reduction_splits.<locals>.<lambda>r     c                    r  rb   r  r   r  r^   r_   r  
  r  r  r  )r  r  r  r  rvals_per_threadxvals_per_blockxblocksr  r  r  r  r  r  r_   outer_reduction_splits  s4   

z4Reduction.num_splits.<locals>.outer_reduction_splitszUse previous IRNode's range and reduction_ranges instead of split. current ranges: %s, current reduction ranges: %s, current split: %d, new ranges: %s, new reduction ranges: %sr   c                    s   t d t|  |  |  d| d}| }dd |jD }g }d}t|jdd dD ]1 t	 fd	d
|D r\|
 j  jtjjv r\tjj j }|jj}|  |jj|kr\d}q+||fS )Nr   r   r   rh   layoutdatac                 S   s(   g | ]}t |tjrt |tjs|qS r^   )rH   rT   r   Numberrp   r8  r^   r^   r_   rt   Q  s    

zBReduction.num_splits.<locals>.get_read_indices.<locals>.<listcomp>Fc                 S   r   rb   rg   r   r^   r^   r_   r  X  s    z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>r  c                 3   s    | ]	}| j jv V  qd S rb   )rr   free_symbolsr  mdr^   r_   r  Y      zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>T)ComputedBufferr   r   r   r   get_read_writes
range_varssortedrK  allappendrr   rh   rD   r   name_to_bufferr  r   decide_layout)r8  cbread_writesr  indiceschangedbuforiginal_strider^   r  r_   get_read_indicesC  s4   	z.Reduction.num_splits.<locals>.get_read_indicesr   c                 s   s    | ]}|d kV  qdS r)   Nr^   r   r^   r^   r_   r  s  r  z'Reduction.num_splits.<locals>.<genexpr>)!rD   r   r   symbolic_hintrA   r<   r   r*   split_reductionsr2   r  r   Workerget_device_propertiesgpu_subslice_countmulti_processor_countINNERrH   rS   r.   logdebugr  rw   r+   index_vars_squeezer   rJ  simplify_with_rangesstride_hintskeysr  OUTER) r   	dst_dtyper  r  r  r  r  reduction_numelr  r  r  r  should_splitdevice_interfacedevice_propertiesr  r  split
new_rangesnew_reduction_rangesextracted_numel_hintr8  r  r  r  r   r  	num_outer	num_innerrq   stridesouterr^   r  r_   
num_splits  s   	
$$





 

zReduction.num_splitsc                    sj   dd D t ||  fdd|dv r1tddt fddfd	d
S S )z1Convert inner_fn from a reduction to an pointwisec                 S      g | ]	}t jj|qS r^   )rD   r   r   evaluate_static_shaperp   r   r^   r^   r_   rt     s    z2Reduction._unroll_reduction_fn.<locals>.<listcomp>c                    s,   t  fddtjdd D  D S )Nc                 3   s    | ]} |V  qd S rb   r^   )rp   r  )rr   value_fnr^   r_   r    s
    
z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>c                 S   s   g | ]}t |qS r^   )rx   r  r^   r^   r_   rt     r   z>Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<listcomp>)	functoolsreduce	itertoolsproductry   )r  r  r  ry   r_   ri     s   z*Reduction._unroll_reduction_fn.<locals>.fnry  rx  Nc                    s*   dd |D }| |t  |tjfS )Nc                 S      g | ]}t |qS r^   )rT   expandro   r^   r^   r_   rt     r   zDReduction._unroll_reduction_fn.<locals>.value_fn.<locals>.<listcomp>)rC   
index_exprrM   int64rr   r  )flatten_indexr  r^   r_   r    s   z0Reduction._unroll_reduction_fn.<locals>.value_fnc                    s    | d S Nr)   r^   ry   )ri   r^   r_   r    r  z0Reduction._unroll_reduction_fn.<locals>.<lambda>)r  FixedLayoutr   r   r  )r  r  r  r  r^   )r  r  ri   r  r  r  r_   _unroll_reduction_fn  s$   
zReduction._unroll_reduction_fnr   r   r  .r  c
                    s  t jjt}
|
dkrB fdd}|d|d|d|dd v s/J  d fdd}tj|||t|d	S |
dkr`d
v rQ fdd}nfdd}t| ||S t	|
t
jrt jj|
tjk rt|dkrt| | ||S | | |||
|		\}}|tjkr|}|dkr|	d usJ t|	\}}|d usJ |d usJ | | |||||
S |dkr| | ||||	S tt| |||S )Nr   c                    s(    t jkr	t| S  jrt| S t| S rb   )rM   r  rT  rV  r  valr   r^   r_   py_cnst  s   
z!Reduction.create.<locals>.py_cnstr)   )ru  rv  rt  rq  z* not supported for zero-dimension tensors!c                    s   t   S rb   rC   rU  ry   )r   r  rtypes_to_initsr^   r_   const_fn  r  z"Reduction.create.<locals>.const_fnr   r   r  r  r  c                    s   t d S r   r$  ry   r"  r^   r_   ri     r   zReduction.create.<locals>.fnc                       dd D } | |S )Nc                 S      g | ]}t d qS r   rT   r   r   r^   r^   r_   rt     r   z0Reduction.create.<locals>.fn.<locals>.<listcomp>r^   rr   reduction_index)r  r  r^   r_   ri        
r  )rD   r   r   simplifyrA   r  rX  r5  rI   rH   rT   r   r   r*   unroll_reductions_thresholdr  r  r2   r  r.   !create_multilayer_existing_rangescreate_multilayerrS   r  )r6  r   r   r  r  r  r  r  r  r  r  r#  r&  ri   hintr  r  r  r^   )r   r  r  r  r%  r_   r5    s   
	


zReduction.createc                 C   sv   | dv rt |rtdS t|rdS t|jS | dv r0t |r$tdS t|r*dS t|jS ddddddd|  S )	N>   rr  rx  z-infr   >   rs  ry  infr)   r   r   r   )ru  rt  rv  rq  welford_reducer  )r   rV  r   rM   iinfors  rr  r  r   r^   r^   r_   default_accumulator8  s*   zReduction.default_accumulatorc                 C   s   | dkrdS t | |S )Nr5  r   )r  r8  r7  r^   r^   r_   default_valueR  s   zReduction.default_valuer  r  c                 C   sP   | dkr|S | dkr|dkr|t jkrt jS | dkr&|dkr&|t jkr&t jS |S )Nr  r        )r2   r  
OUTER_TINY)r  r  r  r^   r^   r_   _multilayer_second_step_hintX  s   
z&Reduction._multilayer_second_step_hintc                    sD   t |gtjjt| d  fdd}|S )Nr   c                    sf   |\}| ^ }| |   fdd}r0t t  tjt tj}t ||S | S )Nc                      s    gS rb   r^   r^   )r  r`  	new_indexr{   r^   r_   body}  r  zCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.body)rC   rz  r  rM   int32masked)rr   r,  reduction_blockr?  r  
block_sizedefaultr`  	need_maskr  r{   )r  r>  r_   
wrapper_fnx  s   
z5Reduction._multilayer_wrap_loader.<locals>.wrapper_fn)Viewdynamic_reshape_indexerrD   r   r   r   rT   r   )r6  r`  r  r  r  rD  rE  rG  r^   rC  r_   _multilayer_wrap_loaderi  s   
z!Reduction._multilayer_wrap_loaderc                    sL   t dd D sJ dt|t|t|  fdd}|S )Nc                 s   s    | ]}|d kV  qdS r  r^   r  r^   r^   r_   r        
zDReduction._multilayer_wrap_loader_existing_ranges.<locals>.<genexpr>z8Only enabled for numel_hint == 1, found original_ranges=c                    s:   | d t  }| t d  } |t|t| S rb   )rw   rJ   )merged_indexnew_reduction_indexoriginal_idxr>  r`  original_rangesr{   r^   r_   rG    s   zEReduction._multilayer_wrap_loader_existing_ranges.<locals>.wrapper_fn)r  rH  rI  rJ   )r6  r`  rP  original_reduction_rangesr  r  rE  rG  r^   rO  r_   '_multilayer_wrap_loader_existing_ranges  s   

z1Reduction._multilayer_wrap_loader_existing_rangesrG  rP  rQ  r  r  c                    s   |t jt jfvr
|nt j}t|||||||	|}|  |   fdd}tj	j
t|}| |
||}||dt| ksCJ tt|||||t|d |	||S )a
        Break a large reduction up into multiple smaller reductions
        recursively
        c                    s    g | |S rb   r^   r+  intermediate_loaderr^   r_   intermediate_fn  r  z;Reduction.create_multilayer_helper.<locals>.intermediate_fnN)rM   float16bfloat16rV  r  r5  r   r  rD   r   r   r   rA   r=  rw   rS   )r6  r   r   r  rG  rP  rQ  r  r  r  r  r  intermediate_dtypeintermediaterV  r  r^   rT  r_   create_multilayer_helper  sD   
z"Reduction.create_multilayer_helperc
                 C   sb   t |}
t|
|d  |}| ||}| |||
|||}| ||||||g |||g|||	S )rS  r)   )rA   r&   r9  rJ  r[  )r6  r   r   r  r  r  r  r  r  r  r  rD  rE  rG  r^   r^   r_   r1    s&   
zReduction.create_multilayerc                 C   sF   |  |	|}| ||||||}| ||||||g ||||	d|
S )rS  r  )r9  rR  r[  )r6  r   r   r  r  rP  rQ  r  r  r  r  rE  rG  r^   r^   r_   r0    s,   
z+Reduction.create_multilayer_existing_rangesrb   ),r   r  r  r	   r   r  rk   rM   r   r2   r%  rO  r   rT   r  r  rJ  rI  r  r  r@  r  rN  r	  r
   r   r  r  rP  r  r   r   r   r5  r8  r9  r  r=  rJ  rR  r[  r1  r0  rR  r^   r^   r)  r_   r  _  s  
 
	
	 b
'

	
 


!

	
=
	
&
	
r  c                 C   s   d| v rdS dS )Nwelfordr   r)   r^   r  r^   r^   r_   num_reduction_outputs5  r  r]  c                       s   e Zd ZU eed<  fddZdd Zeej	fde
jde
jdeed	ef  d
ee dee dedefddZedd Zede
jde
jdeed	ef  d
ee dee dededefddZ  ZS )WelfordReductionoutput_indexc	           
   
      sF   t  dkr d }	n fdd}	t |||	||||| || _d S )Nr)   r   c                    s   t  fddD S )Nc                 3   s    | ]}| V  qd S rb   r^   rp   ri   r   reduction_idxr^   r_   r  L      z<WelfordReduction.__init__.<locals>.loader.<locals>.<genexpr>)rJ   ra  	inner_fnsra  r_   r`  K  s   z)WelfordReduction.__init__.<locals>.loader)rw   r(  __init__r_  )
r   r   r   re  r  r  r  r  r_  r`  r)  rd  r_   rf  <  s   


zWelfordReduction.__init__c              	   C   s:   t | j| j| j| ||}|| j }t ||||S rb   )rC   r  r   r  r  r  r_  r  )r   r]  r^  r_  r  rL   r   r^   r^   r_   r  Z  s   

z WelfordReduction.store_reductionr   r   re  .r  r  r  r  c              
      s2  dv sJ t jjt}fdd}	|dkr,|	d}
|	d}|	d}|
||fS |dkrUfdd dkrJ d |	d|	dfS t fd	d
D S tjd |d\}}tj	krm||dkr}| 
|S fddtdD }|D ]}|  q|S )N>   r5  r  c                    s$    fdd}t j|tdS )Nc                    s   t  S rb   r$  r   )r   r!  r^   r_   r  t  s   z8WelfordReduction.create.<locals>.const.<locals>.inner_fnr'  rX  r5  rI   )r!  r  )r   r   r  r   r_   consts  s   z&WelfordReduction.create.<locals>.constr   r)   c                    s$    fdd}t j|tdS )Nc                    r(  )Nc                 S   r)  r   r*  r   r^   r^   r_   rt     r   zKWelfordReduction.create.<locals>.copy.<locals>.inner_fn.<locals>.<listcomp>r^   )r   r,  )r`  r  r^   r_   r    r-  z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnr'  rh  )r`  r  )r   r   r  r  r`  r_   copy  s   z%WelfordReduction.create.<locals>.copyr5  c                 3       | ]} |V  qd S rb   r^   r`  )rk  r^   r_   r    r  z*WelfordReduction.create.<locals>.<genexpr>)r  r  c                    s(   g | ]}t t |qS r^   )rS   r5  r^  )rp   
output_idx)r   r   re  r  r  r  r  r^   r_   rt     s    z+WelfordReduction.create.<locals>.<listcomp>r   )rD   r   r   r.  rA   rJ   r  r  r2   r  r1  rx   r   )r6  r   r   re  r  r  r  r  r  ri  meanm2weightr2  r  resultsr   r^   )rk  r   r   re  r  r  r  r  r_   r5  d  sT   



zWelfordReduction.createc                 C   r0  )Nr4  r^   r7  r^   r^   r_   r9       zWelfordReduction.default_valuer  c	              
      s.  t tjjt d }	|	r8|dkr8fdd}
j||d t|
ddt|
ddf|d|dS t	d   t
|t fdd	|D g | g||}|D ]}|  q`d
d |D }dd tjjt |}||}t
|tfdd	|D |gd|S )rS  r   r  c                    s   t | S rb   r$  )r   rb  r   r   r^   r_   rU    r   z4WelfordReduction.create_multilayer.<locals>.constantr   r)   )r   r   re  r  r  r  r  r  c              	   3   s&    | ]}j | d dV  qdS )r   )rE  N)rJ  )rp   r`  )rD  r6  r  r  r  r^   r_   r    s    	
z5WelfordReduction.create_multilayer.<locals>.<genexpr>c                 S      g | ]}|  qS r^   )r  ro   r^   r^   r_   rt      r   z6WelfordReduction.create_multilayer.<locals>.<listcomp>c                 S   s   |g | |S rb   r^   )rr   r,  r`  r^   r^   r_   intermediate_loader_fn"  r  zBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fnc                 3   s     | ]}t  | d V  qdS )rj  N)r   r  ro   )rt  r^   r_   r  ,  
    
)rA   rD   r   r   r   rT   r   r1  r   r&   r^  r5  rJ   r   r   r=  )r6  r   r   re  r  r  r  r  r  rF  rU  intermediatesrq   	i_loadersr  r^   )rD  r6  r   rt  r  r  r  r_   r1    sd   

	

z"WelfordReduction.create_multilayer)r   r  r  r  r  rf  r  rP  r2   r  rM   r   r   r   r   r   r	   r   rk   r5  r	  r9  r1  rR  r^   r^   r)  r_   r^  9  sR   
 
	u
	r^  c                       s  e Zd ZU ee ed< ee ed< eeedf eedf geedf f ed< eee ee gee f ed< e	ed< e
ed< eejdf ed< eedef df ed	< d
eej f fddZ fddZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zee	jfdejdeejdf d	eeee gef df dee d e
deeedf eedf geedf f de	d
eed!  fd"d#Zedejd$ejd%eee gef d e
d&ee dee deeedf eedf geedf f d'efd(d)Z  Z S )*Scanscan_rangesr   .r  r{   r  r_  dtypesre  r   c                    s:   t   t jdd | jD  B t jdd | jD  B S )Nc                 s   r  rb   r  r  r^   r^   r_   r  L  r  z0Scan.get_unbacked_symbol_uses.<locals>.<genexpr>c                 s   r  rb   r  r  r^   r^   r_   r  M  r  )r(  r  r   r  ry  r   r   r)  r^   r_   r  F  s   zScan.get_unbacked_symbol_usesc                    s0   t | jt | j t | jksJ t   d S rb   )rw   r  ry  r   r(  r   r   r)  r^   r_   r   P  s   "zScan.__post_init__c                    sJ   |  ||  fdd| jD }t| j| j|}t|| || j S )Nc                    s   g | ]}| qS r^   r^   )rp   r  rg  r^   r_   rt   V  r   z(Scan.store_reduction.<locals>.<listcomp>)r{   re  rC   scanrz  r  r[  r_  )r   r]  r^  r_  	scan_varsrL   resultr^   rg  r_   r  T  s   zScan.store_reductionc                 C   r0  )Ncustomr^   r   r^   r^   r_   rI  Z  rr  zScan.get_reduction_typec                 C   r   rb   )ry  r   r^   r^   r_   rJ  ^  r   zScan.get_reduction_sizec                 C   r   rb   r   r   r^   r^   r_   r   a  r   zScan.get_sizec                 C   r   rb   r  r   r^   r^   r_   r/  d  r   zScan.get_pointwise_sizec                 C   r  rb   )rw   r  ry  r   r^   r^   r_   r  g  rj   zScan.index_lengthc                 C   s.   |  | j}|  | jtj}| ||}|fS rb   )r:  r  ry  r(   r  r{   r   rr   r  r   r^   r^   r_   r@  j  s   zScan.inner_fn_argsc                 C   s4   |  | j}|  | jtj}| ||}t| j|S rb   )r:  r  ry  r(   r  r{   r-   r  r  r^   r^   r_   r  p  s   z#Scan.inner_fn_free_unbacked_symbolsr   axisrS   c                    s  g d    d d    g	t js!d gt S tjjd ur4tdkr4d gt S tjj}	|		t
	}
ttksIJ |	t|
drbfddttD S | jd d  	|
d\}|dkr{tnt
|dkrtjjd urd gt S |dkrtdkrd gt S  	fdd	
fddttD }|D ]}|  q|S )	Nr)   c                    s&   g | ]}t j | | d qS )r'  )rX  r5  rp   r_  )r   rz  re  r   r^   r_   rt     s    zScan.create.<locals>.<listcomp>r   )r   r   r  r  pointwise_rangesry  r  
scan_numelc                    sH   t |t ks
J t | t ksJ g | d   ||  d  S rb   rw   )rr   
scan_index)r  r  ry  r^   r_   r{     s    zScan.create.<locals>.reindexc                    sB   g | ]}t 	d| | 
 |d qS ))r   r   rz  r  re  r   r  ry  r  r{   r  r_  r^   )rS   r5  r  )r  r   rz  re  rf   r  r  r{   ry  	scan_typer   r^   r_   rt     s*    )r<   rY   rw   rM   versionhiprD   r   r   r.  rA   r   rT   Lerx   r  rx  	SplitScanr   )r6  r   rz  re  r   r  r  r  rf   r   r  r  rq  r}  r^   )r  r  r   rz  re  rf   r  r  r{   ry  r  r   r_   r5  v  sF    







zScan.creater   r  r  r  c	           
   
      s(    fdd}	t j||||	||d|dS )Nc                    s$   g | d   ||  d  S rb   r^   ra  r  r  r^   r_   rG    s   $z#Scan.num_splits.<locals>.wrapper_fnru  )r   r   r  r  r  r  r  r  )r  r  )
r6  r   r   r  r  r  ry  r  r  rG  r^   r  r_   r    s   zScan.num_splits)!r   r  r  r	   r   r  r   r   r   r2   r  rM   r   r   rT   r  r  r   r  rI  rJ  r   r/  r  r@  r  rP  r  r   r
   r5  r  rR  r^   r^   r)  r_   rx  8  sp   
 , 
	&

Z&	rx  c                   @      e Zd ZdS )r  Nr   r  r  r^   r^   r^   r_   r    s    r  c                 C   s(   z	t | dd W dS  ty   Y dS w )NFfreezeT)as_storage_and_layoutr   r   r^   r^   r_   r     s   r   c                 C   s@   zt | dd\}}| r|  | W S  ty   Y dS w NFr  )r  should_pad_stridespad_stridesis_contiguousr   )r   bufferr  r^   r^   r_    is_contiguous_storage_and_layout  s   
r  Fc                 C   s   t | trt| j||||dS t | trDt | jtrD|r>|r,| j  | jj s+J n|dur9| jj	||d n| j
  | | jjfS t | trWt| j|d\}}|| jfS t)z
    Try to simplify x into a StorageBox and a Layout.

    allow_padding only affect how we apply stride_order. When allow_padding
    is True, we have the freedom to add padding when applying the stride_order.
    r  want_contiguousstride_orderallow_paddingNr  r  )rH   rS   r  r  
StorageBoxBufferfreeze_layoutr  r  freeze_layout_with_stride_orderr  ReinterpretViewr   )r   r  r  r  r  r  r   r^   r^   r_   r    s4   
	




r  )r  c                 C   s2   zt | dd\}}||W S  ty   Y dS w r  )r  is_stride_orderedr   )r   r  r  r  r^   r^   r_   "is_stride_order_storage_and_layout4  s   r  c                   @   s   e Zd ZU eed< dd Zdd Zdd Zdd	 Ze	d
d Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*S )+BaseViewr  c                 C   
   | j  S rb   r  r  r   r^   r^   r_   r  @     
z!BaseView.get_unbacked_symbol_usesc                 C   s   t d|  )Nzmake_reindexer NYI on r   r   r^   r^   r_   make_reindexerC  rD  zBaseView.make_reindexerc                    $   | j   |   fdd}|S )Nc                    r   rb   r^   rg  innerr{   r^   r_   r^  J  r   z&BaseView.make_indexer.<locals>.indexer)r  r  r  r   r^  r^   r  r_   r  F     
zBaseView.make_indexerc                    r  )Nc                    r   rb   r^   rg  r  r^   r_   r`  S  r   z$BaseView.make_loader.<locals>.loader)r  r  r  r   r`  r^   r  r_   r  O  r  zBaseView.make_loaderc                 C      | j jS rb   r  r   r   r^   r^   r_   r   X     zBaseView.dtypec                 C   r  rb   r  r   r   r^   r^   r_   r   \  r  zBaseView.get_layoutc                 C   r  rb   )r  r   r   r^   r^   r_   r   _  r  zBaseView.get_devicec                 C   rZ  rb   r^   r   r^   r^   r_   r.  b  r3  zBaseView.get_origin_nodec                 C   r  rb   r  r   r   r^   r^   r_   r   e  r  zBaseView.get_namec                 C   r  rb   r   r   r^   r^   r_   r/  h  r  zBaseView.get_pointwise_sizec                 C      | j |S rb   )r  r  r   usersr^   r^   r_   r  k  r   zBaseView.mark_reusec                 C   r  rb   )r  r   r   r^   r^   r_   r   n  r  zBaseView.has_exceeded_max_readsc                 C   r  rb   r  r   r   r^   r^   r_   r   q  r  zBaseView.realizec                 C   r  rb   )r  r  r   r^   r^   r_   r  t  r  zBaseView.realize_hintc                 C   r  rb   )r  r   r   r^   r^   r_   r   w  r  zBaseView.get_storage_numelc                 C   r  rb   )r  r2  r   r^   r^   r_   r2  z  r  zBaseView.is_externc                 C   r  rb   )r  is_module_bufferr   r^   r^   r_   r  }  r  zBaseView.is_module_bufferc                 C   sF   t tdd t|  |  jW  d    S 1 sw   Y  d S r;  )r   r?  r   r/   r  r   rK  r   r^   r^   r_   r     s   $zBaseView.get_readsc                 C   s"   | }t |tr|j}t |ts|S rb   )rH   r  r  r   r   r^   r^   r_   unwrap_view  s
   

zBaseView.unwrap_viewc                 C   s0   |   }ttd||}t||  ||  S rb  )r  r   r?  re  rX  r   r   rf  r^   r^   r_   rN    s   zBaseView.constant_to_deviceN)r   r  r  r   r  r  r  r  r  propertyr   r   r   r.  r   r/  r  r   r   r  r   r2  r  r   r  rN  r^   r^   r^   r_   r  <  s.   
 		
r  c                   @   sB   e Zd ZU ee ed< edd Zedd Z	dd Z
dd	 Zd
S )rP   r   c                 C   s   t jj}tttj|}|  }dgt|t|  t| }t|t|ks)J t	t|D ]7}|| dkrF|| dus?J || ||< q/|| du sR|| dkrSq/|j
|| ||  dddksfJ dq/|S )zReplace `-1` with correct sizesNr  r)   r   fallbackzKBroadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i})rD   r   r   rI   r   rT   r  r   rw   rx   r   )r   new_sizer   old_sizerq   r^   r^   r_   _normalize_size  s    zExpandView._normalize_sizec           
      C   s   |  ||}t|rSt|\}}t|t|j }|dksJ tdg| }t|j|jD ]\}}|	|dkr:|ntd q.t
|j|jt|||j}	t||	S t||S Nr   r)   )r  r   r  rw   r   rT   r   r}   r   r  r  r   r   rI   offsetr  rP   )
r6  r   r  storage
old_layoutskip
new_strider   r   
new_layoutr^   r^   r_   r5    s"   

zExpandView.createc                 C   r   rb   r  r   r^   r^   r_   r     r   zExpandView.get_sizec                    s4   |   }| j   t|t   fdd}|S )Nc                    sT   t | d  } t| t ksJ tt D ]} | dkr'td| |< q| S Nr)   r   )rI   rw   rx   rT   r   )rr   rq   actualr  r^   r_   r{     s   z*ExpandView.make_reindexer.<locals>.reindex)r   r  rw   )r   targetr{   r^   r  r_   r    s
   
	zExpandView.make_reindexerN)r   r  r  r	   r   r  r	  r  rP  r5  r   r  r^   r^   r^   r_   rP     s   
 

rP   c                   @   sB   e Zd ZU ee ed< edd Zedd Zdd Z	dd	 Z
d
S )PermuteViewdimsc                    s   |  |}t|ttt|ksJ t|r;t|\} t j j fdd|D  fdd|D  j	}t
||S t||S )Nc                       g | ]} j | qS r^   r  ro   r  r^   r_   rt     r   z&PermuteView.create.<locals>.<listcomp>c                    r  r^   r   ro   r  r^   r_   rt     r   )_map_neg_dimsr   rx   rw   r   r  r  r   r   r  r  r  )r6  r   r  r  r  r^   r  r_   r5    s   


zPermuteView.createc                    s    fdd D S )Nc                    s$   g | ]}|d kr
|nt  | qS r   r  )rp   dimr  r^   r_   rt     s   $ z-PermuteView._map_neg_dims.<locals>.<listcomp>r^   )r6  r  r^   r  r_   r    r   zPermuteView._map_neg_dimsc                    sD   t | | jt tt| jksJ | j   fdd| jD S )Nc                    r   r^   r^   ro   r  r^   r_   rt     r   z(PermuteView.get_size.<locals>.<listcomp>)r   r  r  rx   rw   r  r   r   r^   r  r_   r     s   &
zPermuteView.get_sizec                    s^   dd t | jD   fddtt| jD  t ttt| jks'J  fdd}|S )Nc                 S   r   r^   r^   )rp   rq   jr^   r^   r_   r     r   z.PermuteView.make_reindexer.<locals>.<dictcomp>c                    r   r^   r^   ro   invr^   r_   rt     r   z.PermuteView.make_reindexer.<locals>.<listcomp>c                    s    fddD S )Nc                    r   r^   r^   ro   ry   r^   r_   rt      r   z?PermuteView.make_reindexer.<locals>.reindex.<locals>.<listcomp>r^   ry   r  ry   r_   r{     r   z+PermuteView.make_reindexer.<locals>.reindex)r   r  rx   rw   r   )r   r{   r^   r  r_   r    s
   zPermuteView.make_reindexerN)r   r  r  r	   r   r  rP  r5  r  r   r  r^   r^   r^   r_   r    s   
 

r  c                   @   sB   e Zd ZeddddZedeejdf fddZ	d	d
 Z
dS )SqueezeViewNr  c                   s<  t |rrt|\}}g }g } d ur(t tsJ dd kr& t|jk s(J tt|j|jD ]0\}\}}	 d u rJ|dkrI|	| |	|	 q1| krY|	| |	|	 q1|dksaJ dq1t
|j|j|||j}
t||
S  d u rt|dd | D S |   dksJ t| fddt| D S )Nzexpected integer dim argumentr   r)   zexpected squeezed size to be 1c                 S      g | ]}|d kr|qS r)   r^   r   r^   r^   r_   rt   '  ru   z&SqueezeView.create.<locals>.<listcomp>c                    s   g | ]
\}}| kr|qS r^   r^   rp   rq   r   r  r^   r_   rt   *      )r   r  rH   r  rw   r   r   r}   r   r  r  r   r   r  r  rH  r5  r   )r6  r   r  r  r  r  r  rq   r   r   r  r^   r  r_   r5    s:   



"zSqueezeView.creater   .c                    sX   dd | D }dd t | D t|  dttj dttjdf f fdd}||fS )	Nc                 S   r  r  r^   r   r^   r^   r_   rt   .  ru   z(SqueezeView.squeezer.<locals>.<listcomp>c                 S   s   g | ]
\}}|d kr|qS r  r^   r  r^   r^   r_   rt   /  r  rr   r   .c                    sV   t | t ksJ |  d tdg  }t| D ]\}}|||< qt|S )N r   )rw   rT   r   r}   rJ   )rr   r>  r   r   lengthnot_oner^   r_   r{   2  s
   "
z%SqueezeView.squeezer.<locals>.reindex)r   rw   r	   rT   r   r   )r   r  r{   r^   r  r_   squeezer,  s
   (zSqueezeView.squeezerc                 C   s   t d)Nzuse SqueezeView.create())AssertionError)r   r  r^   r^   r_   rf  ;  r  zSqueezeView.__init__)r   r  r  rP  r5  r	  r   rT   r   r  rf  r^   r^   r^   r_   r    s    %r  c                   @   sZ   e Zd ZU ee ed< edef ed< dd Zdd Z	dd	 Z
e
Zed
d Zdd ZdS )GenericViewr   .r{   c                 C   r   rb   )r{   r   r^   r^   r_   r  D  r   zGenericView.make_reindexerc                 C   sB   dd t t| jD }t| |}ddtt| d| S )Nc                 S      g | ]}t tj|qS r^   )r@   r(   rQ  )rp   r9  r^   r^   r_   rt   H      z+GenericView.reindex_str.<locals>.<listcomp>zlambda , : )rx   rw   r   rI   r{   r   r   rk   )r   	index_old	index_newr^   r^   r_   reindex_strG  s
   zGenericView.reindex_strc                 C   s$   |  | jd| j d|   gS )Nsize=zreindex=)r   r  r   r  r   r^   r^   r_   r%  N  s   zGenericView.__str__c                 C   s   | |t ||S rb   )rI   )r6  r   r  r{   r^   r^   r_   r5  U     zGenericView.createc                 C   r   rb   r  r   r^   r^   r_   r   Y  r   zGenericView.get_sizeN)r   r  r  r	   r   r  r   r   r  r  r%  rO  rP  r5  r   r^   r^   r^   r_   r  ?  s   
 
r  c                   @   sH   e Zd Zedd Zedd Zedd Zedd Zed	d
 Z	dS )rH  c                 C   s<   t | } t |}tjjjj}|t | dr| | } | S r   )rT   r  rD   r   r   	shape_envevaluate_exprLt)r   r   r  r^   r^   r_   handle_negative_index_  s   

zView.handle_negative_indexc           	         s   t |ttfs	J | | |\ }tjj |r|S d}t	t
 dks/t	t
|dkr1d}d|v rC fdd}| |t||S t|sI|rm|rTt|sTt|}t|\}}t|j|j|t||j}t||S |  |}| |t||S )NFr   Tc                    s   t dgt  S r   )rJ   rw   ry   r  r^   r_   fake_reindexz  r   z!View.create.<locals>.fake_reindex)rH   rJ   rI   resolve_negative_sizer   rD   r   r   statically_known_list_equalsrw   r!   r  ExternKernelrealize_input as_contiguous_storage_and_layoutr  r   r   r   r   r  r  rI  )	r6  r   r  unbacked_symbols_in_sizesr  r  r  r  r{   r^   r  r_   r5  h  s2   

zView.createc                 C   s   dd |D }dd | D } t |}tt|D ]}|| dkr4td||< tt| t|||<  nqtjj	
t| t| | |fS )Nc                 S   r  r^   rD   r   r   r.  r  r^   r^   r_   rt         z.View.resolve_negative_size.<locals>.<listcomp>c                 S   r  r^   r  r  r^   r^   r_   rt     r  r  r)   )rI   rx   rw   rT   r   r%   rA   rD   r   r   guard_equals)r  r  rq   r^   r^   r_   r    s   zView.resolve_negative_sizec              	   C   sX   z	|  ||}W |S  ttfy+   t|g}|  ||}|  ||}t||}Y |S w rb   )_dynamic_reshape_indexerr  
IndexErrorrA   r   )r6  r  r  r{   flatr   r   r^   r^   r_   rI    s   
zView.dynamic_reshape_indexerc                    sR  t jjj}dd tt|D  tt |}t| }g |r|r| }| \}}|dkr?	t
d |	||f n|dkrI|	| n||||kr_	| t jj|| n||||k r||||k r| \}}	|| | }||	 }||||k so	| t jj|| nL||||krt
d}
|}	t||
| |
| }
||||kr| }	t||
| |
| }
|| }||||kst jj|| nt|r|s!|r| }t jj|d 	t
d |s|r| \}}t jj|d |s   tt| ks J  fdd}|S )zG
        Perform a reshape entirely by modifying indexing math
        c                 S   r  r^   )r@   r(   VIEWro   r^   r^   r_   rt     r  z1View._dynamic_reshape_indexer.<locals>.<listcomp>r)   r   c                    sH   t | t ksJ t | t ftt|  t fddD S )Nc                 3   s    | ]}t | V  qd S rb   rB   r  replacementsr^   r_   r    rc  zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>)rw   rK   r}   rJ   ry   r_  	view_exprr  r_   r{     s   $z.View._dynamic_reshape_indexer.<locals>.reindex)rD   r   r   r   rx   rw   rI   r}   r4  r  rT   r   r  r'   r  reverse)r  r  r   	stack_new	stack_oldsize_oldvarsize_newvar2	size_new2divisormodulusr{   r^   r  r_   r    sj   




 zView._dynamic_reshape_indexerN)
r   r  r  r	  r  rP  r5  r  rI  r  r^   r^   r^   r_   rH  ]  s    

)

rH  c                       s   e Zd ZU dZded<  fddZdd ZeZdd	 Zd
d Z	dd Z
edd Zdd Zdd Zdd Zdd Zdd Zdd Zdeej fddZd"d d!Z  ZS )#r  z*Pretend our storage has a different layoutLayoutr  c                    s*   t    t| jtr| j | _d S d S rb   )r(  r   rH   r  r  r  r   r)  r^   r_   r     s   
zReinterpretView.__post_init__c                 C   s   |  | j| jgS rb   )r   r  r  r   r^   r^   r_   r%    s
   zReinterpretView.__str__c                 C   r  rb   r  r   r^   r^   r_   r   	  r  zReinterpretView.get_namec                 C   r  rb   r  r   r   r^   r^   r_   r   		  r  zReinterpretView.get_devicec                 C   rZ  rb   r^   r   r^   r^   r_   r.  	  r3  zReinterpretView.get_origin_nodec                 C   r  rb   )r  r   r   r^   r^   r_   r   	  r  zReinterpretView.dtypec                 C      t | jjS rb   rI   r  r   r   r^   r^   r_   r   	  r   zReinterpretView.get_sizec                 C   r  rb   rI   r  r   r   r^   r^   r_   r   	  r   zReinterpretView.get_stridec                    r   )Nc                    s    j  }t  || S rb   )r  r  rC   loadr   rr   r^  r   r^   r_   r`  	  s   
z+ReinterpretView.make_loader.<locals>.loaderr^   r  r^   r   r_   r  	  r   zReinterpretView.make_loaderc                 C   r  rb   r  r  r   r^   r^   r_   r   	  r  zReinterpretView.make_indexerc                 C   r   rb   r  r   r^   r^   r_   r   #	  r   zReinterpretView.get_layoutc                 C   rZ  rb   r^   r   r^   r^   r_   r  &	  r3  zReinterpretView.freeze_layoutr   c                 C   s$   t | jjt | jjB t | jjB S rb   )r!   r  r   r   r  r   r^   r^   r_   r  )	  s   


z(ReinterpretView.get_unbacked_symbol_usesNc                 C   s$   t jj| j| jj| jj| jj|S rb   )	rD   r   wrapper_codecodegen_reinterpret_viewr  r  r   r   r  r   r^   r^   r_   r   0	  s   z!ReinterpretView.codegen_referencerb   )r   r  r  __doc__r  r   r%  rO  r   r   r.  r  r   r   r   r  r  r   r  r   rT   r  r  r   rR  r^   r^   r)  r_   r    s&   
 
r  c                   @   s&   e Zd Zedd ZedddZdS )		SliceViewc                    sv   t jj| | tdd ||fD rdd  nfdd  fdd}||dd}|||}||fS )	zz
        Normalize start and end such that both are in the range
        [0, x.get_size()[dim]] and start <= end.
        c                 s   r  rb   r  r  r^   r^   r_   r  G	  r  z0SliceView.normalize_start_end.<locals>.<genexpr>c                 S   s   t t | ||S rb   )rT   MinMaxr   lowerupperr^   r^   r_   clampI	  rj   z,SliceView.normalize_start_end.<locals>.clampc                    s      | ||S rb   )evaluate_minevaluate_maxr   r   r^   r_   r#  N	  rj   c                    s$   | d u r|S  | }  | ||S rb   )r  )r!  r!  r"  rE  )r#  r6  dim_sizer^   r_   
clamp_wrapQ	  s   z1SliceView.normalize_start_end.<locals>.clamp_wrapr   )rD   r   r   r   rq  )r6  r   r  startendr(  r^   )r#  r6  r'  r   r_   normalize_start_end>	  s   
zSliceView.normalize_start_endr)   Tc                    s  t dksJ zdkr|dkrdkr|W S W n	 ty%   Y nw tjj}t| |r<| | |\}t	| d   < t
|rwt|\}}	t|	j}
|
   |
 < t|	j|	j|
|	j|	j    }t||S  fdd}t||dS )Nr   l    r)   c                    sD   t | t ksJ d|  d t| } |     |  < | S )Nzwrong ndim r  )rw   rI   ry   r  r  r)  stepr^   r_   r{   ~	  s   $z!SliceView.create.<locals>.reindex)r   r{   )rT   r  	TypeErrorrD   r   r   rI   r   r+  r&   r   r  r   r  r   r   r  r  r  )r6  r   r  r)  r*  r-  r#  r   r  r  r  r  r{   r^   r,  r_   r5  [	  s8   


zSliceView.createN)r)   T)r   r  r  rP  r+  r5  r^   r^   r^   r_   r  =	  s
    
r  c                   @   sZ   e Zd ZU ejed< ejed< dd Zdd Zdd Z	d	d
 Z
dd Zdd Zdd ZdS )BaseConstantr   r   c                 C   r0  Nr^   r^   r   r^   r^   r_   r   	  r3  zBaseConstant.get_sizec                 C   r   rb   r,  r   r^   r^   r_   r   	  r   zBaseConstant.get_devicec                 C   rZ  rb   r^   r   r^   r^   r_   r.  	  r3  zBaseConstant.get_origin_nodec                 C   rZ  rb   r^   r  r^   r^   r_   r  	  r3  zBaseConstant.mark_reusec                 C   r0  r1  r^   r   r^   r^   r_   r   	  r3  z#BaseConstant.has_exceeded_max_readsc                 C   r0  r0  r^   r   r^   r^   r_   r   	  r3  zBaseConstant.get_readsc                 C   r0  r1  r^   r   r^   r^   r_   r2  	  r3  zBaseConstant.is_externN)r   r  r  rM   r   r  r   r   r   r.  r  r   r   r2  r^   r^   r^   r_   r/  	  s   
 

r/  c                   @   sB   e Zd ZU eed< ejed< ejed< dd Zdd Z	dd	 Z
d
S )Constantr   r   r   c                    r   )Nc                       t  j jS rb   )rC   rU  r   r   ry   r   r^   r_   r`  	  r  z$Constant.make_loader.<locals>.loaderr^   r  r^   r   r_   r  	     zConstant.make_loaderc                 C   rZ  rb   r^   r   r^   r^   r_   r   	  r3  zConstant.realizec                 C      t | j| j|S rb   )r1  r   r   rM  r^   r^   r_   rN  	  r  zConstant.constant_to_deviceN)r   r  r  r   r  rM   r   r   r  r   rN  r^   r^   r^   r_   r1  	  s   
 

r1  c                   @   s:   e Zd ZU eed< ejed< ejed< dd Zdd Z	dS )	IndexingConstantrr   r   r   c                    r   )Nc                    r2  rb   )rC   r  rr   r   ry   r   r^   r_   r`  	  r  z,IndexingConstant.make_loader.<locals>.loaderr^   r  r^   r   r_   r  	  r3  zIndexingConstant.make_loaderc                 C   r4  rb   )r5  rr   r   rM  r^   r^   r_   rN  	  r  z#IndexingConstant.constant_to_deviceN)
r   r  r  r   r  rM   r   r   r  rN  r^   r^   r^   r_   r5  	  s   
 

r5  c                 C   s    t dd t| t||D S )Nc                 s   s&    | ]\}}}|d kp||kV  qdS r  r^   )rp   leftrightr   r^   r^   r_   r  	  s
    
z2is_contiguous_strides_for_shape.<locals>.<genexpr>)r  r}   r   r   )r   shaper^   r^   r_   is_contiguous_strides_for_shape	  s
   r9  c                 C   s
   d| j  S )z
    CUDA max memory transaction size is 128 bytes for a warp.
    We pick `128 // dtype.itemsize` as alighment so GPU can do coalesced
    memory access.
    r  )itemsizer   r^   r^   r_   get_align_for_dtype	  s   
r;  c                   @   s   e Zd Zedfdejdejdee de	e
eeef   def
ddZed	d
 Zdd ZeZdd Zedd Zdd Zdd Zdd Zedd Zdd Zdd Zdd Zdd  Zd!efd"d#Zd!ejfd$d%Z d&S )'r  r   r   r   r   r   r  c                 C   sd   |d u st |t |ksJ d| d| || _|| _tdd |D s'J || _|| _|| _d S )Nr  	, stride=c                 s   s    | ]
}t |ttfV  qd S rb   )rH   r   r  r   r^   r^   r_   r  	  s    z"Layout.__init__.<locals>.<genexpr>)rw   r   r   r  r   _strider  r   r   r   r   r   r  r^   r^   r_   rf  	  s   
zLayout.__init__c                 C   r   rb   )r=  r   r^   r^   r_   r   	  s   zLayout.stridec                 C   sP   d}| j dkrd| j  }t| j d| jj d| j d| j d| j | dS )	Nr   r   z	, offset=z('z', z, size=r<  ))r  rY   r   r   r   r   r   )r   r  r^   r^   r_   r%  	  s   
zLayout.__str__c                 C   s   t | j| jS rb   )r9  r   r   r   r^   r^   r_   r  	  rD  zLayout.is_contiguousc                 C   sV   t | }|dvs| d dkrdS t|t| | D ]\}}}|dkr(||kr( dS qdS )N)r      r)   FT)rw   r}   r   )r8  r  ndimr6  r7  r   r^   r^   r_   is_channels_last_contiguous	  s   
z"Layout.is_channels_last_contiguousc                 C   sB   t | jtt| j| jD ]\}}}|dkr||kr dS qdS )Nr)   FT)r}   r   reversedr   r   r   )r   r6  r7  r   r^   r^   r_   is_transposed	
  s   zLayout.is_transposedc                    s   t jt  ksJ dd tjD }fdd|D } fdd|D  dd }|  dgt   }tt  D ]}tjj|| | | < q<tt  d D ]}|| ||d  krc d	S qTd
S )Nc                 S   s*   g | ]\}}t jjj|d ddkr|qS )r   r  r)   rD   r   r   r   )rp   rq   r  r^   r^   r_   rt   
  s
    z,Layout.is_stride_ordered.<locals>.<listcomp>c                    r  r^   r  ro   r   r^   r_   rt   
  r   c                    r   r^   r^   ro   r   r^   r_   rt   
  r   c                    s   t |   fdd| D S )Nc                       g | ]}  |qS r^   ry   )rp   element
sorted_arrr^   r_   rt   "
  r   zDLayout.is_stride_ordered.<locals>.sorted_indices.<locals>.<listcomp>)r  )arrr^   rH  r_   sorted_indices 
  s   z0Layout.is_stride_ordered.<locals>.sorted_indicesr  r)   FT)	rw   r   r   r   rx   rD   r   r   r   )r   r   non_1_indicesr   rK  stride_orderedrq   r^   )r   r   r_   r  
  s    zLayout.is_stride_orderedc                 C   s:   dgt ttdt| jd  }t|g| }| |S r  )rI   rC  rx   rw   r   r  r   r   r^   r^   r_   is_channels_last_stride_ordered1
  s   "
z&Layout.is_channels_last_stride_orderedc                 C   s,  t |}t| dkr| S tjst|| r| S t }t|dr)|j	
ddr)| S tdd t| |D s8| S t| }t|}dd tt| D }d	||d < d
}d}	t|d	d d	dD ])\}
}||
d	  }|| ||  }||kr|| dkrt||| }d}	|||< q_|	s| S t jd	7  _|S )z
        The padding does not change stride order but makes sure all strides larger
        than the threshold are multiple of align.
        r   metadislike_paddingFc                 s   s     | ]}t |ttjfV  qd S rb   r  r   r^   r^   r_   r  Q
  ru  z&Layout._pad_strides.<locals>.<genexpr>c                 S   r   r   r^   r   r^   r^   r_   rt   Z
  r   z'Layout._pad_strides.<locals>.<listcomp>r)   r:  N)r)  T)r;  rw   r*   pad_channels_lastr  rB  rD   get_current_nodehasattrrP  getr  r  chainr   r   rx   r   r6   r   num_comprehensive_padding)
in_stridesr   r   aligncurrent_fx_noder  r   new_stridesalign_stride_thresholdpaddedrankr   prev_idxr   r^   r^   r_   _pad_strides7
  sB   


zLayout._pad_stridesc                 C   s6   t | tsJ | jd usJ | | j| j| j| _d S rb   )rH   r   r=  r`  r   r   r   r^   r^   r_   r  
  s   zLayout.pad_stridesc                 C   s   t jot| tS rb   )r*   comprehensive_paddingrH   r   r   r^   r^   r_   r  
  r  zLayout.should_pad_stridesc                 C   s8   t | tr| S |  r|   t| j| j| j| j| jS rb   )	rH   r  r  r  r   r   r   r   r  r   r^   r^   r_   as_fixed
  s   
zLayout.as_fixedc                 C   s(   t jsJ dt| j d|   S )Nzconvert z to FixedLayout first)r   r<  rY   r   rb  r  r   r^   r^   r_   r  
  s
   zLayout.make_indexerr   c                 C   s<   | j |j ko| j|jko| j|jko| j|jko| j|jkS rb   r   r   r   r   r  )r   otherr^   r^   r_   __eq__
  s   



zLayout.__eq__c                 C   s   t | j| j| jS rb   )r   r   r   r  r   r^   r^   r_   storage_size
  r   zLayout.storage_sizeN)!r   r  r  r   rM   r   r   r	   r   r
   r   r   r  rf  r  r   r%  rO  r  r	  rB  rD  r  rO  r`  r  r  rb  r  r  re  rT   rf  r^   r^   r^   r_   r  	  s>    

	


K	r  c                       st   e Zd ZdZdedfdejdejdee	e
 e	e f deeee
ef   dee
ef f
 fd	d
Zdd Z  ZS )r  z A Tensor layout we cannot changeNr   r   r   r   r   r  c                    s*   |d u r	t |}t ||||| d S rb   )r   r   r(  rf  r>  r)  r^   r_   rf  
  s   
zFixedLayout.__init__c                    r   )z1A closure containing math to read a given elementc                    sf   t | t  jksJ t | t  jksJ  j}t|  j jD ]\}}}|dkr0|||  }q!|S r  )rw   r   r   r  r}   )rr   r}  r   r   szr   r^   r_   r^  
  s   z)FixedLayout.make_indexer.<locals>.indexerr^   r  r^   r   r_   r  
  s   	zFixedLayout.make_indexer)r   r  r  r  r   rM   r   r   r   r	   r   r  r
   r   rf  r  rR  r^   r^   r)  r_   r  
  s     
r  c                       s|   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	edd Z
dddZdd Zdd Zd fdd	Z  ZS )r   z(A Tensor layout we are allowed to changeFc                 C   sP   t | dkrg S tdg}t| dd  D ]}|||d   qtt|S )Nr   r)   r  )rw   rT   r   rC  r  rI   )sizesreversed_stridesr   r^   r^   r_   r   
  s   z!FlexibleLayout.contiguous_stridesc                 C   sV   t tt| t |ksJ td}dgt| }|D ]}|||< || |  }q|S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        r)   N)r   rx   rw   rT   r   )rh  r   next_strider  rq   r^   r^   r_   fill_ordered
  s   
zFlexibleLayout.fill_orderedc                 C   s0   t tt| t |ksJ t|}t| |S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r   rx   rw   r   r   rk  )rh  r   r   r^   r^   r_   rM  
  s   zFlexibleLayout.stride_orderedc                 C   sP   |t jkrt| tS |t jkrt| tS |t jkr t| S t	
d| t)aq  
        Create a stride based on a memory format.

        Memory format is translasted into a stride order,
        so channels_last is the same as:
            FlexibleLayout.stride_ordered(sizes, [3, 0, 2, 1])

        This interface does not support memory_format `torch.preserve_format`
        which should be used to deduce a format from another source
        z>stride_ordered_for_memory_format, unsuppored memory_format: %s)rM   channels_lastr   rM  NHWC_STRIDE_ORDERchannels_last_3dNHWDC_STRIDE_ORDERcontiguous_formatr   r  r  r   )rh  memory_formatr^   r^   r_    stride_ordered_for_memory_format
  s   



z/FlexibleLayout.stride_ordered_for_memory_formatc                 C   sD   t | t |ks
J dd |D }ttt ||jd}t| |S )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        c                 S   r  r^   rE  r  r^   r^   r_   rt     r  z/FlexibleLayout.same_ordered.<locals>.<listcomp>r  )rw   r  rx   __getitem__r   rk  )rh  r   r   r^   r^   r_   same_ordered  s   zFlexibleLayout.same_orderedc                 C   sD   |  | j|}|  r|r| || j| j}t| j| j| j|| jS rb   )rM  r   r  r`  r   r  r   r  )r   r   r  r  r^   r^   r_   as_stride_order"  s   zFlexibleLayout.as_stride_orderc                 C   @   |  | j|}|  r| || j| j}t| j| j| j|| jS rb   )rk  r   r  r`  r   r  r   r  )r   r   r  r^   r^   r_   as_fill_order/     zFlexibleLayout.as_fill_orderc                 C   rv  rb   )rt  r   r  r`  r   r  r   r  )r   r   r  r^   r^   r_   as_same_order;  rx  zFlexibleLayout.as_same_orderNc                    s2   |r	t ||}nt |}t |||| d S rb   )r   rk  r   r(  rf  )r   r   r   r   r  r  r)  r^   r_   rf  G  s   
zFlexibleLayout.__init__Frb   )r   r  r  r  r<  r	  r   rk  rM  rr  rt  ru  rw  ry  rf  rR  r^   r^   r)  r_   r   
  s"    





r   c                       s>   e Zd ZdZdeedf f fddZdd Zdd	 Z  Z	S )
NonOwningLayoutz,Is a view into the storage of another tensorviewrS   c                    s,   |  }t |j|j|j|j || _d S rb   )r   r(  rf  r   r   r   r   r|  )r   r|  r  r)  r^   r_   rf  R  s   
zNonOwningLayout.__init__c                 C      |    S rb   )rb  r  r   r^   r^   r_   r  \  r   zNonOwningLayout.make_indexerc                 C   s4   | j  j}|dkrdS ddlm} tjj||S )Nr   Tr)   )	ALIGNMENT)	r|  r   r  
compile_fxr~  rD   r   r   statically_known_multiple_of)r   r  r~  r^   r^   r_   maybe_guard_aligned_  s
   z#NonOwningLayout.maybe_guard_aligned)
r   r  r  r  r   r  rf  r  r  rR  r^   r^   r)  r_   r{  O  s
    
r{  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )
NoneLayoutc                 C   s   || _ dg| _dg| _d S r   )r   r   r   rM  r^   r^   r_   rf  q  s   zNoneLayout.__init__c                 C   r0  r   r^   r   r^   r^   r_   rf  v  r3  zNoneLayout.storage_sizec                 C      | S rb   r^   r   r^   r^   r_   rb  y  r3  zNoneLayout.as_fixedN)r   r  r  rf  rf  rb  r^   r^   r^   r_   r  h  s    	r  c                       sr   e Zd Zdef fddZejjdd Zdej	fddZ
dd
dZdd ZedddZdd Zdd Z  ZS )MutationLayoutSHOULDREMOVEr  c                    s@   t  | | | d  || _|   }tj	
| d S rb   )r(  rf  r   r   r   r  
get_bufferr   rD   r   mark_buffer_mutated)r   r  rh   r)  r^   r_   rf  ~  s   z#MutationLayoutSHOULDREMOVE.__init__c                 C   
   |   jS rb   )real_layoutr   r   r^   r^   r_   r        
z!MutationLayoutSHOULDREMOVE.strider   c                 C   r}  rb   )r  rf  r   r^   r^   r_   rf    r   z'MutationLayoutSHOULDREMOVE.storage_sizer  c                    s,    fdd  | j }t|tsJ d|S )Nc                    sB   t | tr
 | jS t | tr |  S t | tr | jS | S rb   )rH   r  r  r  r  
MutableBoxr  )r  unwrap_viewsr^   r_   r    s   




z;MutationLayoutSHOULDREMOVE.get_buffer.<locals>.unwrap_viewsz1MutationLayoutSHOULDREMOVE must refer to a buffer)r  rH   r  )r   r}  r^   r  r_   r    s   
	z%MutationLayoutSHOULDREMOVE.get_bufferc                 C   r  rb   )r  r  r   r^   r^   r_   r    r  z&MutationLayoutSHOULDREMOVE.real_layoutFc              	   C   s   |   tj|  t|tr|j}|  |s6t	j
| | | dd t| | D dj}|   t|jjtsCJ t||j_|jS )Nc                 S       g | ]\}}t jj||qS r^   rD   r   r   r  rp   r  r  r^   r^   r_   rt         z;MutationLayoutSHOULDREMOVE.realize_into.<locals>.<listcomp>r'  )r   rD   r   r  r   rH   rS   r  r  rX  r5  r   r   r  r}   r   r  r   r  )r6  srcdstunsafe_aliasr^   r^   r_   realize_into  s(   

z'MutationLayoutSHOULDREMOVE.realize_intoc                 C   r  rb   r^   r   r^   r^   r_   rb    r3  z#MutationLayoutSHOULDREMOVE.as_fixedc                 C   r  rb   )r  r  r   r^   r^   r_   r    r  z'MutationLayoutSHOULDREMOVE.make_indexer)r   r  rz  )r   r  r  r   rf  r  r   getterrT   r   rf  r  r  rP  r  rb  r  rR  r^   r^   r)  r_   r  }  s    

"r  c                       s@  e Zd ZU ee ed< eed<  fddZdd Zdefdd	Z	d
d Z
dd Zedd Zdd Zdd Zdd Zdd Zdd Zdd Zdd ZdBdd Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* ZdCd,d-Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Z de!e"j# fd8d9Z$de!e"j# fd:d;Z%d<d= Z&d>d? Z'd@dA Z(  Z)S )Dr  rh   r  c                    r&  rb   r'  r   r)  r^   r_   r     r+  zBuffer.__post_init__c                 C   r  rb   r  r   r^   r^   r_   r    r  zBuffer.make_indexerr   c                 C   s   | j sJ | | j S rb   rg   r   r^   r^   r_   r     s   zBuffer.get_namec                 C   r  rb   r  r   r^   r^   r_   r     r  zBuffer.get_devicec                 C   r   rb   r-  r   r^   r^   r_   r.    r   zBuffer.get_origin_nodec                 C   s   t | jdd S )Nr   )rc   r  r   r^   r^   r_   r        zBuffer.dtypec                 C   r  rb   r  r   r^   r^   r_   r     r   zBuffer.get_sizec                 C   r  rb   r  r   r^   r^   r_   r     r   zBuffer.get_stridec                 C   r  rb   )r  r  r   r^   r^   r_   
get_offset  r  zBuffer.get_offsetc                 C   r   rb   r  r   r^   r^   r_   r     r   zBuffer.get_layoutc                 C   r  rb   )r   r   r^   r^   r_   r     r  zBuffer.get_storage_numelc                 C   r0  r1  r^   r   r^   r^   r_   r2    r3  zBuffer.is_externc                 C   s$   t | jttfs| j | _d S d S rb   )rH   r  MultiOutputLayoutr{  rb  r   r^   r^   r_   r    s   zBuffer.freeze_layoutFc                 C   s&   t | jtsJ | jj||d| _d S )Nr  )rH   r  r   ru  )r   r   r  r^   r^   r_   r    s   z&Buffer.freeze_layout_with_stride_orderc                 C   "   t | jtsJ | j|| _d S rb   )rH   r  r   rw  rN  r^   r^   r_   freeze_layout_with_fill_order     z$Buffer.freeze_layout_with_fill_orderc                 C   r  rb   )rH   r  r   ry  )r   r   r^   r^   r_   freeze_layout_with_same_order
  r  z$Buffer.freeze_layout_with_same_orderc                 C   r   r   r   r   r^   r^   r_   r     r   zBuffer.is_zero_elementsc                    s(      rtt  dS  fdd}|S )Nr   c                    s    j  }t j|| S rb   )r  r  rC   r  rh   r  r   r^   r_   r`    s   
z"Buffer.make_loader.<locals>.loader)r   r   rW  r   r  r^   r   r_   r    s   zBuffer.make_loaderc                 C   r0  r1  r^   r   r^   r^   r_   is_no_op  r3  zBuffer.is_no_opNc                 C   r  rb   r   r   r^   r^   r_   r     r  zBuffer.codegen_referencec                 C   rZ  rb   r^   r   r^   r^   r_   r  "  r3  zBuffer.decide_layoutc                 C      t | jtr| jj gS dS r0  )rH   r  r{  r|  r   r   r^   r^   r_   get_inputs_that_alias_output%     z#Buffer.get_inputs_that_alias_outputc                 C   r  r0  )rH   r  r  r  r   r   r^   r^   r_   get_mutation_names*  r  zBuffer.get_mutation_namesc                 C   sD   t tdd t|  |  W  d    S 1 sw   Y  d S r;  )r   r?  r   r/   r  r   r   r^   r^   r_   r  /  s   $zBuffer.get_read_writesc                 C   r  rb   )r  rK  r   r^   r^   r_   r   6  r  zBuffer.get_readsc                 C      t  S rb   r   r   r^   r^   r_   get_unbacked_symbol_defs9  r   zBuffer.get_unbacked_symbol_defsc                 C   r  )a  
        Returns the unbacked symbols which are required to be in scope in
        order to successfully perform codegen for this buffer.  For example,
        a buffer that corresponds to an extern kernel call that takes i0 as
        an argument would return {i0} here.  This is used to generate necessary
        dependencies that ensure we actually bind i0 in codegen before you
        try to use it.

        Note that this is NOT transitive; in particular, if this buffer takes
        in as input another buffer with dynamic shape (e.g., (i0,)), we will
        not report it here, because you will already have a dependency
        on that buffer, which will eventually have a dependency on i0 if
        necessary.
        r  r   r^   r^   r_   r  <  s   zBuffer.get_unbacked_symbol_usesc                 C   rZ  rb   r^   r   r^   r^   r_   r   M  r3  zBuffer.realizec                 C   r0  )z
        Gets extra global memory size needed by this buffer.
        Some algorithms (e.g. group gemm) may require extra global memory in the generated code.
        r   r^   r   r^   r^   r_   get_workspace_sizeP  s   zBuffer.get_workspace_sizec                 C   r0  r1  r^   r   r^   r^   r_   should_allocateW  rr  zBuffer.should_allocaterz  rb   )*r   r  r  r
   rk   r  r  r   r  r   r   r.  r  r   r   r   r  r   r   r2  r  r  r  r  r   r  r  r   r  r  r  r  r   r   rT   r  r  r  r   r  r  rR  r^   r^   r)  r_   r    sD   
 


r  c                   @   r  )InputBufferNr  r^   r^   r^   r_   r  \  s    r  c                   @   s0   e Zd ZU dZeej ed< dd Zdd Z	dS )re  Nrd  c                    r   )Nc                    s*    j  }ttj   j|| S rb   )	r  r  rC   r  rD   r   constant_namer   rd  r  r   r^   r_   r`  d  s
   
z*ConstantBuffer.make_loader.<locals>.loaderr^   r  r^   r   r_   r  c  s   zConstantBuffer.make_loaderc                 C   s   t tj|  || jS rb   )re  rD   r   r  r   r  rM  r^   r^   r_   rN  m  s   z!ConstantBuffer.constant_to_device)
r   r  r  rd  r
   rM   r   r  r  rN  r^   r^   r^   r_   re  `  s   
 
re  c                   @   s*   e Zd Zdeej fddZdddZdS )NoneAsConstantBufferr   c                 C   r  rb   r  r   r^   r^   r_   r  t  r   z-NoneAsConstantBuffer.get_unbacked_symbol_usesNc                 C   s
   t jjjS rb   )rD   r   r  none_strr   r^   r^   r_   r   w  r  z&NoneAsConstantBuffer.codegen_referencerb   )r   r  r  r   rT   r  r  r   r^   r^   r^   r_   r  s  s    r  c                       s:   e Zd Z fddZdeej fddZd	ddZ  Z	S )
ShapeAsConstantBufferc                    s   t    || _d S rb   )r(  rf  r8  )r   r8  r)  r^   r_   rf  |  r+  zShapeAsConstantBuffer.__init__r   c                 C   
   t | jS rb   )r!   r8  r   r^   r^   r_   r    r  z.ShapeAsConstantBuffer.get_unbacked_symbol_usesNc                 C   s   t jjt jj| jS rb   )rD   r   r  expr_printerr   r.  r8  r   r^   r^   r_   r     r   z'ShapeAsConstantBuffer.codegen_referencerb   )
r   r  r  rf  r   rT   r  r  r   rR  r^   r^   r)  r_   r  {  s    r  c                       s   e Zd ZU eed< dd Zedd Zdd Zde	e
j fd	d
Z fddZdd Zdd Zdd Zedd Z	d%deeeeef ee f  fddZe	d%ddZdd Zdd Zdd  Zd!d" Zd#d$ Z  ZS )&r  r  c                 C   s(   | j dur| j S t| jdr| jj S dS )z
        Returns self.name if it exists, otherwise returns the name of the data node if that exists.
        If neither exist, returns None.
        Nrh   )rh   rT  r  r   r^   r^   r_   get_computed_buffer_name  s
   
z'ComputedBuffer.get_computed_buffer_namec                 C   s   t |  jS rb   )rw   r  rK  r   r^   r^   r_   	num_reads  r  zComputedBuffer.num_readsc                 C   sz   t tdd, | j r"t|  | j | j W  d    S t|  | j	 W  d    S 1 s6w   Y  d S r;  )
r   r?  r   r  rI  r/   get_store_functionr/  rJ  r   r   r^   r^   r_   r    s   
$zComputedBuffer.get_read_writesr   c                 C   s.   t |  t |  B t |  B | j B S rb   )r!   r   r   r  r  r  r   r^   r^   r_   r    s   


z'ComputedBuffer.get_unbacked_symbol_usesc                    s:   t | jdr| jtjjvr|  dkr| j S t  S )Nr  r   )	rT  r  rh   rD   r   mutated_buffersr  r  r(  r   r)  r^   r_   r    s   


zComputedBuffer.make_loaderc                 C   sR   | j   }t| jttfrt| jj| j	|S t| jt
s J t| jj| j	|S rb   )r  rb  r  rH   r  r  rx  r   r  rh   rX  ra  r  r^   r^   r_   r    s
   z!ComputedBuffer.get_store_functionc                    s   t | jtr^t| j | j \\}}|  j	}dd |D }t
dd |D s-J fdd|D }|r^t | jtrF| j| n|  fdd|D }ddlm} |||  S d	S )
al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        c                 S   s0   g | ]}|j tjj v rtjj|j  nd qS rb   )rh   rD   r   r  r  r  r^   r^   r_   rt     s    z1ComputedBuffer.get_fill_order.<locals>.<listcomp>c                 s   s"    | ]}t |tjtjfV  qd S rb   )rH   r+   StarDep	MemoryDepr  r^   r^   r_   r    s
    
z0ComputedBuffer.get_fill_order.<locals>.<genexpr>c                    s.   g | ]}t |tjrt|jd d  D qS )c                 S   s    i | ]}|d kr|t d qS r   r*  rp   vr^   r^   r_   r     r   z<ComputedBuffer.get_fill_order.<locals>.<listcomp>.<dictcomp>)rH   r+   r  rB   rr   r  )r  r^   r_   rt     s    
c                    s   g | ]
}t jj| qS r^   rD   r   r   r  rp   expr)r  r^   r_   rt     s    r)   pick_loop_orderN)rH   r  r   r+   r  r  r/  rJ  r  rK  r  rx  r{   	schedulerr  r   )r   
index_varsr   rK  
reads_bufsstride_lengthsr  r^   )r  r  r_   get_fill_order  s0   


zComputedBuffer.get_fill_orderc                 C   s6   t | jtr|  }|r| | d S |   d S d S rb   )rH   r  r   r  r  r  rN  r^   r^   r_   r    s   zComputedBuffer.decide_layoutc           
      C   s   t j| j | j dd\}}ttd|   t	| 
 |  r$|n|d d |}W d    n1 s6w   Y  g }g }g }g }| D ]+\}}	||d v r`|rUJ || ||	 qG||d v shJ || ||	 qG||f|||ffS )NqrF   rd  r)   r   )r+   r  r  r/  rJ  r   r?  re  r   LoopBodyr  rI  itemsr  )
r   re   
var_rangesr?  r  reduce_vars
index_sizereduce_sizer  r   r^   r^   r_   get_default_sizes_body  s.   


z%ComputedBuffer.get_default_sizes_bodyNextra_indexing_constraintsc                    sX    \\}}}\}}g |j  |durZt|tr!t|dks#J |\}}t|ts.J t|ts5J tdd |D s@J |j	}	|	|ksMJ |	|f fdd|D } |7  dd |j
 D }
g |j
 |j  fdd	}|| }||||\}}}||||\}}}tj||d
d\\}}}t|||||g|}||f|fS )a  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders

        Optional argument extra_indexing_constraints can be used to append additional
        indexing expressions to existing ones derived from buffer's body. This can be useful
        to fuse scheduler nodes with compatible ranges, e.g. (s0*s1*...,) and (s0, s1, s2, ...)
        on CPU by preventing indexing simplifications and obtaining index/reduce ranges for
        the scheduler node compatible with other nodes.
        Nr   c                 s       | ]}t |tV  qd S rb   )rH   r   )rp   fr^   r^   r_   r  G  rc  z6ComputedBuffer.simplify_and_reorder.<locals>.<genexpr>c                    s   g | ]}| vr|qS r^   r^   r  )index_formulasr^   r_   rt   O      z7ComputedBuffer.simplify_and_reorder.<locals>.<listcomp>c                 S   s,   g | ]}|t jj v rt jj| nd qS rb   )rD   r   r  r  )rp   
reads_namer^   r^   r_   rt   T  s    c                    sZ    | ||\}}}|| } tjj| |t | |\}}}|| } t||}|||fS rb   )_apply_loop_reorderingrD   r   r   _simplify_loopsr,   r   )x_varssupport_varsrh  reindex0r   r   pruner{   r  memory_addrsr   r^   r_   simplify_and_reorder_  s   




zAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorderzrF   )r  indexing_exprsrL   rH   rJ   rw   rK   rI   r  r  reads_name2exprr  writes_name2exprr+   index_vars_no_squeezer  )r   r  r  r  r?  r  r  extra_indexing_rangesextra_indexing_exprexpected_var_rangesr  r  r  iter_rangesiter_reindexr   reduce_rangesreduce_reindex	iter_varsr  r^   r  r_   r  %  s`   



z#ComputedBuffer.simplify_and_reorderc              
      s   ddl m} |du rg }z* fdd|D }t|t|kr)t|d t ks+J tt|||}W n  tyV   tjrLt	dt
t | ttt}Y nw fdd|D t|t|fS )	zU
        Shuffle the order of loops around to hopefully improve performance.
        r)   r  Nc                    s   g | ]}t jj| qS r^   r  r  )r  r  r^   r_   rt         z9ComputedBuffer._apply_loop_reordering.<locals>.<listcomp>r   z%Did not simplify complex index:
%s
%sc                    r   r^   r^   ro   )rh  r^   r_   rt     r   )r  r  rw   rI   rC  	Exceptionr*   r  r  warningrK   r}   rx   r   r   )r  r  rh  r  priority_idxr  r  r   r^   )r  rh  r  r_   r    s,   
z%ComputedBuffer._apply_loop_reorderingc                 C   r  rb   )r  rJ  r   r^   r^   r_   rJ    r  z!ComputedBuffer.get_reduction_sizec                 C   r  rb   )r  rI  r   r^   r^   r_   rI    r  z!ComputedBuffer.get_reduction_typec                 C   r  rb   )r  r   r   r^   r^   r_   r    r  zComputedBuffer.is_no_opc                 C   r0  NTr^   r   r^   r^   r_   r    r3  zComputedBuffer.should_allocatec                 C   r  )rc  )r  rN  rM  r^   r^   r_   rN       z!ComputedBuffer.constant_to_devicerb   ) r   r  r  r  r  r  r5   r  r  r   rT   r  r  r  r  r  r  r  r
   r   r   r   r	   r  r	  r  rJ  rI  r  r  rN  rR  r^   r^   r)  r_   r    s2   
 
/

_#r  c                       sz   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
	ddeeeeef ee f  fddZ  ZS )TemplateBufferzt
    Represents a Triton (in the future other type) of template operator
    that we can fuse an epilogue onto.
    c                    s4   t  jd |d t|| _|| _tj| | _	d S )N)rh   r  )
r(  rf  InputsKernelunwrap_storageinputsmake_kernel_renderrD   r   register_bufferrh   )r   r  r  r  r)  r^   r_   rf    s   zTemplateBuffer.__init__c                 C   r  rb   )normalized_read_writesr   r^   r^   r_   r    r  zTemplateBuffer.get_read_writesc                    sL   |   | j   fdd}tj||  ddd}dd | jD |_|S )Nc                    s"   t |dksJ t | dS )Nr   fake)rw   rC   r[  r  r^  rh   r^   r_   dummy  r  z4TemplateBuffer.normalized_read_writes.<locals>.dummyr^   T)	normalizec                 S   s   h | ]	}t | qS r^   r+   r  r   r  r^   r^   r_   r     r  z8TemplateBuffer.normalized_read_writes.<locals>.<setcomp>)r   r  r  r+   r/   r   r  rK  )r   r  depsr^   r  r_   r    s   
z%TemplateBuffer.normalized_read_writesc                 C   r0  r  r^   r   r^   r^   r_   rJ    r3  z!TemplateBuffer.get_reduction_sizec                 C   rZ  rb   r^   r   r^   r^   r_   rI    r3  z!TemplateBuffer.get_reduction_typec                 C   r0  r1  r^   r   r^   r^   r_   r    r3  zTemplateBuffer.is_no_opc                 C   r0  r  r^   r   r^   r^   r_   r    r3  zTemplateBuffer.should_allocateNr  c                 C   s   |   dfd fS r0  r  )r   r  r^   r^   r_   r    s
   z#TemplateBuffer.simplify_and_reorderrb   )r   r  r  r  rf  r  r  rJ  rI  r  r  r
   r   r   r   r	   r  rR  r^   r^   r)  r_   r    s    r  c                       s8   e Zd Z		ddeee  f fddZdd Z  ZS )TritonTemplateBufferNmutated_inputsc                    sv   t  ||| || _|| _|dur9tjjjtjjjh}t	j
jj}||v s/J d| d| t| g|R   dS dS )a  
        NOTE:[TritonTemplates with multiple outputs]
        We want the ability for TritonTemplates to output multiple tensors. Triton
        kernels have no notion of outputs and this is done by creating tensors that
        are then mutated by the kernel. Currenlty our STORE_OUTPUT codegen doesn't
        support creating multinode outputs for triton templates.
        We work around this by creating an extra input buffer during the lowering
        and we mark them as mutated inputs.
        Nz$Mutated inputs are only allowed for z	 but got )r(  rf  debug_extrar  rM   rC   higher_orderflex_attentionflex_attention_backwardrD   r   current_noder  mark_node_as_mutating)r   r  r  r  r  r  allowed_setr  r)  r^   r_   rf    s   

zTritonTemplateBuffer.__init__c                 C   s   d| j  d| j d}|S )NzTritonTemplateBuffer(layout=r  r?  )r  r  )r   r   r^   r^   r_   r%    s   zTritonTemplateBuffer.__str__)NN)	r   r  r  r
   r   r   rf  r%  rR  r^   r^   r)  r_   r    s    
 r  c                       s~   e Zd ZdZ fddZdefddZdefddZd	d
 Z	defddZ
dddZdeeeeee f f fddZ  ZS )ChoiceCallera.  
    Represents a possible choice used in autotune_process.py.
    During autotuning, self.benchmark() is first called to get benchmark result,
    and if this choice is selected, self.output_node() is called to get the output_node.

    Children classes: TritonTemplateCaller, CUDATemplateCaller.
    c                    s    t    || _|| _|| _d S rb   )r(  rf  rh   r  input_nodes)r   rh   r  r  r)  r^   r_   rf  !  s   

zChoiceCaller.__init__r   c                G   s   |   }t||d|iS )Nr   )to_callabler3   )r   r   re   algor^   r^   r_   	benchmark'  s   zChoiceCaller.benchmarkc                 C      t rb   r  r   r^   r^   r_   	call_name+  r3  zChoiceCaller.call_namec                 C   r  rb   r  r   r^   r^   r_   r  .  r3  zChoiceCaller.to_callablec                 C   r  rb   r  r   r^   r^   r_   hash_key1  r3  zChoiceCaller.hash_keyrS   c                 C   r  rb   r  r   r^   r^   r_   output_node4  r3  zChoiceCaller.output_nodec                 C   s   i S )zRInformation returned here is logged to the autotune log file when that is enabled.r^   r   r^   r^   r_   	info_dict7  rr  zChoiceCaller.info_dict)r   rS   )r   r  r  r  rf  rV  r  rk   r  r  r  r	  r   r   PrimitiveInfoTyper	   r
  rR  r^   r^   r)  r_   r    s    
*r  c                   @   s   e Zd ZdefddZdS )TritonTemplateCallerBaser   c                 C   r  rb   r  r   r^   r^   r_   get_make_kernel_render=  r3  z/TritonTemplateCallerBase.get_make_kernel_renderN)r   r  r  r   r  r^   r^   r^   r_   r  <  s    r  c                
       s   e Zd ZdZdedee deg ee	e
f f f fddZedee	e
f fdd	Zejd
efddZd
efddZdee	e
f fddZ  ZS )MultiTemplateBufferaG  
    Represents a Buffer with multiple backing implementation choices.

    Choices can be TritonTemplates or ExternKernels. During scheduling if there is a potential
    epilogue we will benchmark each of the choices with the epilogue to determine an implementation.
    Otherwise, the fastest base choice will be chosen.
    r  r  choice_timingsc                    s(   t  j||d d || _d | _|| _d S )N)r  r  r  )r(  rf  _choice_timings_fn_choice_timingsoriginal_inputs)r   r  r  r  r)  r^   r_   rf  J  s   
zMultiTemplateBuffer.__init__r   c                 C   s   | j d u r
|  | _ | j S rb   )r  r  r   r^   r^   r_   r  U  s   

z"MultiTemplateBuffer.choice_timingscallerc                 c   sR    t |tjjjsJ | j|jksJ | j}| | _z	d V  W || _d S || _w rb   )rH   rM   rN   select_algorithmTritonTemplateCallerr  r  r  )r   r  renderr^   r^   r_   swap_as_triton_caller[  s   
z)MultiTemplateBuffer.swap_as_triton_callerc                 C   sJ   t |tjjjs
J | jj|jjksJ | jj|jjksJ | | _	d S rb   )
rH   rM   rN   r  r  r  r   r   r  r  )r   r  r^   r^   r_   finalize_as_triton_callerg  s   z-MultiTemplateBuffer.finalize_as_triton_callerc                 C   s    t | j| jjd}|| j| fS )Nr  )rs  r  rU  )r   
min_choicer^   r^   r_   get_min_choicem  s   z"MultiTemplateBuffer.get_min_choice)r   r  r  r  r  r	   r   r   r   r  rV  rf  r  r  r
  r  r  r  r  r   r  rR  r^   r^   r)  r_   r  A  s    r  c                       s.   e Zd Zdeddf fddZdd Z  ZS )CUDATemplateBufferworkspace_sizetemplateCUDATemplatec                        t  ||| || _|| _d S rb   )r(  rf  r  r  )r   r  r  r  r  r  r)  r^   r_   rf  s  s   
zCUDATemplateBuffer.__init__c                 C   s   | j d ur| j S dS r   )r  r   r^   r^   r_   r    rj   z%CUDATemplateBuffer.get_workspace_size)r   r  r  r  rf  r  rR  r^   r^   r)  r_   r  r  s    r  c                          e Zd Z fddZ  ZS )CppTemplateBufferc                    r  rb   )r(  rf  r  choice)r   r  r  r  r  r"  r)  r^   r_   rf    s   
zCppTemplateBuffer.__init__r   r  r  rf  rR  r^   r^   r)  r_   r!        r!  c                   @   sJ   e Zd ZU ee ed< dd Zdd Zedd Z	e
dd	 Zd
d ZdS )r  r  c                 C   s   t | S rb   r  r  r^   r^   r_   get_read_writes_input  rD  z"InputsKernel.get_read_writes_inputc                    sp   g } j D ]}t|tr| fdd|D  q| | qtjt|t	 
 ht g d t dS )Nc                    rF  r^   )r%  r  r   r^   r_   rt     r   z0InputsKernel.get_read_writes.<locals>.<listcomp>)	op_counts)r  rH   rI   extendr  r%  r+   
ReadWritesr   r  r   collectionsCounter)r   star_depinputr^   r   r_   r    s   

zInputsKernel.get_read_writesc                 C   sz   t |tr|j}t |tr|j}t |trt |tst|}t |tr)| |S t |t	r0|S t |t
tfs;J ||S rb   )rH   rS   r  r  r  r  r  r  unwrap_storage_for_inputTorchBindObjectr  r6  r   r^   r^   r_   r-    s   





z%InputsKernel.unwrap_storage_for_inputc                 C   s@   g }| D ]}t |trdd |D }nt|}|| q|S )Nc                 S   r  r^   )r  r-  ro   r^   r^   r_   rt     r   z/InputsKernel.unwrap_storage.<locals>.<listcomp>)rH   rI   r  r-  r  )r  
inputs_newr   r^   r^   r_   r    s   

zInputsKernel.unwrap_storagec                 C   r0  r  r^   r   r^   r^   r_   r2    r3  zInputsKernel.is_externN)r   r  r  r	   r  r  r%  r  rP  r-  r	  r  r2  r^   r^   r^   r_   r    s   
 


r  c                   @   s   e Zd Zdd ZdS )	NopKernelc                 C   r0  r  r^   r   r^   r^   r_   r    r3  zNopKernel.is_no_opN)r   r  r  r  r^   r^   r^   r_   r1    s    r1  c                   @   s<   e Zd ZdZedd Zedd Zedd Zdd	 Zd
S )ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    c                 C   s  |d   }|d  }t|d  }dg}|| g}d|  kr)t|k s,J  J tdt|D ]Z}||  }	|||  t|	t|ksLJ ||  |ksVJ ||   |ks`J tt|D ]}
|
|krw||
 |	|
  ||
< qftjj	
||
 |	|
 ||
< qf|||  q3t|}tt|D ]!}|| }t|r| }t|trt|j|jrt|} nqtdd |D }tjjjd }t|tsJ |du rtdd |D rt|}td t||||dg d}t|}g }tt|D ]M}| || tj|||| || dd	}|j| t|| jt r)|| j! }n|| j}|" rIt#||   j$rIt%|sI||&  qt|dkrWtj'| tj(||_)| *|j|_|S )
Nr   r)   c                 s   r  rb   )r   r  r^   r^   r_   r    r  z&ConcatKernel.create.<locals>.<genexpr>Fc                 s   sB    | ]}d |j v o|j d  jtjdp|j d  jtjdV  qdS )r!  rq  N)rP  r  rM   rl  rn  rp   argr^   r^   r_   r    s    

)r   r   r   r   rh   r  r  )r#  )+r   r   rI   r   rw   rx   r  rD   r   r   r  r   r   r   r   rH   r  r  rB  r   r   r   rq  r  re   r2  r  r  r  r5  r  r  r  r  is_input_bufferr<   rY   r;   r   register_listr  rh   r  )r6  r  r  r   r   r  offsets_startoffsets_endrq   
input_sizer  output_strider   r  any_input_is_storage_and_layoutfx_node_argsconcat_kernelkernelbuffer_namesinput_bufferinput_unwrappedr^   r^   r_   r5    s   
 



zConcatKernel.createc                 C   s2   t |tr| |jS t |jjtot |jt S rb   )rH   rS   can_realize_into_without_copyr  r  r   ExternKernelAlloc)r6  r  r^   r^   r_   rD  *  s
   
z*ConcatKernel.can_realize_into_without_copyc              	   C   s   t |tst|rt|\}}t||}t |tsJ |t |tr)| |j|S t |trH|  t	|jds:J | 
|rHt||j_|jS tj| | | dd t| | D d}| ||S )Nr  c                 S   r  r^   r  r  r^   r^   r_   rt   M  r  z-ConcatKernel.realize_into.<locals>.<listcomp>r'  )rH   r  r   r  rS   r  r  r  r   rT  rD  r{  r  rX  r5  r   r   r  r}   r   )r6  r  r  r  r  pwr^   r^   r_   r  4  s,   




	zConcatKernel.realize_intoc                 C   r0  r  r^   r   r^   r^   r_   r  T  r3  zConcatKernel.should_allocateN)	r   r  r  r  rP  r5  rD  r  r  r^   r^   r^   r_   r2    s    
Y
	
r2  c                 C   sP   t | tjjr| jdkrd S | jdkr| jdd n| jdd}d| dS )NatenrE  .r   r   z
at::_ops::z::call)	rH   rM   _ops
OpOverload	namespace_overloadnamer   r  replace)r@  opnamer^   r^   r_   get_aten_cpp_kernel_nameX  s   
rO  c                       sp  e Zd ZU dZeedf ed< eje	dZ
eeef ed< dZee ed< dZee ed< dZee ed	< ejedZee ed
< dZeeejjejjf  ed< dZeeeeef   ed< dZeeeeeef f  ed< eje	dZeej e!j"f ed< 							dG fdd	Z#de$ej  fddZ%dd Z&dHddZ'dd Z(dd Z)dd Z*dd  Z+e,d!d" Z-e.deeee ee e/eegef eeej e!j"f  f fd#d$Z0e.d%d& Z1e.d'd( Z2e.d)d* Z3e.dHd+d,Z4e.d-d. Z5e.d/d0 Z6e.d1d2 Z7d3d4 Z8d5d6 Z9d7d8 Z:d9d: Z;dHd;d<Z<d=d> Z=d?d@ Z>dAdB Z?de$ej  fdCdDZ@dEdF ZAeAZB  ZCS )Ir  r^   .constant_args)default_factoryrf   Noutput_viewpython_kernel_namecpp_kernel_nameordered_kwargs_for_cpp_kernelop_overloadarg_propertieskwarg_propertiesunbacked_bindingsc                    sf   t  ||| || _|r|ni | _|| _|| _|pt|
| _|	| _|
| _	| 
  i | _tjj| _d S rb   )r(  rf  rP  rf   rR  rS  rO  rT  rU  rV  collect_arg_kwarg_propertiesrY  rD   r   r  fx_node)r   rh   r  r  rP  rf   rR  rS  rT  rU  rV  r)  r^   r_   rf  |  s   zExternKernel.__init__r   c                 C   r  rb   r  r   r^   r^   r_   r    r   z%ExternKernel.get_unbacked_symbol_defsc                 C   s   t | jtjjrdd | jjjD ndd tt| j	D | _
t | jtjjr1dd | jjjD ni | _t | jtjjrL| jsNdd | jjjD | _d S d S d S )Nc                 S   s$   g | ]}|j s|j|j|jd qS ))rh   rY   r9  )
kwarg_onlyrh   	real_typer9  r  r^   r^   r_   rt     s    z=ExternKernel.collect_arg_kwarg_properties.<locals>.<listcomp>c                 S   s   g | ]}i qS r^   r^   ro   r^   r^   r_   rt     r   c                 S   s   i | ]}|j |j|jd qS ))rY   r9  )rh   r]  r9  r  r^   r^   r_   r     r  z=ExternKernel.collect_arg_kwarg_properties.<locals>.<dictcomp>c                 S      g | ]}|j r|jqS r^   r\  rh   r  r^   r^   r_   rt     
    )rH   rV  rM   rI  rJ  _schema	argumentsrx   rw   r  rW  allarg_propertiesrU  r   r^   r^   r_   rZ    s(   
z)ExternKernel.collect_arg_kwarg_propertiesFc                 C   s   t |ttfs	J t |trt|}| jsJ dt|}t| j}||k rQtd| j||  t||D ]}| j| d }|	||v rH|| n| j| d  q5|S )Nz/ExternKernel.arg_properties should not be emptyzv%s has %d unprovided positional arguments. Will check if they are in the keyword arguments or will use default values.rh   r9  )
rH   rI   rJ   rW  rw   r  r  rV  rx   r  )r   re   rf   convert_val_to_strn_args
n_pos_argsrq   arg_namer^   r^   r_   fill_non_provided_args  s(   	

z#ExternKernel.fill_non_provided_argsc                 C   s$   t | jtr|   |   d S d S rb   )rH   r  r   apply_constraintr  r   r^   r^   r_   r    s   zExternKernel.decide_layoutc                 C   s$   t | |\}}|r|| d S d S rb   )r:   	writeline)r   wrapper
origin_strdetailed_origin_strr^   r^   r_   codegen_comment  s   zExternKernel.codegen_commentc                 C   r  rb   r  r   rk  r^   r^   r_   codegen  r3  zExternKernel.codegenc                 C   s*   t jjrtjrt jj| jS | jS | jS rb   )	rD   r   cpp_wrapperr*   abi_compatibler  get_c_shim_func_namerT  rS  r   r^   r^   r_   get_kernel_name  s   zExternKernel.get_kernel_namec                 C   s:   t j|  |  |  |  |  |  d}|  |S )N)r   r   r  r  r#  r   )	rX  r5  r   r   r  r   r.  r   r   )r   rF  r^   r^   r_   
copy_input  s   zExternKernel.copy_inputc                    s  ||d}t |\} g g }g }|D ])}t|t d r(|| qt|tjr8tjj	j
j|d d}|| q fdd}	fdd|D }|D ]}
t|
r\t|
dd	 qPg }|D ]3}
|
 tjjv rw|tjj|
   qa|
 tjjv r|tjj|
   qa|t|
dd
 qa|	||\}}||i |}d }tjj
 }rt|tj| t||tjjd}t|ttfs|gn|}|D ]#}t|tjr|jrd}tjjjdd  }r| d| }|tj_q||||	|fS )Nrd   r  )r2  c                    sd   g }t | }t |}D ]}|r|t| q|t| qt| }|dg |di fS )Nre   rf   )iterr  nextpytreetree_unflattenrU  )new_tensor_argsnew_non_tensor_argsr}  
it_tensorsit_non_tensors	is_tensorr8  )	args_specis_arg_tensorr^   r_   unflatten_args   s   z3ExternKernel.process_kernel.<locals>.unflatten_argsc                    rF  r^   r  r  r6  r^   r_   rt   ,  r   z/ExternKernel.process_kernel.<locals>.<listcomp>Tr  r   r!  zEsparsity not handled. Please file issue for sparse inference weights.stack_tracez Found from : 
 )rx  tree_flattenr  rH   r   rT   r   rD   r   r   r  create_symintnoder   r  r   	constantstorchbind_constantsr   	fake_moder"   r  r   rP  rU  rI   rJ   rM   Tensor	is_sparsedisable_cudagraphs_reason)r6  r@  re   rf   binded_args	args_flattensor_argsnon_tensor_argsr5  r  r   example_argsnew_args
new_kwargsexample_outputrY  r  example_out_lir   msgr  r^   )r  r6  r  r_   process_kernel  sd   

zExternKernel.process_kernelc              	   C   sF  t |tsJ t |tr|S | }tj|  }|durId|j	v rIt |j
trI|j	d jtjds?|j	d jtjdrI|t|  n|  tj| dd\}}|d }| |}tjj||}tjj||}tjj||}	t|||	 }
||
krtd||	| tt|jt |! |" | ||	dd	S )
z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        Nr!  r3  r8  rF   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%src  )r  r  )#rH   r  r  r  rD   r   r  r   r.  rP  r  r   r  rM   rl  rn  r  r   r   r  r+   r  r  r   r  stride_vars
offset_varr>   r  r  r   r  r  r   r   )r6  r   x_unwrap_viewx_unwrap_view_fx_node
index_argsr  r  rr   r  r  expectedr^   r^   r_   convert_to_reinterpret_viewb  sh   






z(ExternKernel.convert_to_reinterpret_viewc                 C   s
  |d u rt  S t|tjtjjjtfrt|S t|t	r-t
jtj|j| | dS t|tr4|S t|tr?| |jS t|trOt| |j| S t|trn|  t| rnz| |W S  tym   Y nw t|try|  |S t|tr|S |  |S )N)r   r   )!r  rH   rT   r   rU   rV   rW   r  r  r1  rD   r   add_tensor_constantrM   tensorr   r   r   re  rS   r  r  r  r   r  r   r   r  r  r   r  r.  ru  r/  r^   r^   r_   r    s8   







zExternKernel.realize_inputc                 C   sD   t |rt| dkr|S | D ]
}|dkr|  S q| |S r  )r   rw   r   ru  )r6  r   r   r^   r^   r_   require_stride1  s   
zExternKernel.require_stride1c              	   C   s  |  dkr|S t|rzt| tr| j}t| tst| trBt|ddt||r;t	t
jj| jn||d |S t| trR| |rR|S t| trzt|  trftdt|  trz|  |rz|S t|tr| |r|S t|trt|jtrt|jtst| rt| jtsz| |j|_| j|||dW S  ty   Y nw | |}t|dd||d t||sJ |S )Nr   TFr  zHthe MutationLayoutSHOULDREMOVE's real layout shouldn't be FlexibleLayoutr  )r   r   rH   r   r{  r|  r   r  r  r   rD   r   r   
size_hintsr   r  r  r  r  r  r  rS   r  r  r  r  rE  r  require_stride_orderr   ru  )r6  r   r   r  r^   r^   r_   r    s   




z!ExternKernel.require_stride_orderc                 C      |  |tS rb   )r  rm  r/  r^   r^   r_   require_channels_last  r  z"ExternKernel.require_channels_lastc                 C   r  rb   )r  ro  r/  r^   r^   r_   require_channels_last_3d  r  z%ExternKernel.require_channels_last_3dc              	   C   s    |  |tttt| S rb   )r  rI   rC  rx   rw   r   r/  r^   r^   r_   require_contiguous  s    zExternKernel.require_contiguousc                 C   rZ  rb   r^   r   r^   r^   r_   ri  #  r3  zExternKernel.apply_constraintc                 C   s   t jjr8g }t| jD ]*\}}t| j| }| jr(|t| jk r(| j| dnd }|	t jj
|| q|S tt jj
j| jS )NrY   )rD   r   rq  r   rP  rw   r  rW  rU  r  r  val_to_arg_strr   )r   r}  rq   r   r   type_r^   r^   r_   codegen_const_args&  s   zExternKernel.codegen_const_argsc                 C   s   g }t | jD ]L\}}t|tr&dd |D }dd| d}|| qtjjrL| j	r4|t
| j	k s8J d| j	| d}|tjj|| q||  q||   |S )Nc                 S   rs  r^   r   ro   r^   r^   r_   rt   ;  r   z-ExternKernel.codegen_args.<locals>.<listcomp>[r  ]z-Invalid access to ExternKernel.arg_propertiesrY   )r   r  rH   rI   r   r  rD   r   rq  rW  rw   rU  r  r  r   r'  r  )r   re   rq   r   r$  r   r  r^   r^   r_   codegen_args7  s*   

zExternKernel.codegen_argsc                 C   sH   || j v r| j |S | jr| j|r| j|dS t| d)Nr9  z not in self.allarg_properties)rf   rU  rc  r  )r   rg  r^   r^   r_   get_kwargs_valueN  s
   
zExternKernel.get_kwargs_valuec                 C   s   t jjrDg }| jD ]8}|r|dkrq	| |}t|tjr#|| q	| j	r4|| j	v r4| j	
|
dnd }|t jj|| q	|S dd | j D }|S )Nr   rY   c                 S   s(   g | ]\}}| d t jj| qS r  rD   r   r  r  rp   kr  r^   r^   r_   rt   m  s    z/ExternKernel.codegen_kwargs.<locals>.<listcomp>)rD   r   rq  rU  r  rH   rT   r   r  rc  rU  r  r  rf   r  )r   skip_outrf   rg  r  r  r^   r^   r_   codegen_kwargsV  s0   



zExternKernel.codegen_kwargsc              	   C   st   t jr6tjjs8t|  dkrd S tjj|  }tjj| 	 }|
d|   d| d| d d S d S d S )Nr   zassert_size_stride(r  r?  )r*   size_assertsrD   r   rq  rA   r   r  codegen_shape_tupler   rj  r   )r   rk  r   r   r^   r^   r_   codegen_size_assertss  s   z!ExternKernel.codegen_size_assertsc                 C   s   |   }|  }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r   r   )r   _sizer=  r^   r^   r_   get_group_stride~  s   zExternKernel.get_group_stridec                    s  t jj|  }|  }fdd|D }dd tt|D ttt||jdd}dd t	|D fddttD }fd	d|D | 
 }|}t jj||g\}}}	td
\}
 tt| fdd|D }tt||}|t|fS )zC
        Manually get canonicalization of the output index
        c                    rF  r^   )r   r  r&  r^   r_   rt     r   z-ExternKernel.canonicalize.<locals>.<listcomp>c                 S   s   g | ]	}t d | qS )d)r?   ro   r^   r^   r_   rt     r  T)r  r  c                 S   r   r^   r^   r   r^   r^   r_   r     r   z-ExternKernel.canonicalize.<locals>.<dictcomp>c                    r   r^   r^   ro   r   r^   r_   rt     r   c                    r   r^   r^   ro   )r  r^   r_   rt     r   cc                    r   r^   r^   r  )add_varr^   r_   rt     r   )rD   r   r   r   r   rx   rw   r  rs  r   r  r  r0   rK   r}   rB   rT   r  rJ   )r   rh  r  index_orderr   r^  rr   	new_sizesr{   r  r   replacementr^   )r  r  r   r   r_   canonicalize  s$   
 zExternKernel.canonicalizec                 C   s>   t  }| jD ]}|t|O }q| j D ]}|t|O }q|S rb   )r   rP  maybe_free_unbacked_symbolsrf   rL   )r   r8  r5  r^   r^   r_   r    s   
z%ExternKernel.get_unbacked_symbol_usesc                    sP   t  dd }d|g}| fddt D 7 }|d j  |S )NrS  zpython_kernel_name=c                    s$   g | ]}|j  d t |j  qS r  )rh   rc   )rp   fieldr   r^   r_   rt     s    z(ExternKernel.__str__.<locals>.<listcomp>r!  )rc   dataclassesfieldsr  r#  r   )r   kernel_namer   r^   r   r_   r%    s   
zExternKernel.__str__r^   NNNNr^   Nrz  )Dr   r  r  rP  r   r   r  r  r  rK   rf   r   rk   rR  r
   r  rS  rT  rI   rU  r   rV  r   rM   rI  rJ  HigherOrderOperatorrW  r	   rX  rY  rT   r  rx  KeyPathrf  r   r  rZ  rh  r  rn  rp  rt  r	  ru  rP  r   r  r  r  r  r  r  r  r  ri  r  r  r  r  r  r  r  r  r%  rO  rR  r^   r^   r)  r_   r  g  s   
  	
""
[
C

	G



	
r  c                       s<   e Zd Zdd Z							d	 fdd	Zdd Z  ZS )
ExternKernelOutc                 C   sN   |  | g |  | jdd}||  |  | jr!| j nd | d S )NT)r  )rn  r  r  generate_extern_kernel_outrt  r   rR  r   rk  re   r^   r^   r_   rp    s   
zExternKernelOut.codegenr^   Nc
           
         s:   t  d || |||pi d ||||	
 tj| | _d S rb   r(  rf  r  rD   r   r  rh   )
r   r  r  rP  rf   rR  rS  rT  rU  rV  r)  r^   r_   rf    s   zExternKernelOut.__init__c                 C   r0  r  r^   r   r^   r^   r_   r    r3  zExternKernelOut.should_allocater  )r   r  r  rp  rf  r  rR  r^   r^   r)  r_   r    s    r  c                       s(   e Zd Zdedejf fddZ  ZS )RandomSeedscountr   c                    sP   t t j}t jt|t j|gdg |j|j|ggdtj	rdndt
jjd d S )Nr  zaten.randint.low_outzat::_ops::randint_low_out::callzat::randint_out)r  r  rP  rS  rT  rV  )rM   r6  r  r(  rf  r  rs  rr  r*   rr  rG  randintlow_out)r   r  r   limitsr)  r^   r_   rf    s   
zRandomSeeds.__init__)r   r  r  r  rM   r   rf  rR  r^   r^   r)  r_   r    s     r  c                       sB   e Zd Zdd Z						d fdd	Zdd Zd	d
 Z  ZS )rE  c                 C   sL   |  | g |  |  }tjj| | t| jt	r$| 
| d S d S rb   )rn  r  r  rD   r   r  generate_extern_kernel_allocrH   r  r  r  r  r^   r^   r_   rp     s   
zExternKernelAlloc.codegenr^   Nc	           	         s:   t  d || |||pi d ||||
 tj| | _d S rb   r  )	r   r  r  rP  rf   rS  rT  rU  rV  r)  r^   r_   rf    s   zExternKernelAlloc.__init__c                 C   r0  r1  r^   r   r^   r^   r_   r     r3  z!ExternKernelAlloc.should_allocatec                 C   r  rb   r  r   r^   r^   r_   ri  #  r3  z"ExternKernelAlloc.apply_constraint)r^   NNNr^   N)r   r  r  rp  rf  r  ri  rR  r^   r^   r)  r_   rE    s    rE  c                       sx   e Zd Zdd Zdd Zdd Zdd Zd	eej	 f fd
dZ
d	eej	 fddZdd Z fddZdd Z  ZS )UserDefinedTritonKernelc                 C   sF   ddl m} ddlm} || j}g }t||r|j}|j}||fS )Nr   )	Autotuner)kernel_side_table)	triton.runtime.autotunerr  *torch._higher_order_ops.triton_kernel_wrapr  
get_kernel
kernel_idxrH   configsri   )r   r  r  r@  r  r^   r^   r_   get_kernel_and_configs(  s   
z.UserDefinedTritonKernel.get_kernel_and_configsc           	         s   |   \ }| || j\}}|  }g }tjjrK fddt|D }| jD ]}| 	|}|
t|dr:| nt| q( fddt|D }| | ||| j|||| d S )Nc                       g | ]\}}| j vr|qS r^   
constexprs)rp   rq   r5  r@  r^   r_   rt   B      z3UserDefinedTritonKernel.codegen.<locals>.<listcomp>r   c                    r  r^   r  )rp   rq   r   r  r^   r_   rt   I  s    )r  !define_user_defined_triton_kernelrf   r  rD   r   rq  r   rU  r  r  rT  r   rY   rn  #generate_user_defined_triton_kernelgrid)	r   rk  r  new_nametriton_metare   	arg_typesrg  r!  r^   r  r_   rp  4  s(   



zUserDefinedTritonKernel.codegenc                 C   r0  r1  r^   r   r^   r^   r_   r  S  r3  z'UserDefinedTritonKernel.should_allocatec                 C   r0  r  r^   r   r^   r^   r_   has_side_effectsV  s   z(UserDefinedTritonKernel.has_side_effectsr   c                    s   t   t| jB S rb   )r(  r  r!   r  r   r)  r^   r_   r  [  s   z0UserDefinedTritonKernel.get_unbacked_symbol_usesc                 C   r  rb   r  r   r^   r^   r_   r  `  r   z0UserDefinedTritonKernel.get_unbacked_symbol_defsc                 C   rY  rb   r^   r   r^   r^   r_   r  c  s   z*UserDefinedTritonKernel.get_mutation_namesc                   s2  g }t  }g }  D ]$\}}t|tr&t| |}	||	 |	||< q|| |||< qt|dks8J |d 	 }
t
 d t|
|t|| tj| | _|| _|| _|  \}} fdd|jD | _ddlm} t|dkr{|d jni } fdd||i  |D | _t| g| jR   d S )Nr   c                    s   g | ]}| v r|qS r^   r^   r4  kernel_argsr^   r_   rt     r  z4UserDefinedTritonKernel.__init__.<locals>.<listcomp>)identify_mutated_tensorsc                    r   r^   r^   rp   r  r  r^   r_   rt     s    )rK   r  rH   rS   r  r-  r  r  rw   r   r(  rf  r  rJ   rD   r   r  rh   r  r  r  	arg_namesrU  r  r  rf   mutable_argsr  )r   r  r  r  r  rf   rP  r  r  r   r   r@  r  r  autotuned_kwargsr)  r  r_   rf  i  sD   






z UserDefinedTritonKernel.__init__c                 C      dd | j D S )Nc                 S   rs  r^   r  ro   r^   r^   r_   rt     r   zHUserDefinedTritonKernel.get_inputs_that_alias_output.<locals>.<listcomp>)r  r   r^   r^   r_   r    r  z4UserDefinedTritonKernel.get_inputs_that_alias_output)r   r  r  r  rp  r  r  r   rT   r  r  r  r  rf  r  rR  r^   r^   r)  r_   r  '  s    ,r  mutated_nodesc                 G   sP   |D ]#}t |tsJ | dt| dtj|  t| ||  qdS )z
    Allows ops in mutated_nodes to be marked as being mutated as well as
    indicates to the scheduler that these ops depend on cur_buffer.

    NB: Use this instead of directly constructing MutationOutput
    z node is type z and is not an IRNodeN)	rH   r   rY   rD   r   r  r   MutationOutputr   )
cur_bufferr  r[   r^   r^   r_   r    s   r  c                       sD   e Zd Zdd Z fddZdd Zdd Zd	d
 Zdd Z  Z	S )r  c                 C      | j d  gS r   r  r   r   r^   r^   r_   r    r  z!MutationOutput.get_mutation_namesc                    s.   t  d |||gd || _tj| | _d S r0  )r(  rf  node_doing_mutatingrD   r   r  rh   )r   r  mutated_noder  r)  r^   r_   rf    s   zMutationOutput.__init__c                 C   r0  r1  r^   r   r^   r^   r_   r    r3  zMutationOutput.should_allocatec                 C   r0  r  r^   r   r^   r^   r_   r    r3  zMutationOutput.is_no_opc                 C   r0  r  r^   r   r^   r^   r_   r    r3  zMutationOutput.has_side_effectsc                 C   r  r   r  r   r^   r^   r_   r    r  z+MutationOutput.get_inputs_that_alias_output)
r   r  r  r  rf  r  r  r  r  rR  r^   r^   r)  r_   r    s    r  c                       L   e Zd ZdZdd Zdd Zdd Zdeej	 fd	d
Z
 fddZ  ZS )InplaceBernoulliFallbackE
    This needs to be a custom class to handle mutation properly
    c                 C   s   dd | j D \}tjjr,tjr,||   d| ddt	t
| j d|j  d S ||   d| ddt	t
| j d|j  d S )Nc                 s       | ]}|  V  qd S rb   r  rp   r   r^   r^   r_   r    r  z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>(r  z, NULL)r?  )r  rD   r   rq  r*   rr  rj  rt  r   r   reprrP  ending)r   rk  r   r^   r^   r_   rp    s   ,,z InplaceBernoulliFallback.codegenc                 C   r0  r1  r^   r   r^   r^   r_   r    r3  z(InplaceBernoulliFallback.should_allocatec                 C   r  r   r  r   r^   r^   r_   r    r  z+InplaceBernoulliFallback.get_mutation_namesr   c                 C   r  rb   r  r   r^   r^   r_   r    r   z1InplaceBernoulliFallback.get_unbacked_symbol_defsc                    sT   t  jd t| | |g||d tj| | _d| _	t
js#d| _t| | d S )NrV  zaten.bernoulli_zat::native::bernoulli_)r(  rf  r  r   r  rD   r   r  rh   rS  r*   rr  rT  r  )r   rV  r   rP  r)  r^   r_   rf    s   

z!InplaceBernoulliFallback.__init__r   r  r  r  rp  r  r  r   rT   r  r  rf  rR  r^   r^   r)  r_   r        r  c                       s`   e Zd ZdZdd Zdd Zdd Zdeej	 fd	d
Z
 fddZeddefddZ  ZS )InplaceCopyFallbackr  c                 C   s>   |   \}}}||   d| d| d| d|j 	 d S )Nr  r  r?  )r  rj  rt  r  )r   rk  r  r  non_blockingr^   r^   r_   rp    s   $zInplaceCopyFallback.codegenc                 C   r0  r1  r^   r   r^   r^   r_   r    r3  z#InplaceCopyFallback.should_allocatec                 C   r  r   r  r   r^   r^   r_   r    r  z&InplaceCopyFallback.get_mutation_namesr   c                 C   r  rb   r  r   r^   r^   r_   r    r   z,InplaceCopyFallback.get_unbacked_symbol_defsc                    s4   t  jd |||dtjrdndd tj| | _d S )Nz
aten.copy_aoti_torch_copy_zat::_ops::copy_::callrS  rT  )r(  rf  r*   rr  rD   r   r  rh   r   r  r  rP  r)  r^   r_   rf     s   
zInplaceCopyFallback.__init__Fr  c                    s>    fdd||fD }|f}t t| ||}t|| |S )Nc                    rF  r^   r  r  r  r^   r_   rt     r   z.InplaceCopyFallback.create.<locals>.<listcomp>)r  r  r   r  )r6  r  r  r  r  rP  r}  r^   r  r_   r5    s   

zInplaceCopyFallback.createrz  )r   r  r  r  rp  r  r  r   rT   r  r  rf  rP  r  r5  rR  r^   r^   r)  r_   r    s    r  c                   @   sD   e Zd ZdZdd Zdd Zdd Zdeej	 fd	d
Z
dd ZdS )MutatingFirstArgExternKernelr  c                 C   sJ   g dd | j D tt| j}||   dd| d|j  d S )Nc                 s   r  rb   r  r  r^   r^   r_   r  &  r  z7MutatingFirstArgExternKernel.codegen.<locals>.<genexpr>r  r  r?  )r  r   r  rP  rj  rt  r   r  )r   rk  argrefsr^   r^   r_   rp  $  s   
z$MutatingFirstArgExternKernel.codegenc                 C   r0  r1  r^   r   r^   r^   r_   r  -  r3  z,MutatingFirstArgExternKernel.should_allocatec                 C   r  r   r  r   r^   r^   r_   r  0  r  z/MutatingFirstArgExternKernel.get_mutation_namesr   c                 C   r  rb   r  r   r^   r^   r_   r  3  r   z5MutatingFirstArgExternKernel.get_unbacked_symbol_defsc                 C   r0  r  r^   r   r^   r^   r_   r  6  r3  z-MutatingFirstArgExternKernel.has_side_effectsN)r   r  r  r  rp  r  r  r   rT   r  r  r  r^   r^   r^   r_   r    s    	r  c                       r   )ResizeStorageBytesc                    s   t |ts	J dt jd t| | |g|fd tj	|
  tj| | _d| _d| _tjj|j
  t| | d S )NzTODO: dynamic shapes)rP  z"inductor_ops.resize_storage_bytes_z&torch::inductor::resize_storage_bytes_)rH   r  r(  rf  r  r   r  rD   r   r  r   r  rh   rS  rT  never_reuse_buffersro  r  r  )r   variabler  r)  r^   r_   rf  ;  s   

zResizeStorageBytes.__init__r#  r^   r^   r)  r_   r
  :  r$  r
  c                       s4   e Zd Z fddZdd Zdd Zdd Z  ZS )	SetSourceTensorKernelc                    sj   |   t j| ||gdd tjj|j	  tjj|	  tjj| 	  t
| || d S )Nz!torch.ops.aten.set_.source_Tensor)rS  )r  r(  rf  r   rD   r   r  ro  r  r   r  )r   self_tensorstorage_tensorr)  r^   r_   rf  L  s   zSetSourceTensorKernel.__init__c                 C   s   | j d  | j d  gS r  r  r   r^   r^   r_   r  X  s   z2SetSourceTensorKernel.get_inputs_that_alias_outputc                 C   r  r  r  r   r^   r^   r_   r  [  r  z(SetSourceTensorKernel.get_mutation_namesc                 C   r0  r  r^   r   r^   r^   r_   r  ^  r3  z&SetSourceTensorKernel.has_side_effects)r   r  r  rf  r  r  r  rR  r^   r^   r)  r_   r  K  s
    r  c                       sf   e Zd ZdZdd Zdd Zdd Zdeej	 fd	d
Z
ddddedee def fddZ  ZS )ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    c              
   C   s   | j d }tjjrddd}||v r|| }| jr%dd | jD \}}}ndd | jD \}}| jd }|||| jd	 ||g| j| j	| j|| 
  d S )
Nr  ru  rt  )ro  multiplyc                 s   r  rb   r  r  r^   r^   r_   r  r  r  z*ScatterFallback.codegen.<locals>.<genexpr>c                 s   r  rb   r  r  r^   r^   r_   r  t  r  r)   r   )rf   rD   r   rq  src_is_tensorr  rP  generate_scatter_fallbackrT  rS  r  )r   rk  r  get_operator_enumr   rr   r  r^   r^   r_   rp  i  s$   


zScatterFallback.codegenc                 C   r0  r1  r^   r   r^   r^   r_   r    r3  zScatterFallback.should_allocatec                 C   r  r   r  r   r^   r^   r_   r    r  z"ScatterFallback.get_mutation_namesr   c                 C   r  rb   r  r   r^   r^   r_   r    r   z(ScatterFallback.get_unbacked_symbol_defsNTr  include_selfr  r  r  c          
   
      s   t |t _ jr fdd|||fD }|f}	n fdd||fD }||f}	t jd t|  ||	||dt|ddg|d t	| _
tj  _t | d S )Nc                    rF  r^   r  r  r   r^   r_   rt     r   z,ScatterFallback.__init__.<locals>.<listcomp>c                    rF  r^   r  r  r   r^   r_   rt     r   r  r  r  )rS  rU  rV  )rH   rS   r  r(  rf  r  r   r  rk   rO  rT  rD   r   r  rh   r  )
r   rV  r   r  rr   r  r  r  tensorsrP  r)  r   r_   rf    s&   


zScatterFallback.__init__)r   r  r  r  rp  r  r  r   rT   r  r  r  r
   rk   r  rf  rR  r^   r^   r)  r_   r  b  s    	r  c                       r  )IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    c           	      C   s   dd | j D ^}}}g }t|}t| jD ]\}}| j| d ur)|t| q|tjjj	 q|j
|  |||g|  R   d S )Nc                 s   r  rb   r  r  r^   r^   r_   r    r  z+IndexPutFallback.codegen.<locals>.<genexpr>)r  rv  r   r  r  rw  rD   r   r  r  generate_index_put_fallbackrt  r  )	r   rk  r   rL   valid_indicesr  iter_valid_indicesrq   r   r^   r^   r_   rp    s   zIndexPutFallback.codegenc                 C   r0  r1  r^   r   r^   r^   r_   r    r3  z IndexPutFallback.should_allocatec                 C   r  r   r  r   r^   r^   r_   r    r  z#IndexPutFallback.get_mutation_namesr   c                 C   r  rb   r  r   r^   r^   r_   r    r   z)IndexPutFallback.get_unbacked_symbol_defsc           	   	      s   | _ dd |D } fdd||g|D }tjrdnd}t jd t|  ||fd||d tj	
  _t | d S )Nc                 S   s   g | ]}|d ur|qS rb   r^   ro   r^   r^   r_   rt     ru   z-IndexPutFallback.__init__.<locals>.<listcomp>c                    rF  r^   r  r  r   r^   r_   rt     r   aoti_torch_index_put_outzat::index_put_outzaten.index_put_)rS  rT  rV  )r  r*   rr  r(  rf  r  r   r  rD   r   r  rh   r  )	r   rV  r   r  rL   
accumulater  r  rT  r)  r   r_   rf    s    
	zIndexPutFallback.__init__r  r^   r^   r)  r_   r    r  r  c                   @   s    e Zd Zedd Zdd ZdS )
DeviceCopyc                 C   sx   |  stdd | D rtjjs||S tj	| tj	|
  td tt|| | d| |gS )Nc                 s   s*    | ]}|j tjjv ot|tjV  qd S rb   )rh   rD   r   r  rH   r+   r  r  r^   r^   r_   r    s
    
z$DeviceCopy.create.<locals>.<genexpr>zDeviceCopy in input programr  )r2  r  r   r*   aot_inductoruse_runtime_constant_foldingrN  rD   r   add_device_infor   r9   r  r   r   r   r  )r6  r   r   r^   r^   r_   r5    s(   

zDeviceCopy.createc                 C   sP   |   }t|dksJ | jr||d | j  d S ||d |   d S r  )r  rw   rR  codegen_device_copyr   r  r^   r^   r_   rp    s
   zDeviceCopy.codegenN)r   r  r  rP  r5  rp  r^   r^   r^   r_   r    s    
r  c                       sL   e Zd ZdZdd Zdd Z fddZdeej	 fd	d
Z
dd Z  ZS )rQ   z;
    The result of a call to aten._local_scalar_dense.
    c                 C   r0  r0  r^   r   r^   r^   r_   r     r3  zDynamicScalar.get_readsc                 C   r0  r1  r^   r   r^   r^   r_   r    r3  zDynamicScalar.should_allocatec                    s:   |   t d ttd| |g || _|| _d S r   )	r   r(  rf  r  rM   r   r  symkeypath)r   r#  r$  r  r)  r^   r_   rf    s   "
zDynamicScalar.__init__r   c                 C   s   | j hS rb   )r#  r   r^   r^   r_   r    r  z&DynamicScalar.get_unbacked_symbol_defsc                 C      | |  d S rb   )codegen_dynamic_scalarro  r^   r^   r_   rp    rD  zDynamicScalar.codegen)r   r  r  r  r   r  rf  r   rT   r  r  rp  rR  r^   r^   r)  r_   rQ      s    rQ   c                       sH   e Zd ZdZdd Zdd Z fddZdd	 Zd
d Zdd Z	  Z
S )rR   z5
    The result of a call to aten._assert_scalar
    c                 C   r0  r0  r^   r   r^   r^   r_   r     r3  zAssertScalar.get_readsc                 C   r0  r1  r^   r   r^   r^   r_   r     r3  zAssertScalar.should_allocatec                    s*   t  d ttdg  || _|| _d S r   )r(  rf  r  rM   r   scalarr  )r   r'  r  r)  r^   r_   rf  #  s   
zAssertScalar.__init__c                 C   r0  r  r^   r   r^   r^   r_   r  .  r3  zAssertScalar.has_side_effectsc                 C   r  rb   )r!   r'  r   r^   r^   r_   r  1  r  z%AssertScalar.get_unbacked_symbol_usesc                 C   s^   t jjrd S |dt jjj| jdd d |dt| j d || 	  d d S )Nzif not F)r.  :z    raise RuntimeError(r?  z = None)
rD   r   rq  rj  r  codegen_python_sizevarr'  r  r  r   ro  r^   r^   r_   rp  4  s   	zAssertScalar.codegen)r   r  r  r  r   r  rf  r  r  rp  rR  r^   r^   r)  r_   rR     s    rR   c                   @   s    e Zd ZU eed< ejed< dS )ExternKernelNoderh   r[   N)r   r  r  rk   r  export_schemar  r^   r^   r^   r_   r*  H  s   
 r*  c                       s   e Zd Z	d!dd fddZdd Zdeej fdd	Zd
d Z	dd Z
edd Zdd Zdd Zdd Zdd Zdd ZedejfddZedd Z fdd Z  ZS )"FallbackKernelNrY  c                   s  |t jjkrt|dkrt|dkrt jj}t j|t|t||d g  _d _	| _
t|tjjtjjfsEJ d| dt| d| _| _|d u rQi n| _tj j g  _g  _t jtjjrkd S d j v rtd S  jj}tjj jr j|d    d S |j!rt"|st#d	| |j$}	  j% j&\}
} fd
d}tjj'||
|D ]	\}}||| qd S )Nr)   r   Fz#Fails to create FallbackKernel for r  z not supported_c10d_functionalr   z'NYI: Can't generate FallbackKernel for c                    s   t | jtjrt |ttfsJ t | jtjot | j tj}|s)t | jtjr2t |ttfr2J |d u r8d S | j	d u r?d S t | jtjsJ|sJJ  j
|  | j	jr]t | d S d S rb   )rH   rY   rM   ListTyperI   rJ   OptionalTypegetElementType
TensorType
alias_infoalias_namesr  r   is_writer  )infor5  is_optional_tensorr   r^   r_   handle_aliasing_and_mutation  s$   
z=FallbackKernel.__init__.<locals>.handle_aliasing_and_mutation)(rG  rn  r  rw   Scalarr(  rf  rJ   outputsuse_runtime_dispatchrY  rH   rM   rI  rJ  r  rY   rV  r  rf   rD   r   warn_fallbackrS  r4  mutation_namesrh   ra  _libraryutilsmutates_and_returns_first_argr  r   
is_mutabler   r   rb  r  rP  
zip_schema)r   r  r@  r  nontensor_argsr  rf   rY  schemaschema_argsre   r8  r6  r5  r)  r   r_   rf  `  sZ   
zFallbackKernel.__init__c                    s|   t dsd S ttjjjj}|sd S | D ]#\} fdd  fdd}||	| d|  |j
  qd S )NrY  c                    s  |dkr| S t |dkr3t|d tr3t|d tjr3 |  d|d j d|d j d|dd  S t|d trL |  d|d j d|dd  S t|d tjrf |  d	|d j d
|dd  S t|d tr |  d|d j d|dd  S t	d| )Nr^   r   r   r)   rH  r  r?  z()r  r  z.__floordiv__(zunrecognized keypath )
rw   rH   r   rx  SequenceKeyrh   r   r    r  r  )r  r$  )gor^   r_   rG    s"   *$$$z7FallbackKernel.codegen_unbacked_symbol_defs.<locals>.goc                      st   t jjr3tjr3tjdkr jd  S td t	j
s"J  jd j  dd  S   S r  )rD   r   rq  r*   rr  rw   r:  r   rH   rx  rF  r   r^   rG  r$  r   r^   r_   go_outer  s   "z=FallbackKernel.codegen_unbacked_symbol_defs.<locals>.go_outer = )rT  r#   rD   r   r   r  rY  r  rj  codegen_unbacked_symbol_declr  )r   rk  rY  r   rI  r^   rH  r_   codegen_unbacked_symbol_defs  s   
z+FallbackKernel.codegen_unbacked_symbol_defsr   c                 C   s*   t | dd  }rttjjj| S t S )NrY  )rc   r#   rD   r   r   r  r  r   )r   rY  r^   r^   r_   r    s   
z'FallbackKernel.get_unbacked_symbol_defsc                    s   ddl m} |jjrJ d|j ddd  t fdd|jjD s,J |j d	t fd
d|jjD sAJ |j d|jj| _	|jj
| _| j	dd d| j | _||| _d S )Nr)   get_cpp_op_schemazmutable z" is not supported with cpp_wrapperc                 S   s   | j d u p	| j j S rb   )r3  r5  )r5  r^   r^   r_   is_not_write  rj   z3FallbackKernel.set_cpp_kernel.<locals>.is_not_writec                 3   rl  rb   r^   r  rO  r^   r_   r     rK  z0FallbackKernel.set_cpp_kernel.<locals>.<genexpr>z< with alias_info arguments is not supported with cpp_wrapperc                 3   rl  rb   r^   r  rP  r^   r_   r  #  rK  z: with alias_info returns is not supported with cpp_wrapper::r   )codegen.wrapperrN  ra  rA  r   r  rb  returnsrh   rT  overload_namecpp_kernel_overload_namerM  cpp_kernel_keycpp_op_schemar   r@  rN  r^   rP  r_   set_cpp_kernel  s(   





zFallbackKernel.set_cpp_kernelc                    s   t jG dd d  fdd| jD }| || j\}}tjjr=t| j	t
jjr=| ||}dd t| j	jj|D }ndd |D }| j| |S )Nc                   @   s   e Zd ZU eed< dd ZdS )z)FallbackKernel.codegen_args.<locals>.Shimrefc                 S   r   rb   )rZ  r   r^   r^   r_   rO  2  r   z2FallbackKernel.codegen_args.<locals>.Shim.__repr__N)r   r  r  r   r  rO  r^   r^   r^   r_   Shim.  s   
 r[  c                    s   g | ]} |  qS r^   r  r  r[  r^   r_   rt   5  ru   z/FallbackKernel.codegen_args.<locals>.<listcomp>c                 S   s"   g | ]\}}t jj||jqS r^   )rD   r   r  r  r]  )rp   paramr   r^   r^   r_   rt   9  s    c                 S   r  r^   r  r  r^   r^   r_   rt   >  r  )r  	dataclassr  r  rP  rD   r   rq  rH   rV  rM   rI  rJ  rh  r}   ra  rb  rf   update)r   r  re   rf   r^   r\  r_   r  -  s   zFallbackKernel.codegen_argsc                 C   s   | rdd | D }|d S t |tjr|jS t |ttfrGdd |D }dd |D }t|dkr5|d S |D ]}t|jrB|  S q7|d S d S )Nc                 S   s   g | ]
}|  r|  qS r^   )r   r4  r^   r^   r_   rt   G  r  z.FallbackKernel.find_device.<locals>.<listcomp>r   c                 S   s   h | ]}t d |qS rb   )r,  find_devicer  r^   r^   r_   r   L  ru   z-FallbackKernel.find_device.<locals>.<setcomp>c                 S   s   g | ]}|r|qS r^   r^   )rp   r   r^   r^   r_   rt   N  r   r)   )	rH   rM   r  r   rI   rJ   rw   r<   rY   )r  r  devices
device_setr   r^   r^   r_   r`  D  s    
zFallbackKernel.find_devicec                 C   s"   t | jtjjr
dS t| j S r1  )rH   rV  rM   rI  r  r   rA  r   r^   r^   r_   r  W  s   zFallbackKernel.has_side_effectsc                 C   r   rb   )r4  r   r^   r^   r_   r  \  r   z+FallbackKernel.get_inputs_that_alias_outputc                 C   s   t | jdks	J | jS r  )rw   r=  r   r^   r^   r_   r  _  s   z!FallbackKernel.get_mutation_namesc           
         s*  t | tsJ | | j| j\}| |}fdd| jD }tjj	s+g ||S t
d d }|| j|}dd  | j}|jj}t|dkrV|d j} || jg}n t | jts^J t|t| jksiJ  fddt|| jD }t|  tj| j ||i dd	}	tjj|	 g ||S )
Nc                    s   g | ]}  |d qS rb   )rU  r  )rf   r^   r_   rt   m  r  z<FallbackKernel.export_extern_kernel_node.<locals>.<listcomp>c                 S   s   t | tjr(|}t |ttfrt|dksJ |d }tjjtj	|
 ddS t | tjrBt |  tjrBtjjdd |D dS tdt|  )	Nr)   r   rg   )	as_tensorc                 S   s   g | ]
}t j| d qS )rg   )r+  TensorArgumentr   )rp   r   r^   r^   r_   rt     s    zZFallbackKernel.export_extern_kernel_node.<locals>.handle_single_output.<locals>.<listcomp>)
as_tensorszUnsupported return type )rH   rM   r2  rI   rJ   rw   r+  Argumentr5  rd  r   r/  r1  RuntimeErrorrY   )return_typeoutputr   r^   r^   r_   handle_single_outputx  s"   
zFFallbackKernel.export_extern_kernel_node.<locals>.handle_single_outputr)   r   c                    s   g | ]
\}} |j |qS r^   )r]  )rp   return_schemari  )rj  r^   r_   rt         
)r  r  r:  metadata)rh   r[   )rH   r,  r  r  rP  rh  rU  rD   r   aot_moder   serialize_inputsrV  ra  rS  rw   r]  r:  rJ   r}   r*  r   r+  r  rh   extern_kernel_nodesr  )
r   re   ordered_kwargs
serializernamed_argumentsr  rS  rh  output_argumentsr[   r^   )rj  rf   r_   export_extern_kernel_nodei  s@   





z(FallbackKernel.export_extern_kernel_nodec                 C   s  | j }|jdkr7t|tjjsJ tjjr1t	
 r0|tvr0t	jdkr0td| d| _| | nSt|| _nM|jdkr[t|tjjsEJ tjjrU| | t	jsTd| _n/t|| _n)t|tjjrjd|j | _n|jdd d	|j | _tjjrd| _| | | jr| | d }d }t	jr|  }n
g |  |  }||  | j| j|| j| j | j!| j || j"
 n"| | g |  |  }tjj#$| | t| j%t&r| '| | (| d S )
NrG  1zG%s is missing a c-shim implementation, using proxy executor as fallbackT
_quantizedztorch.ops.higher_order.z._ops.z.ops.rH  ))rV  rK  rH   rM   rI  rJ  rD   r   rq  r*   	is_fbcode
has_c_shimc_shim_versionr  r  r;  rY  rk   rS  rr  r  r   r  rM  rn  ru  r  r  6generate_extern_kernel_alloc_and_find_schema_if_neededr   rT  rW  rV  rU  r:  r  generate_fallback_kernelr  r  r  rL  )r   rk  r@  exported_argsre   r^   r^   r_   rp    sn   









zFallbackKernel.codegenri  c                 C   s"   t | j| jt|  t|  S rb   )r  r   r   r7   r   r   )ri  r^   r^   r_   tensor_to_layout  s   

zFallbackKernel.tensor_to_layoutc                    s   t jf}||vrtjjnt }|  j|g|R i |\}}}}	}
W d    n1 s-w   Y   ||}|d u rI t|||||	|
dn|sOJ d t	|||||	|
d fdd|g }t
|tttfru|_|S |g_|S )Nr-  z"Not sure where to find device infoc                    s   t ttfrt fddttD S t tr, fdd D S t tj	r;t
 S t trBS t tjrLjjS d u sZJ dt dd S )Nc                 3   s,    | ]} | t |fg V  qd S rb   rY   ro   generate_outputr  ri  r^   r_   r  !  s
    
zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>c                    s*   i | ]\}}| |t |fg qS r^   r  )rp   r  r!  r  r^   r_   r   &  s    zBFallbackKernel.create.<locals>.generate_output.<locals>.<dictcomp>zFallbackKernel output type z is not supported)rH   rI   rJ   rY   rx   rw   rK   r  rM   r  MultiOutputr~  r  SymIntr[   r  )ri  r  r6  r  packed)r  ri  r_   r    s,   



z.FallbackKernel.create.<locals>.generate_output)rG  *_fused_moving_avg_obs_fq_helper_functionalrD   r   r  r   r  r`  r  r  rH   rI   rJ   rK   r:  )r6  r@  re   rf   fake_incorrect_kernelscontextr  r  r  r  rY  r   r:  r^   r  r_   r5    sL   	
	
zFallbackKernel.createc                    s
   t   S rb   )r(  ri  r   r)  r^   r_   ri  A  r  zFallbackKernel.apply_constraintrb   )r   r  r  rf  rL  r   rT   r  r  rY  r  r	  r`  r  r  r  ru  rp  rM   r  r~  rP  r5  ri  rR  r^   r^   r)  r_   r,  _  s*    	u6

BF
Fr,  c                       s6   e Zd ZdZdd Zdd Zdd fdd	
Z  ZS )
ComplexViewz9View a complex number as two dtyped numbers or vice versac                 C   r0  r1  r^   r   r^   r^   r_   r  I  r3  zComplexView.should_allocatec                 C   r  r   r  r   r^   r^   r_   r  L  r  z(ComplexView.get_inputs_that_alias_outputNr-  c                   s   t  j||||||d d S )Nr-  r(  rf  )r   r  r@  r  rC  r  rY  r)  r^   r_   rf  P  s   

zComplexView.__init__)r   r  r  r  r  r  rf  rR  r^   r^   r)  r_   r  E  s    r  c                   @   s   e Zd ZU ejed< dS )r  r   N)r   r  r  rM   r   r  r^   r^   r^   r_   r  d  s   
 r  c                       sb   e Zd Zdd Zdd Zdeeedf  f fddZd	e	e
j fd
dZdd Zdd Z  ZS )r  c                 C   s   t |dkrW|d \}}t|tr!| | d| d|dd  S t|tr=tjj|| 	 t
|}| ||dd  S t|trR| | d| d|dd  S td||S )Nr   r  r  r)   z['z']znon supported index type: )rw   
issubclassrI   codegen_list_tuple_accessrJ   rD   r   r  codegen_tuple_accessr   rk   rK   r  )r   basenamer  ityperq   tuple_accessr^   r^   r_   r  m  s   
 

 
z%MultiOutput.codegen_list_tuple_accessc                 C   s(   | |  | | jd  | j d S r   )codegen_multi_outputr   r  r  r  ro  r^   r^   r_   rp    s   zMultiOutput.codegenr  .c                    s,   t  d ||gd tj| | _|| _d S r0  )r(  rf  rD   r   r  rh   r  )r   r  r,  r  r)  r^   r_   rf    s   
zMultiOutput.__init__r   c                 C   s   | j d  S r   )r  r  r   r^   r^   r_   r    rD  z$MultiOutput.get_unbacked_symbol_usesc                 C   r0  r1  r^   r   r^   r^   r_   r    r3  zMultiOutput.should_allocatec                 C   r  )Nc                 S   s.   g | ]}t |trt| d kr| qS r   )rH   r,  rw   r  r   )rp   inpr^   r^   r_   rt     s    z<MultiOutput.get_inputs_that_alias_output.<locals>.<listcomp>)r  r   r^   r^   r_   r    s   z(MultiOutput.get_inputs_that_alias_output)r   r  r  r  rp  r	   r   r   rf  r   rT   r  r  r  r  rR  r^   r^   r)  r_   r  i  s    r  r   rS   rp  biaspaddingr   dilationgroups
transposedoutput_paddingc
                 C   s  dd }
dd }|   |   |dur|   tjj t|dd}t|dd}t| d }d	t|  k r>|ksAJ  J d	t|  k rN|ksQJ  J d	t|  k r^|ksaJ  J t||}t||}t||}|	du r{td	g|}	nd	t|	  k r|ksJ  J t|	|}	t|t	sJ |r|||}| }|
||||	|||}n|durt|ddn|}t
jj||||||||	|	}| }d	gtttd
t|d
  }t|g| }W d   n1 sw   Y  | ||}tdd |D  }|rt|rt|}nt|}| jdkr$| jdks&J ||g}t| | t|t|}||||g}|rH|d
|	 |durS|| n|d	| ||||fS )au  
    This function is a helper function to prepare inputs, layout and constant args
    for convolution post-op fusion's create function, including deciding the output
    layout (channels first or channels last), realizing inputs and make them etc. The
    function only supports the CPU device since conv post-op fusion kernel is only
    supported on CPU right now.
    c                 S   s   t | t |ksJ dt | }|dksJ dd}d}	g }
|
| |  |
||	 |  td|D ]1}|| d ||d   d }| | d ||d   ||d  d  | ||d   }|
| q3ttt|
S )NzExpect input dim == weight dimr   zExpect input dim > 2r   r)   )rw   r  rx   rI   r   r  )output_sizeweight_sizer  r  r   r  r  r  	BATCH_DIMWEIGHT_INPUT_CHANNELS_DIMr;  r  r@  input_size_dr^   r^   r_   _conv_input_size  s(   
z<_prepare_convolution_fusion_create.<locals>._conv_input_sizec                 S   s   |   }t|}|dksJ d|dkr9g }||d |  ||d |  td|D ]	}|||  q-|S | dd  }|S )Nr   zExpect weight dim > 2r)   r   )r   rw   r  rx   	transpose)prepacked_weightr  prepacked_weight_sizer  r  r  r^   r^   r_   _original_deconv_weight_size  s   zH_prepare_convolution_fusion_create.<locals>._original_deconv_weight_sizeNTr  r   r   r)   c                 s   r  rb   )rH   r  ro   r^   r^   r_   r    rc  z5_prepare_convolution_fusion_create.<locals>.<genexpr>r   )r   rD   r   r  r   rw   r   r=   rH   r  rM   rC   rG  convolutionrI   rC  rx   r  r  r  r   r   r   r   rY   r  r   r7   insertr  )r6  r   rp  r  r  r   r  r  r  r  r  r  x_fakeweight_faker  r  r;  r  	bias_fakeri  req_stride_orderdynamic_shapesr<  r  kernel_layoutrP  r^   r^   r_   "_prepare_convolution_fusion_create  s   
   


 

 4	$
r  c                 C   s   |   |   |dur|   | ^ }}| \}}t||g }tttt| }| ||}| jdkrD| jdksFJ ||g}	t	
|}
t| | ||
}g }|durf|	| n|d| |	|||fS )z
    This function is a helper function to prepare inputs, layout and constant args
    for linear post-op fusion's create function. The function only supports the CPU device
    since linear post-op fusion kernel is only supported on CPU right now.
    Nr   r   )r   r   rI   rC  rx   rw   r  r   rY   r   r   r  r   r  r  )r6  r   rp  r  mr   ocr  r  r  r<  r  rP  r^   r^   r_   _prepare_linear_fusion_create5  s.    
r  c                       sj   e Zd Z	d fdd	Zdd Zeddddd	dd
ee dee dee dedeee	  fddZ
  ZS )ConvolutionUnaryr^   c                    (   t  j|||d ddd d| _d| _d S )Nz'torch.ops.mkldnn._convolution_pointwisemkldnn::_convolution_pointwiser  convolution_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r(  rf  rV  rW  r  r)  r^   r_   rf  b     
zConvolutionUnary.__init__c                 C   sB   | |  | j| j|  | j| j t| jt	r| 
| d S d S rb   )r{  r   rS  rT  r  rW  rV  rH   r  r  r  ro  r^   r^   r_   rp  ~  s   zConvolutionUnary.codegenr   rS   rp  r  padding_stride_	dilation_r  scalarsc              	   C   s>   t | |||||||\}}}}||t|	|
g }t|||dS )Nr  r  rP  )r  r   r  )r6  r   rp  r  r  r  r  r  attrr  	algorithmr  rP  r  r   r^   r^   r_   r5    s   zConvolutionUnary.creater^   r   r  r  rf  rp  rP  r	   r  r
   r   r5  rR  r^   r^   r)  r_   r  a  s,    

r  c                       s   e Zd Z		d fdd	Zdd Zeddddd	dd
ddee dee dee dedede	e
 de	e de	ee  de	e fddZ  ZS )ConvolutionBinaryr^   c                    s4   t  j|||d ddd d| _d| _d| _|| _d S )Nz.torch.ops.mkldnn._convolution_pointwise.binaryr  r  binaryconvolution_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm))r(  rf  rU  rV  rW  cpp_constant_args)r   r  r  rP  r  r)  r^   r_   rf    s   
zConvolutionBinary.__init__c              	   C   sF   | |  | j| j|  | j| j| j t| j	t
r!| | d S d S rb   )r{  r   rS  rT  r  rW  rV  rU  rH   r  r  r  ro  r^   r^   r_   rp    s   	zConvolutionBinary.codegenr   rS   rd  rp  r  r  r  r  r  binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmc              	   C   sZ   t | |||||||\}}}}| ||}|d| ||	|
|t||g }t|||dS )Nr)   r  )r  r  r  r   r  )r6  r   rd  rp  r  r  r  r  r  r  r  r  r  r  r  rP  r  r  r^   r^   r_   r5    s,   zConvolutionBinary.create)r^   r^   )r   r  r  rf  rp  rP  r	   r  rk   r
   rV  r   r5  rR  r^   r^   r)  r_   r    sB    "	

r  c                       s   e Zd Z	d fdd	Zdd Zdd Zdeej fd	d
Z	e
dddddddddee dee dee dededee dee deee  dee fddZ  ZS )ConvolutionBinaryInplacer^   c                    sJ   |d |d g|dd   }t  j|||d ddd d| _d| _d	| _d S )
Nr)   r   r   z/torch.ops.mkldnn._convolution_pointwise_.binaryzmkldnn::_convolution_pointwise_r  r  convolution_pointwise_binary_a  
            at::Tensor&(
                at::Tensor& other_t,
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm)r(  rf  rU  rV  rW  )r   r  r  rP  reordered_inputsr)  r^   r_   rf    s   
z!ConvolutionBinaryInplace.__init__c              	   C   ,   | |  | j| j|  | j| j| j d S rb   r{  r   rS  rT  r  rW  rV  rU  ro  r^   r^   r_   rp  %     z ConvolutionBinaryInplace.codegenc                 C   r  r   r  r   r^   r^   r_   r  0  r  z+ConvolutionBinaryInplace.get_mutation_namesr   c                 C   r  rb   r  r   r^   r^   r_   r  3  r   z1ConvolutionBinaryInplace.get_unbacked_symbol_defsr   rS   rd  rp  r  r  r  r  r  r  r  r  r  r  c              	   C   s~   t | |||||||\}}}}| ||}|d| ||	|
|t||g }tt|d  ||d}t||d  |jd S )Nr)   )r  r  rP  r   )	r  r  r  r   r  r  r   r  r  )r6  r   rd  rp  r  r  r  r  r  r  r  r  r  r  r  rP  r   r  r  r^   r^   r_   r5  6  s0   
zConvolutionBinaryInplace.creater  )r   r  r  rf  rp  r  r   rT   r  r  rP  r	   r  rk   r
   rV  r   r5  rR  r^   r^   r)  r_   r     sD    $	

r  c                       s4   e Zd Z	d fdd	Zdd Zedd Z  ZS )	MKLPackedLinearr^   c                    r  )Nztorch.ops.mkl._mkl_linearzmkl::_mkl_linearr  
mkl_lineara  
            at::Tensor(
                const at::Tensor& self,
                const at::Tensor& mkl_weight_t,
                const at::Tensor& origin_weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                const int64_t prepack_batch_size)r  r  r)  r^   r_   rf  e  r  zMKLPackedLinear.__init__c                 C   (   | |  | j| j|  | j| j d S rb   r{  r   rS  rT  r  rW  rV  ro  r^   r^   r_   rp  |     zMKLPackedLinear.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}}t||g }	t|	}
|||g}|g}|d ur;||g7 }n|dd  tt|	 |
 |	|
||dS )Nr   r  )r  r  r   rI   r   r   r  r  r  r   r   )r6  r   packed_worig_wB
batch_sizer  r   r  r  r<  r  rP  r^   r^   r_   r5    s$   

zMKLPackedLinear.creater  )r   r  r  rf  rp  rP  r5  rR  r^   r^   r)  r_   r  d  s    
r  c                       s<   e Zd Z	d
 fdd	Zdd Zedd Zdd	 Z  ZS )LinearUnaryr^   c                    r  )Nz"torch.ops.mkldnn._linear_pointwisemkldnn::_linear_pointwiser  linear_pointwiseaL  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r  r  r)  r^   r_   rf    r  zLinearUnary.__init__c                 C   r  rb   r  ro  r^   r^   r_   rp    r  zLinearUnary.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}	}||g}
||r&|ndg|g}|d ur=|  | |}|
| n|dd  tt| | t	||	g d|
|dS )Nr  r   r  r  )
r  r  r   r  r  r  r   r   r   rI   )r6  r   wr  r  r  r  r  icr  r  rP  r^   r^   r_   r5    s&   zLinearUnary.createc                 C   rZ  rb   r^   r   r^   r^   r_   ri    r3  zLinearUnary.apply_constraintr  )	r   r  r  rf  rp  rP  r5  ri  rR  r^   r^   r)  r_   r    s    

r  c                       s@   e Zd ZdZ	d fdd	Zdd Zedd Zd	d
 Z  Z	S )LinearBinary)torch.ops.mkldnn._linear_pointwise.binaryr^   c                    s.   t  j|||d ddd d| _d| _d| _d S )Nr  r  r  r  linear_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr)
        r  r  r)  r^   r_   rf    s   
zLinearBinary.__init__c              	   C   r  rb   r  ro  r^   r^   r_   rp    r  zLinearBinary.codegenc                 C   s   |  | |}|  | |}|  | |}| ^ }}| \}}|||g}	|g}
|d ur?|  | |}|	| n|
d| tt| | t	||g d|	|
dS )Nr   r  r  )
r  r  r   r  r  r  r   r   r   rI   )r6  r   yr  r  r  r  r  r  r  rP  r^   r^   r_   r5    s(   
zLinearBinary.createc                 C   rZ  rb   r^   r   r^   r^   r_   ri     r3  zLinearBinary.apply_constraintr  )
r   r  r  r@  rf  rp  rP  r5  ri  rR  r^   r^   r)  r_   r    s    
r  c                       sr   e Zd Z	d fdd	Zdd Zeddddd	dd
ee dee dee dee dedeee	  fddZ
  ZS )ConvolutionTransposeUnaryr^   c                    r  )Nz1torch.ops.mkldnn._convolution_transpose_pointwisez(mkldnn::_convolution_transpose_pointwiser  convolution_transpose_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef output_padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r  r  r)  r^   r_   rf  %  r  z"ConvolutionTransposeUnary.__init__c                 C   r  rb   r  ro  r^   r^   r_   rp  B  r  z!ConvolutionTransposeUnary.codegenr   rS   rp  r  r  output_padding_r  r  groups_r  c                 C   sF   d}t | |||||||||
\}}}}||	t|
|g }t|||dS )NTr  )r  r   r  )r6  r   rp  r  r  r  r  r  r  r  r  r  r  r  rP  r  r   r^   r^   r_   r5  L  s8   z ConvolutionTransposeUnary.creater  r  r^   r^   r)  r_   r  $  s0    
	
r  c                !       sr   e Zd Z	d fdd	Zeddddddddd	dd
ddddedee dededededededef ddZ  Z	S )MkldnnRnnLayerr^   c                    s   t  j|||d ddd d S )Nzaten.mkldnn_rnn_layerzat::mkldnn_rnn_layerr  r  r  r)  r^   r_   rf  z  s   
zMkldnnRnnLayer.__init__r   rS   w0w1w2w3hxcxr  batch_sizesrj  hidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc              	      sZ  |  |   |  | |}|  | |}|  | |}|  | |}|  | |}|  |  | |}|   }t|dksRJ d|\}}}|||g}| }| }g }||||||g}||	|
||||||g	}tt ||d dd }|||g}|||t	|t	|g} fddt
t||D }|S )Nr   zExpect lstm input to be 3D)r  rP  c                 S   s   t | dks
J dt| S )Nr   zExpect output_shape to be 3D)rw   r   r   )output_shaper  r^   r^   r_   get_strides_of_lstm_output  s   
z9MkldnnRnnLayer.create.<locals>.get_strides_of_lstm_outputc                    s8   g | ]\}\}}t t  || t|fgqS r^   )r  r  r   r   rJ   )rp   rq   r  r<  r  r   r^   r_   rt     s    
z)MkldnnRnnLayer.create.<locals>.<listcomp>)r  r  r  r   rw   r  r  r   r   r   r   r}   )r6  r   r  r  r  r  r  r  r  r  rj  r  r  r  r  r  r  r;  
seq_length
mini_batchr  hy_shapecy_shaperesr  rP  r  output_sizesoutput_strides	output_irr^   r  r_   r5    sZ   



zMkldnnRnnLayer.creater  )
r   r  r  rf  rP  r  r	   r  r5  rR  r^   r^   r)  r_   r  y  sJ    	
r  c                       sv   e Zd Z	d fdd	Zdd Zeddded	ed
ddddddddee dee dee dededefddZ	  Z
S )QConvPointWisePT2Er^   c                    s6   t |dk| _t j|||dddd d| _d| _dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        r@  Nz"torch.ops.onednn.qconv2d_pointwiseonednn::qconv2d_pointwiser  qconv2d_pointwisea  
            at::Tensor(
                at::Tensor act,
                double act_scale,
                int64_t act_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                torch::List<int64_t> stride,
                torch::List<int64_t> padding,
                torch::List<int64_t> dilation,
                int64_t groups,
                double output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm))rw   has_biasr(  rf  rV  rW  r  r)  r^   r_   rf    s   
zQConvPointWisePT2E.__init__c                 C   s   dd | j D }g }||   |d }|d }| jr |d n|d }|d |d }}|dd  \}	}
}}}}}}}}}}||||||||	|
||||||||f}||  | j| j|| j| j	 t
| jtro| | d S d S )	Nc                 S   rs  r^   r  r  r^   r^   r_   rt     r   z.QConvPointWisePT2E.codegen.<locals>.<listcomp>r   r)   r   r  )r  r'  r  r   r{  r   rS  rT  rW  rV  rH   r  r  r  )r   rk  re   
const_argsr   packed_weightr  w_scalew_zpr   r  r  r  x_scalex_zpo_inv_scaleo_zpoutput_dtyper  r  r  r  r^   r^   r_   rp    sd   
zQConvPointWisePT2E.codegenr   rS   r  r  rp  r  r  r  r  r  r  r  r	  output_zero_pointc                 C   s   d}d }t | ||||	||
|||
\}}}}|d u r'|d |d |d< |d< n|d |d |d< |d< |  |  |||g }|||||||t||g }|d ura|tjtjfv s^J ||_t|||dS )NFr   r)   r   r  )r  r   r   rM   float32rX  r   r  )r6  r   r  r  rp  r  r  r  r  r  r  r  r	  r  r  r  r  r  r  r  r  rP  r  r   r^   r^   r_   r5  L  sL   zQConvPointWisePT2E.creater  )r   r  r  rf  rp  rP  rV  r  r	   r5  rR  r^   r^   r)  r_   r    s@    .7	
r  c                       s   e Zd Z	d fdd	Zdd Zdd Zdeej fd	d
Z	e
dddddddddee dee dee deddddfddZ  ZS )QConvPointWiseBinaryPT2Er^   c                    sL   t |dk| _| jrdnd| _t j|||dddd d| _d	| _d
| _dS )a~  
        Needs input/weight/output qparams
        if bias is not None
            - inputs = [x, w, b, accum, w_scale, w_zp]
            - const_args = [stride, padding, dilation, groups, x_scale, x_zp, accum_scale, accum_zp, o_inv_scale, o_zp,
            fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, accum, w_scale, w_zp]
            - const_args = const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, accum_scale,
            accum_zp, o_inv_scale, o_zp, fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
           r   r   Nz)torch.ops.onednn.qconv2d_pointwise.binaryr  r  r  qconv2d_pointwise_binarya  
            at::Tensor(
                at::Tensor act,
                double act_scale,
                int64_t act_zero_point,
                at::Tensor accum,
                double accum_scale,
                int64_t accum_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                torch::List<int64_t> stride,
                torch::List<int64_t> padding,
                torch::List<int64_t> dilation,
                int64_t groups,
                double output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm))rw   r   idx_for_inplace_sumr(  rf  rU  rV  rW  r  r)  r^   r_   rf    s   
z!QConvPointWiseBinaryPT2E.__init__c                 C   s  dd | j D }g }||   |d }|d }| jr |d n|d }|d |d |d }}}	|d	d  \}
}}}}}}}}}}}}}}}|||||||||	||
|||||||||||f}||  | j| j|| j| j	| j
 t| jtr| | d S d S )
Nc                 S   rs  r^   r  r  r^   r^   r_   rt     r   z4QConvPointWiseBinaryPT2E.codegen.<locals>.<listcomp>r   r)   r   r  r  i)r  r'  r  r   r{  r   rS  rT  rW  rV  rU  rH   r  r  r  )r   rk  re   r  r   r  r  accumr  r  r   r  r  r  r  r  accum_scaleaccum_zpr	  r
  r  r  alphar  r  r  	conv_argsr^   r^   r_   rp    sx   
	z QConvPointWiseBinaryPT2E.codegenc                 C   s   | j | j  gS rb   )r  r  r   r   r^   r^   r_   r    r   z+QConvPointWiseBinaryPT2E.get_mutation_namesr   c                 C   r  rb   r  r   r^   r^   r_   r  
  r   z1QConvPointWiseBinaryPT2E.get_unbacked_symbol_defsr   rS   r  rp  r  r  r  r  r  r	  r  c                 C   s   d}d }t | |||
||||||
\}}}}| ||}|| |
d u r2|d |d |d< |d< n|d |d |d< |d< |  |	  |||	g }|||||||||||t||g }|dksgJ dtt| ||d}t|| |j	|j
 S )NFr   r)   r   ru  zCFor now, only post op sum is supported in QConvPointWiseBinaryPT2E.r  )r  r  r  r   r   r  r  r   r  r  r  )r6  r   r  r  r  r  r  rp  r  r  r  r  r  r  r  r	  r  r  r  r  r  r  r  r  r  r  rP  r  r  r  r^   r^   r_   r5    sf   



zQConvPointWiseBinaryPT2E.creater  )r   r  r  rf  rp  r  r   rT   r  r  rP  r	   r  r5  rR  r^   r^   r)  r_   r    s8    6@r  c                       s^   e Zd Z			d fdd	Zdd Zedd	d
ededd	dd	dd	dd	dedefddZ  Z	S )QLinearPointwisePT2Er^   TFc                    h   || _ || _t j|||d|rdnddd |rdnd| _d| _|r$d	nd
\}}d| d| d| _dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        Nz)torch.ops.onednn.qlinear_pointwise.tensorz*torch.ops.onednn.qlinear_pointwise.defaultonednn::qlinear_pointwiser  r  r   qlinear_pointwise
at::Tensorr  doubleint64_tI
            at::Tensor(
                at::Tensor act,
                 act_scale,
                a   act_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                double output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::string_view post_op_name,
                torch::List<c10::optional<at::Scalar>> post_op_args,
                c10::string_view post_op_algorithm)r   x_scale_zp_are_tensorsr(  rf  rU  rV  rW  r   r  r  rP  r   r$  x_scale_type_strx_zp_type_strr)  r^   r_   rf  f  s.   zQLinearPointwisePT2E.__init__c                 C   s(  dd | j D }g }||   |d }|d }| jr |d n|d }|d |d }}| jrNt|dks8J |d	 |d
 }	}
|dd  \}}}}}}nt|dksVJ |dd  \}	}
}}}}}}||	|
||||||||||f}||  | j| j	|| j
| j| j t| jtr| | d S d S )Nc                 S   rs  r^   r  r  r^   r^   r_   rt     r   z0QLinearPointwisePT2E.codegen.<locals>.<listcomp>r   r)   r   r  r  r   r  ir  ir  r'  r  r   r$  rw   r{  r   rS  rT  rW  rV  rU  rH   r  r  r  )r   rk  re   r  r   r  r  r  r  r  r  r	  r
  r  r  r  r  r  r^   r^   r_   rp    sn   


	zQLinearPointwisePT2E.codegenr   rS   r  r  rp  r  r  r  r	  r  c                 C   s   t | |||\}}}}t|tr&t|tr&|  |  |||g }d}nt|tr0t|ts2J |||g }d}|  |  |||g }|||	|
|t||g }|
d ure|
tjtj	fv sbJ |
|_
t||||d u|dS )NTFr  r  rP  r   r$  )r  rH   rS   r   rV  r  r   rM   r  rX  r   r  )r6  r   r  r  rp  r  r  r  r	  r  r  r  r  r  r  rP  r  r   r$  r^   r^   r_   r5    sF   	zQLinearPointwisePT2E.creater^   TF
r   r  r  rf  rp  rP  rV  r  r5  rR  r^   r^   r)  r_   r  e  s4    7=	
r  c                       sb   e Zd Z			d fdd	Zdd Zedd	d
ededd	dd	dd	dd	dededd	fddZ  Z	S )QLinearPointwiseBinaryPT2Er^   TFc                    r  )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp, x2]
            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp, x2]
            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        Nz0torch.ops.onednn.qlinear_pointwise.binary_tensorz)torch.ops.onednn.qlinear_pointwise.binaryr  r  binary_tensorr  qlinear_pointwise_binaryr  r  r!  r"  a   act_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                double inv_output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::optional<at::Tensor> other,
                double other_scale,
                int64_t other_zero_point,
                c10::string_view binary_post_op,
                double binary_alpha,
                c10::string_view unary_post_op,
                torch::List<c10::optional<at::Scalar>> unary_post_op_args,
                c10::string_view unary_post_op_algorithm)r#  r%  r)  r^   r_   rf    s0   
z#QLinearPointwiseBinaryPT2E.__init__c                 C   sL  dd | j D }g }||   |d }|d }| jr |d n|d }|d |d |d }}}	| jrWt|d	ks=J |d
 |d }
}|dd  \
}}}}}}}}}}nt|dks_J |dd  \}
}}}}}}}}}}}||
|||||||||	|||||||f}||  | j| j	|| j
| j| j t| jtr| | d S d S )Nc                 S   rs  r^   r  r  r^   r^   r_   rt   W  r   z6QLinearPointwiseBinaryPT2E.codegen.<locals>.<listcomp>r   r)   r   r  r  r  r@  r(  ir  r  r)  )r   rk  re   r  r   r  r  r  r  rd  r  r  r	  r
  r  other_scaleother_zpr  r  r  r  r  r  r^   r^   r_   rp  U  s   

	z"QLinearPointwiseBinaryPT2E.codegenr   rS   r  r  rp  r  r  r  r	  r  rd  c                 C   s@  t | |||\}}}}t|tr&t|tr&|  |  |||g }d}nt|tr0t|ts2J |||g }d}|  |  |||g }|dkrR| ||}|| |||	|
|||||t||g
 }|dkrt	t
| |||d u|d}t|| |jd S |
d ur|
tjtjfv sJ |
|_t	||||d u|dS )NTFru  r*  r  )r  rH   rS   r   rV  r  r  r  r   r-  r  r   r  r  rM   r  rX  r   )r6  r   r  r  rp  r  r  r  r	  r  r  rd  r1  r2  r  r  r  r  r  r  rP  r  r  r$  r  r^   r^   r_   r5    sr   



z!QLinearPointwiseBinaryPT2E.creater+  r,  r^   r^   r)  r_   r-    s8    >J	
r-  c                   @   s|   e Zd ZU dZeed< dd Zdd Zdee	j
 fdd	ZdddZedd Zdd Zdd Zedd Zdd ZeZd
S )r  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    r  c                 C   s4   t | j|}t|r|S tt| jj d| d)NrH  z not callable)rc   r  callableAttributeErrorrY   r   )r   rh   ri   r^   r^   r_   __getattr__   s   zMutableBox.__getattr__c                 C   r  rb   r  r   r^   r^   r_   r     r  zMutableBox.realizer   c                 C   r  rb   r  r   r^   r^   r_   r  	  r  z#MutableBox.get_unbacked_symbol_usesNc                 C   r  rb   )r  r   r   r^   r^   r_   r     r   zMutableBox.codegen_referencec                 C   r  rb   r  r   r^   r^   r_   r    r  zMutableBox.layoutc                 C   r   rb   r  r   r^   r^   r_   r     r   zMutableBox.get_layoutc                 C   r  rb   )r  r   r   r^   r^   r_   r     r  zMutableBox.get_sizec                 C   r  rb   r  r   r^   r^   r_   r     r  zMutableBox.dtypec                 C   sn   t | jtrt| j dt| jj d}d}| jj}nt| j d}| j}d}|tt||g}d|S )Nr  z))r?  
)rH   r  r  rY   r   r   rk   r   )r   line0endlr  r   r^   r^   r_   r%    s   


zMutableBox.__str__rb   )r   r  r  r  r   r  r5  r   r   rT   r  r  r   r  r  r   r   r   r%  rO  r^   r^   r^   r_   r    s   
 


r  c                   @   s   e Zd Zedd ZdS )rS   c                 C   r   rb   )rS   r  )r  r^   r^   r_   r5  2  r  zTensorBox.createN)r   r  r  r	  r5  r^   r^   r^   r_   rS   1  s    c                   @   sT   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Ze	dd Z
e	dd ZdS )r  c                 C   s&   t | jttfr| j tjjv S dS r1  )rH   r  r  r  r   rD   r   graph_inputsr   r^   r^   r_   r7  8  s   zStorageBox.is_input_bufferc                 C   s   t | jto| j tjjv S rb   )rH   r  re  r   rD   r   r  r   r^   r^   r_   r  =  s   zStorageBox.is_module_bufferc                 C   s   t | jtttttfr| j S t | jtt	t
fs J t| j| j }| j }td t| j | j | j d| jd| _tj| j| j_| j| j_|| j_|| j_| jjS )Nr  r  )rH   r  r  r  r  r  r  r   rX  r  rx  rY   r.  r   r   r   r   r   rD   r   r  rh   r   r#  r   )r   r#  r   r^   r^   r_   r   C  s6   

 

	
zStorageBox.realizec                 C   s<   t | jttfr|  dkr|  r|   dS dS dS dS )zL
        Called on buffers we expect to be forced to realize later.
        r)   N)rH   r  rX  r  r  8is_pointwise_non_scalar_tensor_num_reads_larger_than_oner   r   r^   r^   r_   r  a  s   zStorageBox.realize_hintc                 C   s"   t | jto|  tjkp|  S rb   )rH   r  rX  r  r*   realize_acc_reads_thresholdrH  r   r^   r^   r_   r   l  s   z!StorageBox.has_exceeded_max_readsc                 C   st   dt ttf fdd}|dkr2t| jttfr4|  tjks,|  s,t	| jr6|| jr8| 
  dS dS dS dS dS )zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        loopsc                    s$   dg}|    t fdd|D S )zW
            The heuristic for realizing reused result of heavy ops on cpu
            expc                 3   s    | ]	}|d   v V  qdS )r  Nr^   )rp   opfn_strr^   r_   r  ~  r  zGStorageBox.mark_reuse.<locals>.should_realize_on_cpu.<locals>.<genexpr>)r"  rq  )r<  	heavy_opsr^   r?  r_   should_realize_on_cpux  s   z4StorageBox.mark_reuse.<locals>.should_realize_on_cpur)   N)r   rX  r  rH   r  r  r*   realize_reads_thresholdrH  r   r   )r   r  rB  r^   r^   r_   r  r  s   	zStorageBox.mark_reusec                 C   sz   | j }t|tttfrdS t|tr| }n!t|ttfs$J t	|td t
| | | d|d }t|jS )Nr)   r  r  )r  rH   r  r  r  r  r  rX  r  rY   r   r   r   r   rw   rK  )r   r  r  r^   r^   r_   r    s$   


	zStorageBox.num_readsc                 C   sD   t | jtr tdd | j D r tdd | j D dkS dS )Nc                 s   s    | ]
}t |tj V  qd S rb   )rH   r+   r  rp   readr^   r^   r_   r    s
    
zVStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_one.<locals>.<genexpr>c                 s   s    | ]}|j d kV  qdS )r   Nry   rD  r^   r^   r_   r    rc  r)   T)rH   r  rX  r  r   ru  r   r^   r^   r_   r:    s   
zCStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_oneN)r   r  r  r7  r  r   r  r   r  r5   r  r:  r^   r^   r^   r_   r  7  s    
r  c                   @   s2   e Zd ZU eed< ejjed< dZe	d ed< dS )Subgraphrh   graph_moduleNrE   r   )
r   r  r  rk   r  rM   r  GraphModuler   r
   r^   r^   r^   r_   rF    s   
 rF  c                 C   s(   dd | D } t dd | D t | k S )Nc                 S   s"   g | ]}t |tr| n|qS r^   )rH   r  r  rp   r  r^   r^   r_   rt     s    z(_has_aliased_buffers.<locals>.<listcomp>c                 S   s   h | ]}t |qS r^   )idrI  r^   r^   r_   r     r   z'_has_aliased_buffers.<locals>.<setcomp>r  )buffersr^   r^   r_   _has_aliased_buffers  s   rL  c                
       s   e Zd ZU dZee ed< dZeee	  ed< dZ
ee ed< dZee ed< dZeee  ed< dedee	 dededef
 fdd	Zede	d
ededee	 fddZdd Z  ZS )ConditionalN	predicateoperandstrue_subgraphfalse_subgraphr:  r  c                    s^   || _ || _|| _|| _g }t|ts|| || t j	d ||d t
j| | _d S Nr6  )rN  rO  rP  rQ  rH   r  r  r'  r(  rf  rD   r   r  rh   )r   rN  rO  rP  rQ  r  r  r)  r^   r_   rf    s   


zConditional.__init__true_fnfalse_fnc              	      s"    |} fdd|D }tjjjd }dd |D }||fD ]/}|jd u rOtjj|j||jd|_t|j |jj	|  W d    n1 sJw   Y  q |jj
}|jj
}	d|fd|	ffD ]\}
}t|rrtd|
 d	| q`t|t|	ksJ ||	ftt||	D ]S\}\}}| | ksJ |||f| | ksJ |||f| | ksJ |||f| | ksJ |||f| j| jksJ |||fqt|ts| }nt|d
ksJ d|d
  }t||||t|dfddt|D }|_|S )Nc                    rF  r^   r  r  r  r^   r_   rt     r   z&Conditional.create.<locals>.<listcomp>r  c                 S      g | ]}|j d  qS r   rP  r  r^   r^   r_   rt     r   gmexample_inputssubgraph_namerS  rT  zVOutput aliasing is currently not supported in compiled torch.cond. The outputs of the z% subgraph of torch.cond are aliased: r   zQWhen predicate is not a Tensor, there must be at least one operand in torch.cond.)rN  rO  rP  rQ  r  c              
      F   g | ]\}}t t| | | | | jd  t|fgqS rc  	r  r  r   r   r   r   r   r  rI   rp   rq   ri  )conditionalr^   r_   rt     s    )r  rD   r   r  re   make_subgraphrG  rh   set_graph_handlerrungraph_outputsrL  r  rw   r   r}   r   r   r   r   r   r  rH   r  rM  r  r:  )r6  rN  rS  rT  rO  fx_operandsfake_operandssubgraphtrue_outputsfalse_outputsrh   r:  rq   tofor   r^   )r6  r_  r_   r5    sh   

$


zConditional.createc                 C   r%  rb   )codegen_conditionalro  r^   r^   r_   rp  1  rD  zConditional.codegen)r   r  r  rN  r
   r   r  rO  r	   rS   rP  rF  rQ  r:  r  r  rf  rP  r5  rp  rR  r^   r^   r)  r_   rM    s8   
 QrM  c                
       s   e Zd ZU dZeee  ed< dZeee  ed< dZ	ee
 ed< dZee
 ed< dZeee  ed< dee dee de
de
def
 fdd	Zed
e
de
dee dee fddZdd Z  ZS )	WhileLoopNcarried_inputsadditional_inputscond_subgraphbody_subgraphr:  r  c                    s@   || _ || _|| _|| _t jd ||| d tj| | _	d S rR  )
rm  rn  ro  rp  r(  rf  rD   r   r  rh   )r   rm  rn  ro  rp  r  r)  r^   r_   rf  =  s   zWhileLoop.__init__cond_fnbody_fnc              	      s   fdd|D } fdd|D }|| }t jjjd t jjjd  }dd |D }||fD ]/}|jd u r^t jj|j||jd|_t |j |jj|  W d    n1 sYw   Y  q/|jj	}	|jj	}
t
|
rrtd|
 t|	d	ks|J |	|	d
  tjksJ |	t|	d
  d
ksJ |	t|d
ksJ d|d
  }t|t|
ksJ ||
ftt||
D ]]\}\}}| | ksJ |||f| | ksJ |||f| |   kr|ksn J ||||f| | ksJ |||f| j| jksJ |||fqt||||t|dfddt|
D }t||D ]\}}| t jjv rKt jj|  q5|_|S )Nc                    rF  r^   r  r  r  r^   r_   rt   Z  r   z$WhileLoop.create.<locals>.<listcomp>c                    rF  r^   r  r  r  r^   r_   rt   [  r   r  r  c                 S   rU  r   rV  r  r^   r^   r_   rt   _  r   rW  zOutput aliasing is currently not supported in compiled torch.while_loop. The outputs of the body_fn subgraph of torch.while_loop are aliased: r)   r   z9torch.while_loop is assumed to have at least one operand.)rm  rn  ro  rp  r  c              
      r[  r\  r]  r^  )
while_loopr^   r_   rt     s    )rD   r   r  re   r`  rG  rh   ra  rb  rc  rL  r  rw   r   rM   r  r   r   r   r}   r   r   r  rl  r  r   r9  r  ro  r:  )r6  rq  rr  rm  rn  
all_inputsfx_all_inputsfake_all_inputsrf  cond_outputsbody_outputsr   rq   r>  bor:  r  r   r^   )r6  rs  r_   r5  R  sl   
. &
	zWhileLoop.createc                 C   r%  rb   )codegen_while_loopro  r^   r^   r_   rp    rD  zWhileLoop.codegen)r   r  r  rm  r
   r	   rS   r  rn  ro  rF  rp  r:  r  r  rf  rP  r5  rp  rR  r^   r^   r)  r_   rl  5  s8   
 \rl  c                       s:   e Zd Z	d	dd fddZ fddZdd Z  ZS )
rX   Nr-  c          
   	      sp   t  j|||||d |d ddlm} ||g ||R |}	|	d us$J |	| _tjj|	d | _	| tjj|	< d S )N)rf   rY  r   )get_effect_key)
r(  rf  torch._higher_order_ops.effectsr{  effect_typerD   r   effectful_opsrU  prev_effect_buffer)
r   r  r@  r  rC  r  rf   rY  r{  r}  r)  r^   r_   rf    s   
zEffectfulKernel.__init__c                    s0   t   }| jd ur|jt| j  |S rb   )r(  r  r  rK  ro  r+   r  r   )r   r  r)  r^   r_   r    s   

zEffectfulKernel.get_read_writesc                 C   r0  r  r^   r   r^   r^   r_   r    r3  z EffectfulKernel.has_side_effectsrb   )r   r  r  rf  r  r  rR  r^   r^   r)  r_   rX     s    	
rX   c                   @   s<   e Zd ZU eed< ejjed< dd Zdd Z	d
dd	Z
dS )r.  rh   r   c                 C   r   rb   rg   r   r^   r^   r_   r     r   zTorchBindObject.get_namec                 C   rZ  rb   r^   r   r^   r^   r_   r     r3  zTorchBindObject.get_deviceNc                 C   r   rb   rg   r   r^   r^   r_   r     r   z!TorchBindObject.codegen_referencerb   )r   r  r  rk   r  rM   _CScriptObjectr   r   r   r^   r^   r^   r_   r.    s   
 r.  c                       sX   e Zd Zeeddd Z fddZdej	j
def fdd	Z fd
dZ  ZS )InterpreterShimNc                   C   s   t jtS rb   )rM   r  symbolic_tracer   r^   r^   r^   r_   	_dummy_gm  s   zInterpreterShim._dummy_gmc                    s>   t  j|  dd | | _|| _|| _d| _|j| _d | _	d S )NF)garbage_collect_values)
r(  rf  r  moduler   
submodulesextra_tracebackrs  
fetch_attrr  r   r   r  r)  r^   r_   rf    s   
zInterpreterShim.__init__r9  r   c                    s   || _ t |S rb   )r  r(  run_node)r   r9  r)  r^   r_   r    s   zInterpreterShim.run_nodec                    s@   t |  t j|i |W  d    S 1 sw   Y  d S rb   )rD   set_interpreter_handlerr(  rb  )r   re   rf   r)  r^   r_   rb    s   $zInterpreterShim.run)r   r  r  r	  r  	lru_cacher  rf  rM   r  r  r   r  rb  rR  r^   r^   r)  r_   r    s    r  c                       sx   e Zd ZdZ fddZedd Zedd Zdd	 Zd
e	j
fddZdd Zdd Zdd Zdd Zdd Z  ZS )r  z
    Captures the body of a Loops subclass into an FX graph.  Persists any
    indexing simplifications and makes it easier to analyze loop bodies.
    c                    sj   t    || _i | _i | _g | _g | _i | _i | _g | _	d| j
i| _i | _g | _t| ||| _d | _d S )N	get_index)r(  rf  r  r  indexing_exprs_namerK  writesr  r  rd  r  r  	subblocksindirect_varsLoopBodyBlock
root_blockindexing)r   ri   re   r  r)  r^   r_   rf    s   

zLoopBody.__init__c                 C   s0   t | jjfdd | j D }dd |D S )Nc                 s   s    | ]}|j V  qd S rb   )r   )rp   blockr^   r^   r_   r  "  s    z%LoopBody.get_nodes.<locals>.<genexpr>c                 S   s   g | ]
}|j D ]}|qqS r^   )rZ   )rp   r   r[   r^   r^   r_   rt   $  r  z&LoopBody.get_nodes.<locals>.<listcomp>)r  rV  r  r   r  rL   )r   
all_graphsr^   r^   r_   	get_nodes  s
   zLoopBody.get_nodesc                 C   s   ddl m} || S )Nr)   )	BoundVars)boundsr  )r   r  r^   r^   r_   r  &  s   zLoopBody.boundsc                 C   s`   dt | j g}|dd | j D  |dd td| jfg| j D  d	|S )Nzvar_ranges = c                 S   s   g | ]\}}| d | qS )rJ  r^   )rp   rh   r!  r^   r^   r_   rt   /  r  z&LoopBody.debug_str.<locals>.<listcomp>c                 S   s   g | ]	\}}| |qS r^   )	debug_str)rp   rh   r  r^   r^   r_   rt   1  s    r?  r6  )
rK   r  r'  r  r  r  rV  r  r  r   r   r^   r^   r_   r  -  s   
zLoopBody.debug_strr  c                 C   sd   t | || |d ur|t | | d|< || jvr-dt| j }|| j|< || j|< | j| S )N
_name2exprrr   )rc   r  r  rw   r  )r   r  categorybuf_namerh   r^   r^   r_   add_index_expr:  s   



zLoopBody.add_index_exprc                 C   s<   |d   r|| jvr|}n	| t| j }|| j|< |S )zaNot actually for nn.Modules, but subblocks in generated code are mapped to FX call_module opcodesr  )	isnumericr  rw   )r   r  rG   rh   r^   r^   r_   add_submoduleD  s
   
zLoopBody.add_submodulec                 C   s"   t tjt| j}| j| |S rb   )r@   r(   INDIRECTrw   r  r  )r   r   r  r^   r^   r_   add_indirectM  s   zLoopBody.add_indirectc                    sB   t t  kr
dS | jdusJ  fdd| j D | _dS )z,Swap in a variable used in indirect indexingNc                    s    i | ]\}}|t | iqS r^   r  r  newr   r^   r_   r   W  r   z-LoopBody.replace_indirect.<locals>.<dictcomp>)rk   r  r  )r   r   r  r^   r  r_   replace_indirectR  s    zLoopBody.replace_indirectc                 C   s   | j d usJ | j | S rb   )r  r   r^   r^   r_   r  Y  r-  zLoopBody.get_indexc                    s   t tj|}t|tjksJ |jftfdd|D s%J ttj	 |  fddj
 D _ }d _|S )Nc                 3   s    | ]}| j vV  qd S rb   )r  r  r   r^   r_   r  `  rc  z$LoopBody.__call__.<locals>.<genexpr>c                    s   i | ]
\}}|t | qS r^   r  )rp   rh   r  r  r^   r_   r   b  rl  z%LoopBody.__call__.<locals>.<dictcomp>)rI   r  rV  from_iterablerw   r  r  rK   r}   r  r  r  r  r  )r   r  rr   r}  r^   )r  r   r_   __call__]  s    
zLoopBody.__call__)r   r  r  r  rf  r5   r  r  r  rT   r   r  r  r  r  r  r  rR  r^   r^   r)  r_   r    s    


	r  c                   @   sD   e Zd ZdZdededef dee fddZdd	 Z	dddZ
dS )r  a  
    Captures the body of a Loops subclass into an FX graph.
    In normal cases there will be a 1:1 mapping between LoopBody and
    LoopBodyBlock, hower in the case of ops.masked() the masked out
    operations will manifest as an extra LoopBodyBlock.
    r?  ri   .re   c           	         s   |_ dfdd	 G  fdddtj}tj tjjjd_	dddi }d	d
l
m} d	dlm} |||j j}tjrM||j j}t| t||  W d    n1 sdw   Y  j_d S )Nc              	      s    dd j| ||fi S )Ncall_moduler  )create_proxyr?  r  )r  r  r  r   tracerr^   r_   	add_indexv  s   z)LoopBodyBlock.__init__.<locals>.add_indexc                       s  e Zd Zd_dedejf fddZd% fdd	Z fd	d
Z	dd Z
 fddZ fddZdedejdejdef fddZededef ffddZedeeedf eedf geedf f ffddZdd Zed&fd!d"	Zefd#d$ZdS )'z/LoopBodyBlock.__init__.<locals>.CaptureIndexingCaptureIndexingrh   rr   c                    s    |d|}| j ||S )NrK  )_innerr  )r   rh   rr   r  r^   r_   r    s   z4LoopBodyBlock.__init__.<locals>.CaptureIndexing.loadNc                    s    |d|}| j ||||S Nr  )r  r[  )r   rh   rr   r   rj  r  r^   r_   r[    s   z5LoopBodyBlock.__init__.<locals>.CaptureIndexing.storec                    s    |d|}| j |||S r  )r  r  )r   rh   rr   r   r  r^   r_   r    s   z?LoopBodyBlock.__init__.<locals>.CaptureIndexing.store_reductionc                    s8   | j |||| d|v rt fddtdD S  S )Nr\  c                 3       | ]} | V  qd S rb   r^   ro   r}  r^   r_   r    r  zLLoopBodyBlock.__init__.<locals>.CaptureIndexing.reduction.<locals>.<genexpr>r   )r  r  rJ   rx   )r   r   r  r  r   r^   r  r_   r    s   z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.reductionc                    s:   t |ttjfr| jt||S  |d}| j||S Nrd  )rH   r  rT   r   r  rU  r  )r   rr   r   r  r^   r_   r    s   
z:LoopBodyBlock.__init__.<locals>.CaptureIndexing.index_exprc                    s&    |d} |d}| j ||||S r  )r  check_bounds)r   rr   r   r!  r"  r  r^   r_   r    s   

z<LoopBodyBlock.__init__.<locals>.CaptureIndexing.check_boundsoffsets_nameoffsets_sizeindexing_dtyper7  c                    s    |d}| j |||||S r  )r  	bucketize)r   rL   r  r  r  r7  r  r^   r_   r    s   

z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.bucketizemasked_body.c                    sH    fdd}j |d}tj |g   j j|< d|| |fi S )zb
                Recursively capture the masked out body in another LoopBodyBlock
                c                       t j|  |S rb   )rD   rC   rA  )r  rd  subblockr^   r_   shim  r  zDLoopBodyBlock.__init__.<locals>.CaptureIndexing.masked.<locals>.shimmasked_subblockr  )r?  r  r  r  r  )
mask_proxyr  other_proxyr  rh   r  r  r_   rA    s   z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.maskedr  c                    sL    fdd}j |d}d|| |fi tfddtt|D S )Nc                    r  rb   )rD   rC   r{  )rz  rL   )r  r^   r_   r    r  zBLoopBodyBlock.__init__.<locals>.CaptureIndexing.scan.<locals>.shimr{  r  c                 3   r  rb   r^   ro   r  r^   r_   r    r  zGLoopBodyBlock.__init__.<locals>.CaptureIndexing.scan.<locals>.<genexpr>)r?  r  r  rJ   rx   rw   )dtype_proxyr  value_proxyr  rh   r  )r  r}  r_   r{    s   z4LoopBodyBlock.__init__.<locals>.CaptureIndexing.scanc                 S   s   | j |}|d |d fS r  )r  frexp)r   r  r}  r^   r^   r_   r    s   z5LoopBodyBlock.__init__.<locals>.CaptureIndexing.frexpTc                    sD   j  fdd}dj |d | fi  S )z
                Flow data from tensors into indexing formulas.
                Introduce a call_module to update the indexing.
                c                    s   j tj|   d S rb   )r?  r  rD   rC   indirect_indexing)new_var)checkr   r   r  r^   r_   set_indirect  s   zWLoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexing.<locals>.set_indirectr  set_)r?  r  r  r  )index_proxyr   r  r  r  )r  r   r  r_   r    s   zALoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexingc                    s     dd| fi  d S )Nri  )r  r  )r  r^   r_   ri    s   z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.outputrb   T)r   r  r  rh   rk   rT   r   r  r[  r  r  r  r  rM   r   r  r  r	  r   r   rA  r   r{  r  r  ri  r^   r  r   r  r^   r_   r  ~  s<    "r  )
tracer_clsplaceholderrC   r^   r)   )IndexPropagation)SimplifyIndexingrb   )r?  rD   WrapperHandlerrM   r  TracerGraphr*  r   r  index_propagationr  r   r  r  r*   constant_and_index_propagationr>  rC   ri  )	r   r?  ri   re   r  	proxy_opsr  r  handlerr^   r  r_   rf  s  s"   
szLoopBodyBlock.__init__c                 C   s"   | j }| jj}t||t S rb   )r   r?  r  r  rb  rD   get_ops_handlerr  r^   r^   r_   r     s   zLoopBodyBlock.__call__r  c              
   C   s8   t j| jj| jj}tdd|	 
dd| dS )Nz;[^\n]*r   zdef forward(zdef r  )rM   r  rH  r?  r  r   coderesubstriprM  )r   rh   r  r^   r^   r_   r  
   s   zLoopBodyBlock.debug_strN)r  )r   r  r  r  r  r   r   r	   rf  r  r  r^   r^   r^   r_   r  k  s    " r  c                   @   sd   e Zd Zdd Zdd Zdd Zedeee	e f dd	fd
dZ
edeee	e f fddZd	S )_CollectiveKernelc                 C   r0  r1  r^   r   r^   r^   r_   r     r3  z!_CollectiveKernel.should_allocatec                 C   r0  r  r^   r   r^   r^   r_   r     r3  z"_CollectiveKernel.has_side_effectsc                 C   s^   ddl m} |jj| _|jj| _| jdd d| j | _||| _	dd |jj
D | _d S )Nr)   rM  rQ  r   c                 S   r^  r^   r_  r  r^   r^   r_   rt   %   r`  z4_CollectiveKernel.set_cpp_kernel.<locals>.<listcomp>)rR  rN  ra  rh   rT  rT  rU  rM  rV  rW  rb  rU  rX  r^   r^   r_   rY     s   


z _CollectiveKernel.set_cpp_kernelr  r   Nc                 O   s   |j }|dd}tjj | j||g|R i |\}}}	}
}W d    n1 s*w   Y  |r:J | d| |D ]}|  q<| t|d  |||	|
}||_	||_
t|gt|R   d S )NrQ  rH  r  r   )_namerM  rD   r   r  r  r   r  r   rT  rS  r  rx  tree_leaves)r6  r@  r  re   rf   rT  rS  r  r  r  r  rY  
tensor_argr  r^   r^   r_   create_inplace/   s0   

z _CollectiveKernel.create_inplacec                    s
  |j }|dd}tjj  j||g|R i |\}}}	}
}W d    n1 s*w   Y  |r:J | d| |D ]}|  q<t|trn 	||} t
||||	|
|_|_ fddt|D _jS   ||||	|
|_|_g_S )NrQ  rH  r  c                    s(   g | ]\}}t  |t|fgqS r^   )r  r~  rI   )rp   rq   r  r6  r  r^   r_   rt      s    z9_CollectiveKernel.create_out_of_place.<locals>.<listcomp>)r  rM  rD   r   r  r  r   rH   rI   r`  r  rT  rS  r   r:  r~  )r6  r@  r  re   rf   rT  rS  r  r  r  r  rY  r  r   r^   r  r_   create_out_of_placec   sP   


z%_CollectiveKernel.create_out_of_place)r   r  r  r  r  rY  rP  r   rS   r	   r  r  r^   r^   r^   r_   r     s    3r  c                       s:   e Zd Zdd ZededdfddZ fdd	Z  ZS )
_WaitKernelc                 C   s`   | j d }t|tr|j d gS t|tr.|j d }t|tr,|jd \}}|j | gS g S g S r   )r  rH   r  r  r  )r   r  collr   r   r^   r^   r_   get_volatile_reads   s   




z_WaitKernel.get_volatile_readsr  r   Nc           	      C   sz   t jj | ||\}}}}}W d    n1 sw   Y  |r*J | d| | t| ||||}t|| d S )Nr  )rD   r   r  r  r  r   r  )	r6  r@  r  r  r  r  r  rY  r  r^   r^   r_   create_wait   s$   


z_WaitKernel.create_waitc                    s6   t   }|  }|D ]}|jt|  q|S rb   )r(  r  r  rK  ro  r+   r  r   )r   r  volatile_readsvrr)  r^   r_   r     s
   
z_WaitKernel.get_read_writes)	r   r  r  r  rP  rS   r  r  rR  r^   r^   r)  r_   r     s
    r  c                 C   s`   t | ttjfrt| S t | ttfr#t }| D ]}|t|O }q|S t | t	j
r-t| S t S rb   )rH   r$   rT   r   r!   rJ   rI   r   r  rM   r  )r   r8  r   r^   r^   r_   r     s   r  r  )TFNF)FN)r   rS   rp  rS   r  rS   (  r)  r
  r  r  r  loggingr  textwrapr   r   r   typingr   r   r   r   r   r	   r
   r   r   r   r   r   unittest.mockr   rT   r   r   torch._export.serde.schema_exportserderD  r+  torch._loggingrM   torch.fxtorch.utils._pytreer?  _pytreerx  torch._dynamo.device_interfacer   torch._dynamo.utilsr   torch._export.serde.serializer   *torch._higher_order_ops.auto_functionalizer   torch._inductorr   torch._prims_commonr   r   r   r   r   torch._subclasses.fake_tensorr   %torch.fx.experimental.symbolic_shapesr   r   r    r!   r"   r#   r$   torch.utils._sympy.functionsr%   r&   r'   torch.utils._sympy.symbolr(   r   r*   r+   codegen.commonr,   r-   r.   r/   r0   ops_handlerr1   runtime.hintsr2   runtime.runtime_utilsr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   virtualizedrC   rD   r   rE   	getLoggerr   r  r   rG  ra   rl   r   r   r   rm  ro  r   r  r   r   r   r   r   r   r   r^  r  rW  rX  rg  r  r  r  r]  r^  rx  r  r   r  r  r  r  r  rP   r  r  r  rH  r  r  r/  r1  r5  r9  r;  r  r  r   r{  r  r  r  r  re  r  r  r  r  r  rV  r  rk   r  r  r  r  r  r!  r  r1  r2  rO  r  r  r  rE  r  r  r  r  r  r  r
  r  r  r  r  rQ   rR   r*  _embedding_bagrE  _fft_c2c'_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention
_scaled_mmaddmmr   bmmcopy_mmrepeat_interleaver  nonzeror|  r   view_as_realry  r,  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r-  r  rS   r  rF  rL  rM  rl  rX   r.  r  Interpreterr  r  r  r  r  r  r^   r^   r^   r_   <module>   s  8$	D
"	
\u
5     Z   6(WF): KK		 U$P   36"&$19     [)(r-3K0#0   i9	


 ,FYd:@FUm + V 2 c8v	w},c * 7