o
    "i                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZmZ d dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZG ddl6mHZHmIZImJZJmKZKmLZLmMZMmNZNmOZO ddlPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZY ddlZm[Z[ ddl\m]Z]m^Z^m_Z_m`Z`maZambZb ddlcmdZdmeZe erd dlfmgZg eheiZjejkleidZmejkleidZnejojpZpeq Zre5s rFd dltmuZu ndd Zud d! Zvd"d# Zwd$d% Zxd&d' Zyd(d) ZzG d*d+ d+ej{j|Z}dS ),    N)defaultdict)contextmanager)
AnyCallableDefaultDictDictListOptionalSetTupleTYPE_CHECKINGUnion)get_decompositions)defakedynamo_timed)
LazyStringtrace_structured)make_channels_last_strides_for)
FakeTensor)BackwardState)magic_methodsmethod_to_operator)free_unbacked_symbolshas_free_symbolsresolve_unbacked_bindingsRuntimeAssertShapeEnvSymTypes)no_dispatch   )configir)DeviceOpOverridesget_device_op_overridesget_scheduling_for_deviceget_wrapper_codegen_for_deviceregister_backend_for_device)CppWrapperCpu)CppWrapperCuda)WrapperCodeGen)CppWrapperCodeGenErrorLoweringExceptionMissingOperatorWithDecompMissingOperatorWithoutDecomp)ConstantFixedLayoutInputBuffer	Pointwise	Reduction
StorageBox	TensorBoxTorchBindObject)	constrain_to_fx_stridesFALLBACK_ALLOW_LISTfallback_handler%fallback_node_due_to_unsupported_typelayout_constraints	loweringsmake_fallbackneeds_realized_inputsunsupported_output_tensor)SizeVarAllocator)convert_shape_to_inductorgather_origins get_cloned_parameter_buffer_nameget_sympy_Expr_dtype#maybe_get_suppress_shape_guards_ctxshould_assume_input_aligned)NullHandlerV)_EffectType
perf_hintsoutput_code)log_module_codec                  O   s   d S N )argskwargsrM   rM   S/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torch/_inductor/graph.pyrK   n      rK   c                 C   st   t jt jt jt jt jt jt jt jt j	t j
t jt jt jh}|r6|t j |t j |t j |t j | |v S rL   )torchfloat32float64int64int32int16int8uint8boolbfloat16	complex32	complex64
complex128float16addfloat8_e4m3fnfloat8_e5m2float8_e4m3fnuzfloat8_e5m2fnuz)dtypecudasupported_dtyperM   rM   rP   supported_dtype_of_cpp_wrapperr   s(   rh   c                 C   sh   t | tjtjtjjjfsJ dt | tjjjrtjS t | tjr&t	| S | j
r,tjS | jr2tjS d S )Nzgget_constant_buffer_dtype only supports input of sympy.Symbol, sympy.Expr or sympy.core.numbers.Integer)
isinstancesympySymbolExprcorenumbersIntegerrR   rU   rC   
is_integeris_floatrS   )constant_bufferrM   rM   rP   may_get_constant_buffer_dtype   s   rs   c                 C   s   dd t D }| |v S )Nc                 S   s   h | ]}t |qS rM   )r   ).0mrM   rM   rP   	<setcomp>   s    z"is_magic_method.<locals>.<setcomp>)r   )op	magic_opsrM   rM   rP   is_magic_method   s   ry   c                 C   sT   | d}| }t|D ]\}}t||s"tdd|d |  t||}q|S )N.z#Node referenced nonexistent target )split	enumeratehasattrRuntimeErrorjoingetattr)objtargettarget_atomsattr_itriatomrM   rM   rP   getattr_recursive   s   

r   c                 C   s   t jsdS tjtjh}tjtjtjtjtj	tj
tjtjtjtjtjtjh}dd }t| jD ]/}||}|s7q.||v r@d|jd< |jddr]|jD ]}||}|sSqJ||vr\d|jd< qJq.dS )a  
    Nodes like convolution/convolution_backward want its input to be dense.
    If we pad their inputs, we result in extra calls to copy kernels!  On the other hand, padding usually helps reduction.

    The pass finds nodes that dislike padding. These are nodes that can be reached
    from a convolution/convolution_backward in the backward direction without
    going thru a reduction.
    Nc                 S   s"   | j dkrt| jdr| jjS d S )Ncall_function_overloadpacket)rw   r}   r   r   )noderM   rM   rP   _get_overload_packet   s
   z8mark_nodes_dislike_padding.<locals>._get_overload_packetTdislike_paddingF)r    comprehensive_paddingatenconvolutionconvolution_backwardvar_meansummeanprodanyaminamaxminmaxargminargmaxscatter_reducereversednodesmetagetall_input_nodes)gops_dislike_paddingops_like_paddingr   currw   priorprior_oprM   rM   rP   mark_nodes_dislike_padding   sD   	


r   c                       s  e Zd ZU eej ed< dejfddZ	dejfddZ
dd Z							
	
							
	
								dqdejjdeeej  f fddZedefddZdedefddZdejjdeej dedd fddZdd Zdd ZdejfddZedd  Zd!efd"d#Zd!efd$d%Zd!efd&d'Ze fd(d)Z d
d*d+ej!d,efd-d.Z"d/ee fd0d1Z#d2d3 Z$defd4d5Z%defd6d7Z&d8d9 Z'drd:d;Z(ded<eej fd=d>Z)d?ef fd@dAZ* fdBdCZ+edDejdefdEdFZ,dGdH Z-dIdJ Z.dKdL Z/ fdMdNZ0dOdP Z1e2dQejj3fdRdSZ4dTe5e6e7ej8f dUf dej9fdVdWZ:dXejj3f fdYdZZ;d[d\ Z<d]d^ Z=d_d` Z>dadb Z?dcdd Z@dedf ZAedgd
dhdidj ZBdkdl ZCdmdn ZDdefdodpZE  ZFS )sGraphLoweringgraph_outputsexc                 C   sx   | j rt| t| fS ddlm} |dt| jj }| j	||\}}}dd |D }dd |D }||fS )z
        Support dynamic shapes and dynamic strides by assigning variables
        to each dimension.  We duck-shape tensors, so if two tensors
        have the same size they get assigned the same symbolic variable.
        r   )ConstantSource__inductor_unknown_tensor_c                 S   $   g | ]}t |tjr|jjn|qS rM   ri   rR   SymIntr   exprrt   r   rM   rM   rP   
<listcomp>     $ z8GraphLowering.symbolic_sizes_strides.<locals>.<listcomp>c                 S   r   rM   r   r   rM   rM   rP   r     r   )
reuse_shape_envr@   sizestridetorch._dynamo.sourcer   len
_shape_env
var_to_val,create_symbolic_sizes_strides_storage_offset)selfr   r   sourcer   r   _rM   rM   rP   symbolic_sizes_strides   s&   z$GraphLowering.symbolic_sizes_stridesc                 C   s,   dd |  D }dd | D }||fS )z+
        Primarily used to weights
        c                 S      g | ]}t |qS rM   rj   ro   r   rM   rM   rP   r         z6GraphLowering.static_sizes_strides.<locals>.<listcomp>c                 S   r   rM   r   r   rM   rM   rP   r     r   )r   r   )r   r   r   r   rM   rM   rP   static_sizes_strides  s   z"GraphLowering.static_sizes_stridesc                 C   sx   t dd u rddlm} td|tt t dd u r&ddlm} td|tt t dd u r:ddl	m
} td|t d S d S )Ncpur   )CppSchedulingrf   )CUDACombinedSchedulingxpu)TritonScheduling)r$   codegen.cppr   r&   r)   r'    codegen.cuda_combined_schedulingr   r(   codegen.tritonr   )r   r   r   r   rM   rM   rP   init_backend_registration  s   z'GraphLowering.init_backend_registrationNFgmexample_inputsc                    sr  t  | || _|d ur|n| j||
d| _d| _|
| _|| _|| _|| _	d| _
|d u r4t }d| _n|| _d| _|| _|  |j | _t | _t|| _g | _i | _i | _|r^|jnt | _|rg|jnt | _d| _g | _|ru|ni | _|rt| nt | _|r|j ni | _ i | _!i | _"t | _#t | _$t | _%t | _&t | _'d | _(d | _)g | _*|	| _+d | _,i | _-t | _.g | _/i | _0t1t2| _3t44 | _5|| _6|| _7|| _8i | _9|| _:|| _;t<t=| _>d | _?| jr| @ nt | _AtB|jC dh| _D|d ur|ni | _Ed| _Fd| _Gg | _Hd | _Ii | _J|K | _L| jMjNOdi | _P|d ur)|jQni | _Q| R  i | _St | _Td S )N)is_inferencer   FTzaten.convolution_backward  dynamo_flat_name_to_original_fqn)Usuper__init__r   decide_layout_opt
layout_optnum_channels_last_convr   is_const_graph
const_codeconst_moduleextra_tracebackr   r   r   freeze_runtime_assertsdeferred_runtime_assertscopyras_by_symbolsetbound_unbacked_symbolsr?   sizevarsgraph_input_namesgraph_inputsgraph_inputs_originaldevice_typesdevice_idxsrf   buffersconst_output_indexkeysfolded_constants	constantstorchbind_constantsconstant_reprsremoved_buffersremoved_inplace_buffersmutated_buffersnever_reuse_buffersinplaced_to_remove
device_opswrapper_codeextern_kernel_nodesextern_node_serializercurrent_nodelistsmutated_inputsmutated_input_idxsname_to_bufferr   listname_to_userstimecreation_timenamecpp_wrapperrecord_multi_kernel_choicemulti_kernel_to_choiceaot_modegraph_idnext_post_grad_graph_counterpost_grad_graph_id	schedulerfind_nodes_prefer_channels_lastnodes_prefer_channels_lastr   graph_warned_fallbackuser_visible_outputs	cache_key
cache_pathcache_linemapdisable_cudagraphs_reasondevice_node_mapping__copy__orig_gmmoduler   r   r   allocated_constant_namer   effectful_opsaligned_inputs)r   r   r   	shape_envr  r  r  r  r   r   r   r   r   r   r   r  	__class__rM   rP   r   (  s   






zGraphLowering.__init__returnc             
   C   s|  t jsdS t jr
dS dd | jjD }t|}|dkrdS tjjj	r3tjj
 r3tdd |D r3dS tt| jjd| krFtd	 dS td
d |D rVtd dS dd }dd }dd }|rddlm} tt}|D ]g}	tjj|	\}
}}|
r|dd#}tj |	j|i | W d   n1 sw   Y  W d   n1 sw   Y  | }||	rd}n||	rd}n	||	rd}nd}||  |7  < qqtd qqd}d}d}d}t| }|d | |d |  |d |  |d |  }||k}|std|| |S tt||rtd dS tt||r-td  dS tt||r<td! dS dS )"zl
        Decide if we should enable layout optimization for this graph based on
        heuristics.
        FTc                 S   s"   g | ]}|j tjjjjkr|qS rM   )r   rR   opsr   r   default)rt   nrM   rM   rP   r     s    z3GraphLowering.decide_layout_opt.<locals>.<listcomp>r   c                 s   s6    | ]}d D ]}|j | jd jtdkV  qqdS )r   r   valr   N)rN   r   devicerR   rt   r!  idxrM   rM   rP   	<genexpr>  s    z2GraphLowering.decide_layout_opt.<locals>.<genexpr>i,  z*Skipped layout opt because only a few convc                 s   s.    | ]}d D ]}t |j| jd V  qqdS )r"  r#  N)r   rN   r   r%  rM   rM   rP   r'    s    zeSee perf regression with dynamic shape. Follow up in https://github.com/pytorch/pytorch/issues/102670c                 S   s(   | j d dko| j d jd ddkS )Nr   r#  rN   r   r   r!  rM   rM   rP   
is_grouped  s   (z3GraphLowering.decide_layout_opt.<locals>.is_groupedc                 S   sJ   | j d jd dd | j d jd dko$| j d jd ddkS )Nr   r#  r      r)  r*  rM   rM   rP   is_in_out_channel  s   0z:GraphLowering.decide_layout_opt.<locals>.is_in_out_channelc                 S   s4   | j d jd ddko| j d jd ddkS )Nr   r#  r   @   r)  r*  rM   rM   rP   is_small_channel  s   z9GraphLowering.decide_layout_opt.<locals>.is_small_channel)FlopCounterMode)displayNgroupedsmallin_outr   zConv inputs meta not foundg|?5^?gtV?g333333?guV?zhSkipped layout opt in inference because weighted flops indicate slowdown, default: %d, channels last: %dzFSkip layout opt because found grouped convolution with >1 in_channels!zBSkip layout opt because some convolutions have smaller out_channelz>Skip layout opt because all convolution channels are too small)r    layout_optimizationforce_layout_optimizationr  r   r   rR   backendsmkldnnenabledis_availableallr   logdebugr   torch.utils.flop_counterr0  r   float	_inductorfx_utilsget_fake_args_kwargsrG   	fake_moder   get_total_flopsr   valuesmap)r   r   
conv_nodesnconvr+  r-  r/  r0  flop_countsr   successrN   rO   flop_counter_modecounted_flops	node_typeGROUPED_MULTIPLIERDEFAULT_MULTIPLIERIN_OUT_MULTIPLIERSMALL_MULTIPLIERtotal_flopsweighted_flopsdo_layout_optrM   rM   rP   r     s   
	





	
zGraphLowering.decide_layout_optr  c                 C   s   | j dur| j  d| S |S )z2Prepend the given name with the graph name if any.Nr   )r  r   r  rM   rM   rP   qualify_nameO  s   
zGraphLowering.qualify_namesubgraph_namec                 C   s(   t ||| j| j| j| j| j| |dS )a  
        Make a subgraph of the current graph with all inherited
        parts, except the graph module (`gm`) and `example_inputs`.
        The subgraphs are lowered separately, but intended to be
        inlined in the parent graph's codegening. Hence the need
        for maintaining the same `shape_env` and other properties.
        The subgraph name is qualified by the parent graph's name.
        )r   r   r  r  r  r   r   r  )r   r   r  r  r   r   rV  )r   r   r   rW  rM   rM   rP   make_subgraphU  s   zGraphLowering.make_subgraphc                 C   s   t  }t| jjjD ]"}|jtjjj	j
kr|| q
|jD ]}||v r+||  nqq
| jjjD ]}||v r>||j q2|S )aC  
        The rule to decide if an node prefer channels last is simple.
        1. if it's input/output of a convolution
        2. if one of its user prefers channels last

        We have rule 1 because cudnn runs a faster convolution kernel for channels last inputs;
        Rule 2 is also important. It makes sure that indirect inputs to convolution also prefers
        channels last.

        Consider the scenario: conv -> batch-norm -> relu -> conv
        Without rule 2, batch-norm output may use a contiguous layout. That will cause 2 extra copies:
        1. the output of batch-norm should be channels last initially since its input is a conv's output.
           Forcing the batch-norm's output to be contiguous results in the first copy
        2. The second conv's input is initially contiguous. This layout is propagated from the batch-norm's output.
           We need convert it to channels last layout which results in the second copy.
        With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
        can be saved.
        )r   r   r  r  r   r   rR   r  r   r   r   r`   usersupdate)r   
output_setr!  userrM   rM   rP   r  n  s    


z-GraphLowering.find_nodes_prefer_channels_lastc                 C   s*   || j vr| j | td| d S d S )NzUsing FallbackKernel: %s)r  r`   perf_hint_loginforU  rM   rM   rP   warn_fallback  s   
zGraphLowering.warn_fallbackr$  c                 C   sR   | j |j |jd ur| j|j tjjr%|| jvr'tjj| j|< d S d S d S rL   )	r   r`   typeindexr   rG   r  r   r  )r   r$  rM   rM   rP   add_device_info  s   
zGraphLowering.add_device_infoc                 C   s   t jS rL   )rG   rC  r   rM   rM   rP   rC    s   zGraphLowering.fake_modebuffer_namec                 C   sj   || j v r
| j | S || jv r| j| S || jv r3tjj| }t|tj|j|j	gtj
|R  S d S rL   )r   r   r   rG   r  r!   ConstantBufferr/   r$  re   r   )r   rd  datarM   rM   rP   
get_buffer  s   





zGraphLowering.get_bufferc                 C   st   || j v r| j | jS || jv r| j|  S || jv r#| j|  S td|}|r3| |dS td| )Nz1(as_strided|reinterpret_tensor)\(([a-zA-Z0-9_]+),r   could not find )	r   re   r   	get_dtyper   rematchgroupKeyError)r   rd  ru   rM   rM   rP   ri    s   


zGraphLowering.get_dtypec                 C   s|   ddl m} || jv r| j|  S || jv r+| j| }tt|dd |r'dS | S || jv r7| j|  S t	d| )Nr   )MultiOutputLayoutlayoutrh  )
r!   rn  r   numelr   ri   r   	get_numelr   rm  )r   rd  rn  bufrM   rM   rP   rq    s   



zGraphLowering.get_numelc                    s   t  j| S rL   )r   run)r   rN   r  rM   rP   rs    s   zGraphLowering.runset_namebufferru  c                C   sh   |  dt| j }| j| || j|< t|tjr | s-|	 d ur-| 
|	  |r2||_|S )Nrr  )rV  r   r   appendr   ri   r!   ComputedBufferis_zero_elements
get_devicerb  r  )r   rv  ru  r  rM   rM   rP   register_buffer  s   

zGraphLowering.register_bufferbuffer_namesc                 C   s"   |  dd| }|| j|< |S )Nlist_r   )rV  r   r   )r   r|  r  rM   rM   rP   register_list  s   
zGraphLowering.register_listc                    s    fdd  | d S )Nc                    s   t | ttfr| D ]} | q	t | tjrAt| dr0t | jtjr0t| jdr0t | jjtjs2d S |  D ]}j| 	|  q6d S d S )Nrf  )
ri   r   tupler!   IRNoder}   rf  get_read_namesr   rw  )valuex	read_nameregisterr   rM   rP   r    s"   

z1GraphLowering.register_users_of.<locals>.registerrM   )r   node_outputrM   r  rP   register_users_of  s   zGraphLowering.register_users_ofc                 C   sD   t |tsJ | j| || jvrdS | j| D ]}|  qdS )z
        When a buffer is mutated we need to make sure all the reads to
        the old version are realized before the mutation happens.
        N)ri   strr   r`   r   realize)r   r  r\  rM   rM   rP   mark_buffer_mutated  s   

z!GraphLowering.mark_buffer_mutatedc                 C   sP   || j v r
|| jv sJ d| t| j | }|| jjv r#| jj| S | j| S )z
        In AOTI, module buffers may have been mutated during the tracing and compilation.
        Thus we need to read from previously stored original buffers, to make sure the
        generated model.so uses correct initial values.
        z$Can not find the original value for )r  r   rB   r  r   )r   r  	orig_namerM   rM   rP   get_original_value_of_constant  s   
z,GraphLowering.get_original_value_of_constantc              
   C   s^  |}t jjsG| j D ];\}}|jsF| | krF| | krF|j|jkrF|j	|j	krF|
  |
  krF| | krF|  S q|d u rSdt| j }|d  r^d| }| |}tdd|}|}d}|| jv r| d| }|d7 }|| jv ss|| j|< |j	d|jdt| dt| dt|d	| j|< || j|< |S )	Nconstantr   	constant_z[^a-zA-Z0-9_]r   r    r  )r    aot_inductoruse_runtime_constant_foldingr   items	is_mkldnnr   r   re   r$  untyped_storagedata_ptrstorage_offsetr   isdigitrV  rj  subr  hashr   r  )r   r  rf  r  constant_namer  prefixcntrM   rM   rP   allocate_non_dup_const_name$  sL   









z)GraphLowering.allocate_non_dup_const_namec              
   C   s6   |  ||}tt|t|j|jg| |R  S rL   )	r  r4   creater!   re  r/   r$  re   r   )r   rf  r  new_namerM   rM   rP   add_tensor_constantJ  s   z!GraphLowering.add_tensor_constantdevice_overridec                 C   sz   | j | j|ks|du r|S tjj  | | d|j |jp!d | j | 	|W  d   S 1 s6w   Y  dS )z
        We AOT copy constants to the devices they are needed on.
        If device_override doesn't match the constant's device, then
        copy it and return a different name.
        Nr   r   )
r   r$  rR   utils_python_dispatch_disable_current_modesr  r`  ra  to)r   r  r  rM   rM   rP   r  S  s   $zGraphLowering.constant_namer   c           	   	      sF  t  |||}| j| t|tr|jj}|| j|< |S t|t	t
tfr2t|}|| j|< |S t|tr9d S t|tjsCJ ||jsN| |\}}n| |\}}| |}tt|t|j|j||}|| j|< |jj| j|< | |j t  t|r| j !| W d    |S W d    |S 1 sw   Y  |S rL   )"r   placeholderr   rw  ri   r   r   r   r   intrZ   r?  rj   sympifyr   rR   Tensor_has_symbolic_sizes_stridesr   r   rV  r4   r  r0   r/   r$  re   rf  r   rb  rD   rE   r  r`   )	r   r   rN   rO   exampler   sizesstridestensorr  rM   rP   r  c  sH   








zGraphLowering.placeholderc           
   
      sb  |t ju rt|d tttfrt |||S t|dr#||i |S  fdd}|t	vrt|t
jjs;J | d| dd }|tv rMt| n7tjrs||||\}}}t|gr`tnt}td|||| t|| nt|gr~t|||t|||ztdt	|  t	| |i |}|W S  ty }	 zt|	||||	jd d }	~	ww )	Nr   _inductor_lowering_functionc                    sL   d }t jjj| jv r!tjtdd}| jg|R i |\}}|}|||fS )NT)ignore_mutated_args_FIXME)	rR   _CTagneeds_fixed_stride_ordertags	functoolspartialr6   r   )r   rN   rO   layout_constraintconstrain_fnrc  rM   rP    get_custom_op_layout_constraints  s   
zEGraphLowering.call_function.<locals>.get_custom_op_layout_constraintsz is not an OpOverloadrz   z"Creating implicit fallback for:
%sz  via %s)operatorgetitemri   r   r  dictr   r   r}   r;   rR   _ops
OpOverloadr  r{   r7   r<   r    implicit_fallbacksr   r,   r-   r<  r^  operator_strr=  	Exceptionr+   with_traceback__traceback__)
r   r   rN   rO   r  	base_namer  errorouter  rc  rP   r     sT   



zGraphLowering.call_functiontc                 C   s   t | jdko| jd dkS )zM
        True if this is a small constant attr that will be inlined.
        r   r      )r   shape)r  rM   rM   rP   can_inline_constant  s   z!GraphLowering.can_inline_constantc                 C   s  t | j|}t|tjjrtj||dS t|tjj	r*|| j
|< d| j|< t||S tjjs5tjs5t|r;| ||S t ; |jdkrUt| |j|jW  d    S | |rrddlm} || |j|jdW  d    S W d    n1 s|w   Y  | ||S )N)r  graph_moduler   rM   r   )r  )re   r$  )r   r  ri   rR   fxGraphModuler!   Subgraphr  ScriptObjectr   r   r5   r    r  r  always_keep_tensor_constantsr>   r  r   r  r.   itemre   r$  r  loweringr  tolist)r   r   rN   rO   r  r  rM   rM   rP   get_attr  s2   




	zGraphLowering.get_attrc                 C      t rL   AssertionErrorr   r   rN   rO   rM   rM   rP   call_module  rQ   zGraphLowering.call_modulec                 C   r  rL   r  r  rM   rM   rP   call_method  rQ   zGraphLowering.call_methodc              	      s  t  |||}t|ttfs|f}t|ttfsJ t|tdd |D s,J |tjj	j
d }t|ttfs=|f}dd |D }g }t|t|ksPJ t||D ]!\}}t|tjtjfsh|| qU|| ||jd   qU|| _| j D ]e\}	}
t|
ttjfsJ dt|
 t|
tsq|
  t|
tsJ |
j}
t|
tjsJ |
}|
j}
t|
tr|
 |	krtj|
| j |	  z| j!|}| j |	 | j|< W q t"y   Y qw q| #  t$%d| j&| j'd ur| j' d S d	 d S )
Nc                 s   s<    | ]}t |ttjtd tjtjtjj	j
ttjfV  qd S rL   )ri   r4   r!   r.   r`  re  rj   rl   logicboolalgBooleanr  EffectfulKernelrt   r  rM   rM   rP   r'    s     
z'GraphLowering.output.<locals>.<genexpr>r   c                 S   s   g | ]}t j|qS rM   )r!   ExternKernelrealize_inputr  rM   rM   rP   r      s    z(GraphLowering.output.<locals>.<listcomp>r#  z'Unsupported inductor graph input type: zGForce channels last inputs for %d conv for the current graph with id %dr(  )(r   outputri   r  r   r`  r;  rG   r  r   rN   r   zipr!   r4   BaseViewrw  try_match_insignificant_stridesr   r   r   r   r  rj   rl   r  rf  r3   r0   get_nameMutationLayoutSHOULDREMOVErealize_intor   ra  
ValueErrorfinalizer<  r=  r   r  )r   r   rN   rO   resultfx_node_argsresult_correct_stridesrfx_noder  r  value_storage_boxindr  rM   rP   r    sr   



zGraphLowering.outputc                 C   s   | j D ]}|  qd S rL   )r   decide_layout)r   rr  rM   rM   rP   r  Q  s   

zGraphLowering.finalizer   c                 c   s*    | j }z|| _ d V  W || _ d S || _ w rL   )r   )r   r   oldrM   rM   rP   set_current_nodeU  s   zGraphLowering.set_current_nodemeta_strides_inp.c                    s   t jj|s	J dd |D }t fddt|| D r"|S  fdd}|| || s4|S t jj|\}}t	|j
}t| D ]\}}	 j|	drY|| ||< qHt jj|j|j|j||j}
tt jj||
S )a  
        Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
        dimensions - size 0 or 1 - will be updated.

        If there are real stride differences (NHWC vs NCHW) then the input will be returned.
        c                 S   r   rM   r   rt   srM   rM   rP   r   m  s    zAGraphLowering.try_match_insignificant_strides.<locals>.<listcomp>c                 3   s"    | ]\}} j ||V  qd S rL   )r   statically_known_equals)rt   s1s2rc  rM   rP   r'  q  s
    
z@GraphLowering.try_match_insignificant_strides.<locals>.<genexpr>c                    s@   t | ||D ]\}}} j|drq j||s dS qdS )Nr   FT)r  r   statically_known_leqr  )r  meta_stridestensor_stridesdimr  r  rc  rM   rP   significant_strides_equalw  s   zPGraphLowering.try_match_insignificant_strides.<locals>.significant_strides_equalr   )rR   r@  r!   is_storage_and_layoutr;  r  
get_strideget_sizeas_storage_and_layoutr   r   r|   r   r  r/   r$  re   r   offsetr4   ReinterpretView)r   r  r  r  r  storage
old_layout
new_strider   r  
new_layoutrM   rc  rP   r  ^  s6   

z-GraphLowering.try_match_insignificant_stridesr!  c           $         s  fdd}t jh}jdkr!\}}|t||O }tj|  t	 jdkrWj
tjurWtrW|d tj
dd|i |}nWjdkr~j
tv r~|d tj
 g|R i |\}}j
||}n0tj
r|d tjd	 tjtjtjfrjd	 jj}nt }n
|d
 t }tjjjjtjjj jtjjj!jtjjj"jtjjj#jg t$dd j%D }t$ fddj%D }j&ddrt|t'r|(  jd	 ) }	tj*j+j,|	 }
t-|dr|. |	kr|
st/|	}tj01||}|r0t|t'r0t|j2tj3r0|(  |s6|rtjd	 tj4rjd	 ) }	tj56jd	 }t t7|	dk}|s|rt |	rt/|	}t |8 dkrj9v rj:j;vr|stj<}j:j;vo| }tj0j1|||d}t t=j%}|dkrt|t'rj%D ]}|j
t>v ri|?  tjjj@jtjjjAjtjjjBjg}g }jCs|DtjjjEj tjFjGr4|tjjHjIjtjjHjIjJtjjjKjtjjLjMjtjjLjMjNtjjLjMjJtjjLjMjOg7 }|tjjHjPjtjjHjPjJtjjHjQjJtjjHjRjtjjLjSjtjjLjSjJg7 }tjFjTr4|tjjUjVjg7 }|j
|v rKtj0j1|t/jd	 ) dd}|j
|v ri|jWd u ritj01|t/tXjd	 jY}|jdkr}t|j2j2tZt[fr}|(  q|\t j% t|t'r|] r|?  t|t'rt|j2t^r|j2j2}t|tZr|_ r|(  W d    n	1 sw   Y  W d    n	1 sw   Y  W d    n	1 sw   Y  t|t'rNt|j2tj^rNt|j2j2tj`r|j2j2_anKt|j2j2tjbrN|j2j2_at|j2j2tjcr+t|j2j2j2tj`r+|j2j2j2_an#t|j2j2tjdrN|j2j2jesNt|j2j2jfd tjbrN|j2j2jfd _ag| t= }tht jD ]}|j| i O }q^fdd}jdkrCt	jjjkjl}|D ]}jmn|g }|jo| }|p q|sdd }||jr }d urjstt||jrk| d|jr dd ||ju }d urՈjstt||juk| d|ju dd |D ]4}t7|j}|jv } | rtw| dd d d }!jmx|!g D| q׈jstt|j|j dd qאq jv|O  _vtyt	jjjkjlj&d!i }"d"d# |"z D }#||#ksCJ d$| d|# d%{  d&|  |S )'Nc                    s   t dt j|  d S )Nzlowering %s %s)r<  r=  r   format_node)msgr*  rM   rP   r=    s   z%GraphLowering.run_node.<locals>.debugr   r8   F)add_to_fallback_setr:   ry   r#  r   c                 s   s    | ]}|j d kV  qdS )r  N)rw   rt   r\  rM   rM   rP   r'    s    z)GraphLowering.run_node.<locals>.<genexpr>c                 3   s    | ]}|j  v V  qd S rL   )r   r  )as_strided_opsrM   rP   r'    s    

inductor_realize_to_stridesr  r      )allow_paddingr   Tr  c                     s>   g } j  d  D ]}| d|  d| d q	d| S )Nzunbacked_symbol_defs=z in:

z***
)r   rw  get_unbacked_symbol_defsr   )r  b)buffer_watermarkr   rM   rP   format_bufferss  s   
z.GraphLowering.run_node.<locals>.format_buffersr  c                 S   s    zt | W S  ty   Y d S w rL   )r  	TypeError)r  rM   rM   rP   convert  s
   
z'GraphLowering.run_node.<locals>.convertz >= rt  z <= c                 S   s   t | S rL   )r  r  rM   rM   rP   <lambda>      z(GraphLowering.run_node.<locals>.<lambda>)keyunbacked_bindingsc                 S   s   h | ]}t jjj||qS rM   )rG   rC  r  unbacked_renamingsr   r  rM   rM   rP   rv     s    z)GraphLowering.run_node.<locals>.<setcomp>zfailed z (inductor >= fx)
fx node is: z
new buffers are:

)|r   r   rw   fetch_args_kwargs_from_envrA   r!   r  current_originsr  rG   r   r  r  r9   r8   r:   r   ry   ri   r   rR   r   SymFloatSymBoolr   r   r   run_noder  r   
as_stridedr   as_strided_as_strided_scatterresize	resize_asr   rY  r   r4   r  r   r@  r  any_is_symbolicr}   r  get_stride_orderr  require_stride_orderrf  r  r  _prims_commonis_non_overlapping_and_denser   r  r  r  r  NHWC_STRIDE_ORDERr   r=   realize_hintr   mm_int_mmr   rw  r   r  _has_mkldnnr8  _linear_pointwisebinarymkldnn_rnn_layeronednnqlinear_pointwiser  binary_tensor_convolution_pointwise_convolution_pointwise_ _convolution_transpose_pointwiseqconv2d_pointwisehas_mklmkl_mkl_linearrN   r   r  r1   r2   
mark_reusehas_exceeded_max_readsr3   has_large_inner_fnLoopsorigin_nodeBufferrx  MultiOutputindicesinputsr  ranger  r  r   r  r   popvar_to_range _default_unspecified_value_rangeissubsetlowerr{  AssertScalarupperr   sorted
setdefaultr   r   r  )$r   r!  r=  originsrN   rO   r  	is_outputis_input_for_as_stridedr  sym_stridesstride_orderdenseunbacked_symbols_in_stridesr  	num_usersr\  need_fixed_layoutneed_fixed_channels_last_layoutcurrnew_unbacked_defsr   r  r  i0rasvrr  rP  rR  rafvsmissingi1r  renamed_unbacked_bindingsr  )r  r  r!  r   rP   r%    s  


 




















	







    >






zGraphLowering.run_nodec                 C   s   t jrtdtjdvrtdtj | j D ],}d }t|tr'|	 }nt|t
jt
jt
jjjfr8t|}t|| jsEtd| qd S )NzC++ codegen is disabled)linuxdarwinzUnsupported platform zUnsupported input dtype )r    disable_cpp_codegenr*   sysplatformr   rE  ri   r4   ri  rj   rk   rl   rm   rn   ro   rs   rh   rf   )r   r  re   rM   rM   rP   !validate_can_generate_cpp_wrapper  s    


z/GraphLowering.validate_can_generate_cpp_wrapperc                 C   s   d| j v | _| jr|   | j  }|d |d t|dks,J dd|t|dk}|r6dn|	 }t
|| _t|| j}|d usQJ d| d	| | _| jrh| jjj| j_| jjj| j_d S d S )
Nrf   r   r   r   zDoes not support mixing {}+r   zDevice z not supported)r   rf   r  rn  r   discardr   formatr   rL  r#   r   r%   r   r   _names_itersrc_to_kernel)r   r   only_cpudevice_typewrapper_code_gen_clsrM   rM   rP   init_wrapper_code  s,   




zGraphLowering.init_wrapper_codec                    s  dj v rd_tddi  j}W d   n1 sw   Y  dd  tjj	 }|durWt
tjtsW|jr@|j  dd	 |jD } fd
d	t|tjD n fdd	t
tjtrejntjD jrddlm} fdd	tjD }|D ]
}|| |< qtjj  | W d   n1 sw   Y  d_j  j  tjjj   tjjj!  " S " S )ad  
        For CPU, the cpp wrapper codegen is done in one pass.
        For GPU, the cpp wrapper codegen is done in two steps: JIT-compile the model with python
        wrapper code and run it to generate autotuned kernel binaries in the first pass; and then
        generate cpp wrapper code and compile it to a dynamic library in the second pass.
        rf   Fztriton.store_cubinTNc                 S   sP   t | tjtjfr| jjS t | trt| S t | tjs&J dt	t
|  | S )Nz&Unknown type when creating real inputs)ri   rR   r   r#  r   hintr   r   r  r  r`  r  rM   rM   rP   materialize  s   
z;GraphLowering.codegen_with_cpp_wrapper.<locals>.materializec                 S   s   g | ]}|d ur|qS rL   rM   )rt   paramrM   rM   rP   r   '  s
    z:GraphLowering.codegen_with_cpp_wrapper.<locals>.<listcomp>c                       g | ]} |qS rM   rM   r  ry  rM   rP   r   ,  s    c                    r{  rM   rM   r  r|  rM   rP   r   3  s    r   )clone_preserve_stridesc                    s.   g | ]\}}|j v rt | tjr|qS rM   )r   ri   rR   r  )rt   r&  r  )real_inputsr   rM   rP   r   ?  s    
)#r   r  r    patchcompile_to_modulecallrR   _guardsTracingContexttry_getri   rG   r~  rF   output_stridesclearparams_flat	itertoolschainr   r   
compile_fxr}  r|   r   r  r  r  r   r   r  r   precomputed_replacementsinv_precomputed_replacementscodegen)r   compiledtracing_contextr  r}  r   r&  rM   )ry  r~  r   rP   codegen_with_cpp_wrapper  sR   





	


z&GraphLowering.codegen_with_cpp_wrapperc                 C   sf   ddl m} |   || j| _ tj| j| j j | j	
|  | j   | j	| j}| j	  |S )Nr   	Scheduler)r
  r  rw  r   rG   r=  draw_orig_fx_graphr  r   r   push_codegened_graphr  generater   pop_codegened_graph)r   r  r  rM   rM   rP   r  _  s   

zGraphLowering.codegenc                 C   s>   ddl m} |j| _|j| _|j| _|| j| _ | j   dS )a  
        This is a more compact version of the `codegen()` above
        where we codegen this graph as a subgraph of some parent
        graph. The parent graph is passed as an argument: the
        intention is to inline codegening of the subgraph in
        the parent graph's wrapper code (including the generated
        kerenls). The wrapper code is not finalized (via `.generate()`
        call), as this will be done in the parent graph's `codegen()`.
        r   r  N)r
  r  r   r   r  r   r  )r   parent_graphr  rM   rM   rP   codegen_subgraphm  s   
zGraphLowering.codegen_subgraphc                 C   sX   d}g }g }| j jD ]}| }||7 }|||d f ||| f q
|||fS )Nr   r  )r
  r   get_read_write_buffers_sizesrw  get_estimated_runtime)r   total_bytesnode_countsnode_runtimesr   	num_bytesrM   rM   rP   count_bytes  s   
zGraphLowering.count_bytescode_gen)
phase_namefwd_onlyc                    sF  ddl m} | jr|  n|  \ }td  zdd |D }| \}W n ty;   t	d fddd	  w t	dfd
d fddd	 |j
||i | j| jd}|| _| _|| _|jd usjJ t|j td|j td|j tjrtd|j tjd tj|j tjtj|jd d  |S )Nr   )PyCodeCacheOutput code: 
%sc                 S   s   g | ]	\}}||j fqS rM   )stack_trace)rt   line_nor   rM   rM   rP   r     s    z3GraphLowering.compile_to_module.<locals>.<listcomp>inductor_output_codec                          S rL   rM   rM   coderM   rP   r        z1GraphLowering.compile_to_module.<locals>.<lambda>)
payload_fnc                      s   d iS )NfilenamerM   rM   )pathrM   rP   r    r  c                      r  rL   rM   rM   r  rM   rP   r    r  )linemapattrszOutput code written to: %szCompiled module path: )filer   z.debug)	codecacher  r  r  r  output_code_logr=  writer  r   load_by_key_pathr   r   r  r  r  __file__rK   r<  r^  r    benchmark_kernelprintrl  stderrrG   rJ   r   osr  splitext)r   r  r  r  modrM   )r  r  rP   r    sJ   



zGraphLowering.compile_to_modulec                 C   s   | j r>ddlm} | jsJ d|  \}}td| d }t r4| j	r4| j
r4| 
| j	}td| |j| ||| jdS |  jS )Nr   )AotCodeCompilerz"AOT mode only supports C++ wrapperr  z#Serialized Extern Kernel Nodes: 
%s)rf   )r  r  r  r  r  r  r=  r    	is_fbcoder   r   compilerf   r  r  )r   r  r  r  serialized_extern_kernel_nodesrM   rM   rP   compile_to_fn  s.   

zGraphLowering.compile_to_fnc                 C   s   dd | j D S )Nc                 S   s,   g | ]}t |tjst |tjs| qS rM   )ri   r!   NoneAsConstantBufferShapeAsConstantBufferr  )rt   r   rM   rM   rP   r     s    

z2GraphLowering.get_output_names.<locals>.<listcomp>)r   rc  rM   rM   rP   get_output_names  s   zGraphLowering.get_output_namesc                 C   s4   || j  v o| j |  dko| j |  jdkS )Nr   r   )r   r   rq  rz  r`  rU  rM   rM   rP   is_unspec_arg  s
   zGraphLowering.is_unspec_arg)NNNFFNNNFFNNNNrL   )G__name__
__module____qualname__r   r!   r  __annotations__rR   r  r   r   r   r  r  r	   r   staticmethodrZ   r   r  rV  rX  r  r_  r$  rb  propertyrC  rg  ri  rq  r   rs  rG  r{  r~  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r   Noder  r   r   r  r   r4   r  r%  rn  rw  r  r  r  r  r  r  r  r  __classcell__rM   rM   r  rP   r      s   
 #  "
2

&	6AL
7  @[

0r   )~r  r  loggingr  r  rj  rl  r   collectionsr   
contextlibr   typingr   r   r   r   r   r	   r
   r   r   r   rj   rR   torch._loggingtorch.fxtorch._decompr   torch._dynamo.utilsr   r   r   r   torch._prims_commonr   torch._subclasses.fake_tensorr   %torch.fx.experimental._backward_stater   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   r   torch.utils._mode_utilsr   r   r    r!   codegen.commonr"   r#   r$   r%   r&   codegen.cpp_wrapper_cpur'   codegen.cpp_wrapper_cudar(   codegen.wrapperr)   excr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r  r6   r7   r8   r9   r:   r;   r<   r=   r>   r   r?   r  r@   rA   rB   rC   rD   rE   virtualizedrF   rG   torch._higher_order_ops.effectsrH   	getLoggerr  r<  _logginggetArtifactLoggerr]  r  r  r   countr  r  torch._inductor.fb.utilsrK   rh   rs   ry   r   r   r  Interpreterr   rM   rM   rM   rP   <module>   sf   0 (,
 

7