o
    Ti"                     @   s   d dl Z z5d dlm  mZ d dlmZ d dlmZm	Z	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZ eZW n	 eyB   Y nw ddlmZ dd	lmZ d
edefddZdd Zded
efddZdd ZdS )    N)create_aot_dispatcher_function)register_lowering	fallbacksadd_needs_realized_inputs)	TensorBoxFallbackKernelLayoutIRNode)V)	Scheduler   )get_input_nodes)DSGraphParamManagerz3_partitionbwdc                    s    fdd}|S )Nc              	      s   | |}|d u rd S rsg } r#  | j\}}dd |D }n j}t| j}t||D ];\}}	|j|v }
|
rhddlm} ddlm	} ||	sUJ d|	 d|
|tjdg|	j|	jd|	j q2|
|	 q2t|}n|}| |S )	Nc                 S   s   g | ]}|j qS  )name).0nr   r   N/home/ubuntu/.local/lib/python3.10/site-packages/deepspeed/compile/inductor.py
<listcomp>'   s    z<patch_compiler.<locals>.wrapped_compiler.<locals>.<listcomp>r   )is_fake)to_fake_tensorzInput z should be fake tensordtypedevice)get_bwd_mappinggraphparam_namesr   zipr   torch._subclasses.fake_tensorr   torch._dynamo.utilsr   appendtorchemptyr   r   	fake_modetuple)gmfake_inputs	mod_graphpatched_inputsparam_nodes_bw_r   input_nodesin_nodein_vds_paramr   r   r   dc_compilergraph_idgraph_param_manageroriginal_compilerr   r   r   wrapped_compiler   s.   





z(patch_compiler.<locals>.wrapped_compilerr   )r5   r2   r   r3   r4   r   r6   r   r1   r   patch_compiler   s   #r7   c                    s    fdd}|S )Nc                     sH   | i |\}}t |j  fdd}||j ||j ||fS )Nc                    sN   | j D ]!}|jdkr$|j jv r$tjdg|jd j|jd jd|jd< qd S )Nplaceholderr   valr   )	nodesopr   r   r#   r$   metar   r   )r   r   pmr   r   fix_placeholder_metaI   s
   
(zMwrap_partition_fn.<locals>.wrapped_partition_fn.<locals>.fix_placeholder_meta)r   r   )argskwargs	fw_module	bw_moduler?   param_indicespartition_fnreal_inputsr=   r   wrapped_partition_fnB   s   

z/wrap_partition_fn.<locals>.wrapped_partition_fnr   )rF   rG   rE   rH   r   rD   r   wrap_partition_fn@   s   rI   r3   c              	      s:   ddl m  dd l f	dd}|  d S )Nr   )AotAutogradc               	      sL   t dr	j_j   	fdd}  _| _d S )N__original_initc                    sp   t |d  dd|d< t |d  dd|d< |d |d< r.t|d |d< | fi | d S )Nfw_compilerF)r   bw_compilerTinference_compilerrF   )r7   rI   )selfrA   )r3   make_bw_graphmake_fw_graphoriginal_initrE   param_managerrG   r   r   r   patched_initc   s$   

zUpatch_create_aot_dispatcher_function.<locals>.patch_aotautograd.<locals>.patched_init)hasattrrK   __init__wraps)rT   	rJ   	functoolsr3   rP   rQ   rE   rS   rG   r   )rR   r   patch_aotautograd\   s   

z?patch_create_aot_dispatcher_function.<locals>.patch_aotautograd)torch._dynamo.backends.commonrJ   rY   )r3   r   rQ   rP   rG   rE   rS   rZ   r   rX   r   $patch_create_aot_dispatcher_functionV   s   
r\   c                     s   	ddd 			d fdd	} | t jjjjddd | t jjjjddd | t jjjjddd | t jjjjdddd | t jjjjddd t	t
d	rPt
jsZdt
_d
d t
_d S d S )NTc                    s$   |rt   fdd}|S )Nc                     sN   fdd}G  fdddt } r|nt }t||jg| R i |S )Nc                    s@   t | tjjjrt| n| }|d ur rtjj	
|  |S N)
isinstancer#   	_inductorirr	   r   creater
   r   never_reuse_buffersaddget_name)xout)never_reuse_outputr   r   wrap_tensors   s   z]register_custom_ops.<locals>.fallback_handler_no_reuse.<locals>.handler.<locals>.wrap_tensorsc                       s:   e Zd Z fddZdefddZ fddZ  ZS )z_register_custom_ops.<locals>.fallback_handler_no_reuse.<locals>.handler.<locals>.CustomDCKernelc                    s:   t  j|g|R i | dd }rt|| d S d S )Nc                 S   s>   t | trt| dsJ d| j tjj|   d S d S )Nrd   zx doesn't have get_name )	r^   r	   rU   	__class__r
   r   rb   rc   rd   )re   r   r   r   add_to_never_reuse   s   
zregister_custom_ops.<locals>.fallback_handler_no_reuse.<locals>.handler.<locals>.CustomDCKernel.__init__.<locals>.add_to_never_reuse)superrV   pytreetree_map)rO   r;   r@   rA   rj   )ri   never_reuse_inputr   r   rV      s
   zhregister_custom_ops.<locals>.fallback_handler_no_reuse.<locals>.handler.<locals>.CustomDCKernel.__init__argc                 S   s2   |  r|S dd l}|d|}|r|dS d S )Nr   zreinterpret_tensor\((\w+),r   )isidentifierrematchgroup)rO   ro   rq   rr   r   r   r   get_var_name_for_arg   s   
ztregister_custom_ops.<locals>.fallback_handler_no_reuse.<locals>.handler.<locals>.CustomDCKernel.get_var_name_for_argc                    s   st  |S | j}| | g |  |  }tjj	| | t
| jtr-| | | |d }|r>|| d | | d S )Nr   z = None)rk   codegenop_overloadcodegen_commentcodegen_argscodegen_kwargsr
   r   wrapper_codegenerate_fallback_kernelr^   layoutr   codegen_size_assertsrt   	writelinecodegen_unbacked_symbol_defs)rO   wrapperkernelr@   var_name)ri   force_free_inputr   r   ru      s   

zgregister_custom_ops.<locals>.fallback_handler_no_reuse.<locals>.handler.<locals>.CustomDCKernel.codegen)__name__
__module____qualname__rV   strrt   ru   __classcell__r   )r   rn   )ri   r   CustomDCKernel   s    
r   )r   rl   rm   ra   )r@   rA   rh   r   
kernel_clsr   r   rn   rg   r   r   handler   s   ) zGregister_custom_ops.<locals>.fallback_handler_no_reuse.<locals>.handler)r   rc   )r   rn   rg   r   add_to_fallback_setr   r   r   r   fallback_handler_no_reuse   s   
4z6register_custom_ops.<locals>.fallback_handler_no_reuseFc                    s$   t |  t| d d | |||dS )N)type_promotion_kindrn   rg   r   )r   r   )rv   rn   rg   r   r   r   r   register_fallback_no_reuse   s   z7register_custom_ops.<locals>.register_fallback_no_reuse)rn   rg   r   is_dc_patchedc                 S   s   d S r]   r   )r,   r   r   r   <lambda>   s    z%register_custom_ops.<locals>.<lambda>)T)FFF)r#   opsdcallgather_paramdefaultwait_allgatherrelease_paramreduce_gradfree_tensorsrU   r   r   dead_node_elimination)r   r   r   r   register_custom_ops~   s&   
?r   ) r#   torch.utils._pytreeutils_pytreerl   torch._functorch.aot_autogradr   torch._inductor.loweringr   r   r   torch._inductor.irr   r   r   r	   torch._inductor.virtualizedr
   torch._inductor.schedulerr   'original_create_aot_dispatcher_functionImportErrorutilr   graph_paramr   boolr7   rI   intr\   r   r   r   r   r   <module>   s$   ((