o
    i'                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZmZmZ ddlm Z  ddl!m"Z" de
j#j$de%e& fddZ'de
j#j(de)e
j*e
j#j+f fddZ,de
j#j(de&de	e- fddZ.de
j#j(de&de	e- fddZ/de
j#j(de&fddZ0de
j#j(de1e	e-  fddZ2de
j#j(d ee defd!d"Z3G d#d$ d$Z4e"d"e4 d% 	&	&d.d'ed(ef d)ee d*e5d+e5ded(ee f f
d,d-Z6dS )/a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Sequence)AnyCallableOptional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendgreturnc                 C   s  dt ttf dtfdd}tt}d}t }| jD ]q}|jdkr<t||jt	j
r7|t||j  | |d7 }q|jdkrt|jd	sHq|jj}t|jD ]8\}}|t|jk rb|j| }	n|j|jvriqQ|j|j }	d
}
|jrz|jjrzd}
|
r||t||	j  O }qQq|S )Nmetar   c                 S   s   d| v r| d S | d S )Nvalfake_result )r   r   r   ^/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fk7   s   z%find_input_mutations.<locals>.meta_fkr   placeholderr   call_function_schemaFT)dictstrr   r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr"   	enumerate	argumentslenargsnamekwargs
alias_infois_write)r   r   inputs	input_idxmutated_inputsnschemaiargargumentmut_argr   r   r   find_input_mutations6   s:   



r@   gmc                 C   sD   i }| j jD ]}|jdd }t|tjr|j|vr|||j< q|S )Nr   )graphr&   r   getr(   r)   r*   device)rA   device_node_mappingr:   tr   r   r   get_device_node_mapping]   s   
rG   	aot_model	num_fixedc                 C   s2   t | jtt| }|sd S t| j}t||S N)r@   rB   r%   ranger   r   )rH   rI   mutation_indicesplaceholdersr   r   r   3check_for_mutation_ignore_cuda_graph_managed_tensorh   s
   

rN   c                 C   sN   t jst| | }r|S tt|  }r|S t|  }r%td|j dS d S )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrN   r   rG   r   r   r3   )rH   rI   mut_skipskipnoder   r   r   check_for_skips   s   rT   c                 C   s$   t tt| }|jdksJ |jS )Ncuda)nextiterrG   typeindex)rA   rD   r   r   r   get_device_index   s   rZ   c                 C   s@   t | }t|jdksJ |jd }t|dsg S dd |D S )Nr   r   __iter__c                 S   s&   g | ]}t |tjjjr|jnd qS rJ   )r(   r)   fxrS   Nodestack_trace).0r=   r   r   r   
<listcomp>   s    z$get_stack_traces.<locals>.<listcomp>)r   r1   r2   r-   )rA   outputr2   r   r   r   get_stack_traces   s   

rb   dynamo_modeldynamo_inputsc              	      s   ddl m tdtd  	ddtjjdtt dt	dtf fd	d
}dtjjdtt dtf fdd}t
||tj|ddtjjjd}|| S )Nr   )cudagraphify_implTFrH   
aot_inputsis_inferencer   c                    s   t | |}ttt|}t| | }r#t td|  |S  t|  ||t	| j
ddt| t| jt| jd	}d|_|S )Nskipping cudagraphs due to Fdevice_indexis_backwardrg   stack_tracesrM   mutated_input_idxsT)r	   r   r1   rT   r   disabler   r%   rZ   rK   valuerb   r   rB   r@   _boxed_call)rH   rf   rg   interpfixedskip_msgoutboxed_device_indexre   do_cudagraphsrd   r   r   forward_cudagraphs   s,   

z&cudagraphs.<locals>.forward_cudagraphsc                    s   t  |}s	 S t }t | }rFtd|  j}|d u r$d}tjjj|ddd us3J dt	t
 dt
f fdd}d	|_|S ||t|t d	dt t jt jd
	}d	|_|S )Nrh   r   F)create_if_none_existsr7   r   c                    s       | S rJ   )set_to_running_backward)r7   rH   managerr   r   fn   s   z3cudagraphs.<locals>.backward_cudagraphs.<locals>.fnTri   )r	   r   rT   r   ro   r)   	_inductorcudagraph_treesget_managerlistr   rp   rK   rZ   rb   r   rB   r@   )rH   rf   rq   rr   rs   
device_idxr}   rt   )rv   re   rw   r{   r   backward_cudagraphs   s>   
z'cudagraphs.<locals>.backward_cudagraphs)rg   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesre   r   r
   r)   r\   GraphModuler   r   boolr   	functoolspartial_dynamor   %cudagraph_backend_keep_input_mutation)rc   rd   rx   r   aot_cudagraphsr   ru   r   
cudagraphs   s6   +
r   c                   @   s@   e Zd ZdZed
ddZedejjde	e
 de
fdd	ZdS )CudagraphsBackendr   r   Nc                  C   s   ddl m}  |   d S )Nr   reset_cudagraph_trees)r   r   r   r   r   r   reset   s   
zCudagraphsBackend.resetmodelr7   c                 C   s
   t | |S rJ   )r   )r   r7   r   r   r   __call__   s   
zCudagraphsBackend.__call__)r   N)__name__
__module____qualname__compiler_namestaticmethodr   r)   r\   r   r   r   r   r   r   r   r   r      s    $r   )r3   compiler_fnTr   .r7   copy_outputscopy_inputsc                    s$  t |ttfs	J  rdd |D nt|tj  tj }|tj  tj	| | |  W d   n1 s>w   Y  |  tj | tj  tj
 tjj|d |  W d   n1 spw   Y  t ttfsfdtdtt f fdd}|S )	zBThis isn't registered as a backend, but is used in some benchmarksc                 S   s   g | ]}t |qS r   )r)   
zeros_liker_   xr   r   r   r`   
  s    z$cudagraphs_inner.<locals>.<listcomp>N)stream
new_inputsr   c                     sT   t t | ks
J  rt| D ]	\}}|| q  r(dd D S S )Nc                 S   s   g | ]}|  qS r   )cloner   r   r   r   r`   &  s    z1cudagraphs_inner.<locals>.run.<locals>.<listcomp>)r1   zipcopy_replay)r   dstsrcr   r   rB   static_inputsstatic_outputsr   r   run  s   zcudagraphs_inner.<locals>.run)r(   r   tupler)   rU   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrB   r   r   )r   r7   r   r   r   r   r   r   r   cudagraphs_inner  s*   





"r   )TT)7__doc__r   collectionsr   collections.abcr   typingr   r   r   r)   torch.fxtorch._dynamor   torch._dynamo.backends.commonr    torch._dynamo.backends.debuggingr	   torch._inductor.cudagraph_utilsr
   r   r   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   r\   Graphr%   intr@   r   r#   rD   r]   rG   r$   rN   rT   rZ   r   rb   r   r   r   r   r   r   r   r   <module>   s\     '

W
