o
    }oiJ                     @   s6  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z d dlZd dl	mZ d dlmZ d dlmZ d dlmZmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ dgZdd Zdd Z G dd dZ!dd Z"dd Z#dd Z$dd Z%dd Z&dd Z'd d! Z(eG d"d# d#Z)G d$d deZ*dS )%    N)	dataclass)
MethodType)AnyDict)LightningModule)Callback)ClosureResult)_ResultCollection_ResultMetric)CombinedLoaderrank_zero_info)is_param_in_hook_signature)STEP_OUTPUT)DistributedDataParallelCUDAGraphCallbackc                    st   t  trtdd  D S t  trtdd  D S t  tr* fdd D S t  tjr8    S  S )Nc                 s       | ]}t |V  qd S Nstruct_copy_one.0i r   S/home/ubuntu/.local/lib/python3.10/site-packages/nemo/utils/callbacks/cuda_graph.py	<genexpr>8       z"struct_copy_one.<locals>.<genexpr>c                 s   r   r   r   r   r   r   r   r   :   r   c                    s   i | ]	}|t  | qS r   r   )r   ksrcr   r   
<dictcomp><   s    z#struct_copy_one.<locals>.<dictcomp>)	
isinstancetuplelistdicttorchTensorclonedetachcudar   r   r   r   r   6   s   


r   c                 C   s   t |trtdt| t |tr:tt|D ]}t || ttttj	fr1t
| | ||  q|| | |< qd S t |trb|D ]}t || ttttj	frYt
| | ||  qA|| | |< qAd S t |tj	rq| j|dd d S tdt| )Nz Unsupported copy for tuple yet: T)non_blockingz,Expect top-level as container type but got: )r    r!   	Exceptiontyper"   rangelenr#   r$   r%   struct_copy_twocopy_)tgtr   r   r   r   r   r   r.   C   s"   


r.   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	StaticBufferLoaderzLoad data to static buffers.c                 C   s   || _ tj | _d | _d S r   )loaderr$   r(   Streamstreamstatic)selfr2   r   r   r   __init__[   s   
zStaticBufferLoader.__init__c              	   c   s    | j D ]M}| jd u r'tj| j t|| _W d    n1 s"w   Y  tj| j t| j| W d    n1 s?w   Y  tj | j | jV  qd S r   )	r2   r5   r$   r(   r4   r   r.   current_streamwait_stream)r6   inputsr   r   r   __iter__`   s   


zStaticBufferLoader.__iter__c                 C   s
   t | jS r   )r-   r2   )r6   r   r   r   __len__k   s   
zStaticBufferLoader.__len__N)__name__
__module____qualname____doc__r7   r;   r<   r   r   r   r   r1   X   s
    r1   c                 C   sD   |   }t| ds|| _tt|D ]}| j| ||  q| jS )N
static_lrs)__orig_get_lr__hasattrrA   r,   r-   r/   )lr_schedulerlrsr   r   r   r   get_lro   s   
rF   c                 O   s*   t j rtd d S | j|i | d S )NzICUDAGraphCallback: set optimizer.zero_grad as nop during graph capturing.)r$   r(   is_current_stream_capturingr   __orig_zero_grad__)	optimizerargskwargsr   r   r   	zero_gradx   s   
rL   c              
   C   s^   t |tjr|  nt|}t|dks)td| d| d| d| d	| }|S )N   z
`self.log(z, zV)` was called, but the tensor must have a single element. You can try doing `self.log(z	.mean())`)	r    r$   r%   r&   r'   tensornumel
ValueErrorsqueeze)r6   valuenamer   r   r   	to_tensor   s   "rT   c                    s   	 d	 d fdd}|S )Nreturnc                    s  t |dst|jddd|_|jrddi}ni }d j  kr$ jk s*n  jdk rd jtj	
  tj	 j |jdi | | j||||d W d    n1 sVw   Y  tj	
  j  j jkrtj	  td td| jj tj	j j jd	d
 |jdi | | j||||d W d    n1 sw   Y  tj	   j j  krdkrn n j  t j|_t | dr|     jd7  _d S )Nsupport_set_to_noneset_to_noneTexplicitr   )optimizer_closurerM   z6CUDAGraphCallback: capturing CUDA graph for module %s.global)r4   capture_error_modenon_cuda_graph_capturabler   )rC   r   rL   rV   current_iterationcapture_iterationr4   r9   r$   r(   r8   __orig_optimizer_step__synchronizetimesleepr   	__class__r=   graphreplayr   from_training_step_outputoutput_resultr]   )r6   epoch	batch_idxrI   rZ   zero_grad_kwargsstater   r   optimizer_step   sN   

$




z*get_optimizer_step.<locals>.optimizer_stepr   )rU   Nr   )rn   ro   r   rm   r   get_optimizer_step   s
   >rp   c                        fdd}|S )Nc                    sX   |  |} jd u rt| _t  t j| W d    |S 1 s%w   Y  |S r   )__orig_training_step__rh   r   r$   no_gradr.   )r6   batchresultsrm   r   r   training_step   s   




z(get_training_step.<locals>.training_stepr   )rn   rv   r   rm   r   get_training_step   s   
rw   c                    rq   )Nc                    s2   d|vrd|d<  j dkrtd | j|i |S )Ncache_enabledFr   z*CUDAGraphCallback: disable autocast cache.)r^   r   __orig_init__r6   rJ   rK   rm   r   r   amp_autocast_init   s
   
z0get_amp_autocast_init.<locals>.amp_autocast_initr   )rn   r{   r   rm   r   get_amp_autocast_init   s   r|   c                    rq   )Nc                    sL   t d tj j | j|i | W d    d S 1 sw   Y  d S )Nz+CUDAGraphCallback: init DDP on side stream.)r   r$   r(   r4   ry   rz   rm   r   r   init   s   "zget_ddp_init.<locals>.initr   )rn   r}   r   rm   r   get_ddp_init   s   r~   c                   @   sR   e Zd ZU dZeed< dZeed< dZej	j
ed< dZej	jed< dZeed< dS )	CUDAGraphStater   r^   r_   Nr4   re   rh   )r=   r>   r?   r^   int__annotations__r_   r4   r$   r(   r3   re   	CUDAGraphrh   r   r   r   r   r   r      s   
 r   c                       s   e Zd ZdZd& fdd	Zddddd	ed
dfddZddddd	ed
dfddZd'ddZd'ddZ	d'ddZ
d'ddZd'ddZd'ddZdddddeded
df
ddZddddd ededed
dfd!d"Zddddd#eeef d
dfd$d%Z  ZS )(r   zFull iteration CUDA graph callback.

    Dataloader and LR scheduler are not included in the CUDA graph with this callback.
    r   c                    sR   t    d|  krdkrtd tj rtddtjd< t|d| _	d S )Nr      zHWarmup must run at least 11 DDP-enabled eager iterations before capture.z=CUDAGraphCallback should be initialized before process group.0TORCH_NCCL_ASYNC_ERROR_HANDLING)r_   )
superr7   r*   r$   distributedis_initializedosenvironr   rn   )r6   r_   rd   r   r   r7     s   


zCUDAGraphCallback.__init__trainer
pl.Trainer	pl_modulepl.LightningModulestagerU   Nc                 C   sB   | j jdk rdS tjjtj_t| j tj_tjt_t| j t_dS )z9Called when fit, validate, test, predict, or tune begins.r   N)	rn   r_   r$   autocastr7   ry   r|   r   r~   r6   r   r   r   r   r   r   setup  s   zCUDAGraphCallback.setupc                 C   s2   | j jdk rdS tjjtj_tj`tjt_t`dS )z7Called when fit, validate, test, predict, or tune ends.r   N)rn   r_   r$   r   ry   r7   r   r   r   r   r   teardown  s   zCUDAGraphCallback.teardownc                 C   sH   | j jdk rdS t|jdddrtdtj | j _tj	 | j _
dS )zCalled when fit begins.r   Ndataloader_iterTrX   a  Found `dataloader_iter` argument in the `training_step`. This is not supported by full iteration CUDA graph capturing yet since dataloader will be within the CUDA graph capturing range.
Try to change `dataloader_iter` to `batch` and remove `next(dataloader_iter)` from `training_step`.)rn   r_   r   rv   r*   r$   r(   r3   r4   r   re   r6   r   r   r   r   r   on_fit_start*  s   	zCUDAGraphCallback.on_fit_startc                 C   s   | j jdk rdS dS )zCalled when fit ends.r   N)rn   r_   r   r   r   r   
on_fit_end<  s   zCUDAGraphCallback.on_fit_endc                 C   sj  | j jdk rdS |jjj}t|tjjj	j
sJ dt| t|}|jjj}t||d}|jj|j_||j_|jj|jj t|jj |jD ]}t|tjjs\J dt| |j|_tt||_qJ|jD ]#}t|jtjjjsJ dt|j |jj|j_tt|j|j_qjtj t_!t"t_ |j#|_$t%| j }	t|	||_#|j&|_'t(| j }
t|
||_&dS )zCalled when the train begins.r   NzExpect Dataloader type but got )modezExpect Optimizer type but got z!Expect _LRScheduler type but got ))rn   r_   fit_loop_combined_loader
_iterablesr    r$   utilsdata
dataloader
DataLoaderr+   r1   _moder   __orig_combined_loader___data_fetcherr   iter
optimizersoptim	OptimizerrL   rH   r   lr_scheduler_configs	schedulerrD   _LRSchedulerrF   rB   r   _LightningModule__to_tensor__orig_to_tensor__rT   rv   rr   rw   ro   r`   rp   )r6   r   r   r   static_loaderr   combined_loaderrI   configrv   ro   r   r   r   on_train_startA  sF   


 


z CUDAGraphCallback.on_train_startc                 C   s   | j jdk rdS |jj|j_|jj|jj t|jj |j`|jD ]}|j	|_
|`	q#|jD ]}|jj|j_|j`q/tjt_t`|j|_|`|j|_|`dS )zCalled when the train ends.r   N)rn   r_   r   r   r   r   r   r   r   rH   rL   r   r   rB   rF   r   r   r   rr   rv   r`   ro   )r6   r   r   rI   r   r   r   r   on_train_endq  s$   

zCUDAGraphCallback.on_train_endc                 C      dS )z#Called when the train epoch begins.Nr   r   r   r   r   on_train_epoch_start  s   z&CUDAGraphCallback.on_train_epoch_startc                 C   r   )aI  Called when the train epoch ends.

        To access all batch outputs at the end of the epoch, either:

        1. Implement `training_epoch_end` in the `LightningModule` and access outputs via the module OR
        2. Cache data across train batch hooks inside the callback implementation to post-process in this hook.
        Nr   r   r   r   r   on_train_epoch_end  s   z$CUDAGraphCallback.on_train_epoch_endrt   rk   c                 C   r   )z#Called when the train batch begins.Nr   )r6   r   r   rt   rk   r   r   r   on_train_batch_start  s   z&CUDAGraphCallback.on_train_batch_startoutputsc                 C   r   )zCalled when the train batch ends.

        Note:
            The value ``outputs["loss"]`` here will be the normalized value w.r.t ``accumulate_grad_batches`` of the
            loss returned from ``training_step``.
        Nr   )r6   r   r   r   rt   rk   r   r   r   on_train_batch_end  s   	z$CUDAGraphCallback.on_train_batch_end
checkpointc                 C   s   d|v r%|d D ]}t | D ]}|| }t|tr#t|dr#||= qqd|v rJ|d D ]}t | D ]}|| }t|trHt|drH||= q5q-dS dS )a  
        Called when saving a checkpoint to give you a chance to store anything else you might want to save.

        Args:
            trainer: the current :class:`~lightning.pytorch.trainer.Trainer` instance.
            pl_module: the current :class:`~lightning.pytorch.core.module.LightningModule` instance.
            checkpoint: the checkpoint dictionary that will be saved.
        optimizer_states__self__lr_schedulersN)r"   keysr    r   rC   )r6   r   r   r   optimizer_stater   vrD   r   r   r   on_save_checkpoint  s$   z$CUDAGraphCallback.on_save_checkpoint)r   )r   r   r   r   rU   N)r=   r>   r?   r@   r7   strr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r      sX    



0





)+r   rb   dataclassesr   typesr   typingr   r   lightning.pytorchpytorchplr$   r   lightning.pytorch.callbacksr   .lightning.pytorch.loops.optimization.automaticr   <lightning.pytorch.trainer.connectors.logger_connector.resultr	   r
   lightning.pytorch.utilitiesr   r   +lightning.pytorch.utilities.signature_utilsr   !lightning.pytorch.utilities.typesr   torch.nn.parallelr   __all__r   r.   r1   rF   rL   rT   rp   rw   r|   r~   r   r   r   r   r   r   <module>   s:   !	B	