o
    ;iQ/                     @   s   U d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZ edZeed	< d
Zeed< dZeed< ee Zeed< G dd deZG dd deZeG dd dZG dd dZdS )    N)ThreadPoolExecutoras_completed)	dataclass)Enum)Path)ListOptional)configloggercuda_checkpoint_pathCUDA_CHECKPOINT_PATHg     f@CUDA_CHECKPOINT_TIMEOUT   "CUDA_CHECKPOINT_TOGGLE_NUM_RETRIESCUDA_CHECKPOINT_TOGGLE_TIMEOUTc                   @   s    e Zd ZdZdZdZdZdZdS )CudaCheckpointStatezyState representation from the CUDA API [1].

    [1] https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.htmlrunninglockedcheckpointedfailedN)__name__
__module____qualname____doc__RUNNINGLOCKEDCHECKPOINTEDFAILED r   r   V/home/ubuntu/.local/lib/python3.10/site-packages/modal/_runtime/gpu_memory_snapshot.pyr      s    r   c                   @   s   e Zd ZdZdS )CudaCheckpointExceptionz0Exception raised for CUDA checkpoint operations.N)r   r   r   r   r   r   r   r   r    *   s    r    c                	   @   sl   e Zd ZU dZeed< eed< ddededdfd	d
Z	ddede	dedefddZ
dddZdddZdS )CudaCheckpointProcesszContains a reference to a PID with active CUDA session. This also provides
    methods for checkpointing and restoring GPU memory.pidstateFtarget_stateskip_first_refreshreturnNc           	      C   s:  t d| j d|j  t }d}t}d}| j|||o |dk dr|d7 }z|   t	dd|  d}t
| d}W nA ty } z5|d7 }||kr\td| j d	| d
| t d| j d| d| d|  t
d|  W Y d}~nd}~ww | j|||o|dk ds$t d| j d|j d dS )zToggle CUDA checkpoint state for current process, moving GPU memory to the
        CPU and back depending on the current process state when called.
        PID: z# Toggling CUDA checkpoint state to r   )refresh   g?   g      ?z Failed to toggle state after z
 retries: z Retry /z after error: g      ?Nz Target state z reached)r
   debugr"   valuetime	monotonicr   _should_continue_toggle_execute_toggle_commandminsleepr    )	selfr$   r%   
start_timeretry_countmax_retriesattempts
sleep_timeer   r   r   toggle8   s8   
$ zCudaCheckpointProcess.toggleTr5   r(   c              	   C   s~   |r|    | j|krdS | jtjkrtd| j d| j t | }|tkr=td| j d|dd|j	 d| j dS )	zMCheck if toggle operation should continue based on current state and timeout.Fr'   z CUDA process state is z Timeout after z.2fzs waiting for state z. Current state: T)
refresh_stater#   r   r   r    r"   r.   r/   r   r-   )r4   r$   r5   r(   elapsedr   r   r   r0   X   s   
z-CudaCheckpointProcess._should_continue_togglec              
   C   s   zt jtddt| jgdddtd}td| j d W dS  t jy> } zd| j d|j	 }t| t
|d}~w t jyU   d| j d	}t| t
|w )
z+Execute the cuda-checkpoint toggle command.z--toggle--pidTcheckcapture_outputtexttimeoutr'   z+ Successfully toggled CUDA checkpoint statez) Failed to toggle CUDA checkpoint state: Nz Toggle command timed out)
subprocessrunr   strr"   r   r
   r,   CalledProcessErrorstderrr    TimeoutExpired)r4   _r:   	error_msgr   r   r   r1   n   s&   

z-CudaCheckpointProcess._execute_toggle_commandc              
   C   s   z t jtddt| jgdddtd}|j  }t	|| _
W dS  t jy@ } zd| j d|j }t| t|d}~w t jyW   d| j d}t| t|w )	z=Refreshes the current CUDA checkpoint state for this process.--get-stater>   Tr?   r'   z& Failed to get CUDA checkpoint state: Nz Get state command timed out)rD   rE   r   rF   r"   r   stdoutstriplowerr   r#   rG   rH   r
   r,   r    rI   )r4   result	state_strr:   rK   r   r   r   r<      s(   

z#CudaCheckpointProcess.refresh_state)F)Tr&   N)r   r   r   r   int__annotations__r   boolr;   floatr0   r1   r<   r   r   r   r   r!   0   s"   
 !

r!   c                   @   s|   e Zd ZdZdd Zdee fddZdede	e fdd	Z
dddZdddZdefddZdeeeef  fddZd
S )CudaCheckpointSessionzGManages the checkpointing state of processes with active CUDA sessions.c                 C   sJ   |   | _| jrtdt| j ddd | jD   d S td d S )NzFound z PID(s) with CUDA sessions: c                 S   s   g | ]}|j qS r   r"   ).0cr   r   r   
<listcomp>   s    z2CudaCheckpointSession.__init__.<locals>.<listcomp>zNo CUDA sessions found.)_get_cuda_pidscuda_processesr
   r,   lenr4   r   r   r   __init__   s   
 zCudaCheckpointSession.__init__r&   c           	         s   g }t d}| stddd | D }ttdt|dG  fdd|D }t|D ]0}|| }z| }|rB|	| W q0 t
y` } ztd	| d
|  W Y d}~q0d}~ww W d   n1 skw   Y  |jdd d |S )zWIterates over all PIDs and identifies the ones that have running
        CUDA sessions.z/proczPOS does not have /proc path rendering it incompatible with GPU memory snapshots.c                 S   s   g | ]	}|j  r|qS r   )nameisdigitrY   entryr   r   r   r[      s    z8CudaCheckpointSession._get_cuda_pids.<locals>.<listcomp>2   )max_workersc                    s(   i | ]}  jt|jt|jqS r   )submit_check_cuda_sessionrS   ra   rc   executorr4   r   r   
<dictcomp>   s    z8CudaCheckpointSession._get_cuda_pids.<locals>.<dictcomp>Error checking PID : Nc                 S   s   | j S NrX   )xr   r   r   <lambda>   s    z6CudaCheckpointSession._get_cuda_pids.<locals>.<lambda>)key)r   existsr    iterdirr   r2   r^   r   rP   append	Exceptionr
   r,   sort)	r4   	cuda_pidsproc_dirpid_dirsfuture_to_pidfuturer"   cuda_processr:   r   ri   r   r\      s4   
"z$CudaCheckpointSession._get_cuda_pidsr"   c              
   C   s   z)t jtddt|gddtd}|jdkr'|j  }t	|}t
||dW S W d
S  t jy4   Y d
S  t jyF   td|  Y d
S  tyc } ztd| d	|  W Y d
}~d
S d
}~ww )z+Check if a specific PID has a CUDA session.rL   r>   T)rA   rB   rC   r   )r"   r#   z$Timeout checking CUDA state for PID rl   rm   N)rD   rE   r   rF   r   
returncoderM   rN   rO   r   r!   rG   rI   r
   r,   ru   )r4   r"   rP   rQ   r#   r:   r   r   r   rh      s0   
 z)CudaCheckpointSession._check_cuda_sessionNc                    sV  | j s
td dS | j D ]}|  |jtjkr+td|j dtjj	 d|jj	 qt
 }dtddfdd	 t O fd
d| j D }g }t|D ]}z|  W qN tym } z|| W Y d}~qNd}~ww |rtdt| dddd |D  W d   n1 sw   Y  t
 | }tdt| j  d|dd dS )z8Checkpoint all CUDA processes, moving GPU memory to CPU.z No CUDA processes to checkpoint.NzPID z: CUDA session not in z state. Current state: procr&   c                 S   s   |  tj d S rn   )r;   r   r   r~   r   r   r   checkpoint_impl   s   z9CudaCheckpointSession.checkpoint.<locals>.checkpoint_implc                    s   g | ]}  |qS r   rg   rY   r~   r   rj   r   r   r[          z4CudaCheckpointSession.checkpoint.<locals>.<listcomp>zFailed to checkpoint  processes: ; c                 s       | ]}t |V  qd S rn   rF   rY   r:   r   r   r   	<genexpr>      z3CudaCheckpointSession.checkpoint.<locals>.<genexpr>zCheckpointing z CUDA sessions took => .3fs)r]   r
   r,   r<   r#   r   r   r    r"   r-   r.   perf_counterr!   r   r   rP   ru   rt   r^   join)r4   r~   startfutures
exceptionsr{   r:   r=   r   r   r   
checkpoint   s@   

"$z CudaCheckpointSession.checkpointc                    s  | j s
td dS t }dtddfddt O  fdd| j D }g }t|D ]}z|  W q, t	yK } z|
| W Y d}~q,d}~ww |rbtd	t| d
ddd |D  W d   n1 slw   Y  t | }tdt| j  d|dd dS )z?Restore all CUDA processes, moving memory back from CPU to GPU.zNo CUDA sessions to restore.Nr~   r&   c                 S   s   | j tjdd d S )NT)r%   )r;   r   r   r   r   r   r   restore_process  s   z6CudaCheckpointSession.restore.<locals>.restore_processc                    s   g | ]}  |qS r   r   r   rj   r   r   r   r[     r   z1CudaCheckpointSession.restore.<locals>.<listcomp>zFailed to restore r   r   c                 s   r   rn   r   r   r   r   r   r   $  r   z0CudaCheckpointSession.restore.<locals>.<genexpr>z
Restoring z CUDA session(s) took => r   r   )r]   r
   r,   r.   r   r!   r   r   rP   ru   rt   r    r^   r   )r4   r   r   r   r{   r:   r=   r   r   r   restore  s.   
"$zCudaCheckpointSession.restorec                 C   s
   t | jS )z9Get the number of CUDA processes managed by this session.)r^   r]   r_   r   r   r   get_process_count*  s   
z'CudaCheckpointSession.get_process_countc                 C   s.   g }| j D ]}|  ||j|jf q|S )z,Get current states of all managed processes.)r]   r<   rt   r"   r#   )r4   statesr~   r   r   r   get_process_states.  s
   
z(CudaCheckpointSession.get_process_statesrR   )r   r   r   r   r`   r   r!   r\   rS   r   rh   r   r   r   tupler   r   r   r   r   r   rW      s    	"

)rW   )rD   r.   concurrent.futuresr   r   dataclassesr   enumr   pathlibr   typingr   r   modal.configr	   r
   getr   rF   rT   r   rV   r   rS   r   r   ru   r    r!   rW   r   r   r   r   <module>   s"   
i