o
    ۷i                     @  s   d Z ddlmZ ddlZddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZ eeZG d	d
 d
eZdS )zJBase worker class for vLLM-Omni with process-scoped GPU memory accounting.    )annotationsN)init_logger)
format_gibmemory_profiling)Worker)detect_pid_host)get_process_gpu_memory"is_process_scoped_memory_availablec                   @  s"   e Zd ZdZe dddZdS )OmniGPUWorkerBasea'  Base GPU worker for vLLM-Omni with process-scoped memory accounting.

    This class overrides determine_available_memory() to use per-process GPU
    memory tracking via pynvml, allowing multiple stages to initialize
    concurrently on the same GPU without memory accounting interference.
    returnintc              	   C  sd  | j j }r| j  tdt| |S t| jt	| jj
d}| j  W d   n1 s0w   Y  |j| _|j| _t rHt rHt| jnd}|durxtd| j| | _tdt | jt| jt|t| j tjdt| jdd n5t	| jj
|j |j }td| j| | _td	t | jt| jt|t| j tjd
t| jdd t	| jS )a  Process-scoped GPU memory profiling for concurrent stage initialization.

        Algorithm:
            1. requested_memory = total_gpu_memory * gpu_memory_utilization
               (computed in init_device from cache_config)

            2. process_memory = memory used by THIS process only (via pynvml)
               - Uses nvmlDeviceGetComputeRunningProcesses to get per-PID memory
               - Supports CUDA_VISIBLE_DEVICES with indices, UUIDs, or MIG IDs

            3. available_kv_cache = requested_memory - process_memory

        Fallback:
            If NVML is unavailable, falls back to profiling data:
            available = requested - (weights + activations + non_torch)
        z,Using explicit kv_cache_memory_bytes: %s GiB)weights_memoryNr   zKProcess-scoped memory (PID %d, GPU %d): requested=%s, used=%s, available=%sz2Available KV cache memory: %s GiB (process-scoped)local)scopezLProfiling fallback (PID %d, GPU %d): requested=%s, profiled=%s, available=%sz6Available KV cache memory: %s GiB (profiling fallback))cache_configkv_cache_memory_bytesmodel_runnerprofile_runloggerinfor   r   init_snapshotr   model_memory_usagenon_torch_increasenon_torch_memorytorch_peak_increasepeak_activation_memoryr	   r   r   
local_rankmaxrequested_memoryavailable_kv_cache_memory_bytesdebugosgetpid	info_once)selfr   profile_resultprocess_memoryprofiled_usage r(   K/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/worker/base.pydetermine_available_memory   st   



z,OmniGPUWorkerBase.determine_available_memoryN)r   r   )__name__
__module____qualname____doc__torchinference_moder*   r(   r(   r(   r)   r
      s    r
   )r.   
__future__r   r!   r/   vllm.loggerr   vllm.utils.mem_utilsr   r   vllm.v1.worker.gpu_workerr   	GPUWorkervllm_omni.entrypoints.utilsr   !vllm_omni.worker.gpu_memory_utilsr   r	   r+   r   r
   r(   r(   r(   r)   <module>   s    