o
    i                     @   sx   d Z ddlZddlZddlmZmZmZ eeZ	G dd dZ
G dd dZ				dd
ededededef
ddZdS )a  
Multi-Engine vLLM Wrapper - transparent dispatch across N engine instances.

OPTIMIZATION: A 0.5B model uses ~1.3GB weights + ~5-10GB KV cache per engine.
On an 80GB A100, we can comfortably run 3 engines at ~12GB each (36GB total).
This triples effective prefill capacity, directly cutting streaming TTFB under
concurrent load by ~3x.

The wrapper is transparent to pipeline code: it exposes the same `.engine.generate()`
interface, but routes calls to different underlying engines.

Usage:
    # Creates N engines, each with their own vLLM subprocess
    model = create_multi_engine_model(model_path, num_engines=3, gpu_memory_per_engine=0.12)
    
    # Pipeline uses it identically to a single engine:
    model.engine.generate(prompt, sampling_params, request_id)  # routed to engine N
    N)AnyAsyncGeneratorListc                   @   sn   e Zd ZdZdefddZdeeef fddZ	dedd	fd
dZ
defddZdee fddZdd Zd	S )LoadAwareEnginez
    Wraps multiple AsyncLLMEngine instances with least-inflight dispatch.

    This reduces tail latency under uneven request completion times versus strict
    round-robin, while preserving fair spread with a rotating cursor on ties.
    enginesc                 C   s*   || _ dd |D | _d| _t | _d S )Nc                 S   s   g | ]}d qS )r    ).0_r   r   8/home/ubuntu/veenaModal/veena3modal/core/multi_engine.py
<listcomp>%   s    z,LoadAwareEngine.__init__.<locals>.<listcomp>r   )_engines	_inflight_cursor	threadingLock_lock)selfr   r   r   r
   __init__#   s   zLoadAwareEngine.__init__returnc                 C   s   | j P t| j}|dkrtdt| j}| j| }|}t|D ]}|| | }| j| |kr4|} nq!| j|  d7  < |d | | _|| j| fW  d    S 1 sVw   Y  d S )Nr   zNo engines configured   )r   lenr   RuntimeErrorminr   r   range)r   nmin_loadstartselectedoffsetidxr   r   r
   _acquire_engine)   s"   


$zLoadAwareEngine._acquire_enginer   Nc                 C   s   | j 5 d|  krt| jk r0n ntd| j| d | j|< W d    d S W d    d S W d    d S 1 s;w   Y  d S )Nr   r   )r   r   r   max)r   r   r   r   r
   _release_engine:   s   "zLoadAwareEngine._release_enginec                    sX     \}z
|j|i | W n ty     w dtf fdd}| S )zFRoute generate() to the least-loaded engine and release on completion.r   c               	     s:   z 2 z	3 d H W } | V  q6 W   d S   w N)r"   )itembase_generator
engine_idxr   r   r
   _wrappedH   s   z*LoadAwareEngine.generate.<locals>._wrapped)r    generate	Exceptionr"   r   )r   argskwargsenginer(   r   r%   r
   r)   ?   s   
zLoadAwareEngine.generatec                 C   s4   | j  t| jW  d   S 1 sw   Y  dS )z1Current in-flight request count per engine index.N)r   listr   r   r   r   r
   inflight_snapshotQ   s   $z!LoadAwareEngine.inflight_snapshotc                 C   s   t | jd |S )z7Forward any other attribute access to the first engine.r   )getattrr   )r   namer   r   r
   __getattr__V   s   zLoadAwareEngine.__getattr__)__name__
__module____qualname____doc__r.   r   tupleintr   r    r"   r   r)   r   r0   r3   r   r   r   r
   r      s    r   c                   @   s6   e Zd ZdZdefddZdd Zdd Zd	d
 ZdS )MultiEngineModela  
    Drop-in replacement for SparkTTSModel that wraps N engines.
    
    Exposes:
        .engine    -> LoadAwareEngine (dispatches generate() calls)
        .tokenizer -> shared tokenizer (same across all engines)
        .model_path, .model_type, etc.
    modelsc                 C   sT   || _ |d | _| jj| _| jj| _| jj| _| jj| _dd |D }t|| _dS )ze
        Args:
            models: List of SparkTTSModel instances (each has its own engine)
        r   c                 S   s   g | ]}|j qS r   r-   )r   mr   r   r
   r   t   s    z-MultiEngineModel.__init__.<locals>.<listcomp>N)_models_primary	tokenizer
model_path
model_typehf_tokenr   r-   )r   r;   r   r   r   r
   r   e   s   




zMultiEngineModel.__init__c                 C   s
   | j  S r#   )r?   get_model_typer/   r   r   r
   rD   w   s   
zMultiEngineModel.get_model_typec                 C      | j S r#   )r@   r/   r   r   r
   get_tokenizerz      zMultiEngineModel.get_tokenizerc                 C   rE   r#   r<   r/   r   r   r
   
get_engine}   rG   zMultiEngineModel.get_engineN)	r4   r5   r6   r7   r.   r   rD   rF   rH   r   r   r   r
   r:   [   s    	r:      Q?rA   num_enginesrC   gpu_memory_per_enginer   c           
      K   s   ddl m} td| d|dd g }t|D ]+}td|d  d	| d
 |d| ||d|}|| td|d  d qt|}	td| d |	S )a  
    Create a multi-engine model with N vLLM instances on the same GPU.
    
    Memory budget per engine (A100-80GB):
        - Model weights: ~1.3GB (shared via CUDA, but each engine loads independently)
        - KV cache at 0.12 utilization: ~8GB
        - Total per engine: ~10GB
        - 3 engines: ~30GB, leaves 50GB free
    
    Args:
        model_path: Path to model directory
        num_engines: Number of vLLM engine instances (default: 2)
        hf_token: HuggingFace token
        gpu_memory_per_engine: GPU memory fraction per engine
        **engine_kwargs: Additional AsyncEngineArgs overrides passed to each engine
    
    Returns:
        MultiEngineModel with round-robin dispatch
    r   )SparkTTSModelzCreating multi-engine model: z engines x z.0%z	 GPU eachz  Initializing engine r   /z...)rA   rC   gpu_memory_utilizationz	  Engine z readyzMulti-engine model ready: z engines, round-robin dispatchNr   )veena3modal.core.model_loaderrM   loggerinfor   appendr:   )
rA   rK   rC   rL   engine_kwargsrM   r;   imodelmultir   r   r
   create_multi_engine_model   s"   
rX   )rI   NrJ   )r7   loggingr   typingr   r   r   	getLoggerr4   rQ   r   r:   strr9   floatrX   r   r   r   r
   <module>   s*    
@(