o
     i                      @  sn   d Z ddlmZ ddlZddlZddlmZmZ ddlm	Z	m
Z
 eeZG dd dZe	G dd	 d	ZdS )
a  
Dual vLLM Engine Router - Distributes requests across multiple engine instances.

OPTIMIZATION: With gpu_memory_utilization=0.25, a 0.5B model uses ~20GB on an 80GB GPU.
Running 2 engines at 0.12 each uses ~20GB total while doubling prefill parallelism.
This directly halves the #1 streaming bottleneck: prefill contention under concurrency.

The router is transparent to the pipeline -- it exposes the same .engine.generate() interface
as a single SparkTTSModel via a RoutedEngine wrapper.
    )annotationsN)AnyOptional)	dataclassfieldc                   @  s"   e Zd ZdZd	ddZdd ZdS )
RoutedEnginea  
    Wraps multiple vLLM AsyncLLMEngine instances behind a single .generate() interface.
    
    Round-robins generate() calls across engines so concurrent requests
    spread prefill work instead of queuing on a single engine.
    Thread-safe via atomic counter.
    engineslistc                 C  s   || _ d| _t | _dS )zM
        Args:
            engines: List of AsyncLLMEngine instances
        r   N)_engines_counter	threadingLock_lock)selfr    r   7/home/ubuntu/veenaModal/veena3modal/core/dual_engine.py__init__   s   zRoutedEngine.__init__c                 K  sf   | j  | jt| j }|  jd7  _W d   n1 sw   Y  | j| }|jd|||d|S )z
        Route a generate() call to the next engine via round-robin.
        
        Returns the async generator from the selected engine.
        Same interface as AsyncLLMEngine.generate().
           N)promptsampling_params
request_idr   )r   r   lenr
   generate)r   r   r   r   kwargsidxenginer   r   r   r   (   s   
zRoutedEngine.generateN)r   r	   )__name__
__module____qualname____doc__r   r   r   r   r   r   r      s    
	r   c                   @  s   e Zd ZU dZdZded< dZded< dZd	ed
< dZded< e	e
dZded< dZded< edddZedd Zdd ZdddZdS )DualEngineRoutera  
    Manages multiple vLLM engine instances on the same GPU.
    
    Memory Budget (A100-80GB, 2 engines at 0.12 each):
        - Engine A: 1.3GB model + ~8GB KV cache = ~9.3GB
        - Engine B: 1.3GB model + ~8GB KV cache = ~9.3GB  
        - BiCodec decoder: ~0.6GB
        - Total: ~19.2GB (leaves ~61GB free)
        - Each engine: ~170 concurrent sequences
        - Combined: ~340 concurrent with 2x prefill throughput
     str
model_path   intnum_enginesgQ?floatgpu_memory_per_engineNzOptional[str]hf_token)default_factoryr	   _modelszOptional[RoutedEngine]_routed_enginereturnr   c                 C  s   | j du r	td| j S )z:The routed engine that distributes calls across instances.Nz:DualEngineRouter not initialized. Call initialize() first.)r,   RuntimeErrorr   r   r   r   r   R   s   
zDualEngineRouter.enginec                 C  s   | j std| j d jS )zDTokenizer from the first model (all share the same tokenizer/vocab).z DualEngineRouter not initializedr   )r+   r.   	tokenizerr/   r   r   r   r0   Y   s   zDualEngineRouter.tokenizerc                 C  s   ddl m} td| j d| j d g }t| jD ]2}td|d  d| j d	 || j| j| jd
}| j	
| |
|j td|d  d qt|| _td| j d dS )z2Initialize all vLLM engine instances sequentially.r   )SparkTTSModelzInitializing z vLLM engines (gpu_memory=z	 each)...z  Loading engine r   /z...)r#   r)   gpu_memory_utilizationz	  Engine z readyzDualEngineRouter: z, engines active, round-robin routing enabledN)veena3modal.core.model_loaderr1   loggerinfor&   r(   ranger#   r)   r+   appendr   r   r,   )r   r1   raw_enginesimodelr   r   r   
initialize`   s"   

zDualEngineRouter.initializec                 C  s   | j r
| j d  S dS )z3Compatibility: return model type from first engine.r   indic_speakers)r+   get_model_typer/   r   r   r   r>   y   s   zDualEngineRouter.get_model_type)r-   r   )r-   r"   )r   r   r   r   r#   __annotations__r&   r(   r)   r   r	   r+   r,   propertyr   r0   r<   r>   r   r   r   r   r    <   s   
 
r    )r   
__future__r   loggingr   typingr   r   dataclassesr   r   	getLoggerr   r5   r   r    r   r   r   r   <module>   s    
&