o
    i~                     @  s  U d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	m
Z
mZ ddlmZmZmZmZmZmZ eeZdmdnddZdodpddZe
G dd dZG dd deZG dd dZe
G dd dZdaded< dqddZdrdd Z dsd"d#Z!dtd%d&Z"dud'd(Z#dvd,d-Z$				.		/	0	1							dwdxdDdEZ%dydFdGZ&dzdHdIZ'	J	K	L	M	N		Od{d|dZd[Z(d}d^d_Z)	J	K	L	M	N		`	Od~ddbdcZ*	J	K	L	M	N		d		ddddidjZ+ddkdlZ,dS )a  
TTS runtime wrapper (container-scoped singleton).

In Modal, each container should load the model once and then handle many requests.
This wrapper owns:
- SparkTTSModel (vLLM engine)
- IndicPromptBuilder
- BiCodecDecoder
- SparkTTSPipeline + Veena3SlidingWindowPipeline
- optional SuperResolutionService

Imports framework-agnostic inference + processing code from `veena3modal/core`,
`veena3modal/processing`, and `veena3modal/audio`.
    )annotationsN)	dataclassfield)AnyAsyncGeneratorDictListOptionalTuplenamestrdefaultintminimumreturnc                 C  D   t j| d}|s|S zt|}W n ty   | Y S w t||S N )osenvirongetr   
ValueErrormaxr   r   r   rawvalue r   ;/home/ubuntu/veenaModal/veena3modal/services/tts_runtime.py_env_int      
r           floatc                 C  r   r   )r   r   r   r!   r   r   r   r   r   r   
_env_float)   r   r"   c                   @  sZ   e Zd ZU dZdZded< dZded< dZd	ed
< dZd	ed< dZ	d	ed< dZ
ded< dS )StreamingAdmissionLeasez2Admission ticket for one active streaming request.r    r!   wait_msFboolqueuedr   r   inflight_on_grantwaiters_on_grantqueue_depth_on_entryreleasedN)__name__
__module____qualname____doc__r$   __annotations__r&   r'   r(   r)   r*   r   r   r   r   r#   4   s   
 r#   c                      s*   e Zd ZdZdddd fddZ  ZS )StreamingAdmissionErrorz@Raised when streaming admission control cannot accept a request.r    N)retry_after_mssnapshotmessager   reasonr1   r!   r2   Optional[Dict[str, float]]c                  s0   t  | || _ttd|| _|pi | _d S )Nr    )super__init__r4   r!   r   r1   r2   )selfr3   r4   r1   r2   	__class__r   r   r7   C   s   z StreamingAdmissionError.__init__)r3   r   r4   r   r1   r!   r2   r5   )r+   r,   r-   r.   r7   __classcell__r   r   r9   r   r0   @   s
    r0   c                   @  sP   e Zd ZdZddd	ZedddZdddZdddZdddZ	d ddZ
dS )!StreamingAdmissionControllerz
    Lightweight admission controller for streaming requests.

    Keeps a hard cap on active streams (`max_inflight`) and an optional bounded
    wait queue (`max_queue`) with per-request wait timeout (`max_wait_ms`).
    max_inflightr   	max_queuemax_wait_msr!   poll_msc                 C  s   t td|| _t td|| _ttd|| _ttd|| _| jdk| _t	 | _
d| _d| _d| _d| _d| _d| _d| _d S )Nr   r          ?)r   r   r=   r>   r!   r?   r@   enabled	threadingLock_lock	_inflight_waiters_total_admitted_total_queue_entries_total_queue_wait_ms_total_rejected_queue_full_total_rejected_timeout)r8   r=   r>   r?   r@   r   r   r   r7   Y   s   

z%StreamingAdmissionController.__init__r   'StreamingAdmissionController'c              	   C  s8   | t ddddt ddddtddddtddd	dd
S )N$VEENA3_STREAM_ADMISSION_MAX_INFLIGHTr   )r   !VEENA3_STREAM_ADMISSION_MAX_QUEUE#VEENA3_STREAM_ADMISSION_MAX_WAIT_MSr    VEENA3_STREAM_ADMISSION_POLL_MSg       @rA   )r=   r>   r?   r@   )r   r"   )clsr   r   r   from_envo   s   z%StreamingAdmissionController.from_envDict[str, float]c                 C  s   | j dkr| j| j  nd}| jrdndt| jt| jt| jt| jt| jt| j	t| j
t| j t|t| jt| jdS )Nr   r          ?)rB   r=   r>   r?   r@   inflight_nowwaiters_nowadmitted_totalqueue_entries_totalqueue_wait_ms_avgrejected_queue_full_totalrejected_timeout_total)rI   rJ   rB   r!   r=   r>   r?   r@   rF   rG   rH   rK   rL   )r8   	queue_avgr   r   r   _snapshot_lockedx   s"   
z-StreamingAdmissionController._snapshot_lockedc                 C  s2   | j  |  W  d    S 1 sw   Y  d S N)rE   r^   )r8   r   r   r   r2      s   $z%StreamingAdmissionController.snapshotr#   c                   s  t  }d}d}| jdkr|| jd  nd }	 t  }| j | jr)| j| jk rn|| d }|  jd7  _|rKtd| jd | _|  j	td|7  _	|  j
d7  _
tt|t|t| jt| jt|dW  d    S |s| j| jkr|  jd7  _|  }tdd	| j|d
d}|  jd7  _| j}|  jd7  _W d    n1 sw   Y  |d ur||kr| j |rtd| jd | _|  jd7  _|  }W d    n1 sw   Y  tdd| j|d
t| jd I d H  q)NFr   r         @@T   )r$   r&   r'   r(   r)   zstreaming admission queue full
queue_full)r4   r1   r2   z streaming admission wait timeoutwait_timeout)timeperf_counterr?   rE   rB   rF   r=   r   rG   rJ   rH   r#   r!   r%   r   r>   rK   r^   r0   r@   rI   rL   asynciosleep)r8   startr&   r)   deadlinenowr$   r2   r   r   r   acquire   sr   

z$StreamingAdmissionController.acquirelease!Optional[StreamingAdmissionLease]Nonec                 C  sV   |d u s|j r	d S d|_ | j td| jd | _W d    d S 1 s$w   Y  d S )NTr   ra   )r*   rE   r   rF   )r8   rl   r   r   r   release   s   "z$StreamingAdmissionController.releaseN)r=   r   r>   r   r?   r!   r@   r!   )r   rM   r   rT   r   r#   rl   rm   r   rn   )r+   r,   r-   r.   r7   classmethodrS   r^   r2   rk   ro   r   r   r   r   r<   Q   s    



:r<   c                   @  s   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	ded< dZ
ded	< dZd
ed< dZded< dZded< dZded< dZded< dZded< dZded< dZded< eedZded< dZded < dS )!
TTSRuntimea,  
    Holds long-lived, per-container inference objects.
    
    Thread Safety:
        This class is designed for async concurrency within a single container.
        The vLLM engine handles internal batching and scheduling.
        Do NOT share across processes without proper synchronization.
    Nr   modelpipelinestreaming_pipelinelong_text_processorprompt_builderbicodec_decoderzOptional[Any]
sr_service
not_loadedr   model_versionFr%   	is_loadedr    r!   load_time_msr   
model_pathbicodec_pathOptional[str]sr_checkpoint_dircudadevice)default_factoryzDict[str, List[int]]speaker_global_cachez&Optional[StreamingAdmissionController]stream_admission)r+   r,   r-   r.   ru   r/   rv   rw   rx   ry   rz   r{   r}   r~   r   r   r   r   r   r   dictr   r   r   r   r   r   rt      s$   
 rt   Optional[TTSRuntime]_runtimec                   C  s   t S )z&Get the current TTS runtime singleton.)r   r   r   r   r   get_runtime   s   r   r%   c                   C  s   t duot jS )z.Check if the runtime is initialized and ready.N)r   r~   r   r   r   r   is_initialized   s   r   runtimec                 C  s0   | du rt d| j}|du rt }|| _|S )zCReturn initialized streaming admission controller for this runtime.NTTS runtime not initialized)RuntimeErrorr   r<   rS   r   
controllerr   r   r    _get_stream_admission_controller  s   r   rT   c                  C  s   t  } | du r	i S t|  S )z
    Return current admission state + cumulative counters.

    Safe to call anytime; when runtime is not initialized returns an empty dict.
    N)r   r   r2   )r   r   r   r    get_streaming_admission_snapshot  s   r   c                    s,   t  stdt } t| }| I dH S )zFAcquire one streaming admission slot or raise StreamingAdmissionError.r   N)r   r   r   r   rk   r   r   r   r   acquire_streaming_slot  s   r   rl   rm   rn   c                 C  s(   t  }|du r	dS t|}||  dS )z:Release a previously acquired streaming slot (idempotent).N)r   r   ro   )rl   r   r   r   r   r   release_streaming_slot#  s
   r   r         ?Fra   r   r   r   r   r   hf_tokengpu_memory_utilization	enable_srnum_enginesmax_num_batched_tokensOptional[int]max_num_seqsenable_chunked_prefillOptional[bool]enable_prefix_cachingdisable_log_statsenforce_eagerprecompute_speaker_globalsc           *      C  s  t   }| ptjdtjdd} |ptjd| }|p"tjd}|p0tjdp0tjd}|du rDtjd	d
  }|dv }td td|   td|  td| d|  zddlm	} ddl
m} ddlm} ddlm} ddlm} i }t|tr|dkr||d< t|	tr|	dkr|	|d< |
durt|
|d< |durt||d< |durt||d< |durt||d< |dkrddlm} || }td| d |d!d" |dL| |||d#|}ntd$ |dL| ||d%|}td& ||j|d'}td( tjd)d*  }|dv }||||d+}td, ||||d-}d}zdd.lm}  td/ | |||d0}W n tyd }! ztd1|!  W Y d}!~!nd}!~!ww |||d2}"d}#|r|rz*dd3lm}$ td4| d5 |$|d6}#|#j|d7rtd8 ntd9 d}#W n ty }! ztd:|!  W Y d}!~!nd}!~!ww tj !| "d;}%|%sd<}%t   | d= }&t#$ }'t%||||"|||#|%d>|&| ||||'d?a&td@|&dAdB tdC|%  |'j'r
tdD|'j(|'j)|'j*|'j+ ntdE |r!|dkr!tdF t,t& n|dkr,tdG ntdH zddIl-m.}( |(|% W t&W S  tyK   Y t&W S w  typ }! zt/dJ|!  ddl0})|)1  t2dK|! |!d}!~!ww )Ma  
    Initialize the TTS runtime with all components.
    
    This should be called once per container (e.g., in Modal's @modal.enter).
    
    Args:
        model_path: Path to Spark TTS model (env: SPARK_TTS_MODEL_PATH)
        bicodec_path: Path to BiCodec model (env: BICODEC_MODEL_PATH, defaults to model_path)
        sr_checkpoint_dir: Path to super-resolution checkpoints (env: AP_BWE_CHECKPOINT_DIR)
        device: Device for inference (cuda/cpu)
        hf_token: HuggingFace token for private models (env: HF_TOKEN)
        gpu_memory_utilization: vLLM GPU memory fraction (default: 0.25)
        enable_sr: Enable super-resolution service
        num_engines: Number of vLLM engine instances (default: 1, set to 2-3 for Tier 3 optimization)
        max_num_batched_tokens: Optional vLLM scheduler cap override.
        max_num_seqs: Optional vLLM concurrent sequence cap override.
        enable_chunked_prefill: Optional vLLM chunked prefill toggle.
        enable_prefix_caching: Optional vLLM prefix caching toggle.
        disable_log_stats: Optional vLLM internal stats log toggle.
        enforce_eager: Optional vLLM eager-mode toggle (disables CUDA graphs when true).
        precompute_speaker_globals: Whether to warm speaker global tokens at startup.
            If None, reads PRECOMPUTE_SPEAKER_GLOBALS env var (default: false).
    
    Returns:
        Initialized TTSRuntime instance
    
    Raises:
        RuntimeError: If initialization fails
    SPARK_TTS_MODEL_PATH
MODEL_PATHz/models/spark_tts_4speakerBICODEC_MODEL_PATHAP_BWE_CHECKPOINT_DIRHF_TOKENHUGGING_FACE_HUB_TOKENNPRECOMPUTE_SPEAKER_GLOBALSfalse>   1onyestruezInitializing TTS runtime...z  Model path: z  BiCodec path: z  SR enabled: z, path: r   )SparkTTSModel)SparkTTSPipeline)BiCodecDecoder)IndicPromptBuilderLongTextProcessorr   r   r   r   r   r   ra   )create_multi_engine_modelzLoading z vLLM engines (z.2fz GPU mem each)...)r   r   r   gpu_memory_per_enginez#Loading SparkTTS model with vLLM...)r   r   r   zInitializing prompt builder...)	tokenizerru   zInitializing BiCodec decoder...VEENA3_BICODEC_BATCHINGr   )r   r   enable_batchingzInitializing TTS pipeline...)ru   ry   rz   )Veena3SlidingWindowPipelinez"Initializing streaming pipeline...)ru   ry   snac_decoderz"Streaming pipeline not available: rv   rw   )SuperResolutionServicez#Initializing super-resolution from z...)checkpoint_dir)r   u.   ✅ Super-resolution model loaded successfullyz%Super-resolution model failed to loadz Super-resolution not available: /z	spark-tts  T)ru   rv   rw   rx   ry   rz   r{   r}   r~   r   r   r   r   r   r   u   ✅ TTS runtime initialized in .0fmsz   Model version: zG   Streaming admission: inflight<=%d queue<=%d wait<=%.1fms poll=%.1fmsz    Streaming admission: disabledz*Speaker globals startup precompute enabledzDSpeaker globals cache startup precompute skipped (multi-engine mode)z;Speaker globals cache startup precompute skipped (disabled))set_model_versionu&   ❌ Failed to initialize TTS runtime: z#TTS runtime initialization failed: r   )3rd   r   r   r   striplowerloggerinfoveena3modal.core.model_loaderr   veena3modal.core.pipeliner    veena3modal.core.bicodec_decoderr   %veena3modal.processing.prompt_builderr   *veena3modal.processing.long_text_processorr   
isinstancer   r%   veena3modal.core.multi_enginer   r   #veena3modal.core.streaming_pipeliner   ImportErrorwarning!veena3modal.core.super_resolutionr   
load_model	Exceptionpathbasenamerstripr<   rS   rt   r   rB   r=   r>   r?   r@   _precompute_speaker_globalsveena3modal.api.fastapi_appr   error	traceback	print_excr   )*r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   
start_timeraw_flagr   r   r   r   r   engine_kwargsr   per_engine_memru   ry   	batch_rawenable_bicodec_batchingrz   rv   rw   r   erx   r{   r   r}   	load_timer   r   r   r   r   r   initialize_runtime,  sD  0
















r   c              	     sX  ddl ddlddlm} ddlm m}m}m}m	} t
d t }d||||d|dd	dfdd fddz6 }| rrddl}|j }	|	fdd  W d   n1 slw   Y  n|  W n ty     Y nw t | d }
tj}t }t
d| d| d|
dd dS )as  
    Pre-compute and cache global tokens for all 12 speakers at startup.
    
    OPTIMIZATION: BiCodec streaming requires 32 "global tokens" before any audio can be emitted.
    These encode speaker identity via FSQ quantization. Since we have only 12 fixed speakers,
    we generate one short utterance per speaker at startup, capture the 32 global tokens,
    and cache them. During streaming, we inject cached globals via build_prefix_with_globals(),
    skipping the ~110ms global token pre-roll phase entirely.
    
    This runs synchronously at startup (adds ~5-10s to cold start, saves ~110ms per streaming TTFB).
    r   N)SamplingParams)SPEAKER_MAPTRAINING_STOP_TOKEN_IDSDEFAULT_TEMPERATUREDEFAULT_TOP_KDEFAULT_TOP_Pz/Pre-computing global tokens for all speakers...zHello, this is a voice test.   F)temperaturetop_ktop_p
max_tokensstopskip_special_tokensspeaker_namer   r   Optional[List[int]]c                   s   j | }d|  dtt d  }d}jjj||d2 z3 dH W }|}q"6 |du r2dS |jd j} 	d|}dd	 |D }t
|d
krQ|dd
 S dS )z<Generate a short utterance and capture the 32 global tokens.zwarmup--r   N)promptsampling_params
request_idr   z<\|bicodec_global_(\d+)\|>c                 S  s   g | ]}t |qS r   )r   ).0tr   r   r   
<listcomp>T  s    zU_precompute_speaker_globals.<locals>._capture_globals_for_speaker.<locals>.<listcomp>    )ry   build_prefixr   rd   ru   enginegenerateoutputstextfindalllen)r   r   r   final_outputrequest_outputgenerated_textglobal_matches
global_ids)rer   r   	test_textr   r   _capture_globals_for_speakerB  s&   zA_precompute_speaker_globals.<locals>._capture_globals_for_speakerc                    s      D ]F} z'| I d H }|r%t|dkr%|j| < td|   ntd|   W q tyK } ztd|  d|  W Y d }~qd }~ww d S )Nr   z  Cached globals for z   Failed to capture globals for z  Error caching globals for z: )keysr  r   r   r   r   r   )r   r
  r   )r   r  r   r   r   _run_allZ  s   
"z-_precompute_speaker_globals.<locals>._run_allc                     s      S r_   )runr   )r  rf   r   r   <lambda>m  s    z-_precompute_speaker_globals.<locals>.<lambda>r   zSpeaker globals cached: r   z speakers in r   r   )r   r   r   r   )rf   r  vllmr   veena3modal.core.constantsr   r   r   r   r   r   r   rd   get_event_loop
is_runningconcurrent.futuresfuturesThreadPoolExecutorsubmitresultrun_until_completer   r  r  r   )r   r   r   r   r   r   t_startloop
concurrentpoolelapsedcachedtotalr   )r   r  r  rf   r  r   r   r  r   r      sD   
	
$r   c                 C  s.   | j du rddlm} || j| jd| _ | j S )z>Return cached LongTextProcessor, creating it lazily if needed.Nr   r   r   )rx   r   r   rv   rw   )r   r   r   r   r   _get_long_text_processorz  s   
r#  皙?2   rU      ?16khzr  speakerr   r   r   r   repetition_penaltyseedoutput_sample_rate&Tuple[Optional[bytes], Dict[str, Any]]c	              
     s  t  stdt }	ddddddd}
t }z|	jj|| ||||||dI dH \}}|r3|
| t | }t|d	 |
d
< |
d
 |
d< |rd}|dksUt	t
jrgtd||	jdu|	jrd|	jjnd |dkr|	jr|	jjrz:t }tdt| d t||	j}t | d	 }t||
d< d|
d< d}td|ddt| d W n" ty } ztd|  ddl}|  W Y d}~nd}~ww ||
d< t|d |d  }td||
d< ||
fW S  ty } z	td|   d}~ww )a  
    Generate speech audio (non-streaming).
    
    Args:
        text: Text to synthesize (already normalized)
        speaker: Internal speaker name (resolved)
        temperature: Sampling temperature
        top_k: Top-k sampling
        top_p: Nucleus sampling
        max_tokens: Maximum tokens to generate
        repetition_penalty: Repetition penalty
        seed: Random seed for reproducibility
        output_sample_rate: "16khz" or "48khz" (triggers super-resolution)
    
    Returns:
        Tuple of (audio_bytes, metrics_dict)
        audio_bytes: WAV audio bytes (16kHz or 48kHz, 16-bit PCM) or None if failed
        metrics_dict: Dictionary with timing metrics
    
    Raises:
        RuntimeError: If runtime not initialized
    r   r   r    F>  )ttfb_msgeneration_mstokens_generatedaudio_duration_seconds
sr_appliedr,  r)  r  r   r   r   r   r*  r+  Nr   r0  r/  48khzz<SR check: output_sample_rate=%r, sr_service=%s, is_loaded=%szApplying super-resolution to z	 bytes...sr_msTr3  逻      ✅ Super-resolution applied in .1fms, output= bytes*Super-resolution failed, returning 16kHz: r,  ,      r2  zSpeech generation failed: )r   r   r   rd   rv   generate_speech_profiledupdater   r   isEnabledForloggingDEBUGdebugr{   r~   r   r  _apply_super_resolutionr   r   r   r   r   r   )r  r)  r   r   r   r   r*  r+  r,  r   metricsr   audio_bytesperfgeneration_timesample_ratesr_startsr_timer   r   audio_durationr   r   r   generate_speech  s|   !	

"
rN  rG  bytesc                 C  s   ddl }ddl}ddl}t| dk rtd|j| dd |jd}||jd }|	|}|
|}|    }	||	d dd|j}
d	}d
}d}|| | d }|| d }t|
d }|ddd| dddd
|||||d|}||
  S )z
    Apply super-resolution to audio bytes.
    
    Args:
        audio_bytes: WAV audio at 16kHz
        sr_service: SuperResolutionService instance
    
    Returns:
        WAV audio at 48kHz
    r   Nr=  zInvalid WAV data)dtypeg      @i i  r7  ra         r>  z<4sI4s4sIHHIIHH4sIs   RIFF$   s   WAVEs   fmt s   data)numpystructtorchr  r   
frombufferint16astypefloat32
from_numpyprocess_chunksqueezecpuclippacktobytes)rG  r{   nprU  rV  pcm_dataaudio_floataudio_tensorupsampled_tensor	upsampledupsampled_int16rJ  num_channelsbits_per_sample	byte_rateblock_align	data_size
wav_headerr   r   r   rE    sD   

rE  r.  rJ  c
                   s  t  stdt }
dddddddd}t }zt|
}|| r9d|d< |j| ||||||||d		I d
H }n|
jj|| ||||||dI d
H \}}|rS|	| t | }t
|d |d< |d |d< |rd}|	dkr|
jr|
jjrz:t }tdt| d t||
j}t | d }t
||d< d|d< d}td|ddt| d W n" ty } ztd|  dd
l}|  W Y d
}~nd
}~ww ||d< t|d |d  }td||d< ||fW S  ty } z	td|   d
}~ww )a  
    Generate speech with automatic text chunking for long inputs.
    
    Uses LongTextProcessor to split text and stitch audio.
    
    Args:
        text: Text to synthesize
        speaker: Internal speaker name
        ... (same as generate_speech)
        sample_rate: Internal generation sample rate (always 16000)
        output_sample_rate: "16khz" or "48khz" (triggers super-resolution)
    
    Returns:
        Tuple of (audio_bytes, metrics_dict)
    r   r   r    Fr.  )r/  r0  chunks_processedr2  text_chunkedr3  r,  Trp  )	r  r)  r   r   r   r   r*  r+  rJ  Nr4  r   r0  r/  r5  z,Applying super-resolution to chunked audio (z
 bytes)...r6  r3  r7  r8  r9  r:  r;  r<  r,  r=  r>  r2  z"Chunked speech generation failed: )r   r   r   rd   r#  should_chunkgenerate_with_chunkingrv   r?  r@  r   r{   r~   r   r   r  rE  r   r   r   r   r   r   )r  r)  r   r   r   r   r*  r+  rJ  r,  r   rF  r   long_processorrG  rH  rI  final_sample_raterK  rL  r   r   rM  r   r   r   generate_speech_chunked2  s   



"
ru  Tenable_chunkingadmission_leaserelease_admission_lease2AsyncGenerator[Tuple[bytes, Dict[str, Any]], None]c                 C sj  t  stdt }|jdu rtd|	}|du r t I dH }t|}tt|ddp,d}ddlm	} dddddt|t
t|d	dtt|d
dpKdtt|ddpTdtt|ddp]dt|jt|jt|jd}t |d  }d}d}d}z/t|}|o|| }|rd|d< t||| ||||||||2 zV3 dH W \}}|du rt }t|| d |d< ||dd}|| }|t||d dkrdnd 7 }|d  d7  < |d |d< ||d  |d< |r|| ||fV  q6 n|j|}|r|jj|| |||||||dd
}n|jj|| ||||||dd	}|2 z~3 dH W }i }|}t|trGt|dkrGt|d trG|d }|d }|rO|| |sTq |du rqt }t|| d |d< ||dd}|| }|t||d dkr~dnd 7 }|d  d7  < |d |d< ||d  |d< ||fV  q 6 W |
rt| dS dS |
rt| w w )a  
    Generate speech audio with true streaming (yields chunks as they're generated).
    
    This is the core streaming implementation for M4.
    First yield includes WAV header + first PCM chunk.
    Subsequent yields are raw PCM chunks.
    
    Args:
        text: Text to synthesize (already normalized)
        speaker: Internal speaker name (resolved)
        temperature: Sampling temperature
        top_k: Top-k sampling
        top_p: Nucleus sampling
        max_tokens: Maximum tokens to generate
        repetition_penalty: Repetition penalty
        seed: Random seed for reproducibility
        enable_chunking: Enable text chunking for long inputs with voice consistency
        admission_lease: Optional pre-acquired admission lease.
        release_admission_lease: Whether to release lease before generator returns.
    
    Yields:
        Tuple of (audio_bytes, metrics_dict)
        - First yield: WAV header (44 bytes) prepended to first PCM chunk
        - Subsequent yields: Raw PCM chunks (int16, 16kHz)
        - Final yield: metrics_dict has final timing info
    
    Raises:
        RuntimeError: If runtime not initialized or streaming pipeline unavailable
    r   Nz Streaming pipeline not availabler$   r    r   )create_wav_headerFr&   r'   r(   r)   )r/  chunks_senttotal_bytesr2  rp  admission_wait_msadmission_queuedadmission_inflight_on_grantadmission_waiters_on_grantadmission_queue_depth_on_entryadmission_max_inflightadmission_max_queueadmission_max_wait_msr`   r.  Trp  r   r/  )rJ  rm  r{  r=  ra   r|  r>  r2  )
r)  r  r
  r   r   r   r   r*  r+  emit_progress)	r)  r  r   r   r   r   r*  r+  r  )r   r   r   rw   r   r   r!   getattrveena3modal.audio.utilsrz  r%   r   r=   r>   r?   rd   r#  rq  _stream_chunked_textr  r@  r   r   )generate_speech_stream_indic_continuationgenerate_speech_stream_indicr   tupler   r   )r  r)  r   r   r   r   r*  r+  rv  rw  rx  r   rl   r   r}  rz  rF  r   first_chunk_timetotal_pcm_bytesrJ  rs  needs_chunkingaudio_chunkchunk_metricsrn  cached_globals
stream_genstream_itemstream_metricsr   r   r   generate_speech_streaming  s   *
 


" 

r  c                 C s  ddl m} ||}|sdS d}d}ddi}t|D ]\}}|d |d< |dkr\| jj||||||||	d2 z!3 dH W \}}|du rH|rH|}||||
dd\}}|rY||fV  q86 q|du rtd	|d  d
 | jj||||||||	d2 z3 dH W }||||
dd\}}|r||fV  qy6 q| jj	|||||||||	d	2 z3 dH W }||||
dd\}}|r||fV  q6 q|r||fV  dS dS )z
    Internal helper: stream chunked text with voice consistency.
    
    Uses global token caching from first chunk to maintain voice across chunks.
    r   )crossfade_bytes_int16Nro  ra   r4  r%  )sample_rate_hzcrossfade_mszNo captured globals for chunk z, using regular streaming)	r)  r  r
  r   r   r   r   r*  r+  )
veena3modal.audio.crossfader  
chunk_text	enumeraterw   (generate_speech_stream_indic_first_chunkr   r   r  r  )r   rs  r  r)  r   r   r   r   r*  r+  rJ  r  chunkscaptured_globalsprevious_chunk_tailr  ir  rG  r
  to_emitr   r   r   r  U  s   








r  )r   )r   r   r   r   r   r   r   r   )r    )r   r   r   r!   r   r!   r   r!   )r   r   )r   r%   )r   r   r   r<   rp   rq   rr   )NNNr   Nr   Fra   NNNNNNN) r   r   r   r   r   r   r   r   r   r   r   r!   r   r%   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rt   )r   rt   r   rn   )r   rt   )r$  r%  rU   r&  r'  Nr(  )r  r   r)  r   r   r!   r   r   r   r!   r   r   r*  r!   r+  r   r,  r   r   r-  )rG  rO  r   rO  )r$  r%  rU   r&  r'  Nr.  r(  )r  r   r)  r   r   r!   r   r   r   r!   r   r   r*  r!   r+  r   rJ  r   r,  r   r   r-  )	r$  r%  rU   r&  r'  NTNT)r  r   r)  r   r   r!   r   r   r   r!   r   r   r*  r!   r+  r   rv  r%   rw  rm   rx  r%   r   ry  )r   rt   r  r   r)  r   r   r!   r   r   r   r!   r   r   r*  r!   r+  r   rJ  r   r   ry  )-r.   
__future__r   r   sysrd   rB  rC   rf   dataclassesr   r   typingr   r   r   r   r	   r
   	getLoggerr+   r   r   r"   r#   r   r0   r<   rt   r   r/   r   r   r   r   r   r   r   r   r#  rN  rE  ru  r  r  r   r   r   r   <module>   s     
 "





	
 
u
Z
jEo 8