o
    i@                  	   @   s  d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZm Z m!Z! e"e#Z$ej%&d	d
' ( Z)e)dv Z*d+de+de,de,de,fddZ-de+de.de.fddZ/dZ0e-ddddZ1e-ddddZ2e-ddddZ3e-ddddZ4e/d d!Z5e-d"e2ddZ6e-d#e7e2d$ddZ8e-d%dddZ9ej%&d&d'' ( d(v Z:G d)d* d*Z;dS ),u}  
Veena3 Streaming Pipeline - Sliding Window Approach

Implements streaming for both SNAC and BiCodec tokens.

For BiCodec (Spark TTS):
1. Generate tokens from vLLM
2. Parse NEW token IDs incrementally (O(1) per token via cache)
3. Buffer semantic + global tokens
4. Apply sliding window (every N token pairs)
5. Decode and stream audio chunks

OPTIMIZATION (Dec 2025):
- Replaced O(n²) pattern (decode-all + regex-all per iteration)
- Now uses incremental token parsing with BiCodecTokenParser
- ~10x CPU reduction in streaming hot loop

For SNAC (legacy):
1. Filter SNAC token IDs directly
2. Apply sliding window
3. Decode and stream
    N)AnyAsyncGeneratorDictListOptionalTupleUnion)SamplingParams)RequestOutputKind)crossfade_bytes_int16)BiCodecTokenParser)CODE_END_TOKEN_IDCODE_START_TOKEN_IDSNAC_MIN_IDSNAC_MAX_IDTRAINING_STOP_TOKEN_IDSDEFAULT_TEMPERATUREDEFAULT_TOP_KDEFAULT_TOP_PDEFAULT_MAX_TOKENSDEFAULT_MIN_TOKENSDEFAULT_REPETITION_PENALTYDEFAULT_SEEDVEENA3_STREAM_OUTPUT_KINDdelta>   1donyestruer   namedefaultminimumreturnc                 C   sD   t j| d}|s|S zt|}W n ty   | Y S w t||S )N )osenvirongetint
ValueErrormax)r    r!   r"   rawvalue r-   >/home/ubuntu/veenaModal/veena3modal/core/streaming_pipeline.py_env_int6   s   
r/   c                 C   s&   t j| d}|s|S |  dv S )Nr$   >   r   r   r   r   )r%   r&   r'   striplower)r    r!   r+   r-   r-   r.   	_env_flagA   s   r2        VEENA3_STREAM_MIN_SEMANTIC_FIRST
      )r"   VEENA3_STREAM_DECODE_INTERVAL0   VEENA3_STREAM_CROSSFADE_MS2   VEENA3_STREAM_WINDOW_SIZE      VEENA3_STREAM_ADAPTIVE_DECODEF#VEENA3_STREAM_DECODE_INTERVAL_FIRST"VEENA3_STREAM_DECODE_INTERVAL_BUSY@   !VEENA3_STREAM_DECODE_BUSY_PENDINGVEENA3_STREAM_WINDOWED_DECODEr   >   r   r   r   r   c                   @   s4  e Zd ZdZdd Zedee defddZede	de
e fd	d
Zde	deeef fddZdeee	f dedee ddfddZdefddZdedeeef fddZeeeedfdedededededede
e deedf fdd Zdedeee ee f fd!d"Zeeeeedd#fd$ededed%edededede
e d&edeeeeeeee	f f f df fd'd(Z eeeeedfd$ededed%edededede
e deedf fd)d*Z!eeeeedd#fd$eded+ee ded%edededede
e d&edeeeeeeee	f f f df fd,d-Z"dS ).Veena3SlidingWindowPipelinea^  
    Streaming TTS pipeline using sliding window approach.
    
    This eliminates choppy audio and popping artifacts by:
    - Decoding overlapping 28-token windows (4 frames)
    - Keeping only the middle 2048 samples from each decode
    - Creating natural continuity between chunks
    
    Based on the official Canopy Labs implementation.
    c                 C   sR   || _ || _|| _t|dd}|du rt|jdd}|r t|nd| _td dS )z
        Initialize sliding window streaming pipeline.
        
        Args:
            model: Veena3Model instance
            prompt_builder: Veena3PromptBuilder instance
            snac_decoder: SNACDecoder instance (with batching enabled)
        	tokenizerNuH   🌊 Veena3SlidingWindowPipeline initialized (sliding window: 28 tokens))modelprompt_buildersnac_decodergetattrenginer   token_parserprint)selfrF   rG   rH   rE   r-   r-   r.   __init__i   s   z$Veena3SlidingWindowPipeline.__init__valuesr#   c                 C   s$   | sdS t | }|t|d d  S )N        r6      )sortedlen)rO   orderedr-   r-   r.   _p50   s   z Veena3SlidingWindowPipeline._p50r,   c              	   C   sF   | d u rd S t | ttfrt| S zt| W S  ttfy"   Y d S w )N)
isinstancer(   float	TypeErrorr)   )r,   r-   r-   r.   
_to_number   s   
z&Veena3SlidingWindowPipeline._to_numberraw_metricsc           
   	   C   s  |d u ri S i }g }t |tr||  nEt|dr2z|t|j  W n	 ty1   Y nw t|D ]%}|dr>q6zt	||}W n	 tyN   Y q6w t
|rTq6|||f q6t }|D ]\}}t|}||v rnqa|| | |}	|	d ur|	||< qa|S )N__dict___)rV   dictextenditemshasattrr[   	Exceptiondir
startswithrI   callableappendsetstraddrY   )
rM   rZ   numericr_   r    r,   seenkeykey_sparsedr-   r-   r.   _extract_request_metrics   sD   




z4Veena3SlidingWindowPipeline._extract_request_metricsperfprefixNc                 C   sf   |sd S t t||| d< t t||| d< t t||| d< t | ||| d< d S )N_total_min_maxrU   )rW   summinr*   rU   )rM   ro   rp   rO   r-   r-   r.   _apply_stats   s   z(Veena3SlidingWindowPipeline._apply_statsc                 C   sB   t | jdd }t|sdS z	tdt| W S  ty    Y dS w )Nget_pending_requestsr   )rI   rH   rd   r*   r(   ra   )rM   getterr-   r-   r.   _get_decoder_pending_requests   s   z9Veena3SlidingWindowPipeline._get_decoder_pending_requestsfirst_chunk_emittedc                 C   sP   t t}d}ts||fS |st t}|  }|tkr!t|t t}td||fS )Nr   r6   )r(   _STREAM_DECODE_INTERVAL_STREAM_ADAPTIVE_DECODE_STREAM_DECODE_INTERVAL_FIRSTry   _STREAM_DECODE_BUSY_PENDINGr*   _STREAM_DECODE_INTERVAL_BUSY)rM   rz   intervalpendingr-   r-   r.   _resolve_decode_interval   s   z4Veena3SlidingWindowPipeline._resolve_decode_intervaldescriptiontexttemperaturetop_p
max_tokensrepetition_penaltyseedc              	   C  s  t d t d|dd  t dt| | j||}t dt| t|||t|tg|d}	t d|| g }
d	}d	}td
 d	dl	}d	dl
}d| jdd  dt|
 d  }| jjj||	|d}|2 zo3 dH W }|jd	 j}||d }t|}|D ]V}t|  krtkrn q|
| t|
d d	krt|
dkr|
dd }| jjr| jj|dddI dH }n	| jj|ddd}|r|d7 }|dkrt dt| |V  qqm6 t d|| dS )a'  
        Generate speech audio with sliding window streaming.
        
        Yields audio chunks using overlapping windows for smooth playback.
        
        Args:
            description: Character/voice description
            text: Text to synthesize (with optional <emotion> tags)
            temperature: Sampling temperature
            top_p: Nucleus sampling
            max_tokens: Max SNAC tokens to generate
            repetition_penalty: Prevent loops
        
        Yields:
            Audio bytes (int16 PCM, 24kHz mono)
        z+sliding-window streaming generation startedzdescription_prefix=%sNP   ztext_len=%dzprompt built chars=%d)r   r   r   
min_tokensr   stop_token_idsr   z+sampling temp=%s top_p=%s sliding_window=28r   u!   🔮 Starting token generation...zslide-   -@B promptsampling_params
request_id      iFT)trim_warmupuse_sliding_windowr6   z+first sliding-window chunk decoded bytes=%dz+sliding-window complete tokens=%d chunks=%d)loggerdebugrS   rG   build_prefixr	   r   r   rL   uuidtimeuuid4hexr(   rF   rJ   generateoutputs	token_idsr   r   re   rH   enable_batchingdecode_single_asyncdecode_to_bytes)rM   r   r   r   r   r   r   r   r   r   token_buffertotal_tokens_generatedtotal_audio_chunksr   r   r   results_generatorrequest_outputgenerated_ids
new_tokenstoken_idwindow_tokensaudio_bytesr-   r-   r.   generate_speech_stream   sz   

*
*z2Veena3SlidingWindowPipeline.generate_speech_streamc                 C   s<   t d|}dd |D }t d|}dd |D }||fS )z
        Extract BiCodec semantic and global tokens from generated text.
        
        Args:
            text: Generated text containing BiCodec token markers
        
        Returns:
            Tuple of (semantic_ids, global_ids)
        z<\|bicodec_semantic_(\d+)\|>c                 S      g | ]}t |qS r-   r(   .0tr-   r-   r.   
<listcomp>W      zQVeena3SlidingWindowPipeline._extract_bicodec_tokens_from_text.<locals>.<listcomp>z<\|bicodec_global_(\d+)\|>c                 S   r   r-   r   r   r-   r-   r.   r   Z  r   )refindall)rM   r   semantic_matchessemantic_idsglobal_matches
global_idsr-   r-   r.   !_extract_bicodec_tokens_from_textL  s
   
z=Veena3SlidingWindowPipeline._extract_bicodec_tokens_from_textFspeakertop_kemit_progressc
           _        sX  ddl ddl}
 ddddgd dhdtdtf fdd	} }| j||} | d
 }|d  }t||||t|t	d|t
rRtjntjd
} | d
 }|d d|
 jdd  dt  d  }| j}|du rt| jddpt| jjdd}|rt|nd}|du rtd| jjj|||d}g }g }d}d}d}d}d}d}d}d}d} i }!d}"g }#g }$g }%g }&g }'g }(g })g }*g }+g },g }- }.|.}/|2 z3 dH W }0 }1|1|/ d
 }2|1}/|0jpg }3|3sq|3d j}4t
r|4}5|t|57 }n
|4|d }5t|4}t|5}6|6dkr q||67 }|#t|2 |(t|6 dvr<|d n
|#rF|%t|2 | t|0dd}7|7r|7}!d}8d|7v r`|7d }8n	d|7v ri|7d }8|8dur|"du rx|8d
 }9n|8|"kr|8|" d
 }9n|8d
 }9|8}"|9dkr|$t|9 t|#dkr|&t|9  }:||5|| |' |: d
  t|tkrt|};|;tkrdvr|d | j | dud\}<}=|;| }>|>|<kr|>}?|;}|,t|< |-t|= |dt }@t!r|;t"kr|t" d }An|}A }B| j#j$|A|@dddI dH }C }D|D|B d
 }E|)t|E |*tt|A |durI|+|D| d
  |D}|Crt|Cd }Ft!ru|;t"kru|?d }G|F|Gkrl|F|G d }Hnd}H|C|Hd }In|F|kr|d }H|C|Hd }Ind }I|Irt%||Id!t&d"\}J}|Jr|d7 }|t|Jd 7 }|du r|D}|d# | du r } |d$ |	r|Ji fV  q|JV  q6  |. d
 }K|d% t||krvt|tkrv|}A }B| j#j'|A|dt ddd&}C }D|D|B d
 }E|)t|E |*tt|A |dur|+|D| d
  |D}|Crvt|Cd }F|F|krv|d }H|C|Hd }It%||Id!t&d"\}J}|Jrv|d7 }|t|Jd 7 }|du r]|D}|d# | du rj } |d$ |	rs|Ji fV  n|JV  |r|d7 }|t|d 7 }| du r } |d$ |	r|i fV  n|V  |d' |d( i d)t|d*t|d+t|Kd,tt|#d-t|d.tt|d/tt|d0tt(|'d1tt|)d2tt|)d3t|d4t|d5 d6t| r|  d
 ndd7t
rd8nd9d:tt)d;tt*d<tt+tt,t-t.tt"t-t!t/t  d
 d=}Li }Mt0| j#d>rRz
| j#1 pCi }MW n t2yQ   i }MY nw |MrZ|L3|M | 4|Ld?|# | 4|Ld@|$ | 4|LdA|% | 4|LdB|& | 4|LdC|( | 4|LdD|' | 4|LdE|) | 4|LdF|* | 4|LdG|+ | 4|LdH|, | 4|LdI|- t|L5dJd|LdE< d|LdK< t|L5dJd|LdL< |dkrt|Kt| |LdM< |#rtt(|#tt|# |LdN< |$rtt(|$tt|$ |LdO< D ])}N|N5d}O|Osqt|N5dPd|LdQ|O dR< t|N5dSd|LdQ|O dT< q|L5dU}P|L5dV}Q|L5dW}Rt6|Pttfr9t|P|LdX< t6|QttfrGt|Q|LdY< t6|Pttfrdt6|Qttfrd|Q|Pkrdt|Q|P |LdZ< t6|Qttfrt6|Rttfr|R|Qkrt|R|Q |Ld[< d\d]d^d_d`da}S|S7 D ]\}T}U|T|!v rt|!|T d
 |L|U< qdbt8t dct8t dt8t fddde}V|!5df}W|!5dg}X|!5dh}Y|!5di}Z|V|W|X}[|V|X|Y}\|V|Y|Z}]|V|W|Z}^|[dur|[|Ldj< |[|Ld\< |\dur|\|Ldk< |\|Ld]< |]dur|]|Ldl< |]|Ld_< |^dur|^|Ldm< |^|Ldn< t9:do|Ld4 |Ld6 |Ld3 |Ld-  |	r*d |LfV  dS dS )puR  
        Generate speech audio with sliding window streaming for Indic model.
        
        Yields audio chunks using overlapping windows for smooth playback.
        
        Args:
            speaker: Speaker name (one of 12 predefined speakers)
            text: Text to synthesize with inline emotion tags
                Examples:
                - "Hello! Welcome."
                - "<laugh> Hello there!"
                - "नमस्ते! <excited> आज का दिन बहुत अच्छा है।"
            temperature: Sampling temperature
            top_p: Nucleus sampling
            max_tokens: Max SNAC tokens to generate
            repetition_penalty: Prevent loops
            seed: Random seed for reproducibility
        
        Yields:
            Audio bytes (int16 PCM, 16kHz mono - BiCodec)
        r   Nrequest_startrP   staget_msdt_msr   r#   c                    sH      d }|  }| | t|t|d |  t|S )N     @@r   )perf_counterre   rW   rh   )r   now_msr   last_mark_msmark_stage_sett_request_startr   timeline_marksr-   r.   _mark  s   
zGVeena3SlidingWindowPipeline.generate_speech_stream_indic.<locals>._markr   prompt_readyF
r   r   r   r   r   r   stopskip_special_tokensr   output_kindsampling_readyzbicodec-stream-r   r   r   rE   z,BiCodecTokenParser unavailable for streamingr   llm_first_batchmetricsmodel_execute_timemodel_forward_timer6   first_chunk_readyrz   r   r   r   r   rQ   @      >  sample_rate_hzcrossfade_msfirst_chunk_decodedrz   llm_doner   r   r   r   stream_donerequest_doneprompt_build_mssampling_params_msllm_generation_wall_msllm_batch_countllm_token_totalsemantic_token_totalglobal_token_totalllm_parse_msbicodec_decode_callsllm_decode_callschunks_sentaudio_duration_secondsg     @@ttfb_msstream_output_kindr   
cumulativestream_decode_interval_tokens#stream_decode_interval_first_tokens"stream_decode_interval_busy_tokens)$stream_decode_busy_pending_thresholdstream_adaptive_decodestream_window_size_tokensstream_windowed_decodetimeline_markerstimeline_total_msget_batching_statsllm_batch_wall_msllm_batch_gpu_msllm_decode_wall_msllm_decode_gpu_mstokens_per_batchllm_parse_step_msbicodec_decode_wall_msbicodec_decode_tokensbicodec_decode_interval_msdecode_interval_applied_tokensdecode_pending_requestsbicodec_decode_wall_ms_totalbicodec_decode_gpu_msbicodec_decode_cpu_msllm_time_per_token_msllm_time_per_batch_wall_msllm_time_per_batch_gpu_msr   	timeline__msr   	_delta_mstimeline_llm_first_batch_mstimeline_first_chunk_emitted_mstimeline_request_done_mstimeline_to_first_batch_ms"timeline_to_first_chunk_emitted_ms.timeline_first_batch_to_first_chunk_emitted_ms'timeline_first_chunk_to_request_done_msllm_time_in_queue_msllm_scheduler_msllm_model_forward_msllm_model_execute_msllm_first_token_ms)time_in_queuescheduler_timer   r   first_token_latencystart_send_sc                 S   s,   | d u s|d u r
d S || k rd S ||  d S )Nr   r-   )r#  r$  r-   r-   r.   r    s
   zKVeena3SlidingWindowPipeline.generate_speech_stream_indic.<locals>._delta_ms	queued_tsscheduled_tsfirst_token_tslast_token_tsllm_queued_to_scheduled_msllm_scheduled_to_first_token_msllm_first_to_last_token_msllm_queued_to_last_token_msllm_request_lifecycle_mszBstreaming complete audio=%.2fs ttfb=%.0fms chunks=%d llm_tokens=%d);r   r   r   rg   rW   rG   r   r	   r   r   _STREAM_USE_DELTA_OUTPUTr
   DELTA
CUMULATIVEr   r   r(   rK   rI   rF   rJ   r   RuntimeErrorr   r   r   rS   re   rn   parse_incremental_STREAM_EXPECTED_GLOBAL_COUNT _STREAM_MIN_SEMANTIC_FIRST_CHUNKr   _STREAM_WINDOWED_DECODE_STREAM_WINDOW_SIZErH   r   r   _STREAM_CROSSFADE_MSdecode_streamingrt   r{   r}   r   r~   boolr|   listr`   r   ra   updaterv   r'   rV   r_   r   r   r   )_rM   r   r   r   r   r   r   r   r   r   r   r   	prompt_t0r   r   sampling_t0r   r   r   rK   rE   r   semantic_bufferglobal_bufferprocessed_token_counttotal_generated_tokensr   total_samples_emitted_to_userprevious_chunk_taillast_decode_countlast_decode_end_tst_first_chunk_decode_donet_first_chunk_emittedfinal_request_metricsprev_gpu_cumulative_sr   r  r  r  llm_parse_ms_samplesr  r  r  bicodec_decode_intervals_msr	  r
  t_llm_startt_last_batchr   now
batch_wallr   r   new_token_idsnew_token_countraw_req_metricscumulative_gpu_sgpu_delta_msparse_t0semantic_countdecode_intervalpending_requestssemantic_deltadecode_step_tokensdecode_globaldecode_semantic	decode_t0r   	decode_t1decode_wall_mstotal_samples_decodednew_samples_approxnew_bytes_startnew_audio_bytesto_emitr   ro   decoder_batch_statsmarkr   tl_first_batchtl_first_emittl_done
sec_fieldssrcdstr  r%  r&  r'  r(  
q_to_schedsched_to_firstfirst_to_lastqueued_to_lastr-   r   r.   generate_speech_stream_indic^  s  !	*
























  







	



 


**
"











z8Veena3SlidingWindowPipeline.generate_speech_stream_indicc	           -      C  s  ddl }	|	  }
| j||}t||||t|td|trtjntj	d
}g }g }d}d}d}d}d}d}ddl
}d	| jdd
  dt|	  d  }| j}| jjj|||d}d}d}d}g }|2 z3 dH W }|jd j}tru|}n
||d }t|}|||| |st||kr|d| }t||kr	t|} | |kr	| | |kr	| }|}!|d| }"| jjr| jj|!|"dddI dH }#n
| jj|!|"ddd}#|#r	t|#d }$|$|kr	|d }%|#|%d }&t||&d|d\}'}t|'d }(||(7 }|'r	|d7 }|'|fV  qd6 t||kr_t||kr_| jj||d| ddd}#|#r_t|#d }$|$|kr_|d }%|#|%d }&t||&d|d\}'}t|'d })||)7 }|'r_|d7 }|'|fV  |ru|d7 }t|d }*||*7 }||fV  |	  }+|d },td|,|t| dS )aF  
        Generate speech for the FIRST chunk of multi-chunk text, capturing global tokens.
        
        This method is specifically for chunked generation: it yields (audio_bytes, global_ids)
        tuples instead of just audio_bytes. The caller captures global_ids from the first
        yield and passes them to generate_speech_stream_indic_continuation() for subsequent chunks.
        
        Use Case (chunked streaming with voice consistency):
            globals_captured = None
            async for audio_bytes, global_ids in pipeline.generate_speech_stream_indic_first_chunk(...):
                if globals_captured is None and global_ids:
                    globals_captured = global_ids  # Capture once
                yield audio_bytes
            # Now use globals_captured for subsequent chunks
        
        Args:
            speaker: Speaker name
            text: First text chunk to synthesize
            temperature: Sampling temperature
            top_k: Top-k sampling
            top_p: Nucleus sampling
            max_tokens: Max tokens to generate
            repetition_penalty: Prevent repetition
            seed: Random seed for reproducibility
        
        Yields:
            Tuple of (audio_bytes, global_ids)
            - audio_bytes: Raw PCM audio (int16, 16kHz)
            - global_ids: List of 32 global token IDs (populated after first decode, else empty)
        
        Thread Safety:
            This method is stateless and thread-safe. Each call creates its own
            request-scoped state. No global state is modified or shared.
        r   NFr   r3   r5   r8   r:   zbicodec-first-r   r   r   r   r   r   rQ   r   r   r6   z>first-chunk complete audio=%.2fs chunks=%d captured_globals=%d)r   rG   r   r	   r   r   r.  r
   r/  r0  r   r   r   r(   rK   rF   rJ   r   r   r   rS   r2  rH   r   r   r8  r   r   r   )-rM   r   r   r   r   r   r   r   r   r   t_startr   r   r>  r?  r@  r   EXPECTED_GLOBAL_COUNTMIN_SEMANTIC_FOR_FIRST_CHUNKDECODE_INTERVALCROSSFADE_MSr   r   rK   r   rD  rB  rC  captured_globalsr   r   rP  rV  all_semanticwindow_globalr   r`  rb  rc  rd  samples_emitted_in_this_chunksamples_emitted_in_finaltail_samplest_endaudio_duration_sr-   r-   r.   (generate_speech_stream_indic_first_chunk  s   -*



?




zDVeena3SlidingWindowPipeline.generate_speech_stream_indic_first_chunkr   c           9      C  s  ddl }|  }t|tkrtdt dt| d| j|||}t||||t|td|	t	r3t
jnt
jd
}g }g }d}d}d}t}t}t}t}ddl}d| jdd	  d
t|  d  }| j}| jjj|||d}d}d}d}d}d} d}!g }"g }#|2 z3 dH W }$|$jd j}%t	r|%}&|t|&7 }n
|%|d }&t|%}|t|&7 }||&|| t|}'|'|kr| j|dud\}(})|'| }*|*|(kr|*}+|'}|"t|( |#t|) tr|'|kr|| d },n|},|  }-| j!j"r| j!j#|,|dddI dH }.n
| j!j$|,|ddd}.| d7 } |!|  |- d 7 }!|.rt|.d }/trH|'|krH|+d }0|/|0kr?|/|0 d }1nd}1|.|1d }2n|/|krX|d }1|.|1d }2nd}2|2rt%||2d|d\}3}|3r|d7 }|t|3d 7 }|du r|  }|
r|3i fV  q|3V  q6 t||kr|  }-| j!j$||ddd}.| d7 } |!|  |- d 7 }!|.rt|.d }/|/|kr|d }1|.|1d }2t%||2d|d\}3}|3r|d7 }|t|3d 7 }|du r|  }|
r|3i fV  n|3V  |r|d7 }t|d }4||47 }|du r|  }|
r|i fV  n|V  |  }5|d }6t&'d|6| |
ri dt|dtt|dtt|dt| dt| dt|!ddd t|!d!t|d"t|6d#t|rn|| d ndd$t	rwd%nd&d't|d(tt(d)tt)d*tt*d+t+t,t|t+td,}7| -|7d-|" | -|7d.|# i }8t.| j!d/rz
| j!/ pi }8W n t0y   i }8Y nw |8r|71|8 d|7fV  dS dS )0ax  
        Generate speech for CONTINUATION chunks using pre-captured global tokens.
        
        This is the key method for voice consistency in chunked generation:
        - Uses global_ids from the first chunk to maintain identical voice
        - Model generates only semantic tokens (skips global token generation)
        - Ensures no voice drift across chunks
        
        CRITICAL for production:
        - global_ids is request-scoped, passed explicitly (no shared state)
        - Thread-safe: each request has its own state
        - No global caching that could cause cross-request contamination
        
        Args:
            speaker: Speaker name (must match first chunk)
            text: Continuation text chunk to synthesize
            global_ids: 32 global token IDs captured from first chunk
            temperature: Sampling temperature
            top_k: Top-k sampling
            top_p: Nucleus sampling
            max_tokens: Max tokens to generate
            repetition_penalty: Prevent repetition
            seed: Random seed (use same as first chunk for consistency)
        
        Yields:
            Audio bytes (int16 PCM, 16kHz mono)
        
        Raises:
            ValueError: If global_ids doesn't contain exactly 32 tokens
        
        Thread Safety:
            Fully thread-safe. All state is request-scoped and passed explicitly.
        r   NzExpected exactly z global tokens, got z:. global_ids must be captured from first chunk generation.Fr   zbicodec-cont-r   r   r   r   rP   r   r   r   r6   r   rQ   r   r   r   r   z+continuation complete audio=%.2fs chunks=%dr   r   r   r   r   r  r  r  r   r   r   r   r   r   r   r   r   r   r   )r   r   r	  r
  r   )2r   rS   r3  r)   rG   build_prefix_with_globalsr	   r   r   r.  r
   r/  r0  r4  r{   r7  r6  r   r   r   r(   rK   rF   rJ   r   r   r   r2  r   re   rW   r5  r   rH   r   r   r8  r   r   r   r}   r   r~   r9  r|   rv   r`   r   ra   r;  )9rM   r   r   r   r   r   r   r   r   r   r   r   rr  r   r   r>  global_buffer_unusedr@  r   rA  rt  ru  rv  WINDOW_SIZEr   r   rK   r   rD  rB  rC  rG  decode_callsdecode_wall_ms_totalr	  r
  r   r   rP  rV  rW  rX  rY  rZ  r\  r]  r   r`  ra  rb  rc  rd  r|  r}  r~  ro   re  r-   r-   r.   )generate_speech_stream_indic_continuation  s  .*







R



	

zEVeena3SlidingWindowPipeline.generate_speech_stream_indic_continuation)#__name__
__module____qualname____doc__rN   staticmethodr   rW   rU   r   r   rY   r   rg   rn   rv   r(   ry   r9  r   r   r   r   r   r   r   bytesr   tupler   r   r   rq  r  r  r-   r-   r-   r.   rD   ]   s    
&%	
	
"r	
"
   <	


 S	
"rD   )r   )<r  asynciologgingr%   r   typingr   r   r   r   r   r   r   vllmr	   vllm.sampling_paramsr
   veena3modal.audio.crossfader   veena3modal.core.token_utilsr   veena3modal.core.constantsr   r   r   r   r   r   r   r   r   r   r   r   	getLoggerr  r   r&   r'   r0   r1   _STREAM_OUTPUT_KIND_RAWr.  rg   r(   r/   r9  r2   r3  r4  r{   r7  r6  r|   r}   r*   r   r~   r5  rD   r-   r-   r-   r.   <module>   s>    $8

