o
    ˜i´m  ã                   @   s>  d Z ddlZddlZddlZddlZddlZddlmZmZ ddl	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlZe e¡Zej dd¡ ¡  ¡ d	v Zej d
d¡ ¡  ¡ d	v Zej dd¡ ¡  ¡ d	v Zej dd¡ ¡  ¡ d	v Zdedeeef ddfdd„Z ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( G dd„ dƒZ)dS )z¡
Spark TTS Generation Pipeline

End-to-end pipeline for TTS generation with BiCodec audio tokenizer.
Migrated from Veena3/Orpheus (SNAC) to Spark TTS (BiCodec).
é    N)ÚasdictÚis_dataclass)ÚOptionalÚListÚDictÚAnyÚTuple)ÚSamplingParams)ÚRequestOutputKindÚVEENA3_PERF_STAGE_LOGÚ >   Ú1ÚonÚyesÚtrueÚVEENA3_PERF_GPU_TIMINGÚVEENA3_ASYNC_BICODEC_DECODEÚVEENA3_NON_STREAM_FINAL_ONLYÚeventÚextraÚreturnc                 C   s8   t rtj| |d dS t tj¡rtj| |d dS dS )zPEmit hot-path perf logs only when explicitly enabled or DEBUG logging is active.©r   N)Ú_PERF_STAGE_LOG_ENABLEDÚloggerÚinfoÚisEnabledForÚloggingÚDEBUGÚdebug)r   r   © r   ú4/home/ubuntu/veenaModal/veena3modal/core/pipeline.pyÚ_log_perf_stage.   s
   ÿr!   )ÚTRAINING_STOP_TOKEN_IDSÚDEFAULT_TEMPERATUREÚDEFAULT_TOP_KÚDEFAULT_TOP_PÚDEFAULT_MAX_TOKENSÚDEFAULT_SEEDÚAUDIO_SAMPLE_RATEc                   @   sŠ  e Zd ZdZdd„ Zedee defdd„ƒZede	de
e fd	d
„ƒZde	deeef fdd„Zdeee	f dedee ddfdd„Zeeeeddfdededededededede
e dee
e eee	f f fdd„Zeeeeddfdededededededede
e de
e fdd„Zeeeeddfdededededededede
e de
e fd d!„Zd"edeee ee f fd#d$„ZdS )%ÚSparkTTSPipelinez
    End-to-end TTS pipeline for Spark TTS with BiCodec.
    
    Replaces SNAC-based token extraction with BiCodec regex-based extraction.
    c                 C   s   || _ || _|| _tdƒ dS )zí
        Initialize pipeline.
        
        Args:
            model: SparkTTS Model instance
            prompt_builder: IndicPromptBuilder instance (with Spark TTS format)
            bicodec_decoder: BiCodecDecoder instance
        u!   ðŸš€ SparkTTSPipeline initializedN)ÚmodelÚprompt_builderÚbicodec_decoderÚprint)Úselfr*   r+   r,   r   r   r    Ú__init__G   s   zSparkTTSPipeline.__init__Úvaluesr   c                 C   s$   | sdS t | ƒ}|t|ƒd d  S )z%Compute a stable p50 for small lists.ç        é   é   )ÚsortedÚlen)r0   Úorderedr   r   r    Ú_p50[   s   zSparkTTSPipeline._p50Úvaluec              	   C   sF   | du rdS t | ttfƒrt| ƒS zt| ƒW S  ttfy"   Y dS w )z1Best-effort conversion of scalar values to float.N)Ú
isinstanceÚintÚfloatÚ	TypeErrorÚ
ValueError)r8   r   r   r    Ú
_to_numberc   s   
ÿzSparkTTSPipeline._to_numberÚraw_metricsc           
   	   C   s8  |du ri S i }g }t |tƒr| | ¡ ¡ n^t|ƒr0z| t|ƒ ¡ ¡ W n	 ty/   Y nw t|dƒrKz| t|jƒ ¡ ¡ W n	 tyJ   Y nw t	|ƒD ]%}| 
d¡rWqOzt||ƒ}W n	 tyg   Y qOw t|ƒrmqO| ||f¡ qOtƒ }|D ]\}}t|ƒ}||v r‡qz| |¡ |  |¡}	|	dur™|	||< qz|S )z§
        Convert vLLM request_output.metrics into a numeric dictionary.

        vLLM may return a dataclass-like object or a plain dict depending on version.
        NÚ__dict__Ú_)r9   ÚdictÚextendÚitemsr   r   Ú	ExceptionÚhasattrr@   ÚdirÚ
startswithÚgetattrÚcallableÚappendÚsetÚstrÚaddr>   )
r.   r?   ÚnumericrD   Únamer8   ÚseenÚkeyÚkey_sÚparsedr   r   r    Ú_extract_request_metricso   sP   
ÿ
ÿ
ÿ

€z)SparkTTSPipeline._extract_request_metricsÚperfÚprefixNc                 C   sf   |sdS t t|ƒƒ||› d< t t|ƒƒ||› d< t t|ƒƒ||› d< t |  |¡ƒ||› d< dS )z:Populate total/min/max/p50 stats for a list of ms samples.NÚ_totalÚ_minÚ_maxr7   )r;   ÚsumÚminÚmaxr7   )r.   rV   rW   r0   r   r   r    Ú_apply_stats¢   s   zSparkTTSPipeline._apply_statsgÍÌÌÌÌÌð?ÚspeakerÚtextÚtemperatureÚtop_kÚtop_pÚ
max_tokensÚrepetition_penaltyÚseedc	           F      ƒ   s‚
  ddl m}	 ddddddddddddœ‰t ¡ ‰ddddœg‰d‰ˆˆd< dˆd< d	td
tf‡‡‡‡fdd„‰ dn‡ ‡‡fdd„}
t ¡ }| j ||¡}t ¡ | d ˆd< ˆ dƒ zt| j	j
j|ddƒˆd< W n tyv   dˆd< Y nw t ¡ }t|||||td|tr‰tjntjd	}t ¡ | d ˆd< ˆ dƒ dt ¡ jdd… › }td||t|ƒ|dœd | j	jj|||d}d}i }g }g }g }g }g }d}d}t ¡ }|}|2 zÙ3 dH W }t ¡ }|| d }|}|jpïg }|sõ|}qÙ|d }t|jƒ} td| | ƒ}!| }|  t|ddƒ¡}"|"r|"}|!dkr|}qÙˆd   d!7  < ˆd  d!krAˆ d"ƒ}#td#|t|d$ƒ|!t|#d$ƒd%œd | t|!ƒ¡ | t|ƒ¡ ˆd  d!kr]| t|ƒ¡ d}$|"rud&|"v rl|"d& }$n	d'|"v ru|"d' }$|$dur°|du r„|$d }%n|$|kr|$| d }%n|$d }%|$}|%dkr°| t|%ƒ¡ ˆd  d!kr°| t|%ƒ¡ |}qÙ6 t ¡ | d ˆd(< ˆ d)ƒ}&td*|tˆd( d$ƒˆd  t|&d$ƒd+œd |du rít ¡ ˆ d ˆd,< |
ƒ  dˆfS |jd j }'|jd j}(t|(ƒˆd-< t ¡ }|  !|'¡\})}*t ¡ | d ˆd.< t|)ƒˆd/< t|*ƒˆd0< ˆ d1ƒ}+td2|tˆd. d$ƒˆd/ ˆd0 t|+d$ƒd3œd |  "ˆd4|¡ |  "ˆd5|¡ |  "ˆd6|¡ |  "ˆd7|¡ |  "ˆd8|¡ |rxt|d ƒˆd9< |rst|d ƒndˆd:< ntˆd( ƒˆd9< dˆd:< |s¦| #d&¡dur¦t|d& ƒd },|,ˆd;< ˆd  d!kr¦|,ˆd<< ˆd- dkr»tˆd( ƒtˆd- ƒ ˆd=< ˆd  dkrétˆ #d>d¡ƒtˆd  ƒ ˆd?< ˆ #d;d¡dkrétˆd; ƒtˆd  ƒ ˆd@< dAdBdCdDdEdFœ}-|- $¡ D ]\}.}/|.|v rt||. ƒd ˆ|/< qõdGt%t dHt%t d
t%t fdIdJ„}0| #dK¡}1| #dL¡}2| #dM¡}3| #dN¡}4|0|1|2ƒ}5|0|2|3ƒ}6|0|3|4ƒ}7|0|1|4ƒ}8|5durP|5ˆdO< |5ˆdA< |6dur]|6ˆdP< |6ˆdB< |7durj|7ˆdQ< |7ˆdD< |8durw|8ˆdR< |8ˆdS< dT|v rˆ #d-d¡dkrt&|dT ƒˆd-< | #dU¡}9| #dV¡}:| #dW¡};|9dur³|:dur³|:|9kr³|:|9 d ˆdE< |9durÊ|;durÊ|;|9krÊ|;|9 d ˆdS< |)rÐ|*sát ¡ ˆ d ˆd,< |
ƒ  dˆfS t ¡ }| j' (|)|*¡st ¡ | d ˆdX< ˆ dYƒ t ¡ ˆ d ˆd,< |
ƒ  dˆfS t ¡ | d ˆdX< ˆ dYƒ t ¡ }<d}=d!ˆdZ< t)o=t*j+ ,¡ o=t-| j'd[ƒo=t| j'j.d\d]ƒd^k}>|>rz8| j'j.}?t*j+j/d_d`}@t*j+j/d_d`}At*j+j0|?da |@ 1¡  | j' 2|)|*¡}B|A 1¡  t*j+j0|?da t|@ 3|A¡ƒ}=W n( tyŒ   | j' 2|)|*¡}Bd}=Y nw t4r›| j' 5|)|*¡I dH }Bn| j' 2|)|*¡}Bt ¡ |< d }Ct|Cƒˆdb< ttd|=ƒƒˆdc< ttd|C|= ƒƒˆdd< ˆ deƒ}Dtdf|tˆdb d$ƒtˆdc d$ƒtˆdd d$ƒt|Dd$ƒdgœd |Bdu rüt ¡ ˆ d ˆd,< |
ƒ  dˆfS t ¡ }|	|Bt6dh}Et ¡ | d ˆdi< ˆ djƒ t ¡ ˆ d ˆd,< |
ƒ  tdk|tˆd, d$ƒˆ #d-¡ttˆ #dld¡ƒd$ƒdmœd |EˆfS )oz€
        Generate speech with detailed per-request timing metrics.

        Returns:
            (wav_bytes, perf_dict)
        r   )Úadd_wav_headerr1   )Úllm_prompt_token_totalÚllm_token_totalÚsemantic_token_totalÚglobal_token_totalÚllm_batch_countÚllm_parse_msÚllm_decode_callsÚbicodec_decode_wall_msÚbicodec_decode_gpu_msÚbicodec_decode_cpu_msÚwav_pack_msÚrequest_start©ÚstageÚt_msÚdt_msÚtimeline_markersÚtimeline_request_start_msru   r   c                    sf   t  ¡ ˆ d }|ˆ  }|‰ ˆ | t|ƒt|ƒdœ¡ t|ƒˆd| › d< t|ƒˆd| › d< t|ƒS )Néè  rt   Ú	timeline_Ú_msÚ	_delta_ms)ÚtimeÚperf_counterrK   r;   )ru   Únow_msrw   )Úlast_mark_msrV   Út_request_startÚtimeline_marksr   r    Ú_markÒ   s   ýÿz8SparkTTSPipeline.generate_speech_profiled.<locals>._markNc                     sj  ˆrˆd   d¡dkrˆ dƒ tˆƒˆd< tˆ  dd¡ƒˆd< ˆ  d¡} ˆ  d	¡}ˆ  d
¡}ˆ  d¡}ˆ  d¡}t| ttfƒrEt| ƒˆd< t| ttfƒr_t|ttfƒr_|| kr_t||  ƒˆd< t|ttfƒryt|ttfƒry||kryt|| ƒˆd< t|ttfƒr“t|ttfƒr“||kr“t|| ƒˆd< t|ttfƒr¯t|ttfƒr±||kr³t|| ƒˆd< d S d S d S d S )Néÿÿÿÿru   Úrequest_donerx   Útimeline_request_done_msr1   Útimeline_total_msÚtimeline_llm_first_batch_msÚtimeline_llm_done_msÚtimeline_parse_done_msÚtimeline_bicodec_done_msÚtimeline_to_first_batch_msÚ#timeline_first_batch_to_llm_done_msÚtimeline_post_llm_msÚ!timeline_parse_to_bicodec_done_msÚ#timeline_bicodec_to_request_done_ms)ÚgetÚlistr;   r9   r:   )Úfirst_batch_msÚllm_done_msÚparse_done_msÚbicodec_done_msÚrequest_done_ms)r„   rV   rƒ   r   r    Ú_finalize_timelineâ   s(   




$$$$ÿzESparkTTSPipeline.generate_speech_profiled.<locals>._finalize_timelinerz   Úprompt_build_msÚprompt_readyF)Úadd_special_tokensrh   )	ra   rb   rc   rd   re   ÚstopÚskip_special_tokensrf   Úoutput_kindÚsampling_params_msÚsampling_readyzreq-é   Úperf_non_stream_start)Ú
request_idr_   Útext_lengthrd   r   )ÚpromptÚsampling_paramsr¤   Úmetricsrl   r2   Úllm_first_batchÚperf_first_llm_batché   )r¤   Úbatch_wall_msÚ
new_tokensÚtimeline_msÚmodel_execute_timeÚmodel_forward_timeÚllm_generation_wall_msÚllm_doneÚperf_llm_generation_done)r¤   r±   rl   r®   Úpipeline_total_msri   rm   rj   rk   Ú
parse_doneÚperf_token_parse_done)r¤   Úparse_msÚsemantic_tokensÚglobal_tokensr®   Úllm_batch_wall_msÚllm_batch_gpu_msÚllm_decode_wall_msÚllm_decode_gpu_msÚtokens_per_batchÚllm_prefill_wall_msÚllm_prefill_gpu_msÚllm_batch_gpu_ms_totalÚllm_decode_gpu_ms_totalÚllm_time_per_token_msÚllm_batch_wall_ms_totalÚllm_time_per_batch_wall_msÚllm_time_per_batch_gpu_msÚllm_time_in_queue_msÚllm_scheduler_msÚllm_model_forward_msÚllm_model_execute_msÚllm_first_token_ms)Útime_in_queueÚscheduler_timer°   r¯   Úfirst_token_latencyÚstart_sÚend_sc                 S   s,   | d u s|d u r
d S || k rd S ||  d S )Ng     @@r   )rÏ   rÐ   r   r   r    r}   Æ  s
   z<SparkTTSPipeline.generate_speech_profiled.<locals>._delta_msÚ	queued_tsÚscheduled_tsÚfirst_token_tsÚlast_token_tsÚllm_queued_to_scheduled_msÚllm_scheduled_to_first_token_msÚllm_first_to_last_token_msÚllm_queued_to_last_token_msÚllm_request_lifecycle_msÚnum_generation_tokensÚarrival_timeÚfirst_token_timeÚfinished_timeÚtoken_validation_msÚvalidation_donern   ÚdeviceÚtyper   ÚcudaT)Úenable_timing)rà   ro   rp   rq   Úbicodec_doneÚperf_bicodec_decode_done)r¤   Údecode_wall_msÚdecode_gpu_msÚdecode_cpu_msr®   )Úsample_raterr   Úwav_doneÚperf_non_stream_donerˆ   )r¤   r´   ri   rˆ   )r   N)7Úveena3modal.audio.utilsrg   r~   r   rM   r;   r+   Úbuild_prefixr5   r*   Ú	tokenizerÚencoderE   r	   r"   Ú_NON_STREAM_FINAL_ONLY_ENABLEDr
   Ú
FINAL_ONLYÚ
CUMULATIVEÚuuidÚuuid4Úhexr!   ÚengineÚgenerateÚoutputsÚ	token_idsr]   rU   rI   ÚroundrK   r`   Ú_extract_bicodec_tokensr^   r’   rD   r   r:   r,   Úvalidate_tokensÚ_PERF_GPU_TIMING_ENABLEDÚtorchrâ   Úis_availablerF   rà   ÚEventÚsynchronizeÚrecordÚdecode_to_bytesÚelapsed_timeÚ_ASYNC_BICODEC_DECODE_ENABLEDÚdecode_to_bytes_asyncr(   )Fr.   r_   r`   ra   rb   rc   rd   re   rf   rg   r™   Út0r¦   r§   r¤   Úresults_generatorÚfinal_outputÚfinal_request_metricsrº   r»   r¼   r½   r¾   Úprev_generated_countÚprev_gpu_cumulative_sÚt_gen_startÚt_last_yieldÚrequest_outputÚnowÚ
batch_wallrø   ÚoutputÚgenerated_countr­   Úraw_req_metricsÚfirst_batch_timeline_msÚcumulative_gpu_sÚgpu_delta_msÚllm_done_timeline_msÚgenerated_textÚgenerated_idsÚsemantic_idsÚ
global_idsÚparse_done_timeline_msÚtotal_gpu_msÚ
sec_fieldsÚsrcÚdstr}   rÑ   rÒ   rÓ   rÔ   Ú
q_to_schedÚsched_to_firstÚfirst_to_lastÚqueued_to_lastÚarrivalÚfirst_tokenÚfinishedÚ	decode_t0rç   Úuse_cuda_timingrà   Ústart_eventÚ	end_eventÚaudio_bytesræ   Úbicodec_done_timeline_msÚ	wav_bytesr   )r„   r   rV   r‚   rƒ   r    Úgenerate_speech_profiled«   sD  €õÿÿ÷üþ	ý


üþ	







Â@üþ

ûþ û
€"














þ
ýûþûþ
üþ	z)SparkTTSPipeline.generate_speech_profiledc	              
   Ã   s*   | j ||||||||dI dH \}	}
|	S )a÷  
        Generate speech audio (non-streaming) using Spark TTS.
        
        NOTE: This method signature changed from description-based to speaker-based
        to align with Spark TTS architecture.
        
        Args:
            speaker: Speaker name (one of 12 predefined speakers)
            text: Text to synthesize (with optional [emotion] tags)
            temperature: Sampling temperature (default: 0.8 for Spark TTS)
            top_k: Top-k sampling (default: 50)
            top_p: Nucleus sampling (default: 1.0)
            max_tokens: Max BiCodec tokens to generate (default: 2048)
            seed: Random seed for reproducibility
        
        Returns:
            Audio bytes (int16 PCM WAV, 16kHz mono) or None if failed
        ©r_   r`   ra   rb   rc   rd   re   rf   N)r0  )r.   r_   r`   ra   rb   rc   rd   re   rf   r/  rA   r   r   r    Úgenerate_speechI  s   €ø
z SparkTTSPipeline.generate_speechc	           	   
   Ã   s"   | j ||||||||dI dH S )uÒ  
        Generate speech audio for Spark TTS (speaker-based).
        
        This method is kept for backward compatibility with existing API.
        It delegates to generate_speech() with the same implementation.
        
        Args:
            speaker: Speaker name (one of 12 predefined speakers)
            text: Text to synthesize with inline emotion tags
                Examples:
                - "Hello! Welcome."
                - "[laughs] Hello there!"
                - "Hello <laugh> this is fun!" (will be normalized)
                - "à¤¨à¤®à¤¸à¥à¤¤à¥‡! [excited] à¤†à¤œ à¤•à¤¾ à¤¦à¤¿à¤¨ à¤¬à¤¹à¥à¤¤ à¤…à¤šà¥à¤›à¤¾ à¤¹à¥ˆà¥¤"
            temperature: Sampling temperature
            top_k: Top-k sampling
            top_p: Nucleus sampling
            max_tokens: Max BiCodec tokens to generate
            seed: Random seed for reproducibility
        
        Returns:
            Audio bytes (int16 PCM WAV, 16kHz mono) or None if failed
        r1  N)r2  )	r.   r_   r`   ra   rb   rc   rd   re   rf   r   r   r    Úgenerate_speech_indicr  s   €#øz&SparkTTSPipeline.generate_speech_indicr  c                 C   s\   t  d|¡}dd„ |D ƒ}t  d|¡}dd„ |D ƒ}|s*|s*t d|dd… › ¡ ||fS )	a»  
        Extract BiCodec semantic and global tokens from generated text using regex.
        
        Spark TTS generates tokens in the format:
        - <|bicodec_semantic_{id}|>
        - <|bicodec_global_{id}|>
        
        Args:
            generated_text: Generated text containing BiCodec token markers
        
        Returns:
            Tuple of (semantic_ids, global_ids)
            Returns ([], []) if no tokens found
        z<\|bicodec_semantic_(\d+)\|>c                 S   ó   g | ]}t |ƒ‘qS r   ©r:   ©Ú.0Útr   r   r    Ú
<listcomp>±  ó    z<SparkTTSPipeline._extract_bicodec_tokens.<locals>.<listcomp>z<\|bicodec_global_(\d+)\|>c                 S   r4  r   r5  r6  r   r   r    r9  µ  r:  u@   âŒ No BiCodec tokens found! Generated text (first 1000 chars):
Nrz   )ÚreÚfindallr   Úerror)r.   r  Úsemantic_matchesr  Úglobal_matchesr  r   r   r    rû      s   z(SparkTTSPipeline._extract_bicodec_tokens)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r/   Ústaticmethodr   r;   r7   r   r   r>   r   rM   rU   r^   r#   r$   r%   r&   r:   r   Úbytesr0  r2  r3  rû   r   r   r   r    r)   @   s°    &3÷þýüûúùø	÷

ö   %÷þýüûúùø	÷

ö-÷þýüûúùø	÷

ö&.r)   )*rC  r   Úosr;  r~   ró   Údataclassesr   r   Útypingr   r   r   r   r   Úvllmr	   Úvllm.sampling_paramsr
   rþ   Ú	getLoggerr@  r   Úenvironr’   ÚstripÚlowerr   rý   r  rð   rM   r!   Úveena3modal.core.constantsr"   r#   r$   r%   r&   r'   r(   r)   r   r   r   r    Ú<module>   s&    
$