o
    4à“iÌ7  ã                   @   sœ   d Z ddlZddlZddlZddlmZmZmZ ddl	Z	ddl
mZmZ ddlmZ ddlmZ e	 e¡Zdd
ededededef
dd„ZG dd„ dƒZdS )u  
Long Text Processing Service

Handles text chunking and audio stitching for long text inputs.

Why This Approach:
- Text-based chunking: Preserves semantic meaning
- SNAC token sliding window: Separate layer for smooth audio
- Crossfade stitching: Seamless audio transitions between chunks
- Unlimited text: Can process any length text

Architecture:
1. Long text â†’ IndicSentenceChunker â†’ Text chunks
2. Each text chunk â†’ TTS Model â†’ Audio chunk  
3. All audio chunks â†’ Crossfade stitching â†’ Final audio
é    N)ÚListÚOptionalÚAsyncGenerator)ÚIndicSentenceChunkerÚcrossfade_audio)ÚVeena3SlidingWindowPipeline)ÚSparkTTSPipelineéÀ]  é   é   Ú	data_sizeÚsample_rateÚchannelsÚbits_per_sampleÚreturnc                 C   sd   || | d }|| d }t  ddd|  d¡}|t  dddd	|||||¡	7 }|t  d
d| ¡7 }|S )a  
    Create WAV file header for PCM data.
    
    Args:
        data_size: Size of PCM data in bytes
        sample_rate: Sample rate (Hz)
        channels: Number of channels
        bits_per_sample: Bits per sample
    
    Returns:
        WAV header bytes (44 bytes)
    é   z<4sI4ss   RIFFé$   s   WAVEz
<4sIHHIIHHs   fmt r   r
   z<4sIs   data)ÚstructÚpack)r   r   r   r   Ú	byte_rateÚblock_alignÚheader© r   úE/home/ubuntu/veenaModal/veena3modal/processing/long_text_processor.pyÚcreate_wav_header   s   r   c                   @   s  e Zd ZdZdZdZdZdZ	d&dede	e
 fd	d
„Zdedefdd„Zdedefdd„Z							d'dededededededede	e dede	e fdd„Z		d(d ee ded!edefd"d#„Z				d)dededededede	e deedf fd$d%„ZdS )*ÚLongTextProcessoru/  
    Process long texts by chunking, generating audio, and stitching.
    
    Features:
    - Intelligent text chunking (respects sentence boundaries, Indic languages)
    - Parallel audio generation (if desired)
    - Crossfade stitching for seamless audio
    - Memory efficient (processes chunks sequentially by default)
    
    Why text chunking, not token chunking:
    - Text maintains semantic meaning
    - Natural sentence boundaries = better prosody
    - Simpler implementation
    - Model handles tokenization internally
    - SNAC token sliding window is separate (already implemented)
    
    Max Input Length Reference (post-normalization):
    "à¤‡à¤¸à¤•à¥‡ à¤¨à¤¿à¤°à¥à¤®à¤¾à¤£ à¤®à¥‡à¤‚ à¤®à¥à¤–à¥à¤¯ à¤µà¤¾à¤¸à¥à¤¤à¥à¤•à¤¾à¤° à¤‰à¤¸à¥à¤¤à¤¾à¤¦ à¤…à¤¹à¤®à¤¦ à¤²à¤¾à¤¹à¥Œà¤°à¥€ à¤•à¥‡ à¤¨à¥‡à¤¤à¥ƒà¤¤à¥à¤µ à¤®à¥‡à¤‚ à¤²à¤—à¤­à¤— 
    20,000 à¤•à¤¾à¤°à¥€à¤—à¤°à¥‹à¤‚ à¤”à¤° à¤¶à¤¿à¤²à¥à¤ªà¤•à¤¾à¤°à¥‹à¤‚ à¤¨à¥‡ à¤¦à¤¿à¤¨-à¤°à¤¾à¤¤ à¤®à¥‡à¤¹à¤¨à¤¤ à¤•à¥€ à¤¥à¥€à¥¤ à¤‡à¤®à¤¾à¤°à¤¤ à¤•à¥‡ à¤¸à¤¾à¤®à¤¨à¥‡ à¤¬à¤¨à¤¾ 
    'à¤šà¤¾à¤°à¤¬à¤¾à¤—' à¤¶à¥ˆà¤²à¥€ à¤•à¤¾ à¤‰à¤¦à¥à¤¯à¤¾à¤¨ à¤”à¤° à¤ªà¤¾à¤¨à¥€ à¤•à¥€ à¤¨à¤¹à¤°à¥‡à¤‚ à¤‡à¤¸à¤•à¥€ à¤¸à¥à¤‚à¤¦à¤°à¤¤à¤¾ à¤®à¥‡à¤‚ à¤šà¤¾à¤° à¤šà¤¾à¤à¤¦ à¤²à¤—à¤¾ à¤¦à¥‡à¤¤à¥‡ à¤¹à¥ˆà¤‚à¥¤"
    = ~232 characters after normalization (numbers expanded to words)
    éæ   éÜ   éª   é2   NÚpipelineÚstreaming_pipelinec                 C   s>   || _ || _t| jd| _tjd| j| j| j| j	dœd dS )zÊ
        Initialize long text processor.
        
        Args:
            pipeline: Non-streaming TTS pipeline
            streaming_pipeline: Optional streaming pipeline (for streaming mode)
        )Úmax_chunk_lengthzLongTextProcessor initialized)Úmax_model_input_lengthÚchunking_thresholdÚ
chunk_sizeÚcrossfade_ms©ÚextraN)
r    r!   r   Ú
CHUNK_SIZEÚchunkerÚloggerÚinfoÚMAX_MODEL_INPUT_LENGTHÚCHUNKING_THRESHOLDÚCROSSFADE_MS)Úselfr    r!   r   r   r   Ú__init__`   s   ü
þzLongTextProcessor.__init__Útextr   c                 C   s   t |ƒ| jkS )z$Determine if text should be chunked.)Úlenr.   ©r0   r2   r   r   r   Úshould_chunkz   s   zLongTextProcessor.should_chunkc                 C   s   | j  |¡S )a@  
        Chunk text using the internal chunker.
        
        Delegates to self.chunker.chunk_text() for actual chunking logic.
        This method is used by _stream_chunked_text in tts_runtime.
        
        Args:
            text: Text to chunk
        
        Returns:
            List of text chunks
        )r*   Ú
chunk_textr4   r   r   r   r6   ~   s   zLongTextProcessor.chunk_textçš™™™™™Ù?çÍÌÌÌÌÌì?é   çÍÌÌÌÌÌð?é€>  ÚspeakerÚtemperatureÚtop_kÚtop_pÚ
max_tokensÚrepetition_penaltyÚseedr   c
                 Ã   sÜ  | j  |¡}
tjdt|ƒ|
d |
|dœd | j  |¡}tjdt|ƒ› dt|ƒdd„ |D ƒd	œd g }tjd
t|ƒ› dt|ƒ› dt|ƒt|ƒ|dœd t|ƒD ]]\}}ttt|ƒd ƒdƒ}t	|dƒ}| j
j||||||||dI dH }|du ržtjd|d › dt|ƒ› |t|ƒ|dd… t|ƒdœd  dS t|ƒdkrª|dd… n|}| |¡ qT| j||	| jd}tt|ƒ|	ddd}|| }t|ƒ|	d  }tjdt|ƒ› d|d›d t|ƒ|t|ƒd! d"œd |S )#a­  
        Generate audio for long text with chunking and stitching.
        
        Process:
        1. Chunk text at natural boundaries
        2. Generate audio for each chunk
        3. Stitch audio with crossfade
        
        Args:
            text: Long text to synthesize
            speaker: Speaker name (lipakshi, reet, etc.)
            temperature: Sampling temperature
            top_p: Nucleus sampling
            max_tokens: Max tokens per chunk
            seed: Random seed (same seed = same voice across chunks)
            sample_rate: Audio sample rate
        
        Returns:
            Stitched audio bytes (int16 PCM) or None if generation fails
        z"Processing long text with chunkingÚprimary)Útext_lengthÚprimary_languageÚlang_mixr<   r'   zText chunked into ú chunksc                 S   s   g | ]}t |ƒ‘qS r   )r3   )Ú.0Úchunkr   r   r   Ú
<listcomp>Á   s    z<LongTextProcessor.generate_with_chunking.<locals>.<listcomp>)Ú
num_chunksÚchunk_sizesu   ðŸŽµ Generating z chunks for z chars)Útotal_chunksÚtotal_charsr<   é   r9   éd   )r<   r2   r=   r>   r?   r@   rA   rB   Nu'   âŒ Failed to generate audio for chunk r
   ú/é–   )Úchunk_indexÚchunk_lengthÚchunk_text_previewrM   é,   )r   r&   r   )r   r   r   r   é   u   âœ… Generated u    chunks â†’ z.1fzs audioi   )rM   Úaudio_durationÚaudio_size_mb)r*   Údetect_language_mixr+   r,   r3   r6   Ú	enumerateÚminÚintÚmaxr    Úgenerate_speech_indicÚerrorÚappendÚstitch_audio_chunksr/   r   )r0   r2   r<   r=   r>   r?   r@   rA   rB   r   Ú	lang_infoÚtext_chunksÚaudio_chunksÚirI   Úchunk_max_tokensÚaudio_bytesÚpcm_dataÚstitched_pcmÚ
wav_headerÚfinal_audioÚtotal_audio_durationr   r   r   Úgenerate_with_chunking   s”   €!üþþþ	ýþ	þ
ø
üþ	ýü
ýþ	z(LongTextProcessor.generate_with_chunkingre   r&   c                 C   s‚   |sdS t |ƒdkr|d S t|d | ƒ}|d }tdt |ƒƒD ]}t||| ||d}tjd|d › |t |ƒdœd q!|S )	aà  
        Stitch multiple audio chunks with crossfade transitions.
        
        Why crossfade:
        - Prevents pops/clicks at chunk boundaries
        - Smooth transitions between chunks
        - Professional audio quality
        
        Args:
            audio_chunks: List of audio bytes (int16 PCM)
            sample_rate: Sample rate (Hz)
            crossfade_ms: Crossfade duration (milliseconds)
        
        Returns:
            Stitched audio bytes
        ó    r
   r   g     @@)Úcrossfade_samplesr   zCrossfaded chunk )rS   Úcurrent_length_bytesr'   )r3   r]   Úranger   r+   Údebug)r0   re   r   r&   rp   Úresultrf   r   r   r   rb     s(   üþþz%LongTextProcessor.stitch_audio_chunksc              	   C  s¤   | j du r
tdƒ‚| j |¡}tjdt|ƒ› ddt|ƒid t|ƒD ]*\}}	t d|d › d	t|ƒ› ¡ | j j||	||||d
2 z	3 dH W }
|
V  qD6 q%dS )a²  
        Generate audio for long text with streaming (experimental).
        
        Note: Streaming long texts is more complex:
        - Need to stream each chunk
        - Need to buffer for crossfade
        - Adds latency for stitching
        
        For now, recommend using non-streaming for long texts.
        
        Args:
            text: Long text to synthesize
            speaker: Speaker name
            temperature: Sampling temperature
            top_p: Nucleus sampling
            max_tokens: Max tokens per chunk
            repetition_penalty: Repetition penalty
            seed: Random seed
        
        Yields:
            Audio bytes (in chunks)
        Nz Streaming pipeline not availablezStreaming long text with rG   rK   r'   zStreaming chunk r
   rQ   )r<   r2   r=   r?   r@   rB   )	r!   Ú
ValueErrorr*   r6   r+   r,   r3   r[   Úgenerate_speech_stream_indic)r0   r2   r<   r=   r?   r@   rB   rd   rf   rI   rh   r   r   r   Ú generate_with_chunking_streamingL  s*   €

þúøüz2LongTextProcessor.generate_with_chunking_streaming)N)r7   r   r8   r9   r:   Nr;   )r;   r   )r7   r8   r9   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r-   r.   r)   r/   r   r   r   r1   ÚstrÚboolr5   Úlistr6   Úfloatr]   Úbytesrn   r   rb   r   rw   r   r   r   r   r   6   s’    ýþ
ýöþýüûúùø	÷
ö
õ üþýü
û9ùþýüûúù
ør   )r	   r
   r   )r{   ÚasyncioÚnumpyÚnpr   Útypingr   r   r   ÚloggingÚ#veena3modal.processing.text_chunkerr   r   Ú#veena3modal.core.streaming_pipeliner   Úveena3modal.core.pipeliner   Ú	getLoggerrx   r+   r]   r€   r   r   r   r   r   r   Ú<module>   s    
 