o
    ²ið(  ã                   @   sN   d Z ddlZddlmZmZ ddlmZmZmZ ddl	m
Z
 G dd„ dƒZdS )zÓ
Spark TTS Indic Prompt Builder

Builds prompts for Spark TTS with Indic speaker system.
Format: <|task_controllable_tts|><|start_content|>...<|end_content|>...

Migrated from Orpheus to Spark TTS architecture.
é    N)ÚListÚTuple)ÚINDIC_SPEAKERSÚINDIC_EMOTION_TAGSÚSPEAKER_MAP)Únormalize_emotion_tagsc                   @   sê   e Zd ZdZddd„Z	ddededed	efd
d„Z	ddededed	ee	 fdd„Z
dedefdd„Zdefdd„Zded	eeee	f  fdd„Z	ddededee	 ded	ef
dd„Zed	ee fdd„ƒZed	ee fdd„ƒZdS )ÚIndicPromptBuilderaƒ  
    Builds prompts in Spark TTS format with Indic speaker system.
    
    Format:
    <|task_controllable_tts|>
    <|start_content|>{text with [emotions]}<|end_content|>
    <|start_style_label|><|speaker_{id}|><|end_style_label|>
    <|start_global_token|>
    
    CRITICAL: This format MUST match Spark TTS training exactly.
    
    Model Details:
    - HuggingFace: BayAreaBoys/spark_tts_4speaker
    - Architecture: Qwen2ForCausalLM with BiCodec audio tokenizer
    - Languages: Telugu, Hindi, English, and more
    - Speakers: 12 predefined (case-sensitive!) - must match training map
    - Emotions: 10 tags in [bracket] format
    Nc                 C   s   || _ || _dS )zÓ
        Initialize Indic prompt builder for Spark TTS.
        
        Args:
            tokenizer: Transformers tokenizer
            model: Model instance (optional, kept for backward compatibility)
        N)Ú	tokenizerÚmodel)Úselfr	   r
   © r   ú@/home/ubuntu/veenaModal/veena3modal/processing/prompt_builder.pyÚ__init__%   s   
zIndicPromptBuilder.__init__TÚspeakerÚtextÚvalidateÚreturnc              
   C   sl   t |ƒ}|r|  ||¡ t |¡}|du r#td|› dtt ¡ ƒ› ƒ‚d dd|ddd	|› d
ddg¡}|S )u   
        Build Spark TTS format prompt with Indic speaker.
        
        Args:
            speaker: Speaker name (one of 12 predefined speakers, case-sensitive!)
                Examples: 'lipakshi', 'reet', 'Nandini', 'Nilay', 'vardan', 'anika', 'adarsh',
                'krishna', 'Aarvi', 'Asha', 'Bittu', 'Mira'
            text: Text to synthesize with inline emotion tags
                Examples:
                - "Hello! Welcome to the demo."
                - "[laughs] The results were amazing!"
                - "The results were amazing and then [giggle] we celebrated!"
                - "à¤¨à¤®à¤¸à¥à¤¤à¥‡! [excited] à¤†à¤œ à¤•à¤¾ à¤¦à¤¿à¤¨ à¤¬à¤¹à¥à¤¤ à¤…à¤šà¥à¤›à¤¾ à¤¹à¥ˆà¥¤" (Hindi)
                - "[curious] à°®à±€à°°à± à°Žà°²à°¾ à°‰à°¨à±à°¨à°¾à°°à±?" (Telugu)
                - "Hello <laugh> this works too!" (will be normalized to [laughs])
            validate: Check format correctness (recommended for first use)
        
        Returns:
            Formatted prompt string ready for tokenization
            Format: <|task_controllable_tts|><|start_content|>text<|end_content|>...
        
        Note: Emotion tags should be inline in text. The API will normalize:
        - <laugh> â†’ [laughs] (old format)
        - laughing â†’ [laughs] (natural language)
        NúInvalid speaker: ú. Valid speakers: Ú ú<|task_controllable_tts|>ú<|start_content|>ú<|end_content|>ú<|start_style_label|>ú
<|speaker_ú|>ú<|end_style_label|>ú<|start_global_token|>)r   Ú_validate_inputsr   ÚgetÚ
ValueErrorÚlistÚkeysÚjoin)r   r   r   r   Ú
speaker_idÚpromptr   r   r   Úbuild_prefix0   s"    

øzIndicPromptBuilder.build_prefixc                 C   s"   |   |||¡}| jj|dd}|S )a/  
        Build prefix as token IDs (for testing/debugging).
        
        Args:
            speaker: Speaker name
            text: Text to synthesize (with inline emotions)
            validate: Check format correctness
        
        Returns:
            List of token IDs for the prefix
        F)Úadd_special_tokens)r&   r	   Úencode)r   r   r   r   r%   Ú	token_idsr   r   r   Úbuild_prefix_idsi   s   z#IndicPromptBuilder.build_prefix_idsc                 C   s@   |t vrtd|› dd t ¡› ƒ‚| ¡ stdƒ‚|  |¡ dS )zï
        Validate speaker and text format.
        
        Args:
            speaker: Speaker name
            text: Text to synthesize (with inline emotions)
        
        Raises:
            ValueError: If inputs are invalid
        zInvalid speaker 'z$'. Must be one of (case-sensitive): z, zText cannot be emptyN)r   r    r#   ÚstripÚ_validate_emotion_tags)r   r   r   r   r   r   r   ~   s   ÿÿz#IndicPromptBuilder._validate_inputsc                 C   s¦   d}t  ||¡}|sdS g }|D ]}d|› d}|tvr!| |¡ q|rQtdt|ƒ› dƒ |D ]
}td|› dƒ q0td	ƒ tD ]	}td
|› ƒ qAtdƒ dS dS )z÷
        Validate emotion tags in text are in the allowed set for Spark TTS.
        
        Warns if unknown emotion tags are found (won't break, but won't work).
        
        Args:
            text: Text with potential emotion tags
        ú\[([a-z\s]+)\]Nú[ú]u   âš ï¸  Warning: Found z& unknown emotion tag(s) for Spark TTS:z    - z" (will be treated as regular text)u#   
ðŸ“‹ Valid Spark TTS emotion tags:z    Ú
)ÚreÚfindallr   ÚappendÚprintÚlen)r   r   Úemotion_patternÚfound_emotionsÚunknown_tagsÚemotionÚtagr   r   r   r,   —   s&   

€ùz)IndicPromptBuilder._validate_emotion_tagsc                 C   sB   d}g }t  ||¡D ]}| d¡}|tv r| || ¡ f¡ q
|S )zÒ
        Extract emotion tags and their positions from text.
        
        Args:
            text: Text with emotion tags
        
        Returns:
            List of (emotion_tag, position) tuples
        r-   r   )r1   ÚfinditerÚgroupr   r3   Ústart)r   r   r6   ÚmatchesÚmatchr:   r   r   r   Úextract_emotion_tags·   s   

€z'IndicPromptBuilder.extract_emotion_tagsÚ
global_idsc           	      C   s®   d}t |ƒ|krtd|› dt |ƒ› dƒ‚t|ƒ}|r!|  ||¡ t |¡}|du r8td|› dtt ¡ ƒ› ƒ‚d d	d
„ |D ƒ¡}d dd|ddd|› ddd|dg
¡}|S )uÿ  
        Build prompt with pre-generated global tokens for voice consistency in chunked generation.
        
        This is critical for multi-chunk text processing: the 32 global tokens encode
        speaker identity (voice DNA). By injecting the same global tokens from the first
        chunk into subsequent chunks, we ensure consistent voice across the entire text.
        
        Use Case:
        - Chunk 1: Use build_prefix() â†’ model generates 32 global tokens + semantic tokens
        - Chunk 2+: Use build_prefix_with_globals() with captured global tokens
                    â†’ model skips global generation, generates only semantic tokens
        
        Args:
            speaker: Speaker name (same as first chunk)
            text: Text chunk to synthesize
            global_ids: List of 32 global token IDs captured from first chunk
            validate: Check format correctness
        
        Returns:
            Formatted prompt with pre-filled global tokens
            
        Raises:
            ValueError: If global_ids doesn't contain exactly 32 tokens
        
        Thread Safety:
            This method is stateless and thread-safe. Global tokens are passed
            explicitly per-request, not stored in any shared state.
        é    zExpected exactly z global tokens, got z=. This likely indicates an issue with first chunk generation.Nr   r   r   c                 S   s   g | ]}d |› d‘qS )z<|bicodec_global_r   r   )Ú.0Úgidr   r   r   Ú
<listcomp>  s    z@IndicPromptBuilder.build_prefix_with_globals.<locals>.<listcomp>r   r   r   r   r   r   r   r   z<|start_semantic_token|>)	r5   r    r   r   r   r   r!   r"   r#   )	r   r   r   rA   r   ÚEXPECTED_GLOBAL_COUNTr$   Úglobal_tokens_strr%   r   r   r   Úbuild_prefix_with_globalsË   s2   $ÿ

öz,IndicPromptBuilder.build_prefix_with_globalsc                   C   s   t  ¡ S )zGet list of available speakers.)r   Úcopyr   r   r   r   Úget_available_speakers  s   z)IndicPromptBuilder.get_available_speakersc                   C   s   dd„ t D ƒS )z-Get list of available emotions (without [ ]).c                 S   s   g | ]}|  d ¡‘qS )z[])r+   )rC   Úer   r   r   rE     s    z=IndicPromptBuilder.get_available_emotions.<locals>.<listcomp>)r   r   r   r   r   Úget_available_emotions  s   z)IndicPromptBuilder.get_available_emotions)N)T)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   ÚstrÚboolr&   r   Úintr*   r   r,   r   r@   rH   ÚstaticmethodrJ   rL   r   r   r   r   r      sT    
üþýü
û=üþýü
û ûþýüû
úKr   )rP   r1   Útypingr   r   Úveena3modal.core.constantsr   r   r   Ú)veena3modal.processing.emotion_normalizerr   r   r   r   r   r   Ú<module>   s    	