o
    i                     @   sz   d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZmZmZmZ G dd dZdS )	z
Spark TTS Model Loader

Loads Spark TTS model with vLLM engine and validates speakers/emotions.
Migrated from Veena3/Orpheus to Spark TTS architecture.
    N)OptionalDictAny)AutoTokenizer)AsyncLLMEngineAsyncEngineArgsSamplingParams)INDIC_EMOTION_TAGSINDIC_SPEAKERSSPEAKER_MAPDEFAULT_MODEL_PATHDEFAULT_MAX_MODEL_LENVLLM_CONFIGc                   @   s\   e Zd ZdZ			ddededefddZd	d
 Zdd ZdefddZdd Z	dd Z
dS )SparkTTSModelu  
    Spark TTS Model with vLLM inference engine and BiCodec audio tokenizer.
    
    Model: BayAreaBoys/spark_tts_4speaker
    Architecture: Qwen2ForCausalLM with BiCodec audio tokenization
    Speakers: 12 predefined (mapped to speaker_0-11 as per training)
    Emotions: 10 tags in [bracket] format
    
    CRITICAL REQUIREMENTS:
    - Load tokenizer with special Spark TTS tokens
    - Configure vLLM with optimizations (prefix caching, CUDA graphs)
    - Support speaker mapping (lipakshi → speaker_0, etc.)
    Nindic_speakers
model_path
model_typehf_tokenc           	      K   s  d| _ |du rtjdt}|| _|| _td td|  td td td d	d
i}|r5||d< dti}t	j
|fi || _| j| tdt| j d |   |   td t }|||d |rs|tjd< || td|d   td|d   td|d   td|dd  td|dd
  tdi |}t|| _td td dS ) a  
        Initialize Spark TTS model with vLLM.
        
        Args:
            model_path: Path to model directory (local or HF). If None, uses env var or default.
            model_type: Model type (kept for compatibility, always uses indic_speakers)
            hf_token: HuggingFace token for private models
            **engine_kwargs: Additional vLLM engine arguments (override defaults from VLLM_CONFIG)
        r   NSPARK_TTS_MODEL_PATHu!   🚀 Initializing Spark TTS Modelu   📁 Model path: u{   🗣️  Speakers: 12 predefined (lipakshi, vardan, reet, Nandini, krishna, anika, adarsh, Nilay, Aarvi, Asha, Bittu, Mira)u*   🎭 Emotions: 10 tags in [bracket] formatu%   
📝 Loading tokenizer from model...trust_remote_codeTtokenadditional_special_tokensu   ✅ Tokenizer loaded:  tokensu4   
🔧 Initializing vLLM engine with optimizations...)model	tokenizerHUGGING_FACE_HUB_TOKENz  - dtype: dtypez  - max_model_len: max_model_lenz  - gpu_memory_utilization: gpu_memory_utilizationz  - enable_prefix_caching: enable_prefix_cachingFz)  - enforce_eager (disable CUDA graphs): enforce_eageru)   ✅ vLLM engine initialized successfully!u+   
🎉 Spark TTS Model ready for inference!
 )r   osenvirongetr   r   r   printr	   r   from_pretrainedr   add_special_tokenslen_validate_speakers_validate_emotion_tagsr   copyupdater   r   from_engine_argsengine)	selfr   r   r   engine_kwargstokenizer_kwargsspecial_tokensengine_kwargs_dictengine_argsr!   r!   8/home/ubuntu/veenaModal/veena3modal/core/model_loader.py__init__'   s\   

zSparkTTSModel.__init__c                 C   s   t dtt d tD ]}| jj|dd}|s t d| d qt dtt d t d	 ttd
d D ]\}}| jj|dd}t d| dt| d q7ttdkr]t d d
S d
S )z
        Validate that emotion tags are properly configured for Spark TTS.
        
        For Spark TTS, emotion tags use [bracket] format and should be
        added as special tokens to the tokenizer.
           
🔍 Validating z emotion tags for Spark TTS...Fr'   u   ⚠️  Warning: Emotion tag z produced no tokens   ✅ All z emotion tags validatedu   
📋 Sample emotion tags:N     : r   z  ...)r%   r(   r	   r   encode	enumerate)r/   tag	token_idsir!   r!   r5   r*      s   z$SparkTTSModel._validate_emotion_tagsc              	   C   s   t dtt d tD ]1}|tvrt d| d qt| }d| d}| jj|dd}t d	| d
| dt| d qt dtt d dS )z
        Validate that all speakers are properly mapped for Spark TTS.
        
        Spark TTS uses speaker tokens like <|speaker_0|> to <|speaker_7|>.
        User-facing names (lipakshi, vardan, etc.) are mapped to these IDs.
        r7   z speakers for Spark TTS...u   ⚠️  Warning: Speaker z not in SPEAKER_MAPz
<|speaker_z|>Fr8   r;   u    → r<   r   r9   z  speakers validated with mappingN)r%   r(   r
   r   r   r=   )r/   speaker_name
speaker_idspeaker_tokenr@   r!   r!   r5   r)      s   "z SparkTTSModel._validate_speakersreturnc                 C      | j S )z;Get the model type (always 'indic_speakers' for Spark TTS).)r   r/   r!   r!   r5   get_model_type      zSparkTTSModel.get_model_typec                 C   rF   )zGet the tokenizer.)r   rG   r!   r!   r5   get_tokenizer   rI   zSparkTTSModel.get_tokenizerc                 C   rF   )zGet the vLLM engine.)r.   rG   r!   r!   r5   
get_engine   rI   zSparkTTSModel.get_engine)Nr   N)__name__
__module____qualname____doc__strr6   r*   r)   rH   rJ   rK   r!   r!   r!   r5   r      s"    
^r   )rO   r"   torchtypingr   r   r   transformersr   vllmr   r   r   veena3modal.core.constantsr	   r
   r   r   r   r   r   r!   r!   r!   r5   <module>   s     
