""" Spark TTS Constants Token IDs, special tokens, speakers, and emotions for Spark TTS model. Migrated from Veena3/Orpheus to Spark TTS (BiCodec) architecture. """ import os # Stop tokens for generation # Spark TTS uses <|im_end|> as stop token TRAINING_STOP_TOKEN_IDS = ["<|im_end|>"] # String-based stop token for Spark TTS # Spark TTS Speaker System - 12 Speakers from Training # Maps user-facing speaker names to internal speaker IDs (speaker_0 to speaker_11) # Model: BayAreaBoys/spark_tts_4speaker (HuggingFace) has 12 speaker tokens SPEAKER_MAP = { "lipakshi": 0, # speaker_0 "vardan": 1, # speaker_1 "reet": 2, # speaker_2 "Nandini": 3, # speaker_3 "krishna": 4, # speaker_4 "anika": 5, # speaker_5 "adarsh": 6, # speaker_6 "Nilay": 7, # speaker_7 "Aarvi": 8, # speaker_8 "Asha": 9, # speaker_9 "Bittu": 10, # speaker_10 "Mira": 11, # speaker_11 } # Friendly speaker name mappings (user-facing → internal) # Users can use these friendly names instead of internal names FRIENDLY_SPEAKER_MAP = { "Mitra": "lipakshi", "Aaranya": "reet", "Taru": "Nandini", "Neer": "Nilay", "Dhruva": "vardan", "Ira": "anika", "Veda": "adarsh", "Aria": "krishna", # Direct mappings for new speakers "Aarvi": "Aarvi", "Asha": "Asha", "Bittu": "Bittu", "Mira": "Mira", } # Valid speaker names (for API validation) # Include both internal names and friendly names INDIC_SPEAKERS = list(SPEAKER_MAP.keys()) ALL_SPEAKER_NAMES = INDIC_SPEAKERS + list(FRIENDLY_SPEAKER_MAP.keys()) def resolve_speaker_name(name: str) -> str: """ Resolve friendly speaker name to internal name. Args: name: Speaker name (friendly or internal) Returns: Internal speaker name Examples: resolve_speaker_name("Mitra") -> "lipakshi" resolve_speaker_name("lipakshi") -> "lipakshi" """ # Check if it's a friendly name if name in FRIENDLY_SPEAKER_MAP: return FRIENDLY_SPEAKER_MAP[name] # Check if it's already an internal name if name in INDIC_SPEAKERS: return name # Invalid speaker name raise ValueError(f"Invalid speaker name: {name}. Valid names: {', '.join(ALL_SPEAKER_NAMES)}") # Spark TTS Emotion Tags (bracket format: [emotion]) # NOTE: API users must now use [emotion] instead of INDIC_EMOTION_TAGS = [ "[angry]", "[curious]", "[excited]", "[giggle]", "[laughs harder]", "[laughs]", "[screams]", "[sighs]", "[sings]", "[whispers]" ] # Legacy emotion tags for backward compatibility mapping # Maps old format to new [emotion] format LEGACY_EMOTION_MAP = { "": "[angry]", "": "[curious]", "": "[excited]", "": "[giggle]", "": "[laughs harder]", "": "[laughs]", "": "[screams]", "": "[sighs]", "": "[sings]", "": "[whispers]" } # All emotion tags (for validation) ALL_EMOTION_TAGS = INDIC_EMOTION_TAGS # Model configuration DEFAULT_MODEL_PATH = "BayAreaBoys/spark_tts_4speaker" DEFAULT_MAX_MODEL_LEN = 4096 # Lower than old model (was 8192) # BiCodec Audio Tokenizer # Location: resolved from env var MODEL_PATH or default BICODEC_TOKENIZER_PATH = os.environ.get('MODEL_PATH', os.environ.get('SPARK_TTS_MODEL_PATH', '/models/spark_tts_4speaker')) # Audio configuration AUDIO_SAMPLE_RATE = 16000 # Changed from 24kHz to 16kHz for BiCodec AUDIO_CHANNELS = 1 AUDIO_BITS_PER_SAMPLE = 16 # Generation defaults for Spark TTS DEFAULT_TEMPERATURE = 0.8 # Higher than old model (was 0.4) DEFAULT_TOP_K = 50 DEFAULT_TOP_P = 1.0 DEFAULT_MAX_TOKENS = 4096 # Increased to handle complex multilingual generation DEFAULT_MIN_TOKENS = 28 # Minimum tokens for generation (legacy compatibility) DEFAULT_REPETITION_PENALTY = 1.0 # Repetition penalty (legacy compatibility) DEFAULT_SEED = None # None = random, set integer for reproducibility # vLLM Configuration optimizations for Spark TTS # NOTE: gpu_memory_utilization adjusted based on available memory # Production: 0.85, Limited memory: 0.3-0.5 # # OPTIMIZATION Dec 2025: # - enable_chunked_prefill: Prevents long prompts from blocking concurrent streams # - async_scheduling: DISABLED - caused EngineCore issues in production # OPTIMIZATION Feb 2026: # Reduced gpu_memory_utilization from 0.85 to 0.25 # Model is only 0.5B (~1.3GB weights). At 0.85, vLLM pre-allocated 65GB for KV cache # (enough for 1,399 concurrent seqs) when peak actual need is ~100 concurrent. # At 0.25: ~18GB KV cache, supports ~380 concurrent seqs (4x peak), frees ~48GB VRAM. # This enables fitting on L4 (24GB) or even T4 (16GB) GPUs. VLLM_CONFIG = { "dtype": "bfloat16", "gpu_memory_utilization": 0.25, # 0.5B model needs <2GB; 0.25 gives ~380 concurrent seqs "max_model_len": 4096, "max_num_batched_tokens": 4096, "enable_prefix_caching": True, # Cache common prompts for faster TTFB "enable_chunked_prefill": True, # Chunk long prefills to avoid blocking concurrent streams "enforce_eager": False, # Enable CUDA graphs for low latency "disable_log_stats": False, "trust_remote_code": True, "tensor_parallel_size": 1, } # Streaming configuration (may need adjustment for BiCodec) STREAM_BUFFER_SIZE = 50 # Buffer size for streaming (BiCodec-specific, TBD) BICODEC_BATCH_SIZE = 64 BICODEC_BATCH_TIMEOUT_MS = 15 # Backward compatibility: Old SNAC constants for files not yet migrated # These are NOT used by Spark TTS but needed by legacy streaming code # TODO: Remove once streaming_pipeline.py is fully migrated to BiCodec CODE_START_TOKEN_ID = 128257 # Legacy SNAC constant CODE_END_TOKEN_ID = 128258 # Legacy SNAC constant SNAC_MIN_ID = 128266 # Legacy SNAC constant SNAC_MAX_ID = 156937 # Legacy SNAC constant