"""
Text Normalization for TTS Pipeline

Production-ready text normalization that prepares text for TTS models by:
- Expanding numbers, dates, URLs, symbols to words
- Language-aware punctuation cleanup (EN, HI, TE)
- Preserving meaning while removing distracting elements
- Deterministic and idempotent transformations

Design principles:
- Deterministic: same input → same output
- Meaning-preserving: expand before removing
- Practical: cover common cases, don't over-complicate
- Language-aware: support English, Hindi, Telugu
"""

import re
import unicodedata
from typing import Dict, Tuple, Optional
from enum import Enum


class Language(Enum):
    """Supported languages for normalization"""
    ENGLISH = "en"
    HINDI = "hi"
    TELUGU = "te"
    UNKNOWN = "unknown"


class TextNormalizer:
    """
    Text normalizer for TTS pipeline.
    
    Implements a deterministic, order-dependent normalization pipeline:
    1. Input sanitation
    2. Language detection
    3. Unicode normalization
    4. Entity expansion (URLs, emails, currency, etc.)
    5. Number expansion (to English words)
    6. Date/time expansion
    7. Symbol cleanup (language-aware)
    8. Whitespace normalization
    9. Final validation
    """
    
    # Language detection patterns (simple heuristic-based)
    DEVANAGARI_PATTERN = re.compile(r'[\u0900-\u097F]')  # Hindi
    TELUGU_PATTERN = re.compile(r'[\u0C00-\u0C7F]')  # Telugu
    
    # Currency symbols mapping
    CURRENCY_MAP = {
        '₹': 'rupees',
        '$': 'dollars',
        '€': 'euros',
        '£': 'pounds',
        '¥': 'yen',
        '₩': 'won',
    }
    
    # Math symbols
    MATH_SYMBOLS = {
        '+': 'plus',
        '−': 'minus',
        '–': 'minus',  # en-dash used as minus
        '×': 'times',
        '÷': 'divided by',
        '=': 'equals',
        '<': 'less than',
        '>': 'greater than',
        '≤': 'less than or equal to',
        '≥': 'greater than or equal to',
        '≠': 'not equal to',
        '≈': 'approximately',
    }
    
    # Unit patterns (common SI and other units)
    UNIT_PATTERN = re.compile(
        r'\b(\d+(?:\.\d+)?)\s*(km|m|cm|mm|kg|g|mg|ml|l|°C|°F|km/h|mph|mb|gb|tb)\b',
        re.IGNORECASE
    )
    
    # Ordinal patterns
    ORDINAL_PATTERN = re.compile(r'\b(\d+)(st|nd|rd|th)\b', re.IGNORECASE)
    
    # Number words for expansion (0-99)
    ONES = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine',
            'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen',
            'seventeen', 'eighteen', 'nineteen']
    TENS = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
    
    # Month names
    MONTHS = ['january', 'february', 'march', 'april', 'may', 'june',
              'july', 'august', 'september', 'october', 'november', 'december']
    
    # Emotion tags that MUST be preserved (for TTS model)
    EMOTION_TAGS = [
        "[angry]",
        "[curious]",
        "[excited]",
        "[giggle]",
        "[laughs harder]",
        "[laughs]",
        "[screams]",
        "[sighs]",
        "[sings]",
        "[whispers]"
    ]
    
    # Abbreviation whitelist (expand these)
    ABBREVIATIONS = {
        'mr.': 'mister',
        'mrs.': 'missus',
        'ms.': 'miss',
        'dr.': 'doctor',
        'prof.': 'professor',
        'st.': 'saint',
        'vs.': 'versus',
        'etc.': 'etcetera',
        'i.e.': 'that is',
        'e.g.': 'for example',
        'aka': 'also known as',
        'fyi': 'for your information',
        'diy': 'do it yourself',
        'ok': 'okay',
        'a.m.': 'a m',
        'am': 'a m',
        'p.m.': 'p m',
        'pm': 'p m',
    }
    
    def __init__(self):
        """Initialize normalizer with default settings"""
        self.verbose = False
        # Match emotion tags (with variants that need normalization)
        # CRITICAL: Model expects EXACT tags from INDIC_EMOTION_TAGS
        # Pattern captures variants and normalizes to exact model format
        self._emotion_tag_pattern = re.compile(
            r'(\[(?:angry|curious|excited|giggles?|laughs?(?: harder)?|screams?|sighs?|sings?|whispers?)\])', 
            re.IGNORECASE
        )
    
    def normalize(self, text: str, verbose: bool = False) -> str:
        """
        Normalize text for TTS.
        
        Args:
            text: Input text to normalize
            verbose: If True, return detailed transformation info
            
        Returns:
            Normalized text ready for TTS
        """
        if not text or not text.strip():
            return ""
        
        self.verbose = verbose
        
        # CRITICAL: Protect emotion tags FIRST (before any processing that removes brackets)
        text, protected_emotions = self._protect_emotion_tags(text)
        
        # Run pipeline in order (ORDER MATTERS!)
        text = self._sanitize_input(text)
        language = self._detect_language(text)
        text = self._normalize_unicode(text)
        text = self._expand_entities(text)
        text = self._expand_dates_times(text)  # BEFORE numbers (to match digit patterns)
        text = self._expand_phones(text)  # BEFORE numbers (to match digit patterns)
        text = self._expand_numbers(text)  # AFTER dates/phones (to avoid breaking patterns)
        text = self._expand_abbreviations(text)
        text = self._cleanup_symbols(text, language)
        text = self._normalize_whitespace(text)
        text = self._final_checks(text, language)
        
        # CRITICAL: Restore emotion tags LAST (after all processing)
        text = self._restore_emotion_tags(text, protected_emotions)
        
        return text.strip()
    
    def _protect_emotion_tags(self, text: str) -> Tuple[str, Dict[str, str]]:
        """
        Protect emotion tags from normalization.
        
        Replaces emotion tags with unique placeholders, stores mapping for later restoration.
        CRITICAL: Must preserve exact emotion tags for TTS model.
        
        Normalizes variants to match EXACT INDIC_EMOTION_TAGS format:
        - [laugh] → [laughs]
        - [giggles] → [giggle]  (NOTE: model uses singular!)
        - [whisper] → [whispers]
        etc.
        
        Placeholder format: XEMOTIONX0X (no special chars that get removed)
        """
        # Emotion tag normalization to match EXACT INDIC_EMOTION_TAGS
        # Model expects these EXACT tags (note: giggle is singular!)
        emotion_normalizations = {
            '[laugh]': '[laughs]',
            '[giggles]': '[giggle]',  # Model uses singular!
            '[scream]': '[screams]',
            '[sigh]': '[sighs]',
            '[sing]': '[sings]',
            '[whisper]': '[whispers]',
            '[laugh harder]': '[laughs harder]',
        }
        
        protected = {}
        counter = 0
        
        def replace_emotion(match):
            nonlocal counter
            tag = match.group(1).lower()  # Normalize to lowercase for comparison
            
            # Normalize variants to match exact model tags
            normalized_tag = emotion_normalizations.get(tag, tag)
            
            # Use alphanumeric-only placeholder that won't be modified
            placeholder = f"XEMOTIONX{counter}X"
            protected[placeholder] = normalized_tag
            counter += 1
            return placeholder
        
        text = self._emotion_tag_pattern.sub(replace_emotion, text)
        return text, protected
    
    def _restore_emotion_tags(self, text: str, protected: Dict[str, str]) -> str:
        """Restore protected emotion tags"""
        for placeholder, original_tag in protected.items():
            text = text.replace(placeholder, original_tag)
        return text
    
    def _sanitize_input(self, text: str) -> str:
        """Step 1: Input sanitation"""
        # Convert to unicode
        if isinstance(text, bytes):
            text = text.decode('utf-8', errors='ignore')
        
        # Remove BOM
        text = text.lstrip('\ufeff')
        
        # Normalize line endings
        text = text.replace('\r\n', '\n').replace('\r', '\n')
        
        return text
    
    def _detect_language(self, text: str) -> Language:
        """
        Step 2: Simple language detection.
        
        Returns primary language based on character sets.
        """
        # Count script characters
        devanagari_count = len(self.DEVANAGARI_PATTERN.findall(text))
        telugu_count = len(self.TELUGU_PATTERN.findall(text))
        
        # Simple majority rule
        if devanagari_count > telugu_count and devanagari_count > 5:
            return Language.HINDI
        elif telugu_count > devanagari_count and telugu_count > 5:
            return Language.TELUGU
        else:
            return Language.ENGLISH
    
    def _normalize_unicode(self, text: str) -> str:
        """Step 3: Unicode normalization & safety"""
        # Apply NFKC normalization
        text = unicodedata.normalize('NFKC', text)
        
        # Remove zero-width and control characters
        text = re.sub(r'[\u200b-\u200f\u202a-\u202e\ufeff]', '', text)
        
        # Map fancy punctuation to plain equivalents
        replacements = {
            '"': '"', '"': '"',  # Curly quotes
            ''': "'", ''': "'",
            '—': '-', '–': '-',  # Dashes
            '…': '...',  # Ellipsis
            # Hindi danda characters intentionally left untouched to preserve punctuation
        }
        
        for fancy, plain in replacements.items():
            text = text.replace(fancy, plain)
        
        return text
    
    def _expand_entities(self, text: str) -> str:
        """
        Step 4: Expand entities (URLs, emails, currency, etc.)
        
        Must happen BEFORE symbol removal to preserve meaning.
        Order matters: emails before URLs to avoid conflict.
        """
        # Remove emojis completely (per user request)
        text = self._remove_emojis(text)
        
        # Expand emails FIRST (before URLs to avoid @ symbol conflicts)
        text = self._expand_emails(text)
        
        # Expand URLs
        text = self._expand_urls(text)
        
        # Expand social handles
        text = self._expand_social(text)
        
        # Expand currency
        text = self._expand_currency(text)
        
        # NOTE: Math symbols expanded in _expand_numbers step (after dates/phones)
        
        return text
    
    def _remove_emojis(self, text: str) -> str:
        """Remove all emojis and emoticons"""
        # Remove emoji characters (Unicode emoji blocks)
        emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags
            "\U00002702-\U000027B0"
            "\U000024C2-\U0001F251"
            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
            "]+",
            flags=re.UNICODE
        )
        text = emoji_pattern.sub('', text)
        
        # Remove common emoticons
        emoticon_pattern = re.compile(r'[:;]-?[()DPO]|<3|XD', re.IGNORECASE)
        text = emoticon_pattern.sub('', text)
        
        return text
    
    def _expand_urls(self, text: str) -> str:
        """Expand URLs to readable form"""
        # Pattern for URLs - must have at least letters in domain parts (not just numbers)
        # This avoids matching decimals like 3.14
        url_pattern = re.compile(
            r'\b(?:https?://)?(?:www\.)?([a-zA-Z][a-zA-Z0-9-]*(?:\.[a-zA-Z][a-zA-Z0-9-]*)+)(?:/[^\s]*)?',
            re.IGNORECASE
        )
        
        def replace_url(match):
            domain = match.group(1)
            # Replace all dots with " dot "
            domain = domain.replace('.', ' dot ')
            return domain
        
        return url_pattern.sub(replace_url, text)
    
    def _expand_emails(self, text: str) -> str:
        """Expand email addresses"""
        email_pattern = re.compile(r'\b([a-zA-Z0-9._+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b')
        
        def replace_email(match):
            user = match.group(1)
            domain = match.group(2).replace('.', ' dot ')
            # Use a unique marker to prevent word concatenation
            return f" {user} at {domain} "
        
        text = email_pattern.sub(replace_email, text)
        # Clean up potential double spaces
        text = re.sub(r' +', ' ', text)
        return text
    
    def _expand_social(self, text: str) -> str:
        """Expand social handles and hashtags"""
        # @ symbol expansion (but NOT if followed by time pattern)
        # Pattern: @ before time like "@ 11:59 PM" → "at 11:59 PM"
        text = re.sub(r'@\s+(\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?)', r'at \1', text)
        
        # @mentions (like @username)
        text = re.sub(r'@([a-zA-Z0-9_]+)', r'at \1', text)
        
        # #hashtags - split camelCase
        def replace_hashtag(match):
            tag = match.group(1)
            # Split on capitals for readability
            tag = re.sub(r'([a-z])([A-Z])', r'\1 \2', tag)
            return f"hashtag {tag.lower()}"
        
        text = re.sub(r'#([a-zA-Z0-9]+)', replace_hashtag, text)
        
        return text
    
    def _expand_currency(self, text: str) -> str:
        """Expand currency amounts"""
        # Pattern: currency symbol + number
        for symbol, name in self.CURRENCY_MAP.items():
            # Match: ₹1,234.56 or $50
            pattern = re.compile(rf'{re.escape(symbol)}\s*(\d[\d,]*(?:\.\d+)?)')
            
            def replace_currency(match):
                amount = match.group(1).replace(',', '')
                amount_words = self._number_to_words(amount)
                return f"{amount_words} {name}"
            
            text = pattern.sub(replace_currency, text)
        
        return text
    
    def _expand_math_symbols(self, text: str) -> str:
        """Expand mathematical symbols to words"""
        # Order matters! Handle in priority:
        # 1. Negative numbers (at start or after space): -5 → minus 5
        # 2. Subtraction with spaces: 5 - 3 → 5 minus 3
        # 3. Ranges handled separately (10-12 with no spaces)
        
        # Negative numbers: -5, -100 (after space or at start)
        text = re.sub(r'(^|\s)([-−–])(\d+)', r'\1minus \3', text)
        
        # Subtraction with spaces: 5 - 3
        text = re.sub(r'(\d+)\s+([-−–])\s+(\d+)', r'\1 minus \3', text)
        
        # Other math symbols
        for symbol, word in self.MATH_SYMBOLS.items():
            if symbol in ['-', '−', '–']:
                continue  # Already handled above
            # Only replace when surrounded by spaces
            pattern = re.compile(rf'(\s|^){re.escape(symbol)}(\s|$)')
            text = pattern.sub(rf'\1{word}\2', text)
        
        # Percentage
        text = re.sub(r'(\d+(?:\.\d+)?)\s*%', lambda m: f"{self._number_to_words(m.group(1))} percent", text)
        
        return text
    
    def _expand_numbers(self, text: str) -> str:
        """
        Step 5: Expand numbers to English words.
        
        Per user request: expand to English, NOT localized to language.
        Order matters: math before ranges, ordinals before units.
        """
        # Expand math symbols FIRST (to handle minus: 5 - 3 → 5 minus 3)
        text = self._expand_math_symbols(text)
        
        # Expand ordinals (1st, 2nd, etc.)
        text = self._expand_ordinals(text)
        
        # Expand units (5km, 32°C, etc.)
        text = self._expand_units(text)
        
        # Expand ranges LAST (10-12, but NOT 5 - 3 which is already "5 minus 3")
        text = self._expand_ranges(text)
        
        # Expand general numbers
        text = self._expand_general_numbers(text)
        
        return text
    
    def _expand_ordinals(self, text: str) -> str:
        """Expand ordinal numbers (1st → first)"""
        ordinal_words = {
            1: 'first', 2: 'second', 3: 'third', 4: 'fourth', 5: 'fifth',
            6: 'sixth', 7: 'seventh', 8: 'eighth', 9: 'ninth', 10: 'tenth',
            11: 'eleventh', 12: 'twelfth', 13: 'thirteenth', 14: 'fourteenth', 15: 'fifteenth',
            16: 'sixteenth', 17: 'seventeenth', 18: 'eighteenth', 19: 'nineteenth', 20: 'twentieth',
            30: 'thirtieth', 40: 'fortieth', 50: 'fiftieth', 60: 'sixtieth',
            70: 'seventieth', 80: 'eightieth', 90: 'ninetieth'
        }
        
        def replace_ordinal(match):
            num = int(match.group(1))
            
            # Direct lookup for common ordinals
            if num in ordinal_words:
                return ordinal_words[num]
            
            # Construct for larger numbers (21st, 22nd, etc.)
            if num < 100:
                tens = (num // 10) * 10
                ones = num % 10
                if ones == 0:
                    return ordinal_words.get(tens, f"{num}th")
                else:
                    tens_word = self.TENS[num // 10]
                    ones_word = ordinal_words.get(ones, self.ONES[ones])
                    return f"{tens_word} {ones_word}"
            
            # For very large numbers, just say the number + 'th'
            return self._number_to_words(str(num)) + 'th'
        
        return self.ORDINAL_PATTERN.sub(replace_ordinal, text)
    
    def _expand_units(self, text: str) -> str:
        """Expand numbers with units"""
        unit_expansions = {
            'km': 'kilometers',
            'm': 'meters',
            'cm': 'centimeters',
            'mm': 'millimeters',
            'kg': 'kilograms',
            'g': 'grams',
            'mg': 'milligrams',
            'ml': 'milliliters',
            'l': 'liters',
            '°c': 'degrees celsius',
            '°f': 'degrees fahrenheit',
            'km/h': 'kilometers per hour',
            'mph': 'miles per hour',
            'mb': 'megabytes',
            'gb': 'gigabytes',
            'tb': 'terabytes',
        }
        
        def replace_unit(match):
            number = match.group(1)
            unit = match.group(2).lower()
            
            number_words = self._number_to_words(number)
            unit_word = unit_expansions.get(unit, unit)
            
            return f"{number_words} {unit_word}"
        
        return self.UNIT_PATTERN.sub(replace_unit, text)
    
    def _expand_ranges(self, text: str) -> str:
        """
        Expand number ranges (10-12 → ten to twelve)
        
        NOTE: Math symbols (minus) are handled separately in _expand_math_symbols
        This only handles actual ranges (no spaces around hyphen, or clear range context)
        """
        # Match patterns like: 10-12 (no spaces) or with percentage context
        # But NOT dates like 2025-11-13 (avoid 4-digit patterns)
        # AND not "5 - 3" which should be "5 minus 3" (handled in math symbols)
        
        # Only match when NO spaces around hyphen (tight binding = range not math)
        range_pattern = re.compile(r'(?<!\d)(\d{1,3})[-–](\d{1,3})(?!\d)')
        
        def replace_range(match):
            start = self._number_to_words(match.group(1))
            end = self._number_to_words(match.group(2))
            return f"{start} to {end}"
        
        return range_pattern.sub(replace_range, text)
    
    def _expand_general_numbers(self, text: str) -> str:
        """Expand standalone numbers"""
        # Remove thousands separators so comma-grouped numbers stay intact (e.g., 20,000 → 20000)
        text = re.sub(r'(?<=\d),(?=\d)', '', text)
        
        # Match integers and decimals, but not in already-expanded contexts
        number_pattern = re.compile(r'\b(\d+(?:\.\d+)?)\b')
        
        def replace_number(match):
            return self._number_to_words(match.group(1))
        
        return number_pattern.sub(replace_number, text)
    
    def _number_to_words(self, number_str: str) -> str:
        """
        Convert number string to English words.
        
        Handles:
        - Integers: 123 → "one hundred twenty three"
        - Decimals: 3.14 → "three point one four"
        - Negatives: -5 → "minus five"
        """
        number_str = number_str.strip()
        
        # Handle negative
        if number_str.startswith('-'):
            return f"minus {self._number_to_words(number_str[1:])}"
        
        # Handle decimal
        if '.' in number_str:
            integer_part, decimal_part = number_str.split('.')
            integer_words = self._integer_to_words(int(integer_part)) if integer_part else 'zero'
            decimal_words = ' '.join(self._integer_to_words(int(d)) for d in decimal_part)
            return f"{integer_words} point {decimal_words}"
        
        # Handle integer
        return self._integer_to_words(int(number_str))
    
    def _integer_to_words(self, n: int) -> str:
        """Convert integer to English words"""
        if n == 0:
            return 'zero'
        
        if n < 0:
            return f"minus {self._integer_to_words(-n)}"
        
        if n < 20:
            return self.ONES[n]
        
        if n < 100:
            tens = n // 10
            ones = n % 10
            if ones == 0:
                return self.TENS[tens]
            return f"{self.TENS[tens]} {self.ONES[ones]}"
        
        if n < 1000:
            hundreds = n // 100
            remainder = n % 100
            if remainder == 0:
                return f"{self.ONES[hundreds]} hundred"
            return f"{self.ONES[hundreds]} hundred {self._integer_to_words(remainder)}"
        
        if n < 1000000:
            thousands = n // 1000
            remainder = n % 1000
            if remainder == 0:
                return f"{self._integer_to_words(thousands)} thousand"
            return f"{self._integer_to_words(thousands)} thousand {self._integer_to_words(remainder)}"
        
        if n < 1000000000:
            millions = n // 1000000
            remainder = n % 1000000
            if remainder == 0:
                return f"{self._integer_to_words(millions)} million"
            return f"{self._integer_to_words(millions)} million {self._integer_to_words(remainder)}"
        
        # For very large numbers, fall back to digit-by-digit
        return ' '.join(self.ONES[int(d)] for d in str(n))
    
    def _expand_dates_times(self, text: str) -> str:
        """
        Step 6: Expand dates and times.
        
        Handles:
        - ISO dates: 2025-11-13
        - Numeric dates: 13/11/2025, 11/13/2025
        - Times: 3:45 PM, 14:30
        """
        # ISO dates (YYYY-MM-DD)
        iso_pattern = re.compile(r'\b(\d{4})-(\d{2})-(\d{2})\b')
        
        def replace_iso_date(match):
            year = match.group(1)
            month = int(match.group(2))
            day = int(match.group(3))
            
            month_name = self.MONTHS[month - 1] if 1 <= month <= 12 else str(month)
            day_word = self._integer_to_words(day)
            year_word = self._year_to_words(int(year))
            
            return f"{day_word} {month_name} {year_word}"
        
        text = iso_pattern.sub(replace_iso_date, text)
        
        # Numeric dates (DD/MM/YYYY or MM/DD/YYYY) - assume day/month/year
        date_pattern = re.compile(r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b')
        
        def replace_date(match):
            first = int(match.group(1))
            second = int(match.group(2))
            year = int(match.group(3))
            
            # Heuristic: if first > 12, it's day/month, else assume day/month by default
            if first > 12:
                day, month = first, second
            else:
                day, month = first, second  # Default to day/month
            
            if 1 <= month <= 12:
                month_name = self.MONTHS[month - 1]
                day_word = self._integer_to_words(day)
                year_word = self._year_to_words(year)
                return f"{day_word} {month_name} {year_word}"
            
            return match.group(0)  # Keep as-is if invalid
        
        text = date_pattern.sub(replace_date, text)
        
        # Times (14:30, 3:45 PM)
        time_pattern = re.compile(r'\b(\d{1,2}):(\d{2})\s*(am|pm|AM|PM)?\b')
        
        def replace_time(match):
            hour = int(match.group(1))
            minute = int(match.group(2))
            period = match.group(3)
            
            hour_word = self._integer_to_words(hour)
            minute_word = self._integer_to_words(minute)
            
            result = f"{hour_word} {minute_word}"
            
            if period:
                result += f" {period.lower().replace('.', ' ')}"
            
            return result
        
        text = time_pattern.sub(replace_time, text)
        
        return text
    
    def _year_to_words(self, year: int) -> str:
        """Convert year to words (2025 → two thousand twenty five)"""
        if year < 2000:
            # 1999 → nineteen ninety nine
            if year < 100:
                return self._integer_to_words(year)
            century = year // 100
            remainder = year % 100
            if remainder == 0:
                return f"{self._integer_to_words(century)} hundred"
            return f"{self._integer_to_words(century)} {self._integer_to_words(remainder)}"
        else:
            # 2025 → two thousand twenty five
            return self._integer_to_words(year)
    
    def _expand_phones(self, text: str) -> str:
        """
        Step 7: Expand phone numbers to digit groups.
        
        Handles:
        - US format: (415) 555-2671
        - International: +91 98765 43210
        """
        # US format
        us_phone = re.compile(r'\((\d{3})\)\s*(\d{3})-(\d{4})')
        
        def replace_us_phone(match):
            area = ' '.join(self._integer_to_words(int(d)) for d in match.group(1))
            prefix = ' '.join(self._integer_to_words(int(d)) for d in match.group(2))
            line = ' '.join(self._integer_to_words(int(d)) for d in match.group(3))
            return f"{area}, {prefix}, {line}"
        
        text = us_phone.sub(replace_us_phone, text)
        
        # International format (+XX XXXXX XXXXX)
        intl_phone = re.compile(r'\+(\d{1,3})\s+(\d+)\s+(\d+)')
        
        def replace_intl_phone(match):
            code = ' '.join(self._integer_to_words(int(d)) for d in match.group(1))
            part1 = ' '.join(self._integer_to_words(int(d)) for d in match.group(2))
            part2 = ' '.join(self._integer_to_words(int(d)) for d in match.group(3))
            return f"plus {code}, {part1}, {part2}"
        
        text = intl_phone.sub(replace_intl_phone, text)
        
        return text
    
    def _expand_abbreviations(self, text: str) -> str:
        """Step 8: Expand known abbreviations"""
        # Case-insensitive replacement
        # MUST happen BEFORE cleanup_symbols to avoid dots being removed
        for abbr, expansion in self.ABBREVIATIONS.items():
            # Word boundary matching - exact match with dots
            # Use case-insensitive for better matching
            pattern = re.compile(r'\b' + re.escape(abbr) + r'\b', re.IGNORECASE)
            text = pattern.sub(expansion, text)
        
        return text
    
    def _cleanup_symbols(self, text: str, language: Language) -> str:
        """
        Step 9: Symbol & punctuation cleanup (language-aware).
        
        Keep only allowed punctuation for each language:
        - EN: , . ?
        - HI: , | ?
        - TE: , ?
        
        CRITICAL: Must preserve Unicode word integrity for Hindi/Telugu.
        DO NOT iterate character-by-character as it breaks combining marks.
        """
        # Define allowed punctuation per language
        if language == Language.HINDI:
            allowed_punct = {',', '|', '?'}
            # Replace sentence-ending punctuation with |
            text = re.sub(r'[.!;:](\s|$)', r'|\1', text)
        elif language == Language.TELUGU:
            allowed_punct = {',', '?'}
            # Replace sentence-ending punctuation with ,
            text = re.sub(r'[.!;:](\s|$)', r',\1', text)
        else:  # English or unknown
            allowed_punct = {',', '.', '?'}
            # Replace ! with .
            text = re.sub(r'!', '.', text)
        
        # For Hindi/Telugu: preserve word structure, only remove standalone symbols
        # Use regex to remove unwanted punctuation while keeping words intact
        if language in [Language.HINDI, Language.TELUGU]:
            # CRITICAL: Indic scripts use combining characters that must stay together
            # Define comprehensive Indic character ranges including combining marks
            # Hindi (Devanagari): U+0900-U+097F
            # Telugu: U+0C00-U+0C7F
            # Also keep Latin alphanumeric for mixed text
            
            # Build list of safe characters to keep
            # We'll only remove specific ASCII punctuation
            allowed_chars = allowed_punct
            
            # Remove specific ASCII punctuation only (not all non-alphanumeric)
            # This preserves Indic combining characters
            ascii_punct_to_remove = '!"#$%&\'()*+/:;<=>@[\\]^_`{|}~'
            
            for punct in ascii_punct_to_remove:
                if punct not in allowed_punct:
                    # Replace with space, but avoid creating multiple spaces
                    text = text.replace(punct, ' ')
        else:
            # For English: character-by-character is safe
            # BUT: also preserve Indic characters even if detected as English (mixed text)
            cleaned_chars = []
            for char in text:
                # Keep alphanumeric, spaces, allowed punctuation, OR Indic characters
                is_indic = '\u0900' <= char <= '\u097F' or '\u0C00' <= char <= '\u0C7F'
                
                if char.isalnum() or char.isspace() or char in allowed_punct or is_indic:
                    cleaned_chars.append(char)
                elif not cleaned_chars or cleaned_chars[-1] != ' ':
                    # Replace removed symbols with space
                    cleaned_chars.append(' ')
            text = ''.join(cleaned_chars)
        
        return text
    
    def _normalize_whitespace(self, text: str) -> str:
        """Step 10: Whitespace normalization"""
        # Collapse multiple spaces
        text = re.sub(r' +', ' ', text)
        
        # Collapse multiple newlines
        text = re.sub(r'\n+', '\n', text)
        
        # Collapse spaces around newlines
        text = re.sub(r' *\n *', '\n', text)
        
        # Remove space before punctuation
        text = re.sub(r'\s+([,.|?])', r'\1', text)
        
        # Ensure space after punctuation (but not at end of line)
        text = re.sub(r'([,.|?])([^\s\n])', r'\1 \2', text)
        
        return text
    
    def _final_checks(self, text: str, language: Language) -> str:
        """Step 11: Final validation and cleanup"""
        # Remove empty lines
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        text = '\n'.join(lines)
        
        # Collapse repeated punctuation
        text = re.sub(r'\?+', '?', text)
        text = re.sub(r',+', ',', text)
        text = re.sub(r'\.+', '.', text)
        text = re.sub(r'\|+', '|', text)
        
        # Remove punctuation-only sentences (but keep single punctuation marks that end sentences)
        # This prevents removing trailing ? or . that are legitimate sentence endings
        # Only remove if there are MULTIPLE punctuation marks without content
        text = re.sub(r'([.,|?])\s*([.,|?])+', r'\1', text)  # Collapse multiple punctuation with optional spaces
        
        return text


# Singleton instance for easy import
_normalizer = TextNormalizer()

def normalize_text(text: str, verbose: bool = False) -> str:
    """
    Convenience function for text normalization.
    
    Args:
        text: Input text to normalize
        verbose: If True, return additional debug info
        
    Returns:
        Normalized text ready for TTS
    """
    return _normalizer.normalize(text, verbose=verbose)

