o
    i                     @   st   d Z ddlZddlZddlmZmZmZ ddlmZ G dd deZ	G dd dZ
e
 Zdd
ededefddZdS )u  
Text Normalization for TTS Pipeline

Production-ready text normalization that prepares text for TTS models by:
- Expanding numbers, dates, URLs, symbols to words
- Language-aware punctuation cleanup (EN, HI, TE)
- Preserving meaning while removing distracting elements
- Deterministic and idempotent transformations

Design principles:
- Deterministic: same input → same output
- Meaning-preserving: expand before removing
- Practical: cover common cases, don't over-complicate
- Language-aware: support English, Hindi, Telugu
    N)DictTupleOptional)Enumc                   @   s    e Zd ZdZdZdZdZdZdS )Languagez%Supported languages for normalizationenhiteunknownN)__name__
__module____qualname____doc__ENGLISHHINDITELUGUUNKNOWN r   r   A/home/ubuntu/veenaModal/veena3modal/processing/text_normalizer.pyr      s    r   c                   @   s  e Zd ZdZedZedZdddddd	d
ZdddddddddddddZ	edej
Zedej
Zg dZg dZg dZg dZi dddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d:d<d=d>d=iZd?d@ ZddBedCedDefdEdFZdBedDeeeeef f fdGdHZdBedIeeef dDefdJdKZdBedDefdLdMZdBedDefdNdOZdBedDefdPdQZdBedDefdRdSZdBedDefdTdUZdBedDefdVdWZ dBedDefdXdYZ!dBedDefdZd[Z"dBedDefd\d]Z#dBedDefd^d_Z$dBedDefd`daZ%dBedDefdbdcZ&dBedDefdddeZ'dBedDefdfdgZ(dBedDefdhdiZ)djedDefdkdlZ*dme+dDefdndoZ,dBedDefdpdqZ-dre+dDefdsdtZ.dBedDefdudvZ/dBedDefdwdxZ0dBedyedDefdzd{Z1dBedDefd|d}Z2dBedyedDefd~dZ3dS )TextNormalizera  
    Text normalizer for TTS pipeline.
    
    Implements a deterministic, order-dependent normalization pipeline:
    1. Input sanitation
    2. Language detection
    3. Unicode normalization
    4. Entity expansion (URLs, emails, currency, etc.)
    5. Number expansion (to English words)
    6. Date/time expansion
    7. Symbol cleanup (language-aware)
    8. Whitespace normalization
    9. Final validation
    z[\u0900-\u097F]z[\u0C00-\u0C7F]rupeesdollarseurospoundsyenwon)u   ₹$u   €   £   ¥u   ₩plusminustimesz
divided byequalsz	less thanzgreater thanzless than or equal tozgreater than or equal toznot equal toapproximately)+   −   –   ×   ÷=<>u   ≤u   ≥u   ≠u   ≈uI   \b(\d+(?:\.\d+)?)\s*(km|m|cm|mm|kg|g|mg|ml|l|°C|°F|km/h|mph|mb|gb|tb)\bz\b(\d+)(st|nd|rd|th)\b)zeroonetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteenfifteensixteen	seventeeneighteennineteen)
 r@   twentythirtyfortyfiftysixtyseventyeightyninety)januaryfebruarymarchaprilmayjunejulyaugust	septemberoctobernovemberdecember)
z[angry]z	[curious]z	[excited][giggle][laughs harder][laughs]	[screams][sighs][sings]
[whispers]zmr.misterzmrs.missuszms.misszdr.doctorzprof.	professorzst.saintzvs.versuszetc.etceterazi.e.zthat isze.g.zfor exampleakazalso known asfyizfor your informationdiyzdo it yourselfokokayza.m.za mamzp.m.zp mpmc                 C   s   d| _ tdtj| _dS )z+Initialize normalizer with default settingsFz](\[(?:angry|curious|excited|giggles?|laughs?(?: harder)?|screams?|sighs?|sings?|whispers?)\])N)verboserecompile
IGNORECASE_emotion_tag_patternselfr   r   r   __init__   s
   
zTextNormalizer.__init__Ftextrk   returnc                 C   s   |r|  sdS || _| |\}}| |}| |}| |}| |}| |}| |}| 	|}| 
|}| ||}| |}| ||}| ||}|  S )z
        Normalize text for TTS.
        
        Args:
            text: Input text to normalize
            verbose: If True, return detailed transformation info
            
        Returns:
            Normalized text ready for TTS
        r@   )striprk   _protect_emotion_tags_sanitize_input_detect_language_normalize_unicode_expand_entities_expand_dates_times_expand_phones_expand_numbers_expand_abbreviations_cleanup_symbols_normalize_whitespace_final_checks_restore_emotion_tags)rq   rs   rk   protected_emotionslanguager   r   r   	normalize   s"   








zTextNormalizer.normalizec                    sB   ddddddddi d	  fd
d}| j ||}|fS )u  
        Protect emotion tags from normalization.
        
        Replaces emotion tags with unique placeholders, stores mapping for later restoration.
        CRITICAL: Must preserve exact emotion tags for TTS model.
        
        Normalizes variants to match EXACT INDIC_EMOTION_TAGS format:
        - [laugh] → [laughs]
        - [giggles] → [giggle]  (NOTE: model uses singular!)
        - [whisper] → [whispers]
        etc.
        
        Placeholder format: XEMOTIONX0X (no special chars that get removed)
        rW   rU   rX   rY   rZ   r[   rV   )z[laugh]z	[giggles]z[scream]z[sigh]z[sing]z	[whisper]z[laugh harder]r   c                    s:   |  d }||}d  d}||<  d7  |S )N   	XEMOTIONXX)grouplowerget)matchtagnormalized_tagplaceholdercounteremotion_normalizations	protectedr   r   replace_emotion   s   z=TextNormalizer._protect_emotion_tags.<locals>.replace_emotion)ro   sub)rq   rs   r   r   r   r   rv      s   
z$TextNormalizer._protect_emotion_tagsr   c                 C   s"   |  D ]
\}}|||}q|S )zRestore protected emotion tags)itemsreplace)rq   rs   r   r   original_tagr   r   r   r      s   z$TextNormalizer._restore_emotion_tagsc                 C   s:   t |tr|jddd}|d}|dddd}|S )zStep 1: Input sanitationzutf-8ignore)errorsu   ﻿z

)
isinstancebytesdecodelstripr   rq   rs   r   r   r   rw      s
   

zTextNormalizer._sanitize_inputc                 C   sR   t | j|}t | j|}||kr|dkrtjS ||kr&|dkr&tjS tjS )zw
        Step 2: Simple language detection.
        
        Returns primary language based on character sets.
           )lenDEVANAGARI_PATTERNfindallTELUGU_PATTERNr   r   r   r   )rq   rs   devanagari_counttelugu_countr   r   r   rx      s   zTextNormalizer._detect_languagec                 C   sN   t d|}tdd|}ddddddd}| D ]
\}}|||}q|S )	z&Step 3: Unicode normalization & safetyNFKCz"[\u200b-\u200f\u202a-\u202e\ufeff]r@   "'-z...)r   r   z: "'", u   —r&   u   …)unicodedatar   rl   r   r   r   )rq   rs   replacementsfancyplainr   r   r   ry     s   z!TextNormalizer._normalize_unicodec                 C   6   |  |}| |}| |}| |}| |}|S )z
        Step 4: Expand entities (URLs, emails, currency, etc.)
        
        Must happen BEFORE symbol removal to preserve meaning.
        Order matters: emails before URLs to avoid conflict.
        )_remove_emojis_expand_emails_expand_urls_expand_social_expand_currencyr   r   r   r   rz     s   




zTextNormalizer._expand_entitiesc                 C   s:   t jdt jd}|d|}t dt j}|d|}|S )zRemove all emojis and emoticonsuH   [😀-🙏🌀-🗿🚀-🛿🇠-🇿✂-➰Ⓜ-🉑🤀-🧿🩰-🫿]+)flagsr@   z[:;]-?[()DPO]|<3|XD)rl   rm   UNICODEr   rn   )rq   rs   emoji_patternemoticon_patternr   r   r   r   8  s   
zTextNormalizer._remove_emojisc                 C   s"   t dt j}dd }|||S )zExpand URLs to readable formzY\b(?:https?://)?(?:www\.)?([a-zA-Z][a-zA-Z0-9-]*(?:\.[a-zA-Z][a-zA-Z0-9-]*)+)(?:/[^\s]*)?c                 S   s   |  d}|dd}|S )Nr   . dot r   r   )r   domainr   r   r   replace_urlY  s   
z0TextNormalizer._expand_urls.<locals>.replace_url)rl   rm   rn   r   )rq   rs   url_patternr   r   r   r   r   P  s   zTextNormalizer._expand_urlsc                 C   s0   t d}dd }|||}t dd|}|S )zExpand email addressesz5\b([a-zA-Z0-9._+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\bc                 S   s.   |  d}|  ddd}d| d| dS )Nr      r   r    z at r   )r   userr   r   r   r   replace_emaile  s   
z4TextNormalizer._expand_emails.<locals>.replace_email +r   rl   rm   r   )rq   rs   email_patternr   r   r   r   r   a  s
   
zTextNormalizer._expand_emailsc                 C   s6   t dd|}t dd|}dd }t d||}|S )z"Expand social handles and hashtagsz&@\s+(\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?)zat \1z@([a-zA-Z0-9_]+)c                 S   s&   |  d}tdd|}d|  S )Nr   z([a-z])([A-Z])\1 \2zhashtag )r   rl   r   r   )r   r   r   r   r   replace_hashtagz  s   
z6TextNormalizer._expand_social.<locals>.replace_hashtagz#([a-zA-Z0-9]+)rl   r   )rq   rs   r   r   r   r   r   p  s
   zTextNormalizer._expand_socialc                    sH   j  D ]\} tt| d} fdd}|||}q|S )zExpand currency amountsz\s*(\d[\d,]*(?:\.\d+)?)c                    s*   |  ddd}|}| d  S )Nr   ,r@   r   )r   r   _number_to_words)r   amountamount_wordsnamerq   r   r   replace_currency  s   
z9TextNormalizer._expand_currency.<locals>.replace_currency)CURRENCY_MAPr   rl   rm   escaper   )rq   rs   symbolpatternr   r   r   r   r     s
   zTextNormalizer._expand_currencyc                    s   t dd|}t dd|} j D ]\}}|dv rqt dt | d}|d| d	|}qt d
 fdd|}|S )z$Expand mathematical symbols to wordsu   (^|\s)([-−–])(\d+)z
\1minus \3u   (\d+)\s+([-−–])\s+(\d+)z\1 minus \3)r   r%   r&   z(\s|^)z(\s|$)\1z\2z(\d+(?:\.\d+)?)\s*%c                    s     | d dS )Nr   z percentr   r   )mrp   r   r   <lambda>  s    z5TextNormalizer._expand_math_symbols.<locals>.<lambda>)rl   r   MATH_SYMBOLSr   rm   r   )rq   rs   r   wordr   r   rp   r   _expand_math_symbols  s   z#TextNormalizer._expand_math_symbolsc                 C   r   )z
        Step 5: Expand numbers to English words.
        
        Per user request: expand to English, NOT localized to language.
        Order matters: math before ranges, ordinals before units.
        )r   _expand_ordinals_expand_units_expand_ranges_expand_general_numbersr   r   r   r   r}     s   




zTextNormalizer._expand_numbersc                    s   i ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-
  fd.d/}j ||S )0u&   Expand ordinal numbers (1st → first)r   firstr   second   third   fourthr   fifth   sixth   seventh   eighth	   ninth
   tenth   eleventh   twelfth   
thirteenth   
fourteenth   	fifteenth   	sixteenth   seventeenth
eighteenth
nineteenth	twentieth	thirtiethfortiethfiftiethsixtieth
seventieth	eightieth	ninetieth)
            (   2   <   F   P   Z   c                    s   t | d}| v r | S |dk rA|d d }|d }|dkr* || dS j|d  } |j| }| d| S t|d S )Nr   d   r   r   thr   )intr   r   TENSONESr   str)r   numtensones	tens_word	ones_wordordinal_wordsrq   r   r   replace_ordinal  s   z8TextNormalizer._expand_ordinals.<locals>.replace_ordinal)ORDINAL_PATTERNr   )rq   rs   r!  r   r  r   r     sR   	zTextNormalizer._expand_ordinalsc                    s   i ddddddddd	d
dddddddddddddddddddddd  fd!d"} j ||S )#zExpand numbers with unitskm
kilometersr   meterscmcentimetersmmmillimeterskg	kilogramsggramsmg
milligramsmlmillilitersllitersu   °czdegrees celsiusu   °fzdegrees fahrenheitzkm/hzkilometers per hourmphzmiles per hourmb	megabytesgb	gigabytestb	terabytesc                    s<   |  d}|  d } |}||}| d| S )Nr   r   r   )r   r   r   r   )r   numberunitnumber_words	unit_wordrq   unit_expansionsr   r   replace_unit  s
   

z2TextNormalizer._expand_units.<locals>.replace_unit)UNIT_PATTERNr   )rq   rs   rA  r   r?  r   r     sF   	
	zTextNormalizer._expand_unitsc                    s"   t d} fdd}|||S )u   
        Expand number ranges (10-12 → ten to twelve)
        
        NOTE: Math symbols (minus) are handled separately in _expand_math_symbols
        This only handles actual ranges (no spaces around hyphen, or clear range context)
        u%   (?<!\d)(\d{1,3})[-–](\d{1,3})(?!\d)c                    s.     | d}  | d}| d| S )Nr   r   z to r   )r   startendrp   r   r   replace_range  s   z4TextNormalizer._expand_ranges.<locals>.replace_ranger   )rq   rs   range_patternrE  r   rp   r   r     s   
zTextNormalizer._expand_rangesc                    s0   t dd|}t d} fdd}|||S )zExpand standalone numbersz(?<=\d),(?=\d)r@   z\b(\d+(?:\.\d+)?)\bc                    s     | dS )Nr   r   )r   rp   r   r   replace_number%  s   z>TextNormalizer._expand_general_numbers.<locals>.replace_number)rl   r   rm   )rq   rs   number_patternrG  r   rp   r   r     s   
z&TextNormalizer._expand_general_numbers
number_strc                    s   |  }|drd |dd  S d|v r>|d\}}|r) t|nd}d fdd	|D }| d
| S  t|S )u   
        Convert number string to English words.
        
        Handles:
        - Integers: 123 → "one hundred twenty three"
        - Decimals: 3.14 → "three point one four"
        - Negatives: -5 → "minus five"
        r   minus r   Nr   r,   r   c                 3       | ]
}  t|V  qd S N_integer_to_wordsr  .0drp   r   r   	<genexpr>=      z2TextNormalizer._number_to_words.<locals>.<genexpr>z point )ru   
startswithr   splitrN  r  join)rq   rI  integer_partdecimal_partinteger_wordsdecimal_wordsr   rp   r   r   *  s   	
zTextNormalizer._number_to_wordsnc                    sv  |dkrdS |dk rd  |  S |dk r j| S |dk r>|d }|d }|dkr1 j| S  j|  d j|  S |dk rc|d }|d }|dkrV j|  d	S  j|  d
  | S |dk r|d }|d }|dkr{  | dS   | d  | S |dk r|d }|d }|dkr  | dS   | d  | S d fddt|D S )z Convert integer to English wordsr   r,   rJ  r  r  r   r   i   hundredz	 hundred i@B z	 thousandz
 thousand i ʚ;z millionz	 million c                 3   s    | ]
} j t| V  qd S rL  )r  r  rO  rp   r   r   rR  k  rS  z3TextNormalizer._integer_to_words.<locals>.<genexpr>)rN  r  r  rV  r  )rq   r[  r  r  hundreds	remainder	thousandsmillionsr   rp   r   rN  C  s>   

z TextNormalizer._integer_to_wordsc                    sj   t d} fdd}|||}t d} fdd}|||}t d} fdd	}|||}|S )
z
        Step 6: Expand dates and times.
        
        Handles:
        - ISO dates: 2025-11-13
        - Numeric dates: 13/11/2025, 11/13/2025
        - Times: 3:45 PM, 14:30
        z\b(\d{4})-(\d{2})-(\d{2})\bc                    s   |  d}t|  d}t|  d}d|  krdkr&n n j|d  nt|} |} t|}| d| d| S )Nr   r   r   r   r   )r   r  MONTHSr  rN  _year_to_words)r   yearmonthday
month_nameday_word	year_wordrp   r   r   replace_iso_datey  s   
.
z<TextNormalizer._expand_dates_times.<locals>.replace_iso_datez\b(\d{1,2})/(\d{1,2})/(\d{4})\bc           	         s   t | d}t | d}t | d}|dkr||}}n||}}d|  kr.dkrKn n j|d  } |} |}| d| d| S | dS )Nr   r   r   r   r   r   )r  r   ra  rN  rb  )	r   r   r   rc  re  rd  rf  rg  rh  rp   r   r   replace_date  s   



z8TextNormalizer._expand_dates_times.<locals>.replace_datez&\b(\d{1,2}):(\d{2})\s*(am|pm|AM|PM)?\bc                    sj   t | d}t | d}| d} |} |}| d| }|r3|d| dd 7 }|S )Nr   r   r   r   r   )r  r   rN  r   r   )r   hourminuteperiod	hour_wordminute_wordresultrp   r   r   replace_time  s   


z8TextNormalizer._expand_dates_times.<locals>.replace_timer   )rq   rs   iso_patternri  date_patternrj  time_patternrq  r   rp   r   r{   m  s   



z"TextNormalizer._expand_dates_timesrc  c                 C   sf   |dk r.|dk r|  |S |d }|d }|dkr!|  | dS |  | d|  | S |  |S )u9   Convert year to words (2025 → two thousand twenty five)i  r  r   r\  r   )rN  )rq   rc  centuryr^  r   r   r   rb    s   

zTextNormalizer._year_to_wordsc                    sH   t d} fdd}|||}t d} fdd}|||}|S )z
        Step 7: Expand phone numbers to digit groups.
        
        Handles:
        - US format: (415) 555-2671
        - International: +91 98765 43210
        z\((\d{3})\)\s*(\d{3})-(\d{4})c                    sn   d  fdd| dD }d  fdd| dD }d  fdd| dD }| d	| d	| S )
Nr   c                 3   rK  rL  rM  rO  rp   r   r   rR    rS  zJTextNormalizer._expand_phones.<locals>.replace_us_phone.<locals>.<genexpr>r   c                 3   rK  rL  rM  rO  rp   r   r   rR    rS  r   c                 3   rK  rL  rM  rO  rp   r   r   rR    rS  r   , rV  r   )r   areaprefixlinerp   r   r   replace_us_phone  s   z7TextNormalizer._expand_phones.<locals>.replace_us_phonez\+(\d{1,3})\s+(\d+)\s+(\d+)c                    sp   d  fdd| dD }d  fdd| dD }d  fdd| dD }d	| d
| d
| S )Nr   c                 3   rK  rL  rM  rO  rp   r   r   rR    rS  zLTextNormalizer._expand_phones.<locals>.replace_intl_phone.<locals>.<genexpr>r   c                 3   rK  rL  rM  rO  rp   r   r   rR    rS  r   c                 3   rK  rL  rM  rO  rp   r   r   rR    rS  r   zplus rv  rw  )r   codepart1part2rp   r   r   replace_intl_phone  s   z9TextNormalizer._expand_phones.<locals>.replace_intl_phoner   )rq   rs   us_phoner{  
intl_phoner  r   rp   r   r|     s   
	
zTextNormalizer._expand_phonesc                 C   s@   | j  D ]\}}tdt| d tj}|||}q|S )z"Step 8: Expand known abbreviationsz\b)ABBREVIATIONSr   rl   rm   r   rn   r   )rq   rs   abbr	expansionr   r   r   r   r~     s   z$TextNormalizer._expand_abbreviationsr   c           
      C   s"  |t jkrh d}tdd|}n|t jkr"ddh}tdd|}nh d}tdd	|}|t jt jfv rJ|}d
}|D ]}||vrG||d}q;|S g }|D ];}d|  koYdkn  pgd|  koedkn  }	| sv| sv||v sv|	r||| qN|r|d dkr|d qNd	|}|S )ab  
        Step 9: Symbol & punctuation cleanup (language-aware).
        
        Keep only allowed punctuation for each language:
        - EN: , . ?
        - HI: , | ?
        - TE: , ?
        
        CRITICAL: Must preserve Unicode word integrity for Hindi/Telugu.
        DO NOT iterate character-by-character as it breaks combining marks.
        >   ?|r   z[.!;:](\s|$)z|\1r   r  z,\1>   r   r  r   !r   z!"#$%&'()*+/:;<=>@[\]^_`{|}~r   u   ऀu   ॿu   ఀu   ౿r@   )
r   r   rl   r   r   r   isalnumisspaceappendrV  )
rq   rs   r   allowed_punctallowed_charsascii_punct_to_removepunctcleaned_charscharis_indicr   r   r   r     s4   

	0

zTextNormalizer._cleanup_symbolsc                 C   sJ   t dd|}t dd|}t dd|}t dd|}t dd	|}|S )
z!Step 10: Whitespace normalizationr   r   z\n+r   z *\n *z\s+([,.|?])r   z([,.|?])([^\s\n])r   r   r   r   r   r   r   /  s   z$TextNormalizer._normalize_whitespacec                 C   sh   dd | dD }d|}tdd|}tdd|}tdd	|}td
d|}tdd|}|S )z%Step 11: Final validation and cleanupc                 S   s   g | ]
}|  r|  qS r   )ru   )rP  rz  r   r   r   
<listcomp>E  s    z0TextNormalizer._final_checks.<locals>.<listcomp>r   z\?+r  z,+r   z\.+r   z\|+r  z([.,|?])\s*([.,|?])+r   )rU  rV  rl   r   )rq   rs   r   linesr   r   r   r   B  s   
zTextNormalizer._final_checksNF)4r   r   r   r   rl   rm   r   r   r   r   rn   rB  r"  r  r  ra  EMOTION_TAGSr  rr   r  boolr   r   r   rv   r   rw   r   rx   ry   rz   r   r   r   r   r   r   r}   r   r   r   r   r   r  rN  r{   rb  r|   r~   r   r   r   r   r   r   r   r      s    

	
"%." *G @r   Frs   rk   rt   c                 C   s   t j| |dS )z
    Convenience function for text normalization.
    
    Args:
        text: Input text to normalize
        verbose: If True, return additional debug info
        
    Returns:
        Normalized text ready for TTS
    )rk   )_normalizerr   )rs   rk   r   r   r   normalize_textY  s   r  r  )r   rl   r   typingr   r   r   enumr   r   r   r  r  r  r  r   r   r   r   <module>   s          >