o
    5¤˜i  ã                   @   sj   d Z ddlZddlmZ dadd„ Zdedefdd	„Zdedefd
d„Zdedede	fdd„Z
dd„ ZdS )uL  
Deterministic Romanization Module
==================================

Converts native Indic script text to MMS_FA-compatible Latin romanization
using the uroman library. This replaces Gemini-generated romanization with
a deterministic, reproducible transform.

Why deterministic:
  - Gemini's "romanized as pronounced" is creative and inconsistent across runs
  - uroman produces identical output for identical input, every time
  - MMS_FA was trained with uroman-style romanization, so alignment is optimal

MMS_FA normalization: [a-z' ] only, lowercase, collapsed spaces.
Per: https://docs.pytorch.org/audio/2.8/tutorials/forced_alignment_for_multilingual_data_tutorial.html

Usage:
    from src.romanization import romanize, romanize_for_alignment

    # Basic romanization
    roman = romanize("à°¨à°¾à°•à± à°•à±Šà°¨à±à°¨à°¿ à°¯à°¾à°¡à±à°¸à± à°—à±à°°à±à°¤à±à°‚à°Ÿà°¾à°¯à°¿")
    # -> "naaku konni yaadds gurtumttaayi"

    # MMS_FA-ready (stripped to [a-z' ] only)
    aligned = romanize_for_alignment("à°¨à°¾à°•à± à°•à±Šà°¨à±à°¨à°¿, à°¯à°¾à°¡à±à°¸à±!")
    # -> "naaku konni yaadds"
é    N)ÚOptionalc                  C   s   t du rddl} |  ¡ a t S )z;Lazy load uroman. First call ~3s, subsequent calls instant.Nr   )Ú_uroman_instanceÚuromanÚUroman)r   © r   ú1/home/ubuntu/maya3_transcribe/src/romanization.pyÚ_get_uroman#   s   r   ÚtextÚreturnc                 C   s$   | r|   ¡ sdS tƒ }| |   ¡ ¡S )a6  
    Romanize text from any script to Latin using uroman.

    Deterministic: same input always produces same output.
    Handles Indic scripts (Telugu, Hindi, Tamil, etc.) + passes Latin through.

    Args:
        text: Text in any script (Indic, Latin, mixed)

    Returns:
        Romanized Latin text
    Ú )Ústripr   Úromanize_string)r	   Úur   r   r   Úromanize,   s   r   c                 C   s`   | r|   ¡ sdS t dd| ¡}t dd|¡}t|ƒ}| ¡ }t dd|¡}t dd|¡  ¡ }|S )aì  
    Romanize and normalize for MMS_FA forced alignment.

    MMS_FA expects: lowercase [a-z' ] only.
    Steps:
      1. Strip tags like [laugh], [UNK], [INAUDIBLE] etc.
      2. Romanize via uroman
      3. Lowercase
      4. Strip everything except [a-z' ]
      5. Collapse multiple spaces
      6. Strip leading/trailing whitespace

    Args:
        text: Native script text (may include punctuation, tags)

    Returns:
        Normalized romanized text ready for MMS_FA alignment
    r   z
\[[\w_]+\]u   [à¥¤,.!?;:\'"()\-â€“â€”â€¦]Ú ú[^a-z' ]ú\s+)r   ÚreÚsubr   Úlower)r	   ÚcleanedÚromanr   r   r   Úromanize_for_alignment?   s   r   Únative_textÚgemini_romanizedc                 C   s  t | ƒ}t dd| ¡ ¡}t dd|¡ ¡ }|s|rdS dS |}|}t|ƒ}t|ƒ}|dkr6|dkr4dS dS tt|d ƒƒ}td|d ƒD ]>}	|	gdg|  }
td|d ƒD ]*}||	d  ||d  krfdnd}t|
|d  d || d ||d  | ƒ|
|< qV|
}qEtd|| | ƒS )a  
    Compute Character Error Rate between uroman-derived and Gemini-provided romanization.

    Low CER (<3%) = Gemini's romanization is consistent with deterministic version.
    High CER (>10%) = Gemini hallucinated or used non-standard romanization.

    Useful for flagging romanization drift without blocking transcriptions.

    Args:
        native_text: Native script text (source of truth)
        gemini_romanized: Gemini's romanized output

    Returns:
        CER as float (0.0 = identical, 1.0 = completely different)
    r   r   r   g      ð?g        r   é   )	r   r   r   r   r   ÚlenÚlistÚrangeÚmin)r   r   Úuroman_romanÚgemini_normÚrefÚhypÚnÚmÚprevÚiÚcurrÚjÚcostr   r   r   Úcompute_romanization_ceri   s.    

ýr+   c                   C   s   da dS )zRelease uroman resources.N)r   r   r   r   r   Úcleanupž   s   r,   )Ú__doc__r   Útypingr   r   r   Ústrr   r   Úfloatr+   r,   r   r   r   r   Ú<module>   s    	*ÿþ
ý5