o
    5i\W                     @   s  d Z ddlZddlmZmZ ddlmZmZmZm	Z	 ddgdddd	d
gddddd
gdddddgdddddgdddddgdddddgdddddgdddddgdddddgdddddgddddddgddddZ
edZed Zh d!Zdad"ed#ed$e	e fd%d&Z	'	(dbd)ed"ed#ed$efd*d+Z	'	,dcd)ed"ed-ed$efd.d/ZeG d0d1 d1Zdadad"efd2d3Zd4d5 Zd6ed7ed$efd8d9Zd)ed$efd:d;Zd)ed"ed$efd<d=Z	'	>		,ddd?ed@ed"edAedBed-ed$efdCdDZdEdF Z ded@ed"ed$efdGdHZ!e"dIkrddl#Z#e$e#j%dJk r%e&dK e#'dL e#j%dL dMkrVdN(e#j%dOd Z)e!e)Z*e&dPe*dQ   e*dQ sTe&dRe*dS   dS dS e#j%dL Z+dN(e#j%dOd Z,e&dT ee+e,Z*e   e&dUe*j-.   e&dVe*j/  e&dWe*j0dX e&dYe*j1dZ e&d[e*j2dZ e&d\e*j3dZ e*j4re&d]e*j4dd^   e*j5re&d_ e*j5D ]Z6e&d`e6  qdS dS dS )fa  
Simple Transcription Validator (v4)
====================================

Architecture change: validator now derives its own romanization via uroman
instead of trusting Gemini's creative romanization. This makes validation
deterministic and reproducible.

Four checks:
  Step 0: Structural sanity (no ML, instant) - unicode, tags, length/duration
  Step 1: Character validation - catch garbage/alien characters
  Step 2: Native CTC alignment - verify native script matches audio
  Step 3: Romanized MMS alignment - verify uroman-derived text matches audio

Combined scoring: S = 0.45*N + 0.55*R - 0.10*abs(N-R)
  N = native CTC score, R = romanized MMS score (using uroman, not Gemini)

Stricter thresholds with per-word analysis:
  ACCEPT: avg >= 0.85 AND min_internal_word >= 0.70
  REVIEW: avg in [0.70, 0.85) or boundary issues
  RETRY:  avg in [0.55, 0.70)
  REJECT: avg < 0.55 or structural failure

Usage:
    from src.validators.simple_validator import validate_transcription
    result = validate_transcription("audio.flac", native_text, language="te")
    # result.status: "accept" / "review" / "retry" / "reject"
    # result.combined_score: 0-1
    N)	dataclassfield)ListDictOptionalSetTelugu)i   i  T)namerangesallow_ascii_punctallow_digitsHindi)i 	  i	  MarathiTamil)i  i  Kannada)i  i  	Malayalam)i   i  Bengali)i	  i	  AssameseGujarati)i
  i
  Punjabi)i 
  i
  Odia)i   i  English)A   Z   )a   z   )tehimrtaknmlbnasgupaorenu$    	
.,!?;:'"()-–—।॥01234567894abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ>   [UNK][sigh][cough][laugh][music][noise][breath]	[singing]
[applause][INAUDIBLE][NO_SPEECH]languageallow_englishreturnc                 C   s~   t  }tD ]	}|t| qt| td }|d D ]\}}t||d D ]}|| q&q|r=tD ]	}|t| q3|S )z3Get set of valid Unicode codepoints for a language.r   r
      )setALLOWED_COMMONaddordSCRIPT_RANGESgetrangeENGLISH_CHARS)r4   r5   validclang_configstartendcp rF   @/home/ubuntu/maya3_transcribe/src/validators/simple_validator.pyget_valid_charsy   s   rH   r   Ftextc                    s   | r|   sdg dddS t||}t|td }g }d}d}t| D ]5\}}	t|	 t fdd|d D }
|
rD|d	7 }|d	7 }q$ |vrY||	 |d
 |	tv rY|d	7 }q$t	|dk|dd t	||dkrq|| dS ddS )z>Check if text contains only valid characters for the language.Fr   empty)r@   invalid_charsscript_ratioreasonr   c                 3   ,    | ]\}}|   ko|kn  V  qd S NrF   .0rC   rD   rE   rF   rG   	<genexpr>      * z#check_characters.<locals>.<genexpr>r
   r7   )char	codepointpositionN
   )r@   rK   invalid_countrL   )
striprH   r<   r=   	enumerater;   anyappendr?   len)rI   r4   r5   valid_charsrB   rK   script_counttotal_alphairU   	is_nativerF   rR   rG   check_characters   s2   



rd           duration_secc                    sr  g }| r|   sddgdS |   }|dv rdg dS td| }|D ]}|tvr1|d|  q#|dkr|td	d
 | D }d}t|td }	| D ]}
t|
 t	 fdd
|	d D rb|d7 }qK|| }|dkr||| dk r||d| d|  |dkrt
|  }|t|d }|dkr|d|dd |dk r|dkr|d| d|dd t
|dk|dS )a  
    Fast structural checks before any ML model runs.
    Catches obvious garbage without wasting GPU cycles.

    Checks:
    1. Unicode block: native text must be in target script (no Latin in non-English)
    2. Tag format: only allowed bracketed tags, properly formatted
    3. Length vs duration: reject impossibly dense text (>15 words/sec)
    4. Empty/whitespace-only: instant reject

    Returns: {"pass": bool, "reasons": [...]}
    F
empty_text)passreasons)r3   r2   Tz\[\w+\]zinvalid_tag:r'   c                 s   s    | ]	}|t v rd V  qdS )r7   N)r?   rQ   rA   rF   rF   rG   rS          z*structural_sanity_check.<locals>.<genexpr>r   r   c                 3   rN   rO   rF   rP   rR   rF   rG   rS      rT   r
   r7   皙?ztoo_few_native_chars:/   zimpossibly_dense:z.1fzw/sg      ?rX   ztoo_many_words_for_short_audio:zw/s)rZ   refindall
VALID_TAGSr]   sumr<   r=   r;   r\   r^   splitmax)rI   r4   rf   ri   stripped
found_tagstaglatin_charsnative_charsrB   rA   total
word_countwords_per_secrF   rR   rG   structural_sanity_check   s>   
r~   c                   @   s   e Zd ZU dZeed< dZeed< dZe	e
 ed< dZeed< dZeed	< dZeed
< dZe	e ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZe	e ed< dZeed< dZe	e ed< de
fddZdS )ValidationResultz5Result of transcription validation with dual scoring.statusT
char_validNrK   re   rL   alignment_scorenative_ctc_scorelow_confidence_wordslow_confidence_ratioroman_mms_scoremms_min_word_scoremms_boundary_word_avgmms_internal_below_thresholdcombined_scorestructural_passstructural_reasons uroman_romanizedri   r6   c                 C   sz   | j | j| jpg t| jdt| jdt| jdt| jdt| jdt| j	dt| j
d| jp/g t| jd| j| jp:g dS )N      )r   r   rK   rL   r   r   r   r   r   r   r   r   r   ri   )r   r   rK   roundrL   r   r   r   r   r   r   r   r   r   ri   )selfrF   rF   rG   to_dict  s   







zValidationResult.to_dict)__name__
__module____qualname____doc__str__annotations__r   boolrK   r   r   rL   floatr   r   r   r   r   r   r   r   r   r   r   r   ri   r   rF   rF   rF   rG   r      s(   
 r   c                 C   s"   t du rddlm} || da t S )zLazy load native CTC aligner.Nr7   )CTCForcedAligner)r4   )_ctc_alignerctc_forced_alignerr   )r4   r   rF   rF   rG   _get_ctc_aligner(  s   
r   c                  C   s   t du rddlm}  |  a t S )z Lazy load romanized MMS aligner.Nr7   
MMSAligner)_mms_alignermms_alignerr   r   rF   rF   rG   _get_mms_aligner0  s   r   
native_ctc	roman_mmsc                 C   s   | |}}d| d|  dt ||   }|dkr.t||dkr.t || dkr+d}nd}n|d	kr5d}n	|dkr<d
}nd}t|d|fS )a  
    Compute weighted validation score with disagreement penalty.

    Formula: S = 0.45*N + 0.55*R - 0.10*abs(N-R)
    MMS weighted higher (0.55): more stable on code-mixed Indic audio.
    Disagreement penalty: flags when validators diverge.

    v4 stricter thresholds (calibrated against test segments):
      ACCEPT: S >= 0.75 AND min(N,R) >= 0.50
      REVIEW: S in [0.65, 0.75) OR high disagreement
      RETRY:  S in [0.55, 0.65)
      REJECT: S < 0.55
    g?皙?rl         ?      ?      ?reviewacceptg?retryrejectr   )absminr   )r   r   NRSverdictrF   rF   rG   compute_combined_score9  s   
 r   c                 C   s.   t dd| }t dd|}t dd| S )zOStrip tags and punctuation for alignment. Keeps native script + Latin + spaces.z
\[[\w_]+\]r   z![\u0964.,!?;:\'"()\-\u2013\u2014]\s+ )rp   subrZ   )rI   cleanedrF   rF   rG   _strip_tags_and_punct\  s   r   c                    s   t | }|dkr
|S t|td }g }|D ]}t| t fdd|d D }|s/|dv r4|| qtddd	|	 S )
zStrip Latin characters for native CTC alignment on code-mixed text.
    CTC models are language-specific and choke on English words mixed in.
    We remove Latin words and only align the native-script portions.r'   r   c                 3   rN   rO   rF   rP   rR   rF   rG   rS   n  rT   z(_strip_to_native_only.<locals>.<genexpr>r
   z 	r   r   r   )
r   r<   r=   r;   r\   r]   rp   r   joinrZ   )rI   r4   r   rB   resultrU   rc   rF   rR   rG   _strip_to_native_onlyc  s   
r   r   
audio_pathtranscriptionromanized_textcheck_audioc              
   C   s  g }t |}t||}t|||}	|	d s(tdd|	d dd|	d  gdS t||dd	}
|
d
 sAtdd|
d |
d dgdS |
d dk rS|d|
d dd ddlm} ||}d}g }d}|r|rzt	|}|
| |}|j}|j}|j}W n  ty } z|dt|dd   W Y d}~nd}~ww d}d}d}d}|r8|r8zkt }|
| |}|j}t|dr|jr|j}|rt|}g }t|dkr||d  t|dkr||d  |rt|t| nd}t|dkr|dd ng }|rtdd |D }|t| }W n! ty7 } z|dt|dd   W Y d}~nd}~ww |rh|dkrh|dkrht||\}}|d krgt|| d!krg|d"|d#d$|d# nb|r|dkr|}|d%krzd&}n
|d'krd(}nd}|dkr|r|d) n8|r|dkr|}|d%krd&}n
|d'krd(}nd}|d* n|rd}d }|d+ n	d}|rd nd&}|
d
 sd}|d(kr|d,|d#d-|d#d.|d#d n|dkr|dkr|d/|d#d-|d#d.|d#d td=i d0|d1|
d
 d|
dd|
d d2|d3|d4|d5|d6|d7|d8|d9|d:|d;dd<|d|rD|S dS )>a  
    Validate a transcription (v4: uroman-based, stricter, structural checks).

    Flow:
      Step 0: Structural sanity checks (instant, no ML)
      Step 1: Character validation (instant)
      Step 2: Native CTC forced alignment (language-specific wav2vec2)
      Step 3: Romanized MMS forced alignment (uroman-derived, NOT Gemini's)
      Step 4: Weighted combined score with per-word analysis

    Key v4 change: romanized_text parameter is IGNORED for MMS alignment.
    Validator derives its own romanization via uroman from the native text.
    This makes validation deterministic and independent of Gemini's output.

    Args:
        audio_path: Path to audio file
        transcription: Native script text
        language: Language code (te, hi, ta, etc.)
        romanized_text: IGNORED in v4 (kept for API compat). Validator uses uroman.
        check_audio: Whether to run audio alignment checks
        duration_sec: Audio duration (for structural checks, optional)
    rh   r   Fri   zstructural_fail: z, )r   r   r   ri   Tr5   r@   rK   rL   zInvalid/alien characters found)r   r   rK   rL   ri   r   Too few native chars (.0%)r   )romanize_for_alignmentre   zCTC alignment failed: N2   word_scoresr7      c                 s   s    | ]	}|d k rdV  qdS )g333333?r7   NrF   )rQ   ro   rF   rF   rG   rS     rk   z)validate_transcription.<locals>.<genexpr>zMMS alignment failed: r   r   zValidator disagreement: CTC=z.2fz MMS=r   r   r   r   zMMS failed, using CTC onlyzCTC failed, using MMS onlyzBoth validators returned 0zBelow threshold: combined=z (CTC=z, MMS=zPoor alignment: combined=r   r   r   r   r   r   r   r   r   r   r   r   r   rF   )r   r   r~   r   r   rd   r]   src.romanizationr   r   alignr   r   r   	Exceptionr   r   hasattrr   r   r^   rs   r   r   r=   )r   r   r4   r   r   rf   ri   
clean_textnative_only
structural
char_checkr   uroman_romanr   low_conf_wordslow_conf_ratioaligner
ctc_resulter   mms_min_wordmms_boundary_avgmms_internal_belowmms
mms_resultscoresboundaryinternalbelowr   r   rF   rF   rG   validate_transcriptiont  s4  
	
((










	

r   c                  C   s>   t dur
t   da tdurt  daddlm}  |   dS )z3Release all aligner resources (CTC + MMS + uroman).Nr   )cleanup)r   r   r   r   )roman_cleanuprF   rF   rG   r   0  s   
r   c                 C   s   | r|   sdddS t| |}|d sdd|d  dS t| |dd}|d s>d	d
 |d dd D }dd| dS |d dk rPdd|d dddS d|d dS )zFQuick character-only + structural validation for native transcription.FzEmpty transcription)r@   rM   rh   zStructural: ri   r   r@   c                 S   s   g | ]}|d  qS )rU   rF   rj   rF   rF   rG   
<listcomp>L  s    z"quick_validate.<locals>.<listcomp>rK   N   zInvalid chars: rL   r   r   r   r   T)r@   rL   )rZ   r~   rd   )r   r4   r   r   invalidrF   rF   rG   quick_validate?  s   

r   __main__r   a  
Simple Validator v4
====================

Usage:
    # Full validation (with audio check, uroman-based)
    python simple_validator.py <audio_path> <transcription>

    # Quick validation (character + structural check only)
    python simple_validator.py --quick <transcription>
r7   z--quickr   r   zValid: r@   zReason: rM   z Validating (v4: uroman-based)...z	
Status: zCharacter valid: zScript ratio: z.1%zCTC score: z.4fzMMS score: z
Combined: zUroman: P   z	
Reasons:z  - )T)r   F)r   re   )r   r   Tre   )r   )7r   rp   dataclassesr   r   typingr   r   r   r   r<   r8   r9   r?   rr   r   r   intrH   rd   r   r~   r   r   r   r   r   tupler   r   r   r   r   r   r   sysr^   argvprintexitr   rI   r   r   r   r   upperr   rL   r   r   r   r   ri   rrF   rF   rF   rG   <module>   s@   K
'
?8	
#
 =





'