o
    5iX$                     @   st   d Z ddlZddlZddlZddlZddlZddlm	Z	m
Z
 ddlmZmZmZ e	G dd dZG dd dZdS )	u  
MMS Forced Aligner (Romanized Text)
====================================

Language-agnostic forced alignment using Meta's MMS_FA model via torchaudio.
Works on romanized/Latin-script text. Complements the native CTC aligner.

MMS_FA is trained on 1000+ languages — handles romanized Indic + English equally well.
Used alongside native CTC to produce dual validation scores.

Usage:
    aligner = MMSAligner()
    result = aligner.align("audio.flac", "naaku konni ads gurtuntaayi")
    print(f"Score: {result.alignment_score}")  # 0-1
    N)	dataclassfield)ListDictOptionalc                   @   s   e Zd ZU dZeed< eed< eed< eed< eedZ	e
e ed< dZeed	< dZeed
< dZeed< dZeed< defddZdS )MMSAlignmentResultz1Result of MMS forced alignment on romanized text.
audio_pathtranscriptionalignment_scoremean_log_prob)default_factoryword_scoresr   
num_tokens
num_frames        audio_duration_secprocessing_time_secreturnc                 C   sT   | j | jd d t| jdt| jddd | jD | j| jt| jdt| j	dd	S )Nd      c                 S   s   g | ]}t |d qS )r   )round).0s r   ;/home/ubuntu/maya3_transcribe/src/validators/mms_aligner.py
<listcomp>0       z.MMSAlignmentResult.to_dict.<locals>.<listcomp>   	r   r	   r
   r   r   r   r   r   r   )
r   r	   r   r
   r   r   r   r   r   r   selfr   r   r   to_dict*   s   



zMMSAlignmentResult.to_dictN)__name__
__module____qualname____doc__str__annotations__floatr   listr   r   r   intr   r   r   r   r!   r   r   r   r   r      s   
 r   c                	   @   s   e Zd ZdZddefddZdefddZd	edej	fd
dZ
dedee fddZdee dee dee dee fddZd	ededefddZdd ZdS )
MMSAligneru   
    Language-agnostic forced alignment using torchaudio MMS_FA.

    Tokenization: space → * (word boundary, idx 28), CTC blank = - (idx 0).
    Score normalization: log_prob range ~[-10, 0] mapped to [0, 1].
    autodevicec                 C   sR   |dkrt j rdnd| _n|| _d | _d | _d | _d | _d| _d| _	d| _
d S )Nr,   cudacpuFr      )torchr.   is_availabler-   modellabels	label2idxsample_rate_setup_complete	BLANK_IDXSTAR_IDX)r    r-   r   r   r   __init__@   s   
zMMSAligner.__init__r   c              
   C   s   | j rdS zOtjj}| | j| _| j  |	 | _
|j| _dd t| j
D | _| jdd| _| jdd| _td| j d	t| j
 d
| j d d| _ W dS  tyn } ztd|  W Y d}~dS d}~ww )zLazy load MMS_FA model.Tc                 S   s   i | ]\}}||qS r   r   )r   ilr   r   r   
<dictcomp>[   r   z$MMSAligner.setup.<locals>.<dictcomp>-r   *r0   z[MMSAligner] Ready on z (z labels, SR=)z[MMSAligner] Setup failed: NF)r7   
torchaudio	pipelinesMMS_FA	get_modeltor-   r3   eval
get_labelsr4   r6   	enumerater5   getr8   r9   printlen	Exception)r    bundleer   r   r   setupP   s.   


zMMSAligner.setupr   c                 C   s^   t |\}}t|jdkr|jdd}tj|tjdd}|| j	kr-t
j||| j	}|S )z3Load and resample audio to MMS sample rate (16kHz).   )axisdtyper   )sfreadrK   shapemeanr1   tensorfloat32	unsqueezer6   rA   
functionalresample)r    r   datasrwaveformr   r   r   _load_audioh   s   
zMMSAligner._load_audiotextc                 C   s   g }|   D ]+}|dkr|r|d | jkr|| j q|| jv r3| j| | jkr3|| j|  q|rK|d | jkrK|d |rK|d | jks=|ra|d | jkra|  |ra|d | jksT|S )u   
        Convert romanized text to MMS token indices.
        Space → * (word boundary). Unknown chars skipped.
        Punctuation stripped.
         r   )lowerstripr9   appendr5   r8   pop)r    ra   tokenschr   r   r   _text_to_tokenst   s    
zMMSAligner._text_to_tokensrh   	alignmentscoresc                 C   s   d}i }t |D ]\}}|| jkr|d7 }q|||< q|d }dd t|D }	d}
t t||D ]$\}\}}|| jkrU|
t|k rU|
|v rQ||
 }|	| | |
d7 }
q1g }|	D ]#}|rxt|t| }tdt	dd|d  }|| qZ|d qZ|S )z
        Compute per-word alignment scores from frame-level scores.

        Words are delimited by * (STAR_IDX) tokens. For each word, average
        the log-prob scores of its frames, then convert to 0-1 confidence.
        r   rP   c                 S   s   g | ]}g qS r   r   )r   _r   r   r   r      s    z3MMSAligner._compute_word_scores.<locals>.<listcomp>r         ?      @)
rH   r9   rangezipr8   rK   rf   summaxmin)r    rh   rk   rl   word_idxtoken_to_wordr;   tokn_wordsword_log_probstoken_cursor	frame_idxaligned_tokscorewiresultlpsmean_lpconfr   r   r   _compute_word_scores   s0   



zMMSAligner._compute_word_scoresromanized_textc                 C   s  | j s|  st||dddS t }z| |}|jd | j }| |}|s7t||dd|t | dW S t	  | 
|| j\}}W d   n1 sQw   Y  | }tj|gtjd}	tjj||	| jd\}
}|  }tdtd	d	|d
  }| ||
d  |d  }t|||||t||jd |t | d	W S  ty } zddl}|  t||ddt | dW  Y d}~S d}~ww )a  
        Perform forced alignment of romanized text against audio.

        Args:
            audio_path: Path to audio file
            romanized_text: Latin-script text to align

        Returns:
            MMSAlignmentResult with alignment score
        r   g      $)r   r	   r
   r   rP   )r   r	   r
   r   r   r   NrR   )blankrn   ro   r   r   )r   r	   r
   r   r   )r7   rO   r   timer`   rV   r6   rj   r1   no_gradr3   rE   r-   r/   rX   int32rA   r[   forced_alignr8   rW   itemrs   rt   r   tolistrK   rL   	traceback	print_exc)r    r   r   
start_timer_   	audio_durrh   emissionrm   token_tensorrk   rl   r   r
   r   rN   r   r   r   r   align   st   



	


zMMSAligner.alignc                 C   s6   | j dur
| ` d| _ d| _tj rtj  dS dS )zRelease resources.NF)r3   r7   r1   r.   r2   empty_cacher   r   r   r   cleanup  s   

zMMSAligner.cleanupN)r,   )r"   r#   r$   r%   r&   r:   boolrO   r1   Tensorr`   r   r*   rj   r(   r   r   r   r   r   r   r   r   r+   8   s"    
*Or+   )r%   r1   rA   	soundfilerT   numpynpr   dataclassesr   r   typingr   r   r   r   r+   r   r   r   r   <module>   s    