o
    i>                     @   s  d Z ddlZddlZddlZddlZddlmZm	Z	 ddl
mZmZmZmZ ddlmZ ddlZeG dd dZeG dd	 d	ZG d
d dZ	d5dedededefddZedkrddlZeejdk rped ed ejd Zdejdd ZeeeZ edd  ed ed  ede d d ede d  d ed!e d"   ed#e d$ d% ed& e d' D ].Z!d(e"e!d) d*  Z#ed+e!d, d-de!d) d.de# d/e!d0 d1d2e!d3 d1d4 qdS dS )6a>  
CTC Forced Aligner
==================

Proper forced alignment using torchaudio's CTC forced_align.
This is equivalent to MFA but using neural CTC instead of GMM-HMM.

Provides:
- Per-character alignment scores (log-prob)
- Per-word alignment scores  
- Overall alignment quality score
- Identifies low-confidence regions

Usage:
    aligner = CTCForcedAligner(language="te")
    result = aligner.align("audio.flac", "transcription text")
    
    print(f"Alignment Score: {result.alignment_score}")  # 0-1
    print(f"Low confidence words: {result.low_confidence_words}")
    N)	dataclassfield)ListDictOptionalTuple)Pathc                   @   s   e Zd ZU dZeed< eed< eed< eed< eed< eed< eed< ee	d	Z
ee ed
< edefddZedefddZdS )WordAlignmentz!Alignment info for a single word.wordstart_frame	end_frame
start_timeend_timelog_prob
confidencedefault_factorychar_scoresreturnc                 C   s   | j | j S )N)r   r   self r   $src/validators/ctc_forced_aligner.pyduration+   s   zWordAlignment.durationc                 C   s
   | j dk S )N      ?)r   r   r   r   r   is_low_confidence/   s   
zWordAlignment.is_low_confidenceN)__name__
__module____qualname____doc__str__annotations__intfloatr   listr   r   propertyr   boolr   r   r   r   r   r	      s   
 r	   c                   @   s   e Zd ZU dZeed< eed< eed< eed< eedZ	e
e ed< eedZe
e ed< d	Zeed
< d	Zeed< d	Zeed< defddZdS )AlignmentResultzResult of forced alignment.
audio_pathtranscriptionalignment_scoremean_log_probr   word_alignmentslow_confidence_words        low_confidence_ratioaudio_duration_secprocessing_time_secr   c                 C   sZ   | j | jt| jdt| jdt| j| jt| jddd | jD t| j	dt| j
dd
S )N   c              	   S   s>   g | ]}|j t|jd t|jd t|jdt|jddqS )   r2   )r
   r   r   r   r   )r
   roundr   r   r   r   .0war   r   r   
<listcomp>R   s    



z+AlignmentResult.to_dict.<locals>.<listcomp>r3   )
r(   r)   r*   r+   
word_countr-   r/   r,   r0   r1   )r(   r)   r4   r*   r+   lenr,   r-   r/   r0   r1   r   r   r   r   to_dictI   s   





zAlignmentResult.to_dictN)r   r   r   r   r    r!   r#   r   r$   r,   r   r	   r-   r/   r0   r1   r   r;   r   r   r   r   r'   4   s   
 r'   c                   @   s   e Zd ZdZdddddddddddd	dd
Z			d*dededefddZdefddZdede	e
jef fddZde
jdejfddZdedefddZdedejd ejd!edee f
d"d#Zded$edefd%d&Zd'd( Zd)S )+CTCForcedAlignerz
    CTC-based forced alignment using torchaudio.
    
    This provides MFA-equivalent functionality using neural CTC models.
    z(anuragshas/wav2vec2-large-xlsr-53-teluguz#theainerd/Wav2Vec2-large-xlsr-hindiz*Harveenchadha/wav2vec2-large-xlsr-53-tamilz,Harveenchadha/wav2vec2-large-xlsr-53-kannadaz!gvs/wav2vec2-large-xlsr-malayalamz#arijitx/wav2vec2-large-xlsr-bengalizfacebook/wav2vec2-large-xlsr-53z&facebook/wav2vec2-large-960h-lv60-self)tehimrtaknmlbnasgupaorendefaultr=   auto./models/ctc_alignerlanguagedevice	cache_dirc                 C   sl   |  d d | _t|| _| jjddd |dkr%tj r!dnd| _n|| _d | _	d | _
d | _d| _d S )N   T)parentsexist_okrJ   cudacpuF)lowerrL   r   rN   mkdirtorchrR   is_availablerM   model	processorvocab_setup_complete)r   rL   rM   rN   r   r   r   __init__{   s   

zCTCForcedAligner.__init__r   c              
   C   s   | j rdS zTddlm}m} | j| j| jd }td| d |j|| j	d| _
|j|| j	d| _| j| j | j  | j
j | _| j
jj| _td| j  d| _ W dS  tys } ztd	|  W Y d
}~dS d
}~ww )zLoad model and processor.Tr   )Wav2Vec2ForCTCWav2Vec2ProcessorrI   z[CTCAligner] Loading z...)rN   z[CTCAligner] Ready on z[CTCAligner] Setup failed: NF)r[   transformersr]   r^   MODELSgetrL   printfrom_pretrainedrN   rY   rX   torM   eval	tokenizer	get_vocabrZ   pad_token_idblank_id	Exception)r   r]   r^   
model_nameer   r   r   setup   s0   
zCTCForcedAligner.setupr(   c                 C   sh   t |\}}t|jdkr|jdd}|dkr0t| d}t	j
||d  }d}||fS )z!Load and resample audio to 16kHz.   )axis>  r   )sfreadr:   shapemeanrV   
from_numpyr#   	unsqueeze
torchaudio
functionalresamplesqueezenumpy)r   r(   datasrwaveformr   r   r   _load_audio   s   zCTCForcedAligner._load_audioaudioc                 C   sf   | j |dddd}t  | |j| jj}W d   n1 s#w   Y  tj|	 dd}|S )z"Get CTC log-probability emissions.rp   ptT)sampling_ratereturn_tensorspaddingN)dim)
rY   rV   no_gradrX   input_valuesrd   rM   logitslog_softmaxrS   )r   r   inputsr   	emissionsr   r   r   _get_emissions   s   
zCTCForcedAligner._get_emissionstextc                 C   s   g }d}d}|D ].}|dkrd| j v r|| j d  q|| j v r.|| j |  |d7 }q|d7 }|d7 }q|t|d }|dkrQtd|dd| d	| d
 tj|gtjd|fS )zConvert text to token IDs. Returns (tensor, oov_ratio).
        OOV chars are dropped but tracked. High OOV = unreliable score.r    |rn   皙?z[CTCAligner] WARNING: z.0%z OOV chars (/))dtype)rZ   appendmaxrb   rV   tensorint32)r   r   tokenstotal_chars	oov_charschar	oov_ratior   r   r   _text_to_tokens   s*   




z CTCForcedAligner._text_to_tokens	alignmentscoresframe_durationc                 C   s  |  }g }t|D ]\}}|D ]}	|| q|d q
|r,|d dkr,|dd }dd tt|D }
dd tt|D }|d  }|d  }d}tt||D ]+\}\}}|| jkr|t|k r|| }|dkr~|
| | || | |d7 }qWg }t|D ]N\}}|
|g }||g }|rt	
|}t	|}tdtd	|}t|}t|d }nd
}d	}d}d}|t||||| || t|t||d q|S )z7Extract word-level alignments from frame-level results.r   Nc                 S      i | ]}|g qS r   r   r6   ir   r   r   
<dictcomp>       z=CTCForcedAligner._compute_word_alignments.<locals>.<dictcomp>c                 S   r   r   r   r   r   r   r   r      r   r   rn         ?r.         $)r
   r   r   r   r   r   r   r   )split	enumerater   ranger:   r{   zipri   ra   nprt   expminr   r	   r#   )r   r   r   r   r   wordschar_to_wordword_idxr
   _word_scoresword_framesalignment_np	scores_npchar_idx	frame_idx	token_idxscore
alignmentsword_score_list
frame_listr+   r   r   r   r   r   r   _compute_word_alignments   s\   


z)CTCForcedAligner._compute_word_alignmentsr)   c                 C   s  | j s|  st||dddS t }z| |\}}t|| }| |}|jd }|| }	| |\}
}|
jd dkrLt||dd|t | dW S t	
|g}t	
|
jd g}tjj||
||| jd\}}| ||||	}|  }tdtdd|d	  }|d
kr|d|d  9 }dd |D }t|tt|d }t | }t|||||||||d	W S  ty } zddl}|  t||ddt | dW  Y d}~S d}~ww )z
        Perform forced alignment.
        
        Args:
            audio_path: Path to audio file
            transcription: Text to align
            
        Returns:
            AlignmentResult with word-level alignments and scores
        r.   r   )r(   r)   r*   r+   rn   r   )r(   r)   r*   r+   r0   r1   )blankr   g      @r   r   c                 S   s   g | ]}|j r|jqS r   )r   r
   r5   r   r   r   r8   u  s    z*CTCForcedAligner.align.<locals>.<listcomp>)	r(   r)   r*   r+   r,   r-   r/   r0   r1   N)r(   r)   r*   r+   r1   )r[   rm   r'   timer   r:   r   rs   r   rV   r   rw   rx   forced_alignri   r   rt   itemr   r   rj   	traceback	print_exc)r   r(   r)   r   r   r}   audio_durationr   
num_framesr   targetsr   input_lengthstarget_lengthsr   r   r,   r+   r*   low_conf_wordslow_conf_ratioprocessing_timerl   r   r   r   r   align'  s   




	
zCTCForcedAligner.alignc                 C   sD   | j dur
| ` d| _ | jdur| `d| _tj r tj  dS dS )zRelease resources.N)rX   rY   rV   rR   rW   empty_cacher   r   r   r   cleanup  s   


zCTCForcedAligner.cleanupN)r=   rJ   rK   )r   r   r   r   r`   r    r\   r&   rm   r   r   ndarrayr"   r   rV   Tensorr   tupler   r#   r   r	   r   r'   r   r   r   r   r   r   r<   a   sb    

 
H
jr<   r=   r(   r)   rL   r   c                 C   s&   t |d}|| |}|  | S )zQuick forced alignment.)rL   )r<   r   r   r;   )r(   r)   rL   alignerresultr   r   r   r     s   
r   __main__r3   z@Usage: python ctc_forced_aligner.py <audio_path> <transcription>rn   r   rO   
z<============================================================zFORCED ALIGNMENT RESULTzAlignment Score: r*   z.4fzMean Log Prob: r+   zLow Confidence Words: r-   zLow Confidence Ratio: r/   z.2%z
Word Alignments:r,   u   █r   
   z  r
   z<20z.3fz [r   z.2f-r   zs])r=   )$r   rV   rw   r{   r   	soundfilerq   dataclassesr   r   typingr   r   r   r   pathlibr   r   r	   r'   r<   r    r   r   sysr:   argvrb   exitr(   joinr)   r   r7   r"   conf_barr   r   r   r   <module>   s^    ,  C




B