o
    i3                     @   s  d Z ddlZddlZddlZddlZddlZddlm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZ e	G dd dZe	G dd	 d	ZG d
d dZ	d3dedededeeef fddZedkrddlZeejdk rved ed ejd Zdejdd Z eee Z!ede!d d ede!d d ede!d d ed e!d!   ed"e!d# d$d% ed& e!d' dd( D ] Z"ed)e"d*  d+e"d, dd-e"d. d/d0e"d1 d/d2	 qdS dS )4a#  
Alignment Scorer
================

CTC-based scoring for transcription validation.
Provides word-level confidence scores and overall alignment quality.

This module uses Wav2Vec2 CTC models to:
1. Score how well transcription matches audio
2. Provide word-level confidence estimates
3. Calculate overall alignment quality

Usage:
    scorer = AlignmentScorer(language="te")
    result = scorer.score_transcription(audio_path, transcription)
    print(f"Score: {result['alignment_score']}")
    print(f"Word confidences: {result['word_scores']}")
    N)	dataclassfield)ListDictOptionalAnyTuple)Pathc                   @   sH   e Zd ZU dZeed< eed< eed< eed< eedZ	e
e ed< dS )		WordScorezScore for a single word.word
start_timeend_time
confidencedefault_factory
char_probsN)__name__
__module____qualname____doc__str__annotations__floatr   listr   r    r   r   @/home/ubuntu/maya3_transcribe/src/validators/alignment_scorer.pyr
      s   
 r
   c                   @   s   e Zd ZU dZeed< eed< eed< eed< eed< eedZ	e
e ed< d	Zeed
< d	Zeed< dZeed< dZeed< deeef fddZdS )AlignmentResultzResult of alignment scoring.
audio_pathtranscriptionalignment_scoreaverage_confidencemin_confidencer   word_scores        audio_duration_secprocessing_time_secctcmethodtelanguagereturnc                 C   s^   | j | jt| jdt| jdt| jdt| jdd | jD t| jdt| j	d| j
| jdS )N   c                 S   s4   g | ]}|j t|jd t|jd t|jddqS )   r+   )r   r   r   r   )r   roundr   r   r   .0wsr   r   r   
<listcomp>E   s    


z+AlignmentResult.to_dict.<locals>.<listcomp>r,   )r   r   r   r    r!   
word_countr"   r$   r%   r'   r)   )r   r   r-   r   r    r!   lenr"   r$   r%   r'   r)   selfr   r   r   to_dict=   s   



	
zAlignmentResult.to_dictN)r   r   r   r   r   r   r   r   r   r"   r   r
   r$   r%   r'   r)   r   r   r6   r   r   r   r   r   '   s   
 r   c                
   @   s   e Zd ZdZddddddZ				
d)dededefddZdefddZdede	e
jef fddZde
jdejfddZdejde
jfddZde
jdedede	ee ef fdd Zdededefd!d"Zd#eeeef  dee fd$d%Zd&d' Zd(S )*AlignmentScorerz
    CTC-based alignment scorer for transcription validation.
    
    Uses Wav2Vec2 model to compute frame-level probabilities and
    align transcription to audio for scoring.
    z(anuragshas/wav2vec2-large-xlsr-53-teluguz#theainerd/Wav2Vec2-large-xlsr-hindiz*Harveenchadha/wav2vec2-large-xlsr-53-tamilzai4bharat/indicwav2vec-hindizfacebook/wav2vec2-large-xlsr-53)r(   hitabndefaultr(   auto./models/alignmentr)   device	cache_dirc                 C   sf   |  dd | _t|| _| jjddd |dkr%tj r!dnd| _n|| _d| _	d| _
d| _dS )	z
        Initialize alignment scorer.
        
        Args:
            language: Language code (te, hi, ta, etc.)
            device: Device to use (auto, cuda, cpu)
            cache_dir: Directory for model cache
        N   T)parentsexist_okr<   cudacpuF)lowerr)   r	   r?   mkdirtorchrC   is_availabler>   model	processor_setup_complete)r5   r)   r>   r?   r   r   r   __init__f   s   

zAlignmentScorer.__init__r*   c              
   C   s   | j rdS zGddlm}m} | j| j| jd }td| d |j|| j	d| _
|j|| j	d| _| j| j | j  td| j  d| _ W dS  tyn } ztd	|  dd
l}|  W Y d
}~dS d
}~ww )zLoad model and processor.Tr   )Wav2Vec2ForCTCWav2Vec2Processorr;   z[AlignmentScorer] Loading z...)r?   z[AlignmentScorer] Ready on z [AlignmentScorer] Setup failed: NF)rK   transformersrM   rN   MODELSgetr)   printfrom_pretrainedr?   rJ   rI   tor>   eval	Exception	traceback	print_exc)r5   rM   rN   
model_nameerW   r   r   r   setup   s4   
zAlignmentScorer.setupr   c                 C   sz   t |\}}t|jdkr|jdd}|dkr9ddl}t| 	d}|j
|d}||}|  }d}||fS )z!Load and resample audio to 16kHz.   axis>  r   N)sfreadr3   shapemean
torchaudiorG   
from_numpyr   	unsqueeze
transformsResamplesqueezenumpy)r5   r   datasrrd   waveform	resamplerr   r   r   _load_audio   s   zAlignmentScorer._load_audioaudioc                 C   sb   | j |dddd}|j| j}t  | |}|j }W d   |S 1 s*w   Y  |S )zGet CTC logits from audio.r_   ptT)sampling_ratereturn_tensorspaddingN)	rJ   input_valuesrT   r>   rG   no_gradrI   logitsrD   )r5   rp   inputsru   outputsrw   r   r   r   _get_ctc_logits   s   


zAlignmentScorer._get_ctc_logitsrw   c                 C   s   t j|dd}|d S )z,Convert logits to frame-level probabilities.)dimr   )rG   softmaxri   rj   )r5   rw   probsr   r   r   _compute_frame_probs   s   z$AlignmentScorer._compute_frame_probsframe_probsr   audio_durationc              
   C   s  |  }|s
g dfS |jd }|| }|jdd}tdd |D }|dkr*g dfS g }	d}
|D ]I}tt|| | }td|}t|
| |}|
|k ra||
| }t|dkr^t| nd}nd}|
| }|| }|		t
||||g d |}
q0t| }|	|fS )	z
        Align words to frames and compute confidence scores.
        
        Uses a simple proportional alignment with confidence from max probs.
        r#   r   r\   r]   c                 s   s    | ]}t |V  qd S )N)r3   )r/   wr   r   r   	<genexpr>   s    z/AlignmentScorer._align_words.<locals>.<genexpr>g      ?)r   r   r   r   r   )splitrb   maxsumintr3   minr   rc   appendr
   )r5   r   r   r   words
num_framesframe_durationframe_confidencestotal_charsr"   current_framer   word_frames	end_frameword_frame_probsword_confidencer   r   overall_scorer   r   r   _align_words   s>   

zAlignmentScorer._align_wordsc                 C   s&  | j s|  st||dddddS t }zO| |\}}t|| }| |}| |}| |||\}	}
dd |	D }|rDt	
|nd}|rMt	|nd}t | }t|||
|||	||d| jd
W S  ty } z"dd	l}|  t||dddt | d
t| dW  Y d	}~S d	}~ww )z
        Score a transcription against audio.
        
        Args:
            audio_path: Path to audio file
            transcription: Text to score
            
        Returns:
            AlignmentResult with scores
        r#   failed)r   r   r   r    r!   r'   c                 S   s   g | ]}|j qS r   )r   r.   r   r   r   r1   6  s    z7AlignmentScorer.score_transcription.<locals>.<listcomp>r&   )
r   r   r   r    r!   r"   r$   r%   r'   r)   r   Nzerror: )r   r   r   r    r!   r%   r'   )rK   r[   r   timero   r3   rz   r   r   nprc   r   r)   rV   rW   rX   r   )r5   r   r   r   rp   rl   r   rw   r   r"   r   confidencesavg_confmin_confprocessing_timerZ   rW   r   r   r   score_transcription	  sb   	


z#AlignmentScorer.score_transcriptionitemsc                 C   s0   g }|D ]}|  |d |d }|| q|S )z
        Score multiple transcriptions.
        
        Args:
            items: List of {"audio_path": ..., "transcription": ...}
            
        Returns:
            List of AlignmentResult
        r   r   )r   r   )r5   r   resultsitemresultr   r   r   score_batchV  s   zAlignmentScorer.score_batchc                 C   sD   | j dur
| ` d| _ | jdur| `d| _tj r tj  dS dS )zRelease model resources.N)rI   rJ   rG   rC   rH   empty_cacher4   r   r   r   cleanupl  s   


zAlignmentScorer.cleanupN)r(   r<   r=   )r   r   r   r   rP   r   rL   boolr[   r   r   ndarrayr   ro   rG   Tensorrz   r   r   r   r
   r   r   r   r   r   r   r   r   r   r   r7   U   sX    	

!
=
M
r7   r(   r   r   r)   r*   c                 C   s&   t |d}|| |}|  | S )z
    Quick scoring function.
    
    Args:
        audio_path: Path to audio file
        transcription: Text to score
        language: Language code
        
    Returns:
        Dict with scores
    )r)   )r7   r   r   r6   )r   r   r)   scorerr   r   r   r   r   z  s   
r   __main__r,   z>Usage: python alignment_scorer.py <audio_path> <transcription>r\    r@   z
Alignment Score: r   z.4fzAverage Confidence: r    zMin Confidence: r!   zWord Count: r2   zProcessing Time: r%   z.3fsz
Word Scores:r"   
   z  r   z: r   z [r   z.2f-r   zs])r(   )#r   osr   rG   rj   r   	soundfiler`   dataclassesr   r   typingr   r   r   r   r   pathlibr	   r
   r   r7   r   r   r   sysr3   argvrR   exitr   joinr   r   r0   r   r   r   r   <module>   sX    	-  *




: