o
    %i=                     @  s   d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlm  mZ ddlmZmZ ddlmZmZ dd	lmZmZ eeZd
ZedZdZG dd dZdS )aX  
English CTC model: facebook/wav2vec2-large-960h-lv60-self
315M params, 1.9% WER on LibriSpeech, character-level vocab.
Enables proper CTC log-likelihood scoring P(text|audio) for English segments.

Replaces MMS-1B-All which was redundant with IndicConformer for Indic languages.
Model loaded from R2 (no HuggingFace network calls at runtime).
    )annotationsN)Path)Optional)Wav2Vec2ForCTCWav2Vec2Processor   )WAV2VEC_BATCH_SIZEAUDIO_SAMPLE_RATE)compute_ctc_scorecharacter_error_ratez&facebook/wav2vec2-large-960h-lv60-selfz/tmp/wav2vec2-large-960hzmodels/wav2vec2-large-960h.tarc                   @  s   e Zd ZdZdejfd)ddZed*ddZd+d,ddZ	e
d-ddZdd Ze 	d.d/ddZd0d d!Zd1d$d%Ze
d2d'd(ZdS )3
EnglishCTCzz
    wav2vec2-large English CTC model.
    Provides both decoded transcription AND proper CTC log-likelihood scoring.
    cudadevicestrdtypetorch.dtypec                 C  s"   || _ || _t| _d | _d | _d S N)r   r   r   
batch_sizemodel	processor)selfr   r    r   ;/home/ubuntu/transcripts/validations/models/wav2vec_lang.py__init__&   s
   
zEnglishCTC.__init__returnset[str]c                 C  s   dhS )Nenr   r   r   r   r   available_languages-   s   zEnglishCTC.available_languages hf_tokenc                 C  s   t d| j  |  }tj|dd| _tj|| jdd	| j
 | _tdd | j D d }t d|d	d
| jjj  d S )Nu'   Loading wav2vec2-large English CTC → T)local_files_only)torch_dtyper!   c                 s  s     | ]}|  |  V  qd S r   )numelelement_size).0pr   r   r   	<genexpr>8   s    z"EnglishCTC.load.<locals>.<genexpr>    .Az$wav2vec2-large English CTC loaded: ~.0fzMB VRAM, vocab=)loggerinfor   _ensure_localr   from_pretrainedr   r   r   toevalr   sum
parameters	tokenizer
vocab_size)r   r    localparam_mbr   r   r   load1   s   "zEnglishCTC.loadc                  C  s  t d  rtd tt S ddl} tdd}td}td| d	t	  | j
d
tdtdtddd}||t	t| | jd }td|dd t|d}|jddd W d   n1 smw   Y  |jdd tdt   tt S )z?Download wav2vec2-large from R2 if not already present locally.zconfig.jsonz%wav2vec2-large already cached locallyr   NR2_VALIDATION_MODEL_BUCKETzvalidation-resultsz/tmp/wav2vec2-large-960h.tarz)Downloading wav2vec2-large from R2: s3:///s3R2_ENDPOINT_URLR2_ACCESS_KEY_IDR2_SECRET_ACCESS_KEYauto)endpoint_urlaws_access_key_idaws_secret_access_keyregion_namer(   zDownloaded r)   zMB, extracting...zr:*z/tmpdata)filterT)
missing_okzwav2vec2-large extracted to )
_LOCAL_DIRexistsr*   r+   r   boto3osgetenvr   _R2_KEYclientdownload_filestatst_sizetarfileopen
extractallunlink)rG   buckettar_pathr9   size_mbtfr   r   r   r,   ;   s,   
zEnglishCTC._ensure_localc                 C  s"   | ` | `d  | _ | _tj  d S r   )r   r   torchr   empty_cacher   r   r   r   unloadX   s   zEnglishCTC.unloadN	waveformslist[torch.Tensor]
lang_codes	list[str]reference_textsOptional[list[str]]
list[dict]c                   s   dd t |D }|  gt }|s|S tdt|| jD ]2}|||| j  }fdd|D } fdd|D }	| ||	}
t |D ]
\}}|
| ||< qFq|S )z
        Run English CTC on segments. Non-English segments get empty results.
        Groups English segments for efficient batching.
        c                 S  s   g | ]
\}}|d kr|qS )r   r   )r%   ilangr   r   r   
<listcomp>f       z,EnglishCTC.predict_batch.<locals>.<listcomp>r   c                   s   g | ]} | qS r   r   r%   ra   )rZ   r   r   rc   n       c                   s   g | ]
} r
 | nd qS r   r   re   )r^   r   r   rc   o   rd   )	enumerate_empty_resultlenranger   _infer_batch)r   rZ   r\   r^   
en_indicesresultsbatch_start	batch_idx
batch_wavs
batch_refsbatch_resultsjidxr   )r^   rZ   r   predict_batch]   s   	zEnglishCTC.predict_batchlist[Optional[str]]c                   s  dd |D } j |tddd} fdd| D } jdi |j}tj| dd	}tj	|dd	} j 
|}g }	tt|D ]}
|
t|k rN||
 nd
}||
 }|d d td}||
 }|rʈ |}|r|jd }|t|krt||dd\}}||d< ||d< nE|rt||}|jdd	j  }t|d|d< td| d|d< n"|rt||}|jdd	j  }t|d|d< td| d|d< |	| qB|	S )Nc                 S  s   g | ]}|  qS r   )numpy)r%   wr   r   r   rc   z   rf   z+EnglishCTC._infer_batch.<locals>.<listcomp>ptT)sampling_ratereturn_tensorspaddingc                   s2   i | ]\}}||j  j|jjr jn|jd qS ))r   )r.   r   r   is_floating_point)r%   kvr   r   r   
<dictcomp>   s    &z+EnglishCTC._infer_batch.<locals>.<dictcomp>)dimr   wav2vec_transcriptionwav2vec_ctc_rawwav2vec_ctc_normalizedwav2vec_model_usedr   )blank_idr   r      g      ?r   )r   r	   itemsr   logitsFlog_softmaxfloatrW   argmaxbatch_decoderj   ri   ENGLISH_CTC_MODEL_tokenize_englishshaper
   r   maxvaluesmeanitemroundappend)r   rZ   r^   
raw_arraysinputsr   	log_probspred_idsdecoded_textsrm   btranscriptionref_textresultseg_lptokensTrawnormcergreedy_confr   r   r   rk   v   sV   





zEnglishCTC._infer_batchtext	list[int]c                 C  s   t dd|  }t dd|}|sg S g }|D ]#}|dkr&| jjj}n| jj|}|dur=|| jjjkr=|	| q|S )z
        Convert English text to token IDs for CTC scoring.
        wav2vec2-large uses uppercase character-level vocab (A-Z, space='|').
        Strips non-alpha characters, uppercases, maps to token IDs.
        z[^a-zA-Z\s]r   z\s+ N)
resubupperstripr   r2   word_delimiter_token_idconvert_tokens_to_idsunk_token_idr   )r   r   cleanedr   chtidr   r   r   r      s   
zEnglishCTC._tokenize_englishdictc                   C  s   dd d ddS )Nr   r   r   r   r   r   r   rh      s
   zEnglishCTC._empty_result)r   r   r   r   )r   r   )r   )r    r   )r   r   r   )rZ   r[   r\   r]   r^   r_   r   r`   )rZ   r[   r^   rv   r   r`   )r   r   r   r   )r   r   )__name__
__module____qualname____doc__rW   float16r   propertyr   r6   staticmethodr,   rY   inference_moderu   rk   r   rh   r   r   r   r   r       s     


:r   ) r   
__future__r   loggingrH   r   rO   pathlibr   typingr   rW   torch.nn.functionalnn
functionalr   transformersr   r   configr   r	   	ctc_scorer
   r   	getLoggerr   r*   r   rE   rJ   r   r   r   r   r   <module>   s$    
