o
    %iS>                     @  s   d Z ddlmZ ddlZddlZddlmZ ddlmZ ddl	Z
ddlZddlm  mZ ddlmZmZmZ ddlmZ eeZd	Zd
ZdZdZdZdZdZdZ G dd dejj!Z"G dd dZ#dS )a  
AI4Bharat IndicConformer 600M Multilingual: CTC + RNNT ASR on 22 Indic languages.
Uses trust_remote_code=True (custom HF ONNX-based model).
Provides CTC transcription + confidence scoring via greedy path logprobs + CER vs Gemini.

GPU FIX: The original TorchScript preprocessor (preprocessor.ts) conflicts with PyTorch
models (MMS LID, VoxLingua) sharing the same CUDA context. We replace it with a pure
PyTorch mel-spectrogram that runs on GPU alongside everything else.
Parameters extracted from the TorchScript graph: n_fft=512, hop=160, win=400, 80 mels,
preemphasis=0.97, per-channel mean/std normalization. Validated to match within 1e-6.
    )annotationsN)Path)Optional   )CONFORMER_MULTI_MODELCONFORMER_BATCH_SIZECONFORMER_LANG_CODES)character_error_ratei      i  P   g
ףp=
?g      p>gh㈵>   c                      s,   e Zd ZdZd fddZdd
dZ  ZS )_MelPreprocessorz
    Pure PyTorch replacement for the NeMo TorchScript AudioToMelSpectrogramPreprocessor.
    Runs on CUDA without TorchScript context conflicts.
    mel_fbtorch.Tensorstft_windowc                   s&   t    | d| | d| d S )Nr   window)super__init__register_buffer)selfr   r   	__class__ >/home/ubuntu/transcripts/validations/models/conformer_multi.pyr   /   s   
z_MelPreprocessor.__init__wavreturn!tuple[torch.Tensor, torch.Tensor]c                 C  s&  |j d }t|d}|t|ddddf   }tj|ttfdd}g }t|D ]}tj|| tt	t
| jdddd	}|| q)t|}| d
}	t|	dd
| j}
|
dd
}
t|
t }|jddd}|jddd}|| |t  }tj|j d t	 d g| tj|jd}||fS )z
        Args:
            wav: [B, samples] raw audio at 16kHz
        Returns:
            features: [B, n_mels, T] normalized log-mel spectrogram
            lengths:  [B] number of valid frames
        r   )   r   Nreflect)modeFT)n_fft
hop_length
win_lengthr   center
normalizedreturn_complexr   r   )dimkeepdim)dtypedevice)shapeFpad_PREEMPH_PADrangetorchstft_N_FFT_HOP_LENGTH_WIN_LENGTHr   appendstackabspowmatmul	transposer   log_LOG_EPSmeanstd	_NORM_EPStensorlongr*   )r   r   Bpadded
emphasizedspecsbsspecpowermellog_melr>   r?   featureslengthsr   r   r   forward4   s0   


z_MelPreprocessor.forward)r   r   r   r   )r   r   r   r   )__name__
__module____qualname____doc__r   rO   __classcell__r   r   r   r   r   )   s    r   c                   @  s   e Zd ZdZdejfd*ddZd+d,ddZedd Z	dd Z
dd Ze 	d-d.ddZdd Z	d-d/d&d'Zed0d(d)ZdS )1IndicConformerMultiz
    IndicConformer 600M multilingual wrapper.

    Internal model uses ONNX Runtime (encoder + ctc_decoder).
    Preprocessor is a pure PyTorch mel-spectrogram on GPU (replaces TorchScript).
    cudar*   strr)   torch.dtypec                 C  s"   || _ || _d | _d | _t| _d S N)r*   r)   modelpreprocessorr   
batch_size)r   r*   r)   r   r   r   r   h   s
   
zIndicConformerMulti.__init__ hf_tokenc           	      C  s`  t d| j  ddlm} |   |jtddd| _| jj	j
 d}tjj|dd}d	\}}|j D ]6}| d
krkz$|  }t|tjr`|jtd d tfkrX|}n|jtfkr`|}W q5 tyj   Y q5w q5|d u st|d u rt d |   d S t||| j | _~t dt | jj! dd"dd t#| jj!$ d d D  d d S )Nz,Loading IndicConformer 600M multilingual -> r   )	AutoModelT)trust_remote_codelocal_files_only/assets/preprocessor.tscpumap_location)NNzprim::Constantr   r   zFCould not extract mel filterbank from TorchScript, falling back to CPUz*IndicConformer loaded (GPU preprocessor): z languages, vocab sizes: z, c                 s  s&    | ]\}}| d t | V  qdS )=N)len).0kvr   r   r   	<genexpr>   s   $ z+IndicConformerMulti.load.<locals>.<genexpr>   z...)%loggerinfor*   transformersr_   _ensure_model_in_hf_cachefrom_pretrainedr   rZ   config	ts_folderr1   jitloadgraphnodeskindoutputtoIValue
isinstanceTensorr+   r3   _N_MELSr5   	Exceptionwarning_load_cpu_fallbackr   toevalr[   rg   vocabjoinlistitems)	r   r^   r_   ts_pathpp_cpur   r   nodevalr   r   r   ru   o   sD   
&zIndicConformerMulti.loadc                  C  s  ddl } ddl}ddl}d}t d }|d | }|d  r'td dS ddl}| 	dd	}d
}td}	td| d|  |j
d| 	d| 	d| 	ddd}
|
||t|	 |	 jd }td|dd ||	d}|jddd W d   n1 sw   Y  |	jdd td}|jjddd | r|| |t|t| |d  }|jddd |d! | td"|  dS )#zPopulate the HF cache with IndicConformer files so from_pretrained()
        with local_files_only=True works. Downloads from R2 if not already cached.r   N(e9b71b369c048e2c6b634d4c131061c34e441179zK.cache/huggingface/hub/models--ai4bharat--indic-conformer-600m-multilingual	snapshotszconfig.jsonzConformer already in HF cacheR2_VALIDATION_MODEL_BUCKETzvalidation-resultszmodels/indic-conformer-600m.tarz/tmp/indic-conformer-600m.tarz$Downloading conformer from R2: s3:///s3R2_ENDPOINT_URLR2_ACCESS_KEY_IDR2_SECRET_ACCESS_KEYauto)endpoint_urlaws_access_key_idaws_secret_access_keyregion_nameg    .AzDownloaded z.0fzMB, extracting to HF cache...zr:*z/tmpdata)filterT)
missing_okz/tmp/indic-conformer-600m)parentsexist_okrefsmainz!Conformer installed to HF cache: )ostarfileshutilr   homeexistsrm   rn   boto3getenvclientdownload_filerW   statst_sizeopen
extractallunlinkparentmkdirrmtreemove
write_text)r   r   r   HASHhf_rootsnap_dirr   	r2_bucketr2_keytar_pathr   size_mbtf	extractedrefs_dirr   r   r   rp      sD   

z-IndicConformerMulti._ensure_model_in_hf_cachec                   s^   ddl }tjj| jjj ddd| jjd   fdd}||| j| j_	t
d	 dS )
zBFallback: use TorchScript preprocessor on CPU if extraction fails.r   Nrb   rc   rd   encoderc                   sL   |  t|jd gd\}} ddg| | d\}}||fS )Nr   )input_signallengthoutputsencoded_lengthsaudio_signalr   )rc   r1   rA   r+   runnumpy)
self_modelr   r   r   r   enc_lengthsonnx_encoderr   r   r   _encode_safe   s   
z<IndicConformerMulti._load_cpu_fallback.<locals>._encode_safez1IndicConformer loaded (CPU fallback preprocessor))typesr1   rt   ru   rZ   rr   rs   models
MethodTypeencoderm   rn   )r   r   r   r   r   r   r      s   z&IndicConformerMulti._load_cpu_fallbackc                 C  s.   | j r| ` d | _ | jr| `d | _tj  d S rY   )rZ   r[   r1   rV   empty_cache)r   r   r   r   unload   s   zIndicConformerMulti.unloadN	waveformslist[torch.Tensor]
lang_codes	list[str]reference_textsOptional[list[str]]r   
list[dict]c              	     sX  j du r fddtt||D S g } gt| }tt||D ]!\}\}}|tv rG|jjv rG r< | nd}	|||||	f q&|sL|S g }
|D ]\}}}} |	d
j\}}|
|d qPjjj}jjd }jjd }tdt|jD ]"}t|j t|}|
|| }||| }|||||| q|S )z
        Batched CTC inference + scoring.
        Preprocesses all segments on GPU, then runs ONNX encoder in batches
        of self.batch_size for massive throughput improvement (~87 segs/s at bs=16).
        Nc                   s.   g | ]\}\}} || r | nd qS rY   )_infer_single_cpu)rh   iwlr   r   r   r   
<listcomp>   s    
z5IndicConformerMulti.predict_batch.<locals>.<listcomp>r]   r   r   ctc_decoder)r[   	enumeratezip_empty_resultrg   r   rZ   r   r6   	unsqueezer   r*   squeezerr   BLANK_IDr   r0   r\   min_run_onnx_batch)r   r   r   r   r   resultsr   r   langreffeatures_list_featblank_idencoder_sessionctc_sessionbatch_start	batch_endbatch_featsbatch_itemsr   r   r   predict_batch   s:   


z!IndicConformerMulti.predict_batchc           !        s  t |}tdd |D }tj|t|| jd}	g }
t|D ]\}}||	|ddd|jd f< |
|jd  qz$|	ddg|	
  tj|
tjdd	\}}|	d
gd|id }W ng ty } z[dt|v spdt|v r|dkrtd| d W Y d}~dS |d }td| d|  | |d| |d| || | | ||d ||d || | W Y d}~dS  d}~ww t|D ]\}\}}}}z| jj| }t|||d dd|f jdd}t|| }|dd|f }| jj| tj|dd}tj|dd}d fdd|D }|dd }|jddj  ! }|t"|ddd}|rJ|rJt#||} t"d|  d|d< |||< W q tym } ztd| d |  W Y d}~qd}~ww dS )!zKRun ONNX encoder+CTC on a batch. On OOM, halve batch and retry recursively.c                 s  s    | ]}|j d  V  qdS )r   N)r+   )rh   fr   r   r   rk     s    z6IndicConformerMulti._run_onnx_batch.<locals>.<genexpr>)r*   Nr   r   r   )r)   r   logprobsencoder_outputr   zallocate memoryRUNTIME_EXCEPTIONz-Conformer OOM even at batch_size=1, skipping z	 segmentsr   zConformer OOM at batch_size=z, retrying with r   r'   r]   c                   $   g | ]}|   kr|   qS r   itemrh   xr   r   r   r   r   H     $ z7IndicConformerMulti._run_onnx_batch.<locals>.<listcomp>   ▁    conformer_multi_transcriptionconformer_multi_ctc_rawconformer_multi_ctc_normalized      ?r  z!Conformer decode failed for item : )$rg   maxr1   zerosr}   r*   r   r+   r6   r   rc   r   nparrayint64r~   rW   rm   r   r   rZ   language_masks
from_numpylog_softmaxintr   argmaxunique_consecutiver   replacestripvaluesr>   r   roundr	   )!r   r   r   r   r   r   r   bsmax_TrD   frame_lengthsjr   enc_outenc_lenraw_logprobsehalforig_idxr   r   ref_text	lang_maskmaskedTlpindices	collapsedhyptranscriptiongreedy_confrcerr   r   r   r     s   


"z#IndicConformerMulti._run_onnx_batchwaveformr   	lang_codereference_textOptional[str]dictc              
     s  |   }|tvs|| jjvr|S z|d}| j|\}}| jjd dgd|id }| jj| }	t	
|dddd|	f jdd}
t|tjrQt|d nt|}|
dd|f }| jjj | jj| t	j|dd}t	j|dd}d fd	d
|D }|dd }||d< |jddj  }t|d|d< |r|rt||}td| d|d< W |S W |S W |S  ty } ztd| d|  W Y d}~|S d}~ww )uB   CPU fallback path — used when GPU preprocessor extraction fails.r   r   r   r   Nr   r   r]   c                   r   r   r   r   r   r   r   r   p  r   z9IndicConformerMulti._infer_single_cpu.<locals>.<listcomp>r   r  r  r  r  r  r  z#IndicConformer CPU failed for lang=r  ) r   r   rZ   r   r   r   r   r   r  r1   r  r  r{   r  ndarrayr  rr   r   r  r  r   r  r  r	  r  r>   r   r  r	   r~   rm   r   )r   r.  r/  r0  resultr   encoder_outputsr   r  r#  r$  r%  r&  r'  r(  r)  r*  r+  r-  r  r   r   r   r   Z  sJ   

$ 

 z%IndicConformerMulti._infer_single_cpuc                   C  s   dd d dS )Nr]   r  r   r   r   r   r   r   |  s   z!IndicConformerMulti._empty_result)r*   rW   r)   rX   )r]   )r^   rW   rY   )r   r   r   r   r   r   r   r   )r.  r   r/  rW   r0  r1  r   r2  )r   r2  )rP   rQ   rR   rS   r1   float16r   ru   staticmethodrp   r   r   inference_moder   r   r   r   r   r   r   r   rU   `   s     (
.	0E"rU   )$rS   
__future__r   jsonloggingpathlibr   typingr   r   r  r1   torch.nn.functionalnn
functionalr,   rr   r   r   r   	ctc_scorer	   	getLoggerrP   rm   r3   r4   r5   r}   r.   r=   r@   r/   Moduler   rU   r   r   r   r   <module>   s,    
7