o
    %i]                     @  s   d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
Z
ddlm  mZ ddlmZmZmZ ddlmZ eeZed	Zd
ZG dd dZdS )a  
VoxLingua107 ECAPA-TDNN (SpeechBrain): LID on 107 languages + 256-dim speaker embeddings.
Architecture: ECAPA-TDNN, ~14M params.  ~200MB VRAM.
Returns per-segment: top language, confidence, top-3, speaker embedding.

Model loaded from R2 (no HuggingFace network calls at runtime).
    )annotationsN)Path)Optional   )VOXLINGUA_MODELVOXLINGUA_BATCH_SIZEVOXLINGUA_LABEL_MAP)collate_waveformsz/tmp/voxlingua107zmodels/voxlingua107.tarc                   @  sl   e Zd ZdZdd ddZd!d"d	d
Zed#ddZdd Ze	
 	d$d%ddZd%ddZed&ddZdS )'VoxLinguaLIDz;Batch LID + speaker embedding extraction with VoxLingua107.cudadevicestrc                 C  s   || _ d | _t| _d S N)r   modelr   
batch_size)selfr    r   8/home/ubuntu/transcripts/validations/models/voxlingua.py__init__   s   
zVoxLinguaLID.__init__ hf_tokenc                   sz   t d| j  |  }dd l}|j  fdd}||_ddlm} |jt	|t	|d| jid| _
 |_t d d S )	Nu   Loading VoxLingua107 → r   c                    s   | dd   | i |S )Nuse_auth_token)pop)argskwargs_orig_downloadr   r   _patched_download+   s   z,VoxLinguaLID.load.<locals>._patched_download)EncoderClassifierr   )sourcesavedirrun_optsz!VoxLingua107 loaded (~14M params))loggerinfor   _ensure_localhuggingface_hubhf_hub_download!speechbrain.inference.classifiersr   from_hparamsr   r   )r   r   save_dirr%   r   r   r   r   r   load$   s   zVoxLinguaLID.loadreturnr   c                  C  s4  t d  rtd t S ddl} tdd}td}td| d	t  | j	d
tdtdtddd}|
|tt| | jd }td|dd t|d}|jddd W d   n1 skw   Y  |jdd t d }| r| }|ddt  }|| tdt   t S )z=Download VoxLingua107 from R2 if not already present locally.zembedding_model.ckptz#VoxLingua107 already cached locallyr   NR2_VALIDATION_MODEL_BUCKETzvalidation-resultsz/tmp/voxlingua107.tarz'Downloading VoxLingua107 from R2: s3:///s3R2_ENDPOINT_URLR2_ACCESS_KEY_IDR2_SECRET_ACCESS_KEYauto)endpoint_urlaws_access_key_idaws_secret_access_keyregion_nameg    .AzDownloaded z.0fzMB, extracting...zr:*z/tmpdata)filterT)
missing_okzhyperparams.yamlz7pretrained_path: speechbrain/lang-id-voxlingua107-ecapazpretrained_path: zVoxLingua107 extracted to )
_LOCAL_DIRexistsr"   r#   boto3osgetenvr   _R2_KEYclientdownload_filer   statst_sizetarfileopen
extractallunlink	read_textreplace
write_text)r<   buckettar_pathr.   size_mbtfhptextr   r   r   r$   :   s<   

zVoxLinguaLID._ensure_localc                 C  s   | ` d | _ tj  d S r   )r   torchr   empty_cache)r   r   r   r   unloada   s   zVoxLinguaLID.unload>  	waveformslist[torch.Tensor]sample_rateint
list[dict]c                 C  sH   g }t dt|| jD ]}|||| j  }| ||}|| q|S )z
        Run LID + embedding extraction on a list of mono waveforms.
        Returns list of dicts with keys:
          vox_lang, vox_lang_iso1, vox_confidence, vox_top3, vox_speaker_embedding
        r   )rangelenr   _infer_batchextend)r   rU   rW   resultsi
batch_wavsbatch_resultsr   r   r   predict_batchf   s   	zVoxLinguaLID.predict_batchc              	   C  s  t |\}}|| j}| |   }|| j}| j||\}}}}	| j||}
|
d	 }
|
 	 }g }tt|D ]}|| }|td|jd \}}t|	| tra|	| n|	| d }| |}g }ttd|jd D ]*}||  }| jjj|}t|tr|d }|t|t||  dd qx| dkr||  n| }|dkr|nt|
  }|||t|d||
|   d qC|S )N      r      )langconf)vox_langvox_lang_iso1vox_confidencevox_top3vox_speaker_embedding)r	   tor   floatmaxr   classify_batchencode_batchsqueezecpuexprZ   r[   topkminshape
isinstancer   _label_to_iso1itemhparamslabel_encoderdecode_ndimlistappendrounddimrQ   tensornumpytobytes)r   rU   rW   paddedlengthswav_lensout_probscoreindextext_lab
embeddingsprobsr^   b	row_probs	top3_valstop3_idx	raw_labeliso1top3jidxlabraw_conflinear_confr   r   r   r\   v   sD   "


 
zVoxLinguaLID._infer_batchlabelc                 C  s.   | t v rt |  S | dd  }t ||S )u<   Convert VoxLingua label to ISO-1. E.g. 'hi: Hindi' → 'hi'.:r   )r   splitstripget)r   coder   r   r   ry      s   zVoxLinguaLID._label_to_iso1N)r   )r   r   )r   )r   r   )r+   r   )rT   )rU   rV   rW   rX   r+   rY   )r   r   r+   r   )__name__
__module____qualname____doc__r   r*   staticmethodr$   rS   rQ   inference_moderb   r\   ry   r   r   r   r   r
      s    &
/r
   )r   
__future__r   loggingr=   rD   pathlibr   typingr   rQ   torch.nn.functionalnn
functionalFconfigr   r   r   audio_loaderr	   	getLoggerr   r"   r:   r?   r
   r   r   r   r   <module>   s    
