o
    |is                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	m
Z
 ddlmZmZ ddlmZ ddlZddlZddlmZmZ ddlmZ dejd	< dd
dddddddddddZddddddddddd d!d"ZeeZG d#d$ d$ejjZ e!d%Z"e!d&Z#e!d'Z$e!d(Z%e!d)Z&e!d*Z'g d+Z(d,e)d-e)fd.d/Z*d,e)d-e)fd0d1Z+d,e)d-e)fd2d3Z,d,e)d-e)fd4d5Z-d6e)d7e)fd8d9Z.d:e)d;e)fd<d=Z/d>e0d-e1fd?d@Z2d>e0d-e0fdAdBZ3d>e0d-e1fdCdDZ4dEe)dFe)fdGdHZ5dOdJdKZ6dLdM Z7e8dNkr
e7  dS dS )Pa  
Benchmark Maya ASR TDT 1.1B (Hybrid FastConformer RNNT/TDT) on indic-asr-benchmark-6k.

Outputs: metrics.json, sample_analysis.json, error_analysis.json
following BENCHMARK_SCHEMA.md v1 normalization.

Usage:
    python3 benchmark_maya_asr_tdt.py         --checkpoint /home/ubuntu/training/checkpoints/maya-asr-tdt-1.1b-ckpt-60000/model.ckpt         --config /home/ubuntu/training/maya-asr-hybrid-fastconformer-rnnt-stage1/configs/train/stage1_prod_8xh200.yaml         --checkpoint-name ckpt-60000
    N)Counterdefaultdict)datetimetimezone)Path)wercer)	OmegaConf	soundfileHF_AUDIO_BACKEND                        	   
      )hibntatemrguknmlpaorasenr   r   r   r   r   r   r   r   r   r    r!   r"   )hindibengalitamiltelugumarathigujaratikannada	malayalampunjabiodiaassameseenglishc                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	LanguageEmbeddingnum_languages	embed_dimc                    s6   t    tj||| _tjjj| jjddd d S )N        g{Gz?)meanstd)	super__init__torchnn	Embeddingembedinitnormal_weight)selfr0   r1   	__class__ //home/ubuntu/training/benchmark_maya_asr_tdt.pyr6   1   s   
zLanguageEmbedding.__init__lang_idsreturnc                 C   s   |  |dS )Nr   )r:   	unsqueeze)r>   rC   rA   rA   rB   forward6      zLanguageEmbedding.forward)	__name__
__module____qualname__intr6   r7   TensorrF   __classcell__rA   rA   r?   rB   r/   0   s    r/   z([\u200b-\u200f\u2028-\u202f\ufeff\u00ad]z'[!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~]z[\u0964\u0965\u0970\u0971]z)[\u2010-\u2027\u2030-\u205e\u2e00-\u2e4f]z>[\u2018\u2019\u201a\u201b\u201c\u201d\u201e\u201f\u00ab\u00bb]z	\([^)]*\))	if	  i	  if
  i
  if  i  if  i  if  textrD   c                 C   s   t d|  S )NNFC)unicodedata	normalizestriprN   rA   rA   rB   norm_rawH   rG   rT   c                 C   s   t d| } td| } td| } d|  } td| } | dddddd} t	d| } t
d| } td| } |  } d|   } | S )	NNFKC  'u   —-u   –u   ‒)rP   rQ   ZW_CHARSsubANNOTATIONSjoinsplitQUOTESreplacePUNCT_COMMONPUNCT_INDICPUNCT_EXTENDEDlowerrR   rS   rA   rA   rB   norm_standardL   s   re   c                 C   sJ   t | } tdd| } tD ]}tdD ]}| t|| t|} qq| S )Nz	(\d),(\d)z\1\2r   )re   rer[   INDIC_DIGIT_OFFSETSranger`   chrstr)rN   offsetirA   rA   rB   norm_numcanon[   s   rm   c                 C   s   t | ddS )zBMER: norm_standard + remove ALL spaces -> single character stream.rW   rV   )re   r`   rS   rA   rA   rB   norm_merd   s   rn   	ref_chars	hyp_charsc              	      st  t | t |}  fddt|d D }t|d D ]}||| d< qt d D ]}||d |< q+td|d D ]G}td d D ]=}| |d  ||d  kra||d  |d  || |< qDdt||d  |d  ||d  | || |d   || |< qDq;t }| }}|dks|dkr8|dkr|dkr| |d  ||d  kr|d8 }|d8 }n|dkr|dkr|| | ||d  |d  d kr||d  |d8 }|d8 }nU|dkr|| | ||d  | d kr||d  |d8 }n5|dkr-|| | || |d  d kr-|dkr||d  n
|dkr(|d |d8 }n	 |S |dks|dks|S )zCCompute Levenshtein DP and backtrace to find edit positions in ref.c                    s   g | ]	}d g d  qS )r   r   rA   ).0_mrA   rB   
<listcomp>o       z*_levenshtein_backtrace.<locals>.<listcomp>r   r   )lenrh   minsetadd)ro   rp   ndprl   jedited_ref_positionsrA   rs   rB   _levenshtein_backtracek   sB   B
(4(
,



r   ref_normhyp_normc                 C   s   |   }|sdS | dd}|dd}t|}||kr d|fS |s$dS |s*||fS g }t|D ]\}}|D ]}	|| q6q0t||}
t }|
D ]}|t|k rX|||  qIt||fS )zXCompute space_norm_wer for a single sample.
    Returns (error_words, total_words).
    r   r   rW   rV   r   )r^   r`   rw   	enumerateappendr   ry   rz   )r   r   	ref_wordsref_nospacehyp_nospacetotal_wordschar_to_wordwiwordrr   edited_positionstouched_wordsposrA   rA   rB   compute_space_norm_wer_sample   s0   
r   samplesc           /         s  t t}| D ]}||d  | qi  g g }}g g }}g g }}g g }	}
d\}}t| D ]Z}|| }g g }}g g }}g g }}g g }}d\}}d}|D ]}|d }|ddpbd}| sk|d7 }t|t|}}t|t|}}t	|t	|} }!t
|t
|}"}#| sqU|| || r|nd ||r|nd ||r|nd || r| nd ||!r|!nd ||"r|"nd ||#r|#nd t|r|nd|r|nd\}$}%||$7 }||%7 }qUt||d	 }&t||d	 }'t||d	 }(t||d	 })t||d	 }*|dkr || d	 nd
}+t|t|&dt|'dt|(dt|+dt|)dt|*d|t|&|' dt|'|( dt|'|+ dt|'|) ddd	 |< || || || || || || |	| |
| ||7 }||7 }q2|dkr|| d	 nd
},t| tt||d	 dtt||d	 dtt||d	 dt|,dtt|	|
d	 dtt||d	 dd d< dd  D }-t|-}.|.tt fdd|-D |. dtt fdd|-D |. dtt fdd|-D |. dtt fdd|-D |. dtt fdd|-D |. dtt fdd|-D |. dd d<  S )Nlanguager   r   	reference
hypothesisrV   r   <empty>d   r2   r   )raw_to_normnorm_to_numcanonnorm_to_space_normnorm_to_mer)		n_sampleswer_rawwer_normwer_numcanonspace_norm_wermercer_normempty_hypothesesnormalization_delta)r   r   r   r   r   r   r   __overall__c                 S   s   g | ]	}| d s|qS )rr   
startswithrq   krA   rA   rB   ru     rv   z/compute_metrics_for_samples.<locals>.<listcomp>c                 3       | ]	} | d  V  qdS )r   NrA   rq   lresultrA   rB   	<genexpr>      z.compute_metrics_for_samples.<locals>.<genexpr>c                 3   r   )r   NrA   r   r   rA   rB   r     r   c                 3   r   )r   NrA   r   r   rA   rB   r     r   c                 3   r   )r   NrA   r   r   rA   rB   r     r   c                 3   r   )r   NrA   r   r   rA   rB   r     r   c                 3   r   )r   NrA   r   r   rA   rB   r     r   )n_languagesr   r   r   r   r   r   __macro_avg__)r   listr   sortedkeysgetrR   rT   re   rm   rn   r   compute_wercompute_cerrw   roundextendsum)/r   by_langsall_ref_rawall_hyp_rawall_ref_normall_hyp_norm
all_ref_nc
all_hyp_ncall_ref_merall_hyp_merall_snw_errorsall_snw_totallanglang_samplesrefs_rawhyps_raw	refs_norm	hyps_normrefs_nchyps_ncrefs_merhyps_merlang_snw_errorslang_snw_totalempty_countrefhyprrhrrnhnrnchncrmhmewtww_raww_normw_ncw_merc_normw_snwoverall_snw	lang_keysn_langsrA   r   rB   compute_metrics_for_samples   s   







 $$$





r   c                 C   s  g }| D ]}|d }| ddpd}t|}t|}t|}t|}t|}	t|}
t|}t|}zNt|g| r=|ndgd }t|rH|ndg|rN|ndgd }t|	rY|	ndg|
r_|
ndgd }t|rj|nd|ro|nd\}}|dkr~|| d nd}W n t	y   d\}}}}Y nw g }||kr|
d	 ||kr|
d
 ||kr||kr|
d | s|
d td|std|r||kr|
d |dkr|
d ||d kr|
d |
|d |d |||||||	|
t|dt|dt|dt|d|d q|S )Nr   r   rV   r   r   r   r2   )      Y@r   r   r   exact_matchexact_match_normpunctuation_only_diffempty_hypothesis\dnumeric_mismatchP   high_werg      ?spacing_erroridr   r   )r   r   r   r   r   r   ref_numcanonhyp_numcanonref_merhyp_merr   r   r   r   flags)r   re   rm   rn   rT   r   rR   r   r   	Exceptionr   rf   searchr   )r   outr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rA   rA   rB   build_sample_analysis"  sn   
""







r   c           *         s&  t t}| D ]}||d  | qi  i }t| D ]e}|| }t }t }t }d}	d}
d}d}d}d}g }|D ]}|d }|ddpId}t|}t|}| sZ|d7 }zt	|r`|ndg|rf|ndgd }W n t
yx   d	}Y nw ||d
 |f | }| }t|}t|}|D ]}|| ||d }|dkr||  |7  < q|D ]}|| ||d }|dkr||  |7  < qtt|t|}t|D ]}|| || kr||| || f  d7  < qtd|std|r||kr|	d7 }	||kr||kr|
d7 }
q<|jdd d dd |dd  D d d d }dd |d d D }dd |D d d }|rQtdd |D t| nd	||< dd |dD dd |dD dd |dD |	|
||||d|||dd |< qt| dd d} d d | dd  D d d d }!d!d | d d D }"t fd"d D }#t fd#d D }$t| }%|#|% d$krd%}&d&}'n|$|% d'krd(}&d)}'nd*}&d+}'|$|% d,krd-n
|$|% d.krd/nd0}(|#|% d'krd-n
|#|% d.krd/nd0})|&|'|(|)|!|"d1 d2<  S )3Nr   r   r   r   rV   r   r   r   r   r   r   c                 S      | d S Nr   rA   xrA   rA   rB   <lambda>      z&build_error_analysis.<locals>.<lambda>)keyc                 S      g | ]\}}|qS rA   rA   rq   sidrr   rA   rA   rB   ru         z(build_error_analysis.<locals>.<listcomp>c                 S   r  rA   rA   r  rA   rA   rB   ru     r  r   c              	   S   s\   g | ]*}t d |d st d |ddpdr,t|d t|ddp%dkr|d qS )r   r   r   rV   r   )rf   r   r   re   )rq   r   rA   rA   rB   ru     s     c                 s   s    | ]\}}|V  qd S NrA   )rq   rr   wrA   rA   rB   r     s    z'build_error_analysis.<locals>.<genexpr>c                 S   s    g | ]\\}}}|||d qS ))r   r   countrA   )rq   rhcrA   rA   rB   ru     s    

   c                 S      g | ]	\}}||d qS )r   r  rA   rq   r  r  rA   rA   rB   ru         c                 S   r  r  rA   r  rA   rA   rB   ru     r  )numeric_mismatch_countpunctuation_only_countspacing_tokenization_countentity_mismatch_countscript_confusion_countempty_hypothesis_count)worst_samplesbest_samplesnumeric_mismatch_samples)top_substitutionstop_insertionstop_deletionserror_bucketsexamplesc                 S   r   r   rA   r   rA   rA   rB   r    r  c                 S   r  rA   rA   rq   r   rr   rA   rA   rB   ru     r  c                 S   r  rA   rA   r#  rA   rA   rB   ru     r  c                 3   *    | ]}| d s | d d V  qdS )rr   r!  r  Nr   r   r   rA   rB   r        ( c                 3   r$  )rr   r!  r  Nr   r   r   rA   rB   r     r%  g333333?zformatting-limited
formattingg?znumeric-limitednumericzrecognition-limitedrecognitiong333333?highg?moderatelow)model_diagnosisprimary_error_sourcenumeric_verbalization_impactformatting_impactworst_languagesbest_languages__summary__)r   r   r   r   r   r   r   re   rR   r   r   r^   rx   rw   rh   rf   r   sortr   most_commonitems)*r   r   r   lang_wer_normsr   r   subsinsdels
numeric_mm
punct_onlyspacing_tok	entity_mmscript_conf	empty_hypper_sample_werr   r   r   r   swr   	hyp_wordsref_counterhyp_counterr  diffmin_lenrl   worstbestnumeric_samplessorted_langsworst_langs
best_langstotal_punct_onlytotal_numerictotal_samples	diagnosisprimary_sourcenumeric_impactr/  rA   r   rB   build_error_analysis_  s   &
 (	 ((
	rS  config_pathcheckpoint_pathc                    sB  dd l m  m} td|   t| }tt|  j	j	j	}|j
jj}tj|s5tj|||j
j_tj|j
jjsHJ d|j
jj td|j
jj  td}| s_|d t||j
j_t||j
j_d|j_d|j_td	 |jj|j
d d
td|  dd l}dd l}dd lG dd dj dd l m!}	 |	j"d! fdd	}
|
|	_"zt#j|ddd}W |	_"n|	_"w t$t%d}|_&'d| j(|d dd\}}tdt)| dt)| d |rtd|d d   j*j+fdd}|j*_+td ,  -  .t#j/td  S )"Nr   zLoading config from zTokenizer dir not found: zTokenizer dir: z/tmp/dummy_manifest.jsonlzB{"audio_filepath": "/dev/null", "text": "dummy", "duration": 1.0}
r   gpuz!Initializing model from config...)cfgtrainerzLoading checkpoint from c                       s$   e Zd ZdZi Z fddZ  ZS )z"load_model.<locals>._SafeUnpicklerz5Unpickler that creates stubs for any missing classes.c              	      sn   zt  ||W S  ttfy6   | d| }|| jvr/G dd d}||_||_|| j|< | j|  Y S w )N.c                   @   s   e Zd Zdd Zdd ZdS )z;load_model.<locals>._SafeUnpickler.find_class.<locals>.Stubc                 _   s   d S r
  rA   )r>   akwrA   rA   rB   r6     s    zDload_model.<locals>._SafeUnpickler.find_class.<locals>.Stub.__init__c                 S   s$   | j t|tr| d S i  d S r
  )__dict__update
isinstancedict)r>   staterA   rA   rB   __setstate__  s   $zHload_model.<locals>._SafeUnpickler.find_class.<locals>.Stub.__setstate__N)rH   rI   rJ   r6   ra  rA   rA   rA   rB   Stub  s    rb  )r5   
find_classModuleNotFoundErrorAttributeError_stub_cacherH   rJ   )r>   modulenamer  rb  r?   rA   rB   rc    s   

z-load_model.<locals>._SafeUnpickler.find_class)rH   rI   rJ   __doc__rf  rc  rM   rA   rA   r?   rB   _SafeUnpickler  s    rj  data.pklc                    s*   G  fddd}| |||fi |S )Nc                       s"   e Zd Z ZjZjZjZdS )z;load_model.<locals>._patched_load.<locals>.SafePickleModuleN)rH   rI   rJ   	UnpicklerloaddumpsloadsrA   )rj  picklerA   rB   SafePickleModule*  s
    
rq  rA   )zip_filemap_locationpickle_modulepickle_filekwargsrq  )rj  _orig_rebuildrp  rA   rB   _patched_load(  s   z!load_model.<locals>._patched_loadcpuF)rs  weights_onlyi   _lang_embed_module
state_dict)strictz  Loaded state_dict: z
 missing, z unexpected keysz  Missing (first 5): r   c                    sh   | |d\}}t  dr0t  dr0 jd ur0 j}|jd |jd kr- |}|| }d  _||fS )N)audio_signallength_current_lang_ids_lang_embedr   )hasattrr  shaper  )r~  r  encodedencoded_lenrC   	lang_bias)modeloriginal_encoder_forwardrA   rB   encoder_forward_with_langE  s   

z-load_model.<locals>.encoder_forward_with_langz1  Language conditioning enabled (encoder patched)z%Model loaded and ready for inference.)rk  )0nemo.collections.asrcollectionsasrprintr	   rm  rj   r   resolveparentr  	tokenizerdirospathisabsr]   isdirexists
write_texttrain_dsmanifest_filepathvalidation_dsrX  devicesacceleratormodelsEncDecHybridRNNTCTCBPEModelzipfileiorp  rl  torch.serializationserialization_loadr7   r/   NUM_LANGUAGESr  register_moduleload_state_dictrw   encoderrF   evalcudatobfloat16)rT  rU  nemo_asrrW  repo_dirtok_dirdummy_manifestr  r  _tsrx  ckpt
lang_embedmissing
unexpectedr  rA   )rj  rw  r  r  rp  rB   
load_model  s\   

&


r      c              
   C   sH  ddl }ddl}ddl}g }t|}|jdd}ztd||D ]}	t|	| |}
||	|
 }g }tt|d D ]/}|d | }tj|d tj	d}|d	 }t
j|d
|	|  d}|||| || q9g }tt|d D ]}|d | }t|d}|t|d qstj|tjdd| _tjjdtjd | j||d}W d   n1 sw   Y  t|tr|d }t|D ]/\}}t|tr|}nt|dr|j}nt|}||d | |d | |d | |d q|D ]}t
 | q|
}t!d| d| d|| d dd q W |j"|dd |S |j"|dd w )zGTranscribe all samples using NeMo's transcribe API with temp wav files.r   Nmaya_asr_bench_)prefixr   audioarray)dtypesampling_ratesample_z.wavr   r"   r   r  )r  device
batch_sizerN   r   )r   r   r   r   z  Transcribed /z
 samples (r   .1fz%)T)ignore_errors)#r
   tempfileshutilrw   mkdtemprh   rx   npr  float32r  r  r]   writer   LANG_NAME_TO_CODEr   
LANG_TO_IDr7   tensorlongr  ampautocastr  
transcriber^  tupler   rj   r  rN   unlinkr  rmtree)r  datasetr  sfr  r  resultstotaltmpdirbatch_start	batch_endbatch	wav_pathsrl   r  arrsrwav_pathrC   	lang_name	lang_codetranscriptionsr   rN   pelapsed_samplesrA   rA   rB   transcribe_datasetZ  sb   






(5r  c                  C   s  t jdd} | jdtddd | jdtddd | jd	td
d | jdtdd | jdtdd | jdtdd |  }t|j|j |j	 }|j
ddd t|j|j}td ddlm}m} |ddtjddd}|d|ddd}tdt| dtt|d  d  t|d! }td"|d#d$|d% d&d' |d( }| rtd)|  t|}	t|	}
W d    n1 sw   Y  |
d* }|
d+ }tdt| d,|d#d- nMtd. t }t|||jd/}t | }td0|d#d1|| d2d3 t|d4}	tj ||d5|	d6d7 W d    n	1 sw   Y  td8|  td9 t!|}d:}zt"j#$d}W n
 t%yG   Y nw |j|j	|jd;d|jt&|d<t&|d<t&|| d=t'(t)j*+d>|d?d@t,t-dAdBd:dC|dD< |dE }tdFdG  tdH|dI d&dJ|dK d&dL|dM d&dN|dO d&dP	 tdG  tdFdQdRdSdTdUdSdVdWdSdXdYdSdZd[dSd\dW td] t.d^d_ |D D ]-}|| }t|dRdS|d` dadb|dI dcdb|dK dddb|dM dedb|dO dcdP qtdf t/|}tdg t0|}t|dh d4}	tj ||	d<d6di W d    n	1 s'w   Y  t|dj d4}	tj ||	d<d6di W d    n	1 sHw   Y  t|dk d4}	tj ||	d<d6di W d    n	1 siw   Y  tdl| dm tdn tdo tdp d S )qNzBenchmark Maya ASR TDT 1.1B)descriptionz--checkpointTzPath to .ckpt file)typerequiredhelpz--configzPath to training YAML configz--checkpoint-namez
ckpt-60000)r  defaultz
--model-idzparakeet-tdt-1.1b-langz--batch-sizer  z--output-dirz'/home/ubuntu/training/benchmark_outputs)parentsexist_okzLoading benchmark dataset...r   )load_datasetAudioz"BayAreaBoys/indic-asr-benchmark-6ktrainHF_TOKEN)r^   tokenr  i>  )r  decodez	  Loaded z samples across r   z
 languagesdurationz  Total audio: r  zs (i  z.2fzh)z_predictions_cache.jsonz!
Loading cached predictions from r   inference_timez) cached predictions (original inference: zs)z
Starting transcription...r  z
Transcription complete in zs (RTF: z.4f)r  )r   r  F)ensure_asciiz  Cached predictions to z
Computing metrics...unknownzmaya-asr-hybrid-tdt-1.1br   r   z%Y-%m-%dT%H:%M:%SZnemov1jiwer__version__)
checkpointcheckpoint_namemodel_id
model_typer  r  inference_time_sectotal_audio_secrtf	timestamprV  	frameworknormalization_versionjiwer_version__meta__r   
zP================================================================================zOVERALL: wer_norm=r   z%  space_norm=r   z%  mer=r   z%  cer_norm=r   %Languagez<15rW   zWER Rawz>8zWER Normz>9	SpaceNormz>10MERz>7zCER NormzA-----------------------------------------------------------------c                 s   s    | ]
}| d s|V  qdS )rr   Nr   r   rA   rA   rB   r     s    zmain.<locals>.<genexpr>r   z>7.2fz% z>8.2fz>9.2fz>6.2fz
Building sample analysis...zBuilding error analysis...zmetrics.json)indentr  zsample_analysis.jsonzerror_analysis.jsonz
Results written to r  z  - metrics.jsonz  - sample_analysis.jsonz  - error_analysis.json)1argparseArgumentParseradd_argumentrj   rK   
parse_argsr   
output_dirr  r  mkdirr  configr  r  datasetsr  r  r  environr   cast_columnrw   ry   r   r  openjsonrm  timer  r  dumpr   r7   r  get_device_namer   r   r   nowr   utcstrftimegetattr
__import__r   r   rS  )parserargsout_dirr  r  r  dsr  predictions_cachefcacher   r  t0metricsgpu_nameor   rt   sample_analysiserror_analysisrA   rA   rB   main  s   &



8Pr/  __main__)r  )9ri  r  r  r  rf   sysr  rP   r  r   r   r   r   pathlibr   numpyr  r7   r  r   r   r   r   	omegaconfr	   r  r  r  rw   r  r8   Moduler/   compilerZ   ra   rb   rc   r_   r\   rg   rj   rT   re   rm   rn   r   r   r   r_  r   r   rS  r  r  r/  rH   rA   rA   rA   rB   <module>   s`   






	()f= 
nF
s
