o
    }oi
k                     @   s>  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlZd dlmZ d dlmZ d dlm  mZ d dlmZ d d	lmZmZ d d
lmZmZ d dl m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) 			dMde!de*de*de*de*de+dee, dee+ deej- fddZ.		dNde!de/de*de*dede*deee* e*f de+dee, deej- fddZ0			 	!dOde"dede*deee* e*f de+dee, de/d"e1deej- fd#d$Z2d%ee+ deej- fd&d'Z3d(ed)ej4deee+f fd*d+Z5d(edeee+ e1f fd,d-Z6dPd.e+d/e1dee7 fd0d1Z8d2e+d3e,de,fd4d5Z9d(ed6e+defd7d8Z:d"e7fd9d:Z;		!	!dQd3eeej- eeej-  ee+ f d(ed6e+dee+ d;e1d"e1dee+e+f fd<d=Z<d>d?d@gg dAdfd2e+dBe+dCe+dDee+ dEee+ dFe+de7fdGdHZ=G dIdJ dJZ>eG dKdL dLZ?dS )R    N)	dataclass)Path)NamedTemporaryFile)ListOptionalTupleUnion)
DictConfig)tqdm)word_error_rate)ASRModelEncDecMultiTaskModel)manifest_utils
rnnt_utils)FrameBatchASRFrameBatchMultiTaskAED)OccurancePunctuationErrorRate)get_full_path)loggingmodel_utilscpuasrtokens_per_chunkdelaymodel_stride_in_secs
batch_sizemanifest	filepathsacceleratorreturnc              	   C   s  g }g }	|r|rt d|du r|du rt d|r`g }t|ddd5}
td |
D ]&}| }|s3q*t|}t|d |d	}|| d
|v rP|	|d
  q*W d   n1 s[w   Y  t	  tj
|dkrndnd g }d| _ttt|dt|dD ]7}|||  t||krdd |D }|   | ||| | ||}|| |  |  j|7  _qt|dkrt|| _t|| j_|   dd |D }| ||| | ||}|| |  |  jt|7  _W d   n	1 sw   Y  W d   n	1 sw   Y  tjdddv rMt|	dkr8td |D ]}td| q.nt||	D ]\}}td| td| q=t|}|S )z
    Moved from examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
    Write all information presented in input manifest to output manifest and removed WER calculation.
    *Please select either filepaths or manifestN.Either filepaths or manifest shoud not be Nonerutf_8encodingzParsing manifest files...audio_filepath
audio_filemanifest_filetextr   cudar   Sample:)desctotalc                 S      g | ]}|qS  r0   .0sampler0   r0   e/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/transcribe_utils.py
<listcomp>Q       z/get_buffered_pred_feat_rnnt.<locals>.<listcomp>c                 S   r/   r0   r0   r1   r0   r0   r4   r5   `   r6   DEBUG01yt#ground-truth text does not present!hyp:ref:)
ValueErroropenprintstripjsonloadsr   appendtorchinference_modeampautocastsample_offsetr
   rangelenresetread_audio_file
transcribeextendclearr   frame_buffererosenvirongetzipwrap_transcription)r   r   r   r   r   r   r   r   hypsrefsmfst_fLrowr(   batchidxaudio_fileshyp_listhyprefwrapped_hypsr0   r0   r4   get_buffered_pred_feat_rnnt%   s|   






re   	frame_lenpreprocessor_cfgdevicec	                 C   s  d|_ tjj|}	|	| g }
g }|r|rtd|du r'|du r'td|rIt|ddD ]}|   | 	||| | 
||}|
| q/nUt|ddd	F}t|ddD ]7}|   | }|sdqWt|}d
|v rt||d
  t|d |d}| 	||| | 
||}|
| qWW d   n1 sw   Y  tjdddv rt|dkrtd |
D ]}td| qnt|
|D ]\}}td| td| qt|
}|S )z
    Moved from examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
    Write all information presented in input manifest to output manifest and removed WER calculation.
    Noner    Nr!   r,   )r-   r"   r#   r$   r*   r&   r'   r7   r8   r9   r   r=   r>   r?   )	normalizenemo_asrmodelsEncDecCTCModelBPEfrom_config_dicttor@   r
   rN   rO   rP   rF   rA   rC   rD   rE   r   rT   rU   rV   rM   rB   rW   rX   )r   rf   r   r   rg   r   rh   r   r   preprocessorrY   rZ   r\   rb   r[   r]   r(   rc   rd   r0   r0   r4   get_buffered_pred_featv   sT   


rq           F
timestampsc              
   C   s  d|_ t|}|| g }	g }
|r|rtd|d u r%|d u r%td|r_td t|dt|ddD ]'}|dd	d
d	dd|rCdndd}| 	  | j
||||d |  }|	| q6net|dddV}t| }t|dt|ddD ]=}| 	  | }|sqwt|}|rd|d< d|v r|
|d  t|d |d}| j
||||d |  }|	| qwW d    n1 sw   Y  t|	}|S )Nri   r    r!   zDeteced audio files as input, default to English ASR with Punctuation and Capitalization output.                 Please use manifest input for other options.zTranscribing:P   )r-   r.   ncolsi enr   yesnothingno)r&   durationsource_langtasknametarget_langpncanswer	timestamp)	meta_datar"   r#   r$   r   r*   r&   r'   )rj   r   rn   ro   r@   r   infor
   rM   rN   rO   rP   rF   rA   list	readlinesrC   rD   rE   r   rX   )r   rg   r   rh   r   r   r   rs   rp   rY   rZ   r(   metarb   finlinesliner3   rd   r0   r0   r4   #get_buffered_pred_feat_multitaskAED   sb   




r   rY   c                 C   s@   t | d tjr
| S g }| D ]}tjdg |d}|| q|S )zEWrap transcription to the expected format in func write_transcriptionr   rr   )score
y_sequencer*   )
isinstancer   
HypothesisrF   )rY   rd   rb   
hypothesisr0   r0   r4   rX      s   rX   cfgmap_locationc                 C   s   | j dur9| j dkr9tj| j dd}|j}t|}td|j  |j| j |d}t	j
t	j
| j d }ntj| j|d}| j}t| d	rat|d
ra|j| jjdd| jjddd ||fS )zBSetup model from cfg and return model and model name for next stepNri   T)restore_pathreturn_configzRestoring model : )r   r   r   )
model_namer   model_changechange_attention_modelself_attention_modelatt_context_size)r   r   )
model_pathr   restore_fromtargetr   import_class_by_pathr   r   __name__rT   pathsplitextbasenamefrom_pretrainedpretrained_namehasattrr   r   	conformerrV   )r   r   	model_cfg	classpathimported_class	asr_modelr   r0   r0   r4   setup_model  s*   
r   c           	      C   s  d}d}| j dur#| js#ttjtj| j d| j dd}||fS g }t| j	j
dkr:td| j	 d dS | dd	}t| j	d
9}|D ].}| }|sRqIt|}t|| | j	||< |ddu rw| jrwtd| d| j	 dqIW d   n1 sw   Y  tdddd/}t| j	| jdD ]}t|| | j	d}||d	< || |t|d  qW d   n1 sw   Y  |j}||fS )a]  
    Prepare audio data for transcription.
    Args:
        cfg (DictConfig): Configuration dictionary containing the following parameters:
            - audio_dir (str): Path to the directory containing audio files.
            - append_pred (bool): Flag indicating whether to append predictions to an existing dataset.
            - audio_type (str): Type of audio files to consider.
            - dataset_manifest (str): Path to the dataset manifest file.
            - audio_key (str, optional): Key in the manifest file specifying the audio file path.
                Defaults to 'audio_filepath'.
            - presort_manifest (bool, optional): Flag indicating whether to presort the manifest file.
                Defaults to True.
    Returns:
        Tuple[List[str], bool]: A tuple containing the following:
            - filepaths (List[str]): List of filepaths to the audio files if path to the directory
                containing audio files is provided.
            - sorted_manifest_path (bool): Path to the sorted manifest file if path to the dataset
                manifest file is provided.
    Nz**/*.T)	recursiver   zThe input dataset_manifest z is empty. Exiting!	audio_keyr&   rtrz   z*Requested presort_manifest=True, but line z in manifest z6                             lacks a 'duration' field.w.jsonF)modesuffixdelete)try_sortr'   
)	audio_dirappend_predr   globrT   r   join
audio_typestatdataset_manifestst_sizer   errorrV   rA   rC   rD   rE   r   presort_manifestr@   r   read_and_maybe_sort_manifestrF   writedumpsname)	r   r   sorted_manifest_pathr   fhr   itemfr(   r0   r0   r4   prepare_audio_data&  sD   &

r   r   r   c                 C   s6   t | }|rtdd |D rt|ddd d}|S )zDSorts the manifest if duration key is available for every utterance.c                 s   s$    | ]}d |v o|d  duV  qdS )rz   Nr0   r2   r   r0   r0   r4   	<genexpr>c  s   " z/read_and_maybe_sort_manifest.<locals>.<genexpr>Tc                 S   s   | d S )Nrz   r0   )r   r0   r0   r4   <lambda>d  s    z.read_and_maybe_sort_manifest.<locals>.<lambda>reversekey)r   read_manifestallsorted)r   r   itemsr0   r0   r4   r   `  s   
r   manifest_pathtranscriptionsc           	      C   s   t | dd}dd t|D }W d    n1 sw   Y  tdd |D s*|S dd t|dd	d
 dD }~t|d t}|rHtt| }d gt| }t|D ]
\}}|| ||< qS|ritt	tt| }|S )Nutf-8r$   c                 S   s*   g | ]\}}|  d kr|t|fqS  rC   rD   rE   )r2   r_   lr0   r0   r4   r5   j  s   * z/restore_transcription_order.<locals>.<listcomp>c                 s   s,    | ]}d |d v o|d d  duV  qdS )rz      Nr0   r   r0   r0   r4   r   k  s   * z.restore_transcription_order.<locals>.<genexpr>c                 S   s   g | ]}|d  qS )r   r0   r   r0   r0   r4   r5   m      Tc                 S   s   | d d S )Nr   rz   r0   )itr0   r0   r4   r   m  s    z-restore_transcription_order.<locals>.<lambda>r   r   )
rA   	enumerater   r   r   r   rW   rM   tuplemap)	r   r   r   r   new2oldis_list	reorderednewoldr0   r0   r4   restore_transcription_orderh  s    r   r   c                 C   sz   | j du r;| jdurtjtj| jdd | _ | S | jdur/| jdd| j d| _ | S | jdd| d| _ | S )z2Compute filename of output manifest and update cfgN.r   _)	output_filenamer   rT   r   dirnamer   pred_name_postfixr   replace)r   r   r0   r0   r4   compute_output_filenamez  s   


r   c                 C   sF   t t| D ]}t| | d | | d< t| | d | | d< q| S )a  
    Normalize the dictionary of timestamp values to JSON serializable values.
    Expects the following keys to exist -
        "start_offset": int-like object that represents the starting index of the token
            in the full audio after downsampling.
        "end_offset": int-like object that represents the ending index of the token
            in the full audio after downsampling.

    Args:
        timestamps: Nested dict.

    Returns:
        Normalized `timestamps` dictionary (in-place normalized)
    start_offset
end_offset)rL   rM   int)rs   val_idxr0   r0   r4   normalize_timestamp_output  s   r   compute_langsc              	   C   sx  |j rtd|j d |jdur|j}n|}d| }nd}d}t| d tr-| }	d}nct| d tjrA| }	|j	j
js@J d	nOt| d trt| d d tjrg g }	}
| D ]3}|	|d  |j	j
jsg }|D ]}t|jtjr{|j  n|j}||j|f qk|
| qYntt|jjjddd
 t|jdddd	}|jdurt|	D ]`\}}|sd|| ||i}nGd|| ||ji}|r|j}|durt|tr|dd | D ]}t|| }||| < q|r|j |d< |j!|d< |j	j
js|
| |d< |"t#$|d  qnt|j%ddd}t|D ]w\}}|& }|s,qt#'|}|s;|	| ||< nP|	| j||< |rn|	| j}|durnt|trn|dd | D ]}t|| }||| < q_|r|	| j |d< |	| j!|d< |j	j
js|
| |d< |"t#$|d  qW d   n	1 sw   Y  W d   n	1 sw   Y  |j|fS )z-Write generated transcription to output file.z Transcripts will be written in "z" fileN
pred_text_	pred_textTr   Fz+Works only with return_best_hypothesis=true)parentsexist_okr   r   r   )r%   newliner&   timestep	pred_langpred_lang_charsbeamsr"   r$   )(r   r   r   r   r   r   strr   r   decodingbeamreturn_best_hypothesisr   rF   r   rG   Tensornumpyr   r*   	TypeErrorr   parentmkdirrA   r   r   r   dictpopkeysr   langslangs_charsr   rD   r   r   rC   rE   )r   r   r   r   r   rs   pred_by_model_namepred_text_attr_namereturn_hypotheses	best_hypsr   rY   r   rb   r   r   r_   transcriptionr   r   valuesfrr   r0   r0   r4   write_transcription  s   	



"
	




6r  r*   r   wer)r   ,?reference_fieldhypothesis_fieldmetricspunctuation_marksoutput_manifest_pathc                 C   sH  g d}t |dkrtd| d|D ]}||vr%td| d| dqd|v r9t |dkr4tdt|d	}d
|v }	d|v }
d|v }t| d}| }dd |D }g }tdd| d t|D ]u}|| }|| }|	rt	|g|gdd}t
d| d|d
< |
rt	|g|gdd}t
d| d|d< |r|j||d\}}}t
d|j d|d< t
d|j d|d< t
d|j d|d< t
d|j d|d< t
d|j d|d< || qhW d   n1 sw   Y  |dur"t|d}|D ]}t|}|| d qW d   n	1 sw   Y  td |  |S )!a  
    Computes metrics per sample for given manifest

    Args:
        manifest_path: str, Required - path to dataset JSON manifest file (in NeMo format)
        reference_field: str, Optional - name of field in .json manifest with the reference text
            ("text" by default).
        hypothesis_field: str, Optional - name of field in .json manifest with the hypothesis text
            ("pred_text" by default).
        metrics: list[str], Optional - list of metrics to be computed
            (currently supported "wer", "cer", "punct_er")
        punctuation_marks: list[str], Optional - list of punctuation marks for computing
            punctuation error rate ([".", ",", "?"] by default).
        output_manifest_path: str, Optional - path where .json manifest with calculated metrics will be saved.

    Returns:
        samples: dict - Dict of samples with calculated metrics
    )r  cerpunct_err   zL'metrics' list is empty.             Select the metrics from the supported: r   'zK' metric is not supported.                 Currently supported metrics are r  zHpunctuation_marks list can't be empty when 'punct_er' metric is enabled.)r  r  r  r"   c                 S   s"   g | ]}|  d krt|qS r   r   r2   r   r0   r0   r4   r5   6  s   " z.compute_metrics_per_sample.<locals>.<listcomp>z
Computing z, z per sampleF)
hypotheses
referencesuse_cerd      T)	referencer   punct_correct_ratepunct_deletions_ratepunct_insertions_ratepunct_substitutions_ratepunct_error_rateNr   r   zOutput manifest saved: )rM   AssertionErrorr   rA   r   r   r   r   r
   r   roundcomputecorrect_ratedeletions_rateinsertions_ratesubstitutions_rater  rF   rD   r   
writelines)r   r  r  r  r  r  supported_metricsmetricoper_objuse_werr  use_punct_err   r   samplessamples_with_metricsr3   r   r   
sample_wer
sample_ceroperation_amountssubstitution_amountspunctuation_ratesoutputr   r0   r0   r4   compute_metrics_per_sample   sr   


r;  c                   @   sh   e Zd ZdefddZdee dee fddZdee dee fdd	Zdee dee fd
dZdS )PunctuationCapitalizationr  c                 C   s8   |rt dd| d| _t d| _dS d| _dS )a  
        Class for text processing with punctuation and capitalization. Can be used with class TextProcessingConfig.

        Args:
            punctuation_marks (str): String with punctuation marks to process.
        Example: punctuation_marks = '.,?'
        z([r   z])z\s{2,}N)recompiler   regex_punctuationregex_extra_space)selfr  r0   r0   r4   __init__^  s   
z"PunctuationCapitalization.__init__r   r   c                         j d ur fdd|D S |S )Nc              	      s(   g | ]} j d  jd| qS ) z \1 r@  subr?  rC   r  rA  r0   r4   r5   n  s    zBPunctuationCapitalization.separate_punctuation.<locals>.<listcomp>r?  rA  r   r0   rG  r4   separate_punctuationl  s
   

z.PunctuationCapitalization.separate_punctuationc                 C   s   dd |D S )Nc                 S   s   g | ]}|  qS r0   )lowerr  r0   r0   r4   r5   u  r   z:PunctuationCapitalization.do_lowercase.<locals>.<listcomp>r0   rI  r0   r0   r4   do_lowercaset  s   z&PunctuationCapitalization.do_lowercasec                    rC  )Nc              	      s(   g | ]} j d  jd | qS )rD  rE  r  rG  r0   r4   r5   y  s   ( z<PunctuationCapitalization.rm_punctuation.<locals>.<listcomp>rH  rI  r0   rG  r4   rm_punctuationw  s   
z(PunctuationCapitalization.rm_punctuationN)	r   
__module____qualname__r   rB  r   rJ  rL  rM  r0   r0   r0   r4   r<  ]  s
    r<  c                   @   s>   e Zd ZU dZeed< dZeed< dZeed< dZ	eed< dS )	TextProcessingConfigr   r  FrL  rM  TrJ  N)
r   rN  rO  r  r   __annotations__rL  boolrM  rJ  r0   r0   r0   r4   rP  ~  s
   
 rP  )NNr   )NN)NNrr   F)F)NFF)@r   rD   rT   r=  dataclassesr   pathlibr   tempfiler   typingr   r   r   r   rG   	omegaconfr	   	tqdm.autor
   nemo.collections.asrcollectionsr   rk    nemo.collections.asr.metrics.werr   nemo.collections.asr.modelsr   r    nemo.collections.asr.parts.utilsr   r   0nemo.collections.asr.parts.utils.streaming_utilsr   r   (nemo.collections.common.metrics.punct_err   4nemo.collections.common.parts.preprocessing.manifestr   
nemo.utilsr   r   r   r   r   r   re   floatrq   rR  r   rX   rh   r   r   r  r   r   r   r   r  r;  r<  rP  r0   r0   r0   r4   <module>   s  		
Y	

F	
E : 

f
]!