o
    i3                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ dd Zd	d
 Zdd ZedkrSeejdd  dS dS )a  
This program performs CTC segmentation to align utterances within audio files.

Inputs:
    `--data-json`:
        A json containing list of utterances and audio files
    `--model`:
        An already trained ASR model

Output:
    `--output`:
        A plain `segments` file with utterance positions in the audio files.

Selected parameters:
    `--min-window-size`:
        Minimum window size considered for a single utterance. The current default value
        should be OK in most cases. Larger values might give better results; too large
        values cause IndexErrors.
    `--subsampling-factor`:
        If the encoder sub-samples its input, the number of frames at the CTC layer is
        reduced by this factor.
    `--frame-duration`:
        This is the non-overlapping duration of a single frame in milliseconds (the
        inverse of frames per millisecond).
    `--set-blank`:
        In the rare case that the blank token has not the index 0 in the character
        dictionary, this parameter sets the index of the blank token.
    `--gratis-blank`:
        Sets the transition cost for blank tokens to zero. Useful if there are longer
        unrelated segments between segments.
    `--replace-spaces-with-blanks`:
        Spaces are replaced with blanks. Helps to model pauses between words. May
        increase length of ground truth. May lead to misaligned segments when combined
        with the option `--gratis-blank`.
    N)CtcSegmentationParametersctc_segmentationdetermine_utterance_segmentsprepare_text)load_trained_model)ASRInterface)LoadInputsAndTargetsc                  C   s  t jdt jt jd} | jdddd | jdtdd	d
 | jddddd | jdtddgdd | jdtddd
 | jddtddd
 | jdtddd
 | jdtdd | jd td!d | jd"tdd#d$ | jd%tdd&d
 | jd'dtd(d) | jd*tdd+d
 | jd,tdd-d
 | jd.tdd/d
 | jd0tdd1d
 | jd2tdd3d
 | jd4tdd5d
 | jd6tdd7d
 | jd8tdd9d
 | jd:tdd;d
 | jd<t d=dd>d$ | S )?zGet default arguments.zXAlign text to audio using CTC segmentation.using a pre-trained speech recognition model.)descriptionconfig_file_parser_classformatter_classz--configTzDecoding config file path.)is_config_filehelpz--ngpur   z$Number of GPUs (max. 1 is supported))typedefaultr   z--dtype)float16float32float64r   z,Float precision (only available in --api v2))choicesr   r   z	--backendpytorchzBackend library)r   r   r   r   z--debugmode   	Debugmodez	--verbosez-VzVerbose optionz--preprocess-confNz-The configuration file for the pre-processingz--data-jsonz+Json of recognition data for audio and text)r   r   z
--utt-textzText separated into utterancesz--modelzModel file parameters to read)r   requiredr   z--model-confzModel config filez
--num-encsz Number of encoders in the model.)r   r   r   z--subsampling-factorzSubsampling factor. If the encoder sub-samples its input, the number of frames at the CTC layer is reduced by this factor. For example, a BLSTMP with subsampling 1_2_2_1_1 has a subsampling factor of 4.z--frame-durationz;Non-overlapping duration of a single frame in milliseconds.z--min-window-sizez-Minimum window size considered for utterance.z--max-window-sizez-Maximum window size considered for utterance.z--use-dict-blankzDEPRECATED.z--set-blankz7Index of model dictionary for blank token (default: 0).z--gratis-blankzSet the transition cost of the blank token to zero. Audio sections labeled with blank tokens can then be skipped without penalty. Useful if there are unrelated audio segments between utterances.z--replace-spaces-with-blankszFill blanks in between words to better model pauses between words. Segments can be misaligned if this option is combined with --gratis-blank. May increase length of ground truth.z--scoring-lengthzFChanges partitioning length L for calculation of the confidence score.z--outputwzOutput segments file)	configargparseArgumentParserYAMLConfigFileParserArgumentDefaultsHelpFormatteraddadd_argumentintstrFileType)parser r#   H/home/ubuntu/.local/lib/python3.10/site-packages/espnet/bin/asr_align.py
get_parserC   s   	r%   c                 C   s0  t  }|| \} }| jdkrtjtjdd n| jdkr&tjtjdd ntjtjdd td | j	dkrF| j
dkrFtd| j
 d	d
}| j	dkr_d}tjd}|du r^td n| j	dkrntd td tdtjdd  td| j  | jdkrt| | ntdtd dS )zRun the main decoding function.r   z>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformat   zSkip DEBUG/INFO messagesr   r   z--dtype z" does not support the CPU backend.cpucudaCUDA_VISIBLE_DEVICESNz CUDA_VISIBLE_DEVICES is not set.zDecoding only supports ngpu=1.zpython path = 
PYTHONPATHz(None)z
backend = r   zOnly pytorch is supported.)r%   parse_known_argsverboseloggingbasicConfigINFODEBUGWARNwarningngpudtype
ValueErrorosenvirongeterrorsysexitinfobackend	ctc_align)argsr"   extradevicecvdr#   r#   r$   main   sF   








rE   c                  C   st  t | j\}}t|tsJ tddd| jdu r|jn| jddid}td|  t|dr5|j	j
j}nt|d	r@|jj
j}nd
}td|  td|jj
j  d|vr^td |j|d  t| jd}t|d }W d   n1 s~w   Y  t| jddde}| }d}	i }
i }| D ]N}g }g }|	t|k r||	 |r|||	 ||	 dd d  |||	 d||	 d  |	d7 }	|	t|k r||	 |s||
|< |||< qW d   n1 sw   Y  t }d}d}| jdur| j}| jdur| j}t|dr|| d |_n||_||_ | j!dur/| j!|_!| j"dur9| j"|_"|j#|_#| j$durHtd | j%durR| j%|_&| j'durc| j'r`d|_'nd|_'| j(rjd|_)|j)rw| j'rwt*d | j+dur| j+|_,td| d|  t-| dD ]\}}td| |t|  ||| fg}||\}}|d }t./ " |0t.1||2d}|j3|d 4 5 }W d   n	1 sw   Y  t6||
| \}}t7|||\}}}t8d |  t9|||||
| }t-|D ])\}	}|| |	  d| d|d d!d|d d!d|d" d#d$
}| j:;| qqdS )%aw  ESPnet-specific interface for CTC segmentation.

    Parses configuration, infers the CTC posterior probabilities,
    and then aligns start and end of utterances using CTC segmentation.
    Results are written to the output file given in the args.

    :param args: given configuration
    :param device: for inference; one of ['cuda', 'cpu']
    :return:  0 on success
    asrTFNtrain)modeload_outputsort_in_input_lengthpreprocess_confpreprocess_argszDecoding device=encencoderUnknownzEncoder module: zCTC module:     rnnz8No BLSTM model detected; memory consumption may be high.)rC   rbuttsrzutf-8)encodingr    r   
   index_durationi  zNThe option --use-dict-blank is deprecated. If needed, use --set-blank instead.zBlanks are inserted between words, and also the transition cost of blank is zero. This configuration may lead to misalignments!zFrame timings: zms * z(%d/%d) Aligning zstate_list = z.2fr(   z.9f
)<r   model
isinstancer   r   rK   r/   r>   hasattrrM   	__class__
__module__rN   ctcr4   toevalopen	data_jsonjsonloadutt_text	readlineskeyslen
startswithappendfindr   subsampling_factorframe_durationrW   frame_duration_msmin_window_sizemax_window_size	char_listuse_dict_blank	set_blankblankreplace_spaces_with_blanksgratis_blankblank_transition_cost_zeror;   scoring_lengthscore_min_mean_over_L	enumeratetorchno_gradencode	as_tensor	unsqueezelog_softmaxr)   numpyr   r   debugr   outputwrite) rA   rC   rY   
train_argsload_inputs_and_targetsencoder_modulefjslinesitextsegment_namesnametext_per_audiosegment_names_per_audioconfigrl   rn   idxbatchfeatlabel
enc_outputlpzground_truth_matutt_begin_indicestimings
char_probs
state_listsegmentsboundaryutt_segmentr#   r#   r$   r@      s   
	


$ 


 r@   __main__r   )__doc__rc   r/   r8   r<   r   r{   r   r   r   r   #espnet.asr.pytorch_backend.asr_initr   espnet.nets.asr_interfacer   espnet.utils.io_utilsr   r%   rE   r@   __name__argvr#   r#   r#   r$   <module>   s"   $x,}