o
    i0~                     @   s@  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlZddlmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddl m!Z! G dd dZ"G dd dZ#		d de
e$e%f de%de%dede	de	de&de&fddZ'dd Z(d!ddZ)e*dkre)  dS dS )"z@Perform CTC segmentation to align utterances within audio files.    N)Path)ListOptionalTextIOUnion)CtcSegmentationParametersctc_segmentationdetermine_utterance_segmentsprepare_textprepare_token_list)check_argument_typescheck_return_type)ASRTask)	to_device)config_argparse)str2boolstr_or_none)get_commandline_argsc                   @   s`   e Zd ZdZdZdZdZdZdZdZ	dZ
dZdZdZdZdZdZdZdd Zdd	 Zd
d ZdS )CTCSegmentationTasku  Task object for CTC segmentation.

    When formatted with str(·), this object returns
    results in a kaldi-style segments file formatting.
    The human-readable output can be configured with
    the printing options.

    Properties:
        text: Utterance texts, separated by line. But without the utterance
            name at the beginning of the line (as in kaldi-style text).
        ground_truth_mat: Ground truth matrix (CTC segmentation).
        utt_begin_indices: Utterance separator for the Ground truth matrix.
        timings: Time marks of the corresponding chars.
        state_list: Estimated alignment of chars/tokens.
        segments: Calculated segments as: (start, end, confidence score).
        config: CTC Segmentation configuration object.
        name: Name of aligned audio file (Optional). If given, name is
            considered when generating the text.
        utt_ids: The list of utterance names (Optional). This list should
            have the same length as the number of utterances.
        lpz: CTC posterior log probabilities (Optional).

    Properties for printing:
        print_confidence_score: Includes the confidence score.
        print_utterance_text: Includes utterance text.
    NFuttTc                 K   s   | j di | dS )zInitialize the module.N )setselfkwargsr   r   I/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/asr_align.py__init__P   s   zCTCSegmentationTask.__init__c                 K   s>   |D ]}| dst| |r|| durt| |||  qdS )zUpdate properties.

        Args:
            **kwargs: Key-value dict that contains all properties
                with their new values. Unknown properties are ignored.
        _N)
startswithhasattrsetattr)r   r   keyr   r   r   r   T   s   zCTCSegmentationTask.setc                    s   d}t  j} jdu r fddt|D }n|t  jks!J  j}t jD ]>\}}||  d j d}||d dd|d d7 } jrT|d|d	 d
7 } jra|d j|  7 }||d 7 }q)|S )z0Return a kaldi-style ``segments`` file (string). Nc                    s   g | ]} j  d |dqS )r   04)name).0ir   r   r   
<listcomp>h       z/CTCSegmentationTask.__str__.<locals>.<listcomp> r   z.2f      z3.4f
)	lensegmentsutt_idsrange	enumerater$   print_confidence_scoreprint_utterance_texttext)r   outputnum_utts	utt_namesr&   boundary	utt_entryr   r'   r   __str__c   s   

zCTCSegmentationTask.__str__)__name__
__module____qualname____doc__r5   ground_truth_matutt_begin_indicestimings
char_probs
state_listr/   configdoner$   r0   lpzr3   r4   r   r   r;   r   r   r   r   r   #   s&    r   c                   @   s0  e Zd ZdZdZdZdZddgZdZddgZ	dZ
e Z					
				d.deeef deeef dededededededefddZdd Zd/ddZd0ddZe deejejf fd d!Zd"d# Zd/d$d%Zed&efd'd(Z 		d/deejejf d)ee!e ef de"e d*e"e d+ef
d,d-Z#dS )1CTCSegmentationu
  Align text to audio using CTC segmentation.

    Usage:
        Initialize with given ASR model and parameters.
        If needed, parameters for CTC segmentation can be set with ``set_config(·)``.
        Then call the instance as function to align text within an audio file.

    Example:
        >>> # example file included in the ESPnet repository
        >>> import soundfile
        >>> speech, fs = soundfile.read("test_utils/ctc_align_test.wav")
        >>> # load an ASR model
        >>> from espnet_model_zoo.downloader import ModelDownloader
        >>> d = ModelDownloader()
        >>> wsjmodel = d.download_and_unpack( "kamo-naoyuki/wsj" )
        >>> # Apply CTC segmentation
        >>> aligner = CTCSegmentation( **wsjmodel )
        >>> text=["utt1 THE SALE OF THE HOTELS", "utt2 ON PROPERTY MANAGEMENT"]
        >>> aligner.set_config( gratis_blank=True )
        >>> segments = aligner( speech, text, fs=fs )
        >>> print( segments )
        utt1 utt 0.27 1.72 -0.1663 THE SALE OF THE HOTELS
        utt2 utt 4.54 6.10 -4.9646 ON PROPERTY MANAGEMENT

    On multiprocessing:
        To parallelize the computation with multiprocessing, these three steps
        can be separated:
        (1) ``get_lpz``: obtain the lpz,
        (2) ``prepare_segmentation_task``: prepare the task, and
        (3) ``get_segments``: perform CTC segmentation.
        Note that the function `get_segments` is a staticmethod and therefore
        independent of an already initialized CTCSegmentation object.

    References:
        CTC-Segmentation of Large Corpora for German End-to-end Speech Recognition
        2020, Kürzinger, Winkelbauer, Li, Watzel, Rigoll
        https://arxiv.org/abs/2007.09127

    More parameters are described in https://github.com/lumaku/ctc-segmentation

    >  NautofixedtokenizeclassicFr   r+   float32Tasr_train_configasr_model_filefsngpu
batch_sizedtypekaldi_style_texttext_convertertime_stampsc
                 K   s0  t  sJ |dkrtdd}|dkrd}n|dkr#td tdt|||\}}|jtt|d	  t
|d| _t|d	rI|jjj}nd
}td|  td|jjj  d| vritd || _|| _|| _|| _|j| _|| _|j| _| jd||	||d|
 |jdd | j_dS )aX  Initialize the CTCSegmentation module.

        Args:
            asr_train_config: ASR model config file (yaml).
            asr_model_file: ASR model file (pth).
            fs: Sample rate of audio file.
            ngpu: Number of GPUs. Set 0 for processing on CPU, set to 1 for
                processing on GPU. Multi-GPU aligning is currently not
                implemented. Default: 0.
            batch_size: Currently, only batch size == 1 is implemented.
            dtype: Data type used for inference. Set dtype according to
                the ASR model.
            kaldi_style_text: A kaldi-style text file includes the name of the
                utterance at the start of the line. If True, the utterance name
                is expected as first word at each line. If False, utterance
                names are automatically generated. Set this option according to
                your input data. Default: True.
            text_converter: How CTC segmentation handles text.
                "tokenize": Use ESPnet 2 preprocessing to tokenize the text.
                "classic": The text is preprocessed as in ESPnet 1 which takes
                token length into account. If the ASR model has longer tokens,
                this option may yield better results. Default: "tokenize".
            time_stamps: Choose the method how the time stamps are
                calculated. While "fixed" and "auto" use both the sample rate,
                the ratio of samples to one frame is either automatically
                determined for each inference or fixed at a certain ratio that
                is initially determined by the module, but can be changed via
                the parameter ``samples_to_frames_ratio``. Recommended for
                longer audio files: "auto".
            **ctc_segmentation_args: Parameters for CTC segmentation.
        r+   z!Batch decoding is not implementedcpucudazMulti-GPU not yet implemented.z%Only single GPU decoding is supported)rT   FencoderUnknownzEncoder module: zCTC module:     rnnz6No RNN model detected; memory consumption may be high.)rQ   rW   rU   rV   Nr   )r   NotImplementedErrorloggingerrorr   build_model_from_filetogetattrtorchevalbuild_preprocess_fnpreprocess_fnr   rZ   	__class__r=   infoctclowerwarning	asr_modelasr_train_argsdevicerT   rU   
token_list
set_configrE   	char_list)r   rO   rP   rQ   rR   rS   rT   rU   rV   rW   ctc_segmentation_argsro   rm   rn   encoder_moduler   r   r   r      sJ   
,


zCTCSegmentation.__init__c                 K   s  d|v r|d | j vrtdt| j  |d | _d|v r%t|d | _d|v r0t|d | _d|v rCt|d ts=J |d | j	_
d|v rOt|d | j	_d|v rat|d ts\J |d | _d|v r{|d | jvrvtd	t| j |d | _d
|v rt|d
 tsJ |d
 | j	_d|v rt|d tsJ |d | j	_d|v rt|d | j	_| j	jr| j	jr| jstd d| _d|v rt|d tsJ |d | j	_dS dS )as  Set CTC segmentation parameters.

        Parameters for timing:
            time_stamps: Select method how CTC index duration is estimated, and
                thus how the time stamps are calculated.
            fs: Sample rate.
            samples_to_frames_ratio: If you want to directly determine the
                ratio of samples to CTC frames, set this parameter, and
                set ``time_stamps`` to "fixed".
                Note: If you want to calculate the time stamps as in
                ESPnet 1, set this parameter to:
                ``subsampling_factor * frame_duration / 1000``.

        Parameters for text preparation:
            set_blank: Index of blank in token list. Default: 0.
            replace_spaces_with_blanks: Inserts blanks between words, which is
                useful for handling long pauses between words. Only used in
                ``text_converter="classic"`` preprocessing mode. Default: False.
            kaldi_style_text: Determines whether the utterance name is expected
                as fist word of the utterance. Set at module initialization.
            text_converter: How CTC segmentation handles text.
                Set at module initialization.

        Parameters for alignment:
            min_window_size: Minimum number of frames considered for a single
                utterance. The current default value of 8000 corresponds to
                roughly 4 minutes (depending on ASR model) and should be OK in
                most cases. If your utterances are further apart, increase
                this value, or decrease it for smaller audio files.
            max_window_size: Maximum window size. It should not be necessary
                to change this value.
            gratis_blank: If True, the transition cost of blank is set to zero.
                Useful for long preambles or if there are large unrelated segments
                between utterances. Default: False.

        Parameters for calculation of confidence score:
            scoring_length: Block length to calculate confidence score. The
                default value of 30 should be OK in most cases.
        rW   u+   Parameter ´time_stamps´ has to be one of rQ   samples_to_frames_ratio	set_blankreplace_spaces_with_blanksrU   rV   u.   Parameter ´text_converter´ has to be one of min_window_sizemax_window_sizegratis_blankzBlanks are inserted between words, and also the transition cost of blank is zero. This configuration may lead to misalignments!Tscoring_lengthN)choices_time_stampsr^   listrW   floatrQ   ru   
isinstanceintrE   blankboolrw   rU   choices_text_converterrV   rx   ry   blank_transition_cost_zerowarned_about_misconfigurationr_   r`   score_min_mean_over_Lr   r   r   r   rq     sh   )


zCTCSegmentation.set_configc                 C   sh   d| j ji}| jdkr| jdu r|  }|| _| j| j }n| jdks%J || }|| j }||d< |S )z+Obtain parameters to determine time stamps.index_durationrK   NrJ   )rE   r   rW   ru    estimate_samples_to_frames_ratiorQ   )r   
speech_lenlpz_len
timing_cfgratior   ru   r   r   r   get_timing_configj  s   


z!CTCSegmentation.get_timing_config H c                 C   s2   t |}| |}|jd }|d }|| }|S )a>  Determine the ratio of encoded frames to sample points.

        This method helps to determine the time a single encoded frame occupies.
        As the sample rate already gave the number of samples, only the ratio
        of samples per encoded CTC frame are needed. This function estimates them by
        doing one inference, which is only needed once.

        Args:
            speech_len: Length of randomly generated speech vector for single
                inference. Default: 215040.

        Returns:
            samples_to_frames_ratio: Estimated ratio.
        r   r+   )rd   randget_lpzshape)r   r   random_inputrG   r   ru   r   r   r   r   }  s   


z0CTCSegmentation.estimate_samples_to_frames_ratiospeechc                 C   s   t |tjrt|}|dtt| j}|j	dgtj
|dd}||d}t|| jd}| jjdi |\}}t|dksGJ t|| j| }|d  }|S )zObtain CTC posterior log probabilities for given speech data.

        Args:
            speech: Speech audio input.

        Returns:
            lpz: Numpy vector with CTC log posterior probabilities.
        r   r+   )rT   
fill_value)r   speech_lengths)ro   Nr   )r   npndarrayrd   tensor	unsqueezerb   rc   rT   new_fulllongsizer   ro   rm   encoder.   rj   log_softmaxdetachsqueezerX   numpy)r   r   lengthsbatchencr   rG   r   r   r   r     s   


zCTCSegmentation.get_lpzc                 C   sr   d}t |tr| }ttt|}| jr5dd |D }tdd |}t|}dd |D }dd |D }||fS )z/Convert text to list and extract utterance IDs.Nc                 S      g | ]}| d dqS )r*   r+   )splitr%   r   r   r   r   r(         z/CTCSegmentation._split_text.<locals>.<listcomp>c                 S   s   t | dkS )Nr,   )r.   )uir   r   r   <lambda>  s    z-CTCSegmentation._split_text.<locals>.<lambda>c                 S      g | ]}|d  qS )r   r   r   r   r   r   r(         c                 S   r   )r+   r   r   r   r   r   r(     r   )r   str
splitlinesr}   filterr.   rU   )r   r5   r0   utt_ids_and_textr   r   r   _split_text  s   
zCTCSegmentation._split_textc              	      s    j }|dur|jd } ||}|jdi |  |\}} jdkrF fdd|D }	|jdfdd|	D }	t||	\}
}n' jdksMJ  fd	d|D } fd
d|D }	dd |	D }	t	||	\}
}t
||||
|||d}|S )u	  Preprocess text, and gather text and lpz into a task object.

        Text is pre-processed and tokenized depending on configuration.
        If ``speech_len`` is given, the timing configuration is updated.
        Text, lpz, and configuration is collected in a CTCSegmentationTask
        object. The resulting object can be serialized and passed in a
        multiprocessing computation.

        A minimal amount of text processing is done, i.e., splitting the
        utterances in ``text`` into a list and applying ``text_cleaner``.
        It is recommended that you normalize the text beforehand, e.g.,
        change numbers into their spoken equivalent word, remove special
        characters, and convert UTF-8 characters to chars corresponding to
        your ASR model dictionary.

        The text is tokenized based on the ``text_converter`` setting:

        The "tokenize" method is more efficient and the easiest for models
        based on latin or cyrillic script that only contain the main chars,
        ["a", "b", ...] or for Japanese or Chinese ASR models with ~3000
        short Kanji / Hanzi tokens.

        The "classic" method improves the the accuracy of the alignments
        for models that contain longer tokens, but with a greater complexity
        for computation. The function scans for partial tokens which may
        improve time resolution.
        For example, the word "▁really" will be broken down into
        ``['▁', '▁r', '▁re', '▁real', '▁really']``. The alignment will be
        based on the most probable activation sequence given by the network.

        Args:
            text: List or multiline-string with utterance ground truths.
            lpz: Log CTC posterior probabilities obtained from the CTC-network;
                numpy array shaped as ( <time steps>, <classes> ).
            name: Audio file name. Choose a unique name, or the original audio
                file name, to distinguish multiple audio files. Default: None.
            speech_len: Number of sample points. If given, the timing
                configuration is automatically derived from length of fs, length
                of speech and length of lpz. If None is given, make sure the
                timing parameters are correct, see time_stamps for reference!
                Default: None.

        Returns:
            task: CTCSegmentationTask object that can be passed to
                ``get_segments()`` in order to obtain alignments.
        Nr   rL   c                    s    g | ]}  d d|id qS )z<dummy>r5   )rg   r   r'   r   r   r(         z=CTCSegmentation.prepare_segmentation_task.<locals>.<listcomp><unk>c                    s   g | ]}|| k qS r   r   r   )unkr   r   r(      r   rM   c                    s   g | ]} j |qS r   )rg   text_cleanerr   r'   r   r   r(     r   c                    s    g | ]}d   jj|qS )r"   )joinrg   	tokenizertext2tokensr   r'   r   r   r(     r   c                 S   r   )r   r"   )replacer   r   r   r   r(     r   )rE   r$   r5   r@   rA   r0   rG   r   )rE   r   r   r   r   rV   rr   indexr   r
   r   )r   r5   rG   r$   r   rE   r   r   r0   rp   r@   rA   taskr   )r   r   r   prepare_segmentation_task  s<   /



	z)CTCSegmentation.prepare_segmentation_taskr   c                 C   sp   t  sJ | jdusJ | j}| j}| j}| j}| j}t|||\}}}t|||||}	| j||||	dd}
|
S )az  Obtain segments for given utterance texts and CTC log posteriors.

        Args:
            task: CTCSegmentationTask object that contains ground truth and
                CTC posterior probabilities.

        Returns:
            result: Dictionary with alignments. Combine this with the task
                object to obtain a human-readable segments representation.
        NT)r$   rB   rC   rD   r/   rF   )	r   rE   rG   r@   rA   r5   r   r	   r$   )r   rE   rG   r@   rA   r5   rB   rC   rD   r/   resultr   r   r   get_segments  s*   


zCTCSegmentation.get_segmentsr5   r$   returnc                 C   sh   t  sJ |dur| j|d | |}| ||||jd }| |}|jdi | t|s2J |S )a  Align utterances.

        Args:
            speech: Audio file.
            text: List or multiline-string with utterance ground truths.
            fs: Sample rate in Hz. Optional, as this can be given when
                the module is initialized.
            name: Name of the file. Utterance names are derived from it.

        Returns:
            CTCSegmentationTask object with segments.
        N)rQ   r   r   )r   rq   r   r   r   r   r   r   )r   r   r5   rQ   r$   rG   r   r/   r   r   r   __call__;  s   


zCTCSegmentation.__call__)NrI   r   r+   rN   TrL   rJ   )NN)r   )$r<   r=   r>   r?   rQ   ru   rW   r|   rV   r   r   r   rE   r   r   r   r   r   r   rq   r   r   rd   no_gradTensorr   r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r   rH   |   sv    *

	

\
^

T)rH   T	log_levelrO   rP   audior5   r6   print_utt_textprint_utt_scorec                 K   s   t  sJ tj| dd dd | D }||d}	td	i |	|}
|jdks*J |j}tt	|\}}| }|
||||d}||_
||_t	|}|| dS )
z7Provide the scripting interface to align text to audio.z>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatc                 S   s   i | ]\}}|d ur||qS Nr   )r%   kvr   r   r   
<dictcomp>o  r)   zctc_align.<locals>.<dictcomp>)rO   rP   r"   )r   r5   rQ   r$   Nr   )r   r_   basicConfigitemsrH   r$   stem	soundfilereadr   r4   r3   write)r   rO   rP   r   r5   r6   r   r   r   modelalignerr$   r   rQ   transcriptsr/   segments_strr   r   r   	ctc_align\  s&   
r   c                  C   s  t jdtjd} | jddd dddd	 | jd
tddd | jddg ddd | d}|jdtdd |jdtdd | d}|jdtdg ddd	 |jdtddd | d}|jd td!d"d |jd#tdd$d |jd%tdd&d |jd'tdd(d |jd)t	d*d+d |jd,t	d*d-d |jd.tdd/d |jd0tt
jt
jd1d	 |jd2tt
jt
jd3d	 | d4}|jd5t	dd6d |jd7t	dd8d |jd9t	dd:d |jd;d<tdd=d> |jd?d@tdAddBd> |jdCdDtdEdFdGd | S )Hz3Obtain an argument-parser for the script interface.zASR Decoding)descriptionformatter_classz--log_levelc                 S   s   |   S r   )upper)xr   r   r   r     s    zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr   DEBUGNOTSETzThe verbose level of logging)typedefaultchoiceshelpz--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r   r   z--dtyperN   )float16rN   float64z	Data type)r   r   r   zModel configuration relatedz--asr_train_configT)r   requiredz--asr_model_filezText converter relatedz--token_typeN)charbpeNzIThe token type for ASR model. If not given, refers from the training argsz
--bpemodelzLThe model path of sentencepiece. If not given, refers from the training argszCTC segmentation relatedz--fsrI   zSampling Frequency. The sampling frequency (in Hz) is needed to correctly determine the starting and ending time of aligned segments.z--min_window_sizez-Minimum window size considered for utterance.z--max_window_sizez-Maximum window size considered for utterance.z--set_blankz*Index of model dictionary for blank token.z--gratis_blankFzSet the transition cost of the blank token to zero. Audio sections labeled with blank tokens can then be skipped without penalty. Useful if there are unrelated audio segments between utterances.z--replace_spaces_with_blankszFill blanks in between words to better model pauses between words. This option is only active for `--text_converter classic`. Segments can be misaligned if this option is combined with --gratis-blank.z--scoring_lengthzFChanges partitioning length L for calculation of the confidence score.z--time_stampsz_Select method how CTC index duration is estimated, and thus how the time stamps are calculated.z--text_converterz"How CTC segmentation handles text.zInput/output argumentsz--kaldi_style_textztAssume that the input text file is kaldi-style formatted, i.e., the utterance name is at the beginning of each line.z--print_utt_textz2Include the utterance text in the segments output.z--print_utt_scorez4Include the confidence score in the segments output.z-az--audiozInput audio file.)r   r   r   z-tz--textrzInput text file. Each line contains the ground truth of a single utterance. Kaldi-style text files include the name of the utterance as the first word in the line.z-oz--outputw-zSOutput in the form of a `segments` file. If not given, output is written to stdout.)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr   add_argument_groupr   r   r   rH   rW   r|   rV   r   r   FileType)parsergroupr   r   r   
get_parser  s  


	

r   c                 C   sF   t t tjd t }|| }t|}|dd tdi | dS )u9   Parse arguments and start the alignment in ctc_align(·).)filerE   Nr   )	printr   sysstderrr   
parse_argsvarspopr   )cmdr   argsr   r   r   r   main.  s   
r  __main__)TTr   )+r?   r   r_   r   pathlibr   typingr   r   r   r   r   r   r   rd   r   r   r	   r
   r   	typeguardr   r   espnet2.tasks.asrr    espnet2.torch_utils.device_funcsr   espnet2.utilsr   espnet2.utils.typesr   r   espnet.utils.cli_utilsr   r   rH   r   r   r   r   r   r  r<   r   r   r   r   <module>   s\   Y   j

- 
&

