o
    i?                     @   s   d Z ddlZddlZedZzddlmZ W n ey4   ddl	Z	e	j
de id ddlmZ Y nw G dd	 d	Zd
d ZdddZdd Zdd Zdd ZdS )ai  CTC segmentation.

This file contains the core functions of CTC segmentation.
to extract utterance alignments within an audio file with
a given transcription.
For a description, see:
"CTC-Segmentation of Large Corpora for German End-to-end Speech Recognition"
https://arxiv.org/abs/2007.09127 or
https://link.springer.com/chapter/10.1007%2F978-3-030-60276-5_27
    Nctc_segmentation   )cython_fill_tableinclude_dirs)
setup_argsc                   @   s   e Zd ZdZdZdZdZdZdZdZ	dZ
dZd	Zd	Zd
Zd	ZdZdZdZdZdZdZdZedd Zedd Zdd Zdd Zdd Zdd ZdS )CtcSegmentationParametersu  Default values for CTC segmentation.

    May need adjustment according to localization or ASR settings.
    The character set is taken from the model dict, i.e., usually are generated
    with SentencePiece. An ASR model trained in the corresponding language and
    character set is needed. If the character set contains any punctuation
    characters, "#", the Greek char "ε", or the space placeholder, adapt
    these settings.
        _i@  i g?      ·r   FTu   ε#u   .,»«•❍·u   ▁Nc                 C   s*   | j r| jr| j| j  d }|S | j}|S )aB  Derive index duration from frame duration and subsampling.

        This value can be fixed by setting ctc_index_duration, which causes
        frame_duration_ms and subsampling_factor to be ignored.

        Legacy function. This function will be removed in later versions
        and replaced by index_duration.
        i  )subsampling_factorframe_duration_msindex_duration)selft r   U/home/ubuntu/.local/lib/python3.10/site-packages/ctc_segmentation/ctc_segmentation.pyindex_duration_in_secondsC   s
   
z3CtcSegmentationParameters.index_duration_in_secondsc                 C   s    t | j}|dt | j 7 }|S )z<Get configuration flags to pass to the table_fill operation.   )intblank_transition_cost_zeropreamble_transition_cost_zero)r   flagsr   r   r   r   S   s   
zCtcSegmentationParameters.flagsc                    s2   d  fdd jD  _td j  dS )z9Remove known tokens from the list of excluded characters. c                    s(   g | ] d  fddj D vr qS )Tc                    s   g | ]} |kqS r   r   ).0jcharr   r   
<listcomp>`       zSCtcSegmentationParameters.update_excluded_characters.<locals>.<listcomp>.<listcomp>)	char_list)r   r   r   r   r   ]   s
    zHCtcSegmentationParameters.update_excluded_characters.<locals>.<listcomp>zExcluded characters: N)joinexcluded_charactersloggerdebugr!   r   r!   r   update_excluded_charactersZ   s   
z4CtcSegmentationParameters.update_excluded_charactersc                 K   s   | j di | dS )z(Set all parameters as attribute at init.Nr   )set)r   kwargsr   r   r   __init__e   s   z"CtcSegmentationParameters.__init__c                 K   s>   |D ]}| dst| |r|| durt| |||  qdS )zUpdate CtcSegmentationParameters.

        Args:
            **kwargs: Key-value dict that contains all properties
                with their new values. Unknown properties are ignored.
        _N)
startswithhasattrsetattr)r   r(   keyr   r   r   r'   i   s   zCtcSegmentationParameters.setc                 C   s>   d}| j  D ]}| j | }|| d| d7 }q|d7 }|S )z"Print all attribute as dictionary.zCtcSegmentationParameters( =z, ))__dict__keys)r   output	attributevaluer   r   r   __repr__x   s   
z"CtcSegmentationParameters.__repr__)__name__
__module____qualname____doc__max_prob	skip_probmin_window_sizemax_window_sizer   score_min_mean_over_Lspaceblankreplace_spaces_with_blanksr   r   backtrack_from_max_tself_transitionstart_of_ground_truthr#   tokenized_meta_symbolr    r   r   propertyr   r   r&   r)   r'   r6   r   r   r   r   r   #   s:    


r   c              	   C   s  | j }d}|jd | j }tdt| d|dd|jd  d t||jd kr5| j| jkr5td| j	}	 t
jt||jd t|gt
jd	}|| j t
jt|gt
jd	}t||t
jt
j|t
jd	|| j | j\}	}
| jr}|jd d
 }	td|dd|
f   d|	  t
t|g}t
|jd g}dg|jd  }z|	dks|
dkrd}t
j}| j}t|jd
 D ]^}||
|f dkr!||
 |
| dkr||
d
 |  nd }|
dkr||	||
  ||
|f f n| j}||	|
f ||	d
 | |
d
 | f  }t|| |k rt|| }|}t||}q|	dkr4t||	||
  |f |n| j}||	|
f ||	d
 |
f  }t|| |kr|
dkrtd|d
 D ]}||
 |	 | j ||
| < qZ||||
 |	 < ||
|f }| j| |||
 |	 < |
d
| 8 }
|	d
| 8 }	n||||
 |	 < | j|||
 |	 < |	d
8 }	|	dks|
dksW n- ty   td |d9 }|| jk rtdt|  Y q8t d t d  w 	 |||fS )zExtract character-level utterance alignments.

    :param config: an instance of CtcSegmentationParameters
    :param lpz: probabilities obtained from CTC output
    :param ground_truth:  ground truth text in the form of a label sequence
    :return:
    r   zCTC segmentation of z
 chars to z.2fz	s audio (z
 indices).zAudio is shorter than text!Tdtyper   z/Max. joint probability to align text to audio: Nz at time index r   zPIndexError: Backtracking was not successful, the window size might be too small.r   zIncreasing the window size to: zMaximum window size reached.zCheck data and character list!)!rA   shaper   r$   infolenr<   r;   AssertionErrorr=   npzerosminfloat32fillint64r   astypearrayr   rC   r%   maxinfrangeabsr    rD   
IndexErrorwarningr>   strerror)configlpzground_truthrA   offsetaudio_durationwindow_sizetableoffsetsr   ctimings
char_probs
state_listmin_smin_switch_prob_deltamax_lpz_probsswitch_probest_switch_prob	stay_probest_stay_prob
char_indexr   r   r   r      s   
((



+


c                 C   s  t | jtkr
d| _|dur|| _| j| j }| j}g }|D ]<}|| js+|| j7 }|t|d  |D ]#}|	 rK| j
rK|| jsJ|| j7 }q6|| jv rY|| jvrY||7 }q6q|| jsf|| j7 }td|  |t|d  tdd | jD }tt||gtjd }	tt|D ]2}
t|D ]+}|
| dk rq||
| |
d  }|| j|}|| jv r| j|}||	|
|f< qq|	|fS )a  Prepare the given text for CTC segmentation.

    Creates a matrix of character symbols to represent the given text,
    then creates list of char indices depending on the models char list.

    :param config: an instance of CtcSegmentationParameters
    :param text: iterable of utterance transcriptions
    :param char_list: a set or list that includes all characters/symbols,
                        characters not included in this list are ignored
    :return: label matrix, character index matrix
    r   Nr   ground_truth: c                 S   s   g | ]}t |qS r   )rM   )r   rg   r   r   r   r     r   z prepare_text.<locals>.<listcomp>rJ   )typerA   r]   r    rE   endswithr@   appendrM   isspacerB   r#   r$   r%   rW   rO   onesrT   rY   replaceindex)r_   textr    rA   ra   utt_begin_indicesuttr   max_char_lenground_truth_matirn   spanrs   r   r   r   prepare_text   sL   



r   c           
      C   s2  | j g}g }|D ]8}|d | jks|| jg7 }|t|d  | D ]}|| jv r?| jr:|| js:|| jg7 }||g7 }q$q|d | jksN|| jg7 }t	
d|  |t|d  d}tt||gtjd }tdt|D ]}|| | jkr| j||df< qu| j|| }	|	||df< qu||fS )zPrepare the given tokenized text for CTC segmentation.

    :param config: an instance of CtcSegmentationParameters
    :param text: string with tokens separated by spaces
    :return: label matrix, character index matrix
    rJ   r   rt   r   )rE   r@   rw   rM   splitr    rB   
beginswithrF   r$   r%   rO   ry   rT   rY   rA   r{   )
r_   r|   ra   r}   r~   tokenr   r   r   rs   r   r   r   prepare_tokenized_text$  s6   


r   c                 C   s   dg}g }|D ]}|d | j ks|| j g7 }|t|d  || 7 }q|d | j ks3|| j g7 }td|  |t|d  tj|tjd	dd}||fS )aN  Prepare the given token list for CTC segmentation.

    This function expects the text input in form of a list
    of numpy arrays: [np.array([2, 5]), np.array([7, 9])]

    :param config: an instance of CtcSegmentationParameters
    :param text: list of numpy arrays with tokens
    :return: label matrix, character index matrix
    rJ   r   rt   rH   )
rA   rw   rM   tolistr$   r%   rO   rV   rT   reshape)r_   r|   ra   r}   r~   r   r   r   r   prepare_token_listL  s   
r   c                    s    fdd}g }t d}tt|D ]a}||| d}	|||d  d}
tt|	| j }tt|
| j }| j}||krA|}n+|| |krP|||  }nt d}t||| D ]}t	|||||   }q\|
|	|
|f q|S )a  Utterance-wise alignments from char-wise alignments.

    :param config: an instance of CtcSegmentationParameters
    :param utt_begin_indices: list of time indices of utterance start
    :param char_probs:  character positioned probabilities obtained from backtracking
    :param timings: mapping of time indices to seconds
    :param text: list of utterances
    :return: segments, a list of: utterance start and end [s], and its confidence score
    c                    sX    |   | d   d }|dkrt  | d  d |S |dkr*t | d  d |S dS )zCompute start and end time of utterance.

        :param index:  frame index value
        :param align_type:  one of ["begin", "end"]
        :return: start/end time of utterance in seconds
        r   r   beging      ?endN)rW   rQ   )r{   
align_typemiddlerh   r   r   compute_timew  s   z2determine_utterance_segments.<locals>.compute_timer   r   r   r   g        )rO   float64rY   rM   r   roundr   r?   meanrQ   rw   )r_   r}   ri   rh   r|   r   segmentsmin_probr   startr   start_tend_tnmin_avgr   r   r   r   determine_utterance_segmentsl  s$   

r   )N)r:   loggingnumpyrO   	getLoggerr$   ctc_segmentation_dynr   ImportError	pyximportinstallget_includer   r   r   r   r   r   r   r   r   r   <module>   s"   
_
m5( 