o
    ei%F                     @   s4  d Z ddlZddlmZ ddlZddlmZ ddlZddlm	Z	 e
dZdd Zd:d
dZ					d;ddZdd Zdd Zdd Zdd Zd<ddZd=ddZdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Z		d:d(d)Zd*d+ Zd,d- ZG d.d/ d/ejZd0d1 Z d2d3 Z!d4d5 Z"d6d7 Z#			d>d8d9Z$dS )?z
Data pipeline elements for the G2P pipeline

Authors
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Artem Ploujnikov 2021 (minor refactoring only)
    N)reduce)nn)expand_to_charsz\s{2,}c                    s0   |   }d fdd|D }td|}|S )aM  
    Cleans incoming text, removing any characters not on the
    accepted list of graphemes and converting to uppercase

    Arguments
    ---------
    txt: str
        the text to clean up
    graphemes: list
        a list of graphemes

    Returns
    -------
    item: DynamicItem
        A wrapped transformation function
     c                 3   s    | ]	}| v r|V  qd S N .0char	graphemesr   a/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/g2p/dataio.py	<genexpr>(   s    z!clean_pipeline.<locals>.<genexpr> )upperjoinRE_MULTI_SPACEsub)txtr   resultr   r   r   clean_pipeline   s   r   Tc                 #   sJ    |r|   }  fdd| D }|V   |}|V  t|}|V  dS )a  Encodes a grapheme sequence

    Arguments
    ---------
    char: str
        A list of characters to encode.
    grapheme_encoder: speechbrain.dataio.encoder.TextEncoder
        a text encoder for graphemes. If not provided,
    uppercase: bool
        whether or not to convert items to uppercase

    Yields
    ------
    grapheme_list: list
        a raw list of graphemes, excluding any non-matching
        labels
    grapheme_encoded_list: list
        a list of graphemes encoded as integers
    grapheme_encoded: torch.Tensor
    c                    s   g | ]	}| j v r|qS r   )lab2ind)r	   graphemegrapheme_encoderr   r   
<listcomp>D   s    z%grapheme_pipeline.<locals>.<listcomp>N)r   encode_sequencetorch
LongTensor)r
   r   	uppercasegrapheme_listgrapheme_encoded_listgrapheme_encodedr   r   r   grapheme_pipeline-   s   



r#   r      c                 #   st     fdd| D }|V  d |durt||n|}|r&t| |||}	n| j|}	|	V  t|	}
|
V  dS )a  A pipeline element that uses a pretrained tokenizer

    Arguments
    ---------
    seq: list
        List of tokens to encode.
    tokenizer: speechbrain.tokenizer.SentencePiece
        a tokenizer instance
    tokens: str
        available tokens
    wordwise: str
        whether tokenization is performed on the whole sequence
        or one word at a time. Tokenization can produce token
        sequences in which a token may span multiple words
    word_separator: str
        The substring to use as a separator between words.
    token_space_index: int
        the index of the space token
    char_map: dict
        a mapping from characters to tokens. This is used when
        tokenizing sequences of phonemes rather than sequences
        of characters. A sequence of phonemes is typically a list
        of one or two-character tokens (e.g. ["DH", "UH", " ", "S", "AW",
        "N", "D"]). The character map makes it possible to map these
        to arbitrarily selected characters

    Yields
    ------
    token_list: list
        a list of raw tokens
    encoded_list: list
        a list of tokens, encoded as a list of integers
    encoded: torch.Tensor
        a list of tokens, encoded as a tensor
    c                    s   g | ]}| v r|qS r   r   r	   tokentokensr   r   r   z       z-tokenizer_encode_pipeline.<locals>.<listcomp>r   N)r   _map_tokens_item_wordwise_tokenizespencode_as_idsr   r   )seq	tokenizerr(   wordwiseword_separatortoken_space_indexchar_map
token_listtokenizer_inputencoded_listencodedr   r'   r   tokenizer_encode_pipelineN   s    ,



r8   c                    sL   ||vr
j |S tt||}fdd|D }|g t fdd|S )a  Tokenizes a sequence wordwise

    Arguments
    ---------
    tokenizer: speechbrain.tokenizers.SentencePiece.SentencePiece
        a tokenizer instance
    sequence: iterable
        the original sequence
    input_separator: str
        the separator used in the input sequence
    token_separator: str
        the token separator used in the output sequence

    Returns
    -------
    result: str
        the resulting tensor
    c                       g | ]} j |qS r   )r,   r-   r	   word_tokensr/   r   r   r          z&_wordwise_tokenize.<locals>.<listcomp>c                    s   |   | S r   r   )leftright)sep_listr   r   <lambda>   s    z$_wordwise_tokenize.<locals>.<lambda>)r,   r-   list_split_listr   )r/   sequenceinput_separatortoken_separatorwordsencoded_wordsr   )r@   r/   r   r+      s   
r+   c                    sj   t |tr|dkrdS ||vr t |tr|n| } j|S tt||} fdd|D }||S )a  Detokenizes a sequence wordwise

    Arguments
    ---------
    tokenizer: speechbrain.tokenizers.SentencePiece.SentencePiece
        a tokenizer instance
    sequence: iterable
        the original sequence
    output_separator: str
        the separator used in the output sequence
    token_separator: str
        the token separator used in the output sequence

    Returns
    -------
    result: torch.Tensor
        the result
    r   c                    r9   r   r,   
decode_idsr:   r<   r   r   r      r=   z(_wordwise_detokenize.<locals>.<listcomp>)
isinstancestrrB   tolistr,   rJ   rC   r   )r/   rD   output_separatorrF   sequence_listrG   rH   r   r<   r   _wordwise_detokenize   s   

rP   c                 c   sh    | dur0d}t | D ]\}}||kr| |d | V  |}q||d k r2| |d d V  dS dS dS )z
    Splits a sequence (such as a tensor) by the specified separator

    Arguments
    ---------
    items: sequence
        any sequence that supports indexing
    separator: str
        the separator token

    Yields
    ------
    item
    N   )	enumerate)items	separatorlast_idxidxitemr   r   r   rC      s   rC   c                 C   sx   |du r
t jj }||krd|jvr|jdd|d nd|jvr*|jdd||d d|jvr3|  |j| dd	 |S )
a'  
    Initializes the phoneme encoder with EOS/BOS sequences

    Arguments
    ---------
    tokens: list
        a list of tokens
    encoder: speechbrain.dataio.encoder.TextEncoder.
        a text encoder instance. If none is provided, a new one
        will be instantiated
    bos_index: int
        the position corresponding to the Beginning-of-Sentence
        token
    eos_index: int
        the position corresponding to the End-of-Sentence

    Returns
    -------
    encoder: speechbrain.dataio.encoder.TextEncoder
        an encoder
    Nz	<eos-bos>)	bos_label	eos_label	bos_indexz<bos>z<eos>)rY   rZ   r[   	eos_indexz<unk>F)sequence_input)sbdataioencoderTextEncoderr   insert_bos_eosadd_unkupdate_from_iterable)r(   r`   r[   r\   r   r   r   enable_eos_bos   s*   


re   c                 c   s,    | V  | | }|V  t|}|V  dS )a  Encodes a sequence of phonemes using the encoder
    provided

    Arguments
    ---------
    phn: list
        List of phonemes
    phoneme_encoder: speechbrain.datio.encoder.TextEncoder
        a text encoder instance (optional, if not provided, a new one
        will be created)

    Yields
    ------
    phn: list
        the original list of phonemes
    phn_encoded_list: list
        encoded phonemes, as a list
    phn_encoded: torch.Tensor
        encoded phonemes, as a tensor
    N)r   r   r   )phnphoneme_encoderphn_encoded_listphn_encodedr   r   r   phoneme_pipeline  s   


rj   c                 c   sv    | | }t|st|}| V  tt|V  || }t|s,t|}| V  tt|V  dS )a}  Adds BOS and EOS tokens to the sequence provided

    Arguments
    ---------
    seq: torch.Tensor
        the source sequence
    encoder: speechbrain.dataio.encoder.TextEncoder
        an encoder instance

    Yields
    ------
    seq_eos: torch.Tensor
        the sequence, with the EOS token added
    seq_bos: torch.Tensor
        the sequence, with the BOS token added
    N)prepend_bos_indexr   	is_tensortensorlonglenappend_eos_index)r.   r`   seq_bosseq_eosr   r   r   add_bos_eos1  s   







rs   c                 C   s
   ||| S )a  Performs a Beam Search on the phonemes. This function is
    meant to be used as a component in a decoding pipeline

    Arguments
    ---------
    char_lens: torch.Tensor
        the length of character inputs
    encoder_out: torch.Tensor
        Raw encoder outputs
    beam_searcher: speechbrain.decoders.seq2seq.S2SBeamSearcher
        a SpeechBrain beam searcher instance

    Returns
    -------
    hyps: list
        hypotheses
    scores: list
        confidence scores associated with each hypotheses
    r   )	char_lensencoder_outbeam_searcherr   r   r   beam_search_pipelineN  s   
rw   c                 C   
   | | S )a#  Decodes a sequence of phonemes

    Arguments
    ---------
    hyps: list
        hypotheses, the output of a beam search
    phoneme_encoder: speechbrain.datio.encoder.TextEncoder
        a text encoder instance

    Returns
    -------
    phonemes: list
        the phoneme sequence
    decode_ndim)hypsrg   r   r   r   phoneme_decoder_pipelinee  s   
r|   c                 C   s    dd t t| t|d D S )zProduces a list of consecutive characters

    Arguments
    ---------
    start_char: str
        the starting character
    end_char: str
        the ending characters

    Returns
    -------
    char_range: str
        the character range
    c                 S   s   g | ]}t |qS r   chr)r	   rW   r   r   r   r         zchar_range.<locals>.<listcomp>rR   )rangeord)
start_charend_charr   r   r   
char_rangew  s    r   c                 C   sL   t ddt dd }ttdd | }tt||dt| }d|d< |S )	aw  Builds a map that maps arbitrary tokens to arbitrarily chosen characters.
    This is required to overcome the limitations of SentencePiece.

    Arguments
    ---------
    tokens: list
        a list of tokens for which to produce the map

    Returns
    -------
    token_map: dict
        a dictionary with original tokens as keys and
        new mappings as values
    AZazc                 S   s   | dkS )Nr   r   r}   r   r   r   rA     s    z&build_token_char_map.<locals>.<lambda>Nr   )r   rB   filterdictzipro   )r(   charsvalues	token_mapr   r   r   build_token_char_map  s
   r   c                 C   s   dd |   D S )zExchanges keys and values in a dictionary

    Arguments
    ---------
    map_dict: dict
        a dictionary

    Returns
    -------
    reverse_map_dict: dict
        a dictionary with keys and values flipped
    c                 S   s   i | ]\}}||qS r   r   )r	   keyvaluer   r   r   
<dictcomp>  s    zflip_map.<locals>.<dictcomp>)rT   )map_dictr   r   r   flip_map  s   r   c                 C   rx   )aD  Decodes a sequence using a tokenizer.
    This function is meant to be used in hparam files

    Arguments
    ---------
    seq: torch.Tensor
        token indexes
    encoder: sb.dataio.encoder.TextEncoder
        a text encoder instance

    Returns
    -------
    output_seq: list
        a list of lists of tokens
    ry   )r.   r`   r   r   r   text_decode  s   
r   c                    s8   fdd}fdd}|r|n| fdd}|S )a  Returns a function that recovers the original sequence from one that has been
    tokenized using a character map

    Arguments
    ---------
    char_map: dict
        a character-to-output-token-map
    tokenizer: speechbrain.tokenizers.SentencePiece.SentencePiece
        a tokenizer instance
    token_space_index: int
        the index of the "space" token
    wordwise: bool
        Whether to apply detokenize per word.

    Returns
    -------
    f: callable
        the tokenizer function
    c                    s   t  | d S )z+Detokenizes the sequence one word at a timer   )rP   rX   )r2   r/   r   r   detokenize_wordwise  s   z0char_map_detokenize.<locals>.detokenize_wordwisec                    s     j | S )zDetokenizes the entire sequencerI   r   r<   r   r   detokenize_regular  s   z/char_map_detokenize.<locals>.detokenize_regularc                    s    fdd| D }t | }|S )zThe tokenizer functionc                    s   g | ]} |qS r   r   r	   rX   )
detokenizer   r   r     r   z2char_map_detokenize.<locals>.f.<locals>.<listcomp>)_map_tokens_batch)r(   decoded_tokensmapped_tokens)r3   r   r   r   f  s   
zchar_map_detokenize.<locals>.fr   )r3   r/   r2   r0   r   r   r   r   )r3   r   r2   r/   r   char_map_detokenize  s
   r   c                        fdd| D S )a  Performs token mapping, in batch mode

    Arguments
    ---------
    tokens: iterable
        a list of token sequences
    char_map: dict
        a token-to-character mapping

    Returns
    -------
    result: list
        a list of lists of characters
    c                    s   g | ]} fd d|D qS )c                       g | ]} | qS r   r   r   r3   r   r   r     r   z0_map_tokens_batch.<locals>.<listcomp>.<listcomp>r   r   r   r   r   r     s    z%_map_tokens_batch.<locals>.<listcomp>r   r(   r3   r   r   r   r        r   c                    r   )zMaps tokens to characters, for a single item

    Arguments
    ---------
    tokens: iterable
        a single token sequence
    char_map: dict
        a token-to-character mapping

    Returns
    -------
    result: list
        a list of tokens
    c                    r   r   r   r   r   r   r   r     r   z$_map_tokens_item.<locals>.<listcomp>r   r   r   r   r   r*     r   r*   c                       s4   e Zd ZdZ fddZdd Z fddZ  ZS )LazyInitzA lazy initialization wrapper

    Arguments
    ---------
    init : callable
        The function to initialize the underlying object
    c                    s    t    d | _|| _d | _d S r   )super__init__instanceinitdevice)selfr   	__class__r   r   r     s   

zLazyInit.__init__c                 C   s   | j du r
|  | _ | j S )zEInitializes the object instance, if necessary
        and returns it.N)r   r   )r   r   r   r   __call__  s   

zLazyInit.__call__c                    s>   t  | | jdu r|  | _t| jdr| j|| _| S )zMoves the underlying object to the specified device

        Arguments
        ---------
        device : str | torch.device
            the device

        Returns
        -------
        self
        Nto)r   r   r   r   hasattr)r   r   r   r   r   r   &  s   

zLazyInit.to)__name__
__module____qualname____doc__r   r   r   __classcell__r   r   r   r   r     s
    r   c                 C   s   t | S )aL  A wrapper to ensure that the specified object is initialized
    only once (used mainly for tokenizers that train when the
    constructor is called

    Arguments
    ---------
    init: callable
        a constructor or function that creates an object

    Returns
    -------
    instance: object
        the object instance
    )r   )r   r   r   r   	lazy_init:  s   r   c                 C   s   |dkr| S |  d| S )aH  Determines the key to be used for sequences (e.g. graphemes/phonemes)
    based on the naming convention

    Arguments
    ---------
    key: str
        the key (e.g. "graphemes", "phonemes")
    mode: str
        the mode/suffix (raw, eos/bos)

    Returns
    -------
    key if ``mode=="raw"`` else ``f"{key}_{mode}"``
    raw_r   )r   moder   r   r   get_sequence_keyL  s   r   c                 C   s   || }dd |D S )a  Converts a batch of phoneme sequences (a single tensor)
    to a list of space-separated phoneme label strings,
    (e.g. ["T AY B L", "B UH K"]), removing any special tokens

    Arguments
    ---------
    phns: torch.Tensor
        a batch of phoneme sequences
    decoder: Callable
        Converts tensor to phoneme label strings.

    Returns
    -------
    result: list
        a list of strings corresponding to the phonemes provided
    c                 S   s   g | ]	}d  t|qS )r   )r   remove_specialr   r   r   r   r   q  s    z%phonemes_to_label.<locals>.<listcomp>r   )phnsdecoderphn_decodedr   r   r   phonemes_to_label^  s   r   c                 C   s   dd | D S )a  Removes any special tokens from the sequence. Special tokens are delimited
    by angle brackets.

    Arguments
    ---------
    phn: list
        a list of phoneme labels

    Returns
    -------
    result: list
        the original list, without any special tokens
    c                 S   s   g | ]}d |vr|qS )<r   r%   r   r   r   r     r)   z"remove_special.<locals>.<listcomp>r   )rf   r   r   r   r   t  s   r   c           	      C   sJ   d}|r#|  | }|jd }t|d|d|d|dd}|S )a  Applies word embeddings, if applicable. This function is meant
    to be used as part of the encoding pipeline

    Arguments
    ---------
    txt: str
        the raw text
    grapheme_encoded: torch.Tensor
        the encoded graphemes
    grapheme_encoded_len: torch.Tensor
        encoded grapheme lengths
    grapheme_encoder: speechbrain.dataio.encoder.TextEncoder
        the text encoder used for graphemes
    word_emb: callable
        the model that produces word embeddings
    use_word_emb: bool
        a flag indicated if word embeddings are to be applied

    Returns
    -------
    char_word_emb: torch.Tensor
        Word embeddings, expanded to the character dimension
    Nr   r   )embr.   seq_lenr1   )
embeddingsr   r   	unsqueezesqueeze)	r   r"   grapheme_encoded_lenr   word_embuse_word_embchar_word_embraw_word_embword_separator_idxr   r   r   word_emb_pipeline  s   
r   )NT)Tr   r$   Nr   )NN)NNN)%r   re	functoolsr   r   r   speechbrainr^   speechbrain.wordemb.utilr   compiler   r   r#   r8   r+   rP   rC   re   rj   rs   rw   r|   r   r   r   r   r   r   r*   Moduler   r   r   r   r   r   r   r   r   r   <module>   sP    	

%
?#
-

**