o
    pioY                  	   @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ ddlmZ ddlmZ eeZG d	d
 d
Zdd ZeG dd dZde
jdee dedee fddZdS )ziLibrary for Byte-pair-encoding (BPE) tokenization.
Authors
 * Abdelwahab Heba 2020
 * Loren Lugosch 2020
    N)	dataclass)List)
merge_char)edit_distance)run_on_main)
get_loggerc                   @   sj   e Zd ZdZ																
			dddZdd Zdd Zdd Zg fddZdddZ	dS )SentencePiecea  BPE class call the SentencePiece unsupervised text tokenizer from Google.
    Reference: https://github.com/google/sentencepiece
    SentencePiece lib is an unsupervised text tokenizer and detokenizer.
    It implements subword units like Byte-pair-encoding (BPE),
    Unigram language model and char/word tokenizer.
    Arguments
    ---------
    model_dir : str
        The directory where the model will be saved (or already stored).
    vocab_size : int, None, optional
        Vocab size for the chosen tokenizer type (BPE, Unigram).
        The vocab_size is optional for char, and mandatory for BPE & unigram
        tokenization.
    annotation_train : str
        Path of the annotation file which is used to learn the tokenizer. It
        can be in JSON or csv format.
    annotation_read : str
        The data entry which contains the word sequence in the annotation file.
    model_type : str
        (bpe, char, unigram).
        If "bpe", train unsupervised tokenization of piece of words. see:
        https://www.aclweb.org/anthology/P16-1162/
        If "word" take the vocabulary from the input text.
        If "unigram" do piece of word tokenization using unigram language
        model, see: https://arxiv.org/abs/1804.10959
    char_format_input : bool
        Whether the read entry contains characters format input.
        (default: False)
        (e.g., a p p l e _ i s _ g o o d)
    character_coverage : int
        Amount of characters covered by the model, good defaults
        are: 0.9995 for languages with a rich character set like Japanese or
        Chinese and 1.0 for other languages with small character set.
        (default: 1.0)
    user_defined_symbols : string
        String contained a list of symbols separated by a comma.
        User-defined symbols are handled as one piece in any context.
        (default: None)
    max_sentencepiece_length : int
        Maximum number of characters for the tokens. (default: 10)
    bos_id : int
        If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
    eos_id : int
        If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
    pad_id : int
        If -1 the pad_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
    unk_id : int
        The token corresponding to an unknown symbol (not in token set).
    split_by_whitespace : bool
        If False, allow the sentencepiece to extract piece crossing multiple
        words. This feature is important for : Chinese/Japanese/Korean.
        (default: True)
    num_sequences : int
        If not none, use at most this many sequences to train the tokenizer
        (for large datasets). (default: None)
    annotation_list_to_check : list,
        List of the annotation file which is used for checking the accuracy of
        recovering words from the tokenizer.
    annotation_format : str
        The format of the annotation file. JSON or csv are the formats supported.
    text_file: str
        An alternate path to the text file (needed when multiple models are trained on
        the same data file)
    add_dummy_prefix : bool
        If True the tokenizer adds dummy whitespace at the beginning of text. (default: True)

    Example
    -------
    >>> import torch
    >>> dict_int2lab = {1: "HELLO", 2: "MORNING"}
    >>> model_dir = getfixture('tmpdir') / "tokenizer_data"
    >>> # Example with csv
    >>> annotation_train = "tests/samples/annotation/dev-clean.csv"
    >>> annotation_read = "wrd"
    >>> model_type = "bpe"
    >>> bpe = SentencePiece(str(model_dir), 100, annotation_train, annotation_read, model_type)
    >>> batch_seq = torch.Tensor([[1, 2, 2, 1],[1, 2, 1, 0]])
    >>> batch_lens = torch.Tensor([1.0, 0.75])
    >>> encoded_seq_ids, encoded_seq_pieces = bpe(
    ...     batch_seq, batch_lens, dict_int2lab, task="encode"
    ... )
    >>> # Example using JSON
    >>> annotation_train = str(model_dir + "/dev-clean.json")
    >>> annotation_read = "wrd"
    >>> bpe = SentencePiece(model_dir, 100, annotation_train, annotation_read, model_type, annotation_format = 'json')
    >>> encoded_seq_ids, encoded_seq_pieces = bpe(
    ...     batch_seq, batch_lens, dict_int2lab, task="encode"
    ... )
    NunigramF      ?
   r   Tcsvc                 C   s"  |dvrt dtj|st| t|tst d|| _|| _|| _	| jd urJtj
| jd }|d u rGtj|tj| j|d}|| _tj|t|d | | _t|| _|| _|| _t|| _t|	| _t|
| _t|| _t|| _t|| _|| _|| _|| _t|| _tj| jd st| j  nt!"d t!"d	 t!"d
| j d  t!"dt| j  t!"d| j  t#$ | _%| j%&| jd  t| j| j% krd| j d| j%  d}| jdkrt!'| d nt!'| d |d urt| j(d|id d S d S )N)r	   bpecharz0model_type must be one of : [unigram, bpe, char]zvocab_size must be integer.   z.txt_z.modelzTokenizer is already trained.z==== Loading Tokenizer ===zTokenizer path: zTokenizer vocab_size: zTokenizer type: zSentencePiece vocab size `z'` requested, but the loaded model has `zQ`! This can cause decoding errors or weird model training behavior in some cases.r   z@ The model type is 'char', for which `vocab_size` has no impact.z7 Are you loading a tokenizer with the wrong parameters?list_annotation_files)kwargs))
ValueErrorospathisdirmakedirs
isinstanceintannotation_trainannotation_readannotation_formatsplitextjoinbasenamereplace	text_filestrprefix_model_file
vocab_size
model_typechar_format_inputcharacter_coveragemax_sentencepiece_lengthbos_ideos_idpad_idunk_idnum_sequencessplit_by_whitespaceuser_defined_symbolsadd_dummy_prefixisfiler   
_train_BPEloggerinfospmSentencePieceProcessorsploadwarning_check_coverage_from_bpe)self	model_dirr%   r   r   r&   r'   r(   r0   r)   r*   r+   r,   r-   r/   r.   annotation_list_to_checkr   r"   r1   extbase_msg rA   b/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/speechbrain/tokenizers/SentencePiece.py__init__s   sv   
















zSentencePiece.__init__c           	      C   s*  t jt j| jst| jd td| j d | j  t	| jd}t
|}t|d}| j|vr>t| jd | j || j}t	| jd}d}|D ]4}| jdurc|| jkrctd	| j   n |d
7 }|| }| jr{t| g\}d|}||d  qN|  |  td| j  dS )z?Read CSV file and convert specific data entries into text file.< is not a file. please provide annotation file for training.Extract  sequences from:rN must exist in:w+r   *Using %d sequences to train the tokenizer.r    
Text file created at: )r   r   r2   abspathr   r   r4   r5   r   openr   readernextindexr"   r.   printr'   r   splitr   writeclose)	r<   annotation_filerP   headersindex_labelr"   row_idxrowsentrA   rA   rB   	_csv2text   sT   



zSentencePiece._csv2textc                 C   s  t jt j| jst| jd td| j d | j  t	| jd}t
|}W d   n1 s5w   Y  t	| jd}d}| D ]7}| jdur[|| jkr[td| j   n#|d	7 }|| | j }| jrvt| g\}d
|}||d  qF|  td| j  dS )z@Read JSON file and convert specific data entries into text file.rD   rE   rF   rG   NrI   r   rJ   r   rK   rL   rM   )r   r   r2   rN   r   r   r4   r5   r   rO   jsonr9   r"   keysr.   rS   r'   r   rT   r   rU   rV   )r<   fout_jsonr"   rZ   snt_idr\   rA   rA   rB   
_json2text   sH   
zSentencePiece._json2textc                 C   s
  t d| j  tj| js*| jdkr|   n| jdkr#| 	  nt
d| j d| j d | j d | j d | j d	 | j d
 | j d | j d | j d | j d | j }| jdvrj|dt| j 7 }| jdurv|d| j 7 }| js}|d7 }tj| dS )zTrain tokenizer with unsupervised techniques (BPE, Unigram) using
        SentencePiece Library. If you use "char" mode, the SentencePiece
        creates a char dict so the vocab_size attribute is not needed.
        zTrain tokenizer with type:r   r^   zIAnnotation format not supported. Supported formats are csv and json. Got z--input=z --model_prefix=z --model_type=z
 --bos_id=z
 --eos_id=z
 --pad_id=z
 --unk_id=z --max_sentencepiece_length=z --character_coverage=z --add_dummy_prefix=)r   z --vocab_size=Nz --user_defined_symbols=z --split_by_whitespace=false)r4   r5   r&   r   r   r2   r"   r   r]   rc   r   r$   r*   r+   r,   r-   r)   r(   r1   r#   r%   r0   r/   r6   SentencePieceTrainertrain)r<   queryrA   rA   rB   r3     st   



	


zSentencePiece._train_BPEc              	   C   sV  |D ]%}t jt j|r!td | jdkr>t|d}t	|}t
|d}| j|vr7t| jd | || j}nt| jd}t|}| j}W d   n1 sWw   Y  g }|D ]n}	| jdkrl|	| }	n||	 | }	| jrt|	 g\}	d|	}	|	dd }	| j|	}
| j|
}tjd	g|	dg|dgd
d\}|d dkr|d D ]}|d dkr|d dur|d |vr||d  qq`| jdkr|  td|  t|dkrtdtt|  tdt| j   tdtdt t|| j     qtd tdtd  qtd|  qdS )a  Logging the accuracy of the BPE model to recover words from the training text.

        Arguments
        ---------
        list_annotation_files : list,
            List of the annotation file which is used for checking the accuracy of recovering words from the tokenizer.
        z===== Accuracy checking for recovering text from tokenizer ===r   rG   NrH   rK   rL   r   utt1T)compute_alignmentsWER	alignment=r   zrecover words from: zWrong recover words: zTokenizer vocab size: zaccuracy recovering words: zWrong recover words: 0r
   z No accuracy recover checking for)!r   r   r2   rN   r4   r5   r   rO   r   rP   rQ   r   r   rR   r   r^   r9   r'   r   rT   r   r8   encode_as_ids
decode_idsr   wer_details_for_batchappendrV   lenr:   r#   r%   float)r<   r   rW   fannotation_filerP   rX   rY   r`   wrong_recover_listr[   
encoded_iddecode_textdetailsalignrA   rA   rB   r;   O  s   














	z&SentencePiece._check_coverage_from_bpeencodec                    s  |dkrdu rt d|dkrg }d} |jd     t|D ]<\}}fdd|d |  D }	jrEt|	g\}
d|
}nd|	}j	|}|
| t||kr_t|}q#tj|jd |f|jd	}tj|jd |jd	}t|D ]\}}t|||dt|f< t|| ||< q|||fS |d
krfdd|D S |dkr |jd      fddt|D S dS )ao  This __call__ function implements the tokenizer encoder and decoder
        (restoring the string of word) for BPE, Regularized BPE (with unigram),
        and char (speechbrain/nnet/RNN.py).
        Arguments
        ----------
        batch : tensor.IntTensor or list
            List if ( batch_lens = None and task = "decode_from_list")
            Contains the original labels. Shape: [batch_size, max_length]
        batch_lens : tensor.LongTensor
            Containing the relative length of each label sequences. Must be 1D
            tensor of shape: [batch_size]. (default: None)
        ind2lab : dict
            Dictionary which maps the index from label sequences
            (batch tensor) to string label.
        task : str
            ("encode", "decode", "decode_from_list)
            "encode": convert the batch tensor into sequence of tokens.
                the output contain a list of (tokens_seq, tokens_lens)
            "decode": convert a tensor of tokens to a list of word sequences.
            "decode_from_list": convert a list of token sequences to a list
                of word sequences.
        rx   Nz0Tokenizer encoder must have the ind2lab functionr   r   c                    s   g | ]} t | qS rA   )r   ).0rR   )ind2labrA   rB   
<listcomp>  s    z*SentencePiece.__call__.<locals>.<listcomp>rK   )devicedecode_from_listc                    s   g | ]} j |d qS )rK   )r8   rm   rT   )ry   utt_seq)r<   rA   rB   r{     s    decodec                    s6   g | ]\}}j |d  |    dqS )NrK   )r8   rm   r   tolistrT   )ry   ir~   )
batch_lensr<   rA   rB   r{     s    )r   shaperoundr   	enumerater'   r   r   r8   rl   ro   rp   torchzerosr|   Tensor)r<   batchr   rz   taskr   max_bpe_lenr   r~   tokens
words_listr\   
bpe_encode
bpe_tensorbpe_lensbpe_uttrA   )r   rz   r<   rB   __call__  sF   


zSentencePiece.__call__)NNr	   Fr
   Nr   r   r   r   r   TNNr   NT)NNrx   )
__name__
__module____qualname____doc__rC   r]   rc   r3   r;   r   rA   rA   rA   rB   r      s2    ^
Z((2Pr   c                    s2   t    |   fddt j D }|S )a  Fetch list of tokens, can be indexed by token id

    The resulting list can be used to map id to token.

    Arguments
    ---------
    model_path : str
        Path to SentencePiece model

    Returns
    -------
    list
        Tokens in order by id (can be indexed by id)
    c                    s   g | ]} j |qS rA   )r8   id_to_piece)ry   r   modelrA   rB   r{     s    z"get_spm_tokens.<locals>.<listcomp>)r6   r7   r9   ranger8   r%   )
model_pathmappingrA   r   rB   get_spm_tokens  s   
r   c                   @   s   e Zd ZU dZdZeed< dS )$SentencePieceDecoderStreamingContextzGMutable streaming context for a single SentencePiece streaming session.r   emitted_symbol_countN)r   r   r   r   r   r   __annotations__rA   rA   rA   rB   r     s   
 r   	tokenizerhypscontextreturnc                 C   sh   | j |gddd }|j}t|jdkr2|jdk}|r(|jd jdr(d| }| jt|j7  _|S )u  Assuming the tokenizer is sentencepiece, decodes the input hypothesis
    but avoids incorrectly stripping leading spaces when streaming.
    Operates on a single hypothesis, not a batch of hypotheses.

    Normally, the tokenizer always decodes full sentences at a time, with the
    consequence that the first space in decoding will get removed.
    However, when streaming, we might be decoding mid-utterance where spaces
    must not be removed mid-sentence. This function handles this case.

    e.g. if within the same streaming context, you decode `["▁how", "▁are"]`
    then `["▁you"]`, the decoder would normally return `"how areyou"` instead of
    `"how are you"` like this function does.

    Arguments
    ---------
    tokenizer : sentencepiece.SentencePieceProcessor
        The SentencePiece processor to use for decoding.
    hyps : list of output token hypotheses
        List of tokens to decode of any length `>=0`.
    context : SentencePieceDecoderStreamingContext
        Mutable streaming context for the sentencepiece decoder, which should be
        reused across calls for the same decoding stream.

    Returns
    -------
    str
        Decoded text. Leading spaces are preserved, except at the start of a
        transcription.
    immutable_proto)out_typer   r   u   ▁rK   )r   textrp   piecesr   piece
startswith)r   r   r   protor   should_preserve_spacerA   rA   rB   !spm_decode_preserve_leading_space  s   "
r   )r   r   r^   os.pathr   dataclassesr   typingr   sentencepiecer6   r   speechbrain.dataio.dataior   speechbrain.utilsr   speechbrain.utils.distributedr   speechbrain.utils.loggerr   r   r4   r   r   r   r7   r   r#   r   rA   rA   rA   rB   <module>   s:       Q