o
    8wi7Z                  	   @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ ddlmZ ddlmZ eeZG d	d
 d
Zdd ZeG dd dZde
jdee dedee fddZdS )ziLibrary for Byte-pair-encoding (BPE) tokenization.
Authors
 * Abdelwahab Heba 2020
 * Loren Lugosch 2020
    N)	dataclass)List)
merge_char)edit_distance)run_on_main)
get_loggerc                   @   sj   e Zd ZdZ																
			dddZdd Zdd Zdd Zg fddZdddZ	dS )SentencePiecea  BPE class call the SentencePiece unsupervised text tokenizer from Google.
    Reference: https://github.com/google/sentencepiece
    SentencePiece lib is an unsupervised text tokenizer and detokenizer.
    It implements subword units like Byte-pair-encoding (BPE),
    Unigram language model and char/word tokenizer.
    Arguments
    ---------
    model_dir : str
        The directory where the model will be saved (or already stored).
    vocab_size : int, None, optional
        Vocab size for the chosen tokenizer type (BPE, Unigram).
        The vocab_size is optional for char, and mandatory for BPE & unigram
        tokenization.
    annotation_train : str
        Path of the annotation file which is used to learn the tokenizer. It
        can be in JSON or csv format.
    annotation_read : str
        The data entry which contains the word sequence in the annotation file.
    model_type : str
        (bpe, char, unigram).
        If "bpe", train unsupervised tokenization of piece of words. see:
        https://www.aclweb.org/anthology/P16-1162/
        If "word" take the vocabulary from the input text.
        If "unigram" do piece of word tokenization using unigram language
        model, see: https://arxiv.org/abs/1804.10959
    char_format_input : bool
        Whether the read entry contains characters format input.
        (default: False)
        (e.g., a p p l e _ i s _ g o o d)
    character_coverage : int
        Amount of characters covered by the model, good defaults
        are: 0.9995 for languages with a rich character set like Japanese or
        Chinese and 1.0 for other languages with small character set.
        (default: 1.0)
    user_defined_symbols : string
        String contained a list of symbols separated by a comma.
        User-defined symbols are handled as one piece in any context.
        (default: None)
    max_sentencepiece_length : int
        Maximum number of characters for the tokens. (default: 10)
    bos_id : int
        If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
    eos_id : int
        If -1 the bos_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
    pad_id : int
        If -1 the pad_id = unk_id = 0. otherwise, bos_id = int. (default: -1)
    unk_id : int
        The token corresponding to an unknown symbol (not in token set).
    split_by_whitespace : bool
        If False, allow the sentencepiece to extract piece crossing multiple
        words. This feature is important for : Chinese/Japanese/Korean.
        (default: True)
    num_sequences : int
        If not none, use at most this many sequences to train the tokenizer
        (for large datasets). (default: None)
    annotation_list_to_check : list,
        List of the annotation file which is used for checking the accuracy of
        recovering words from the tokenizer.
    annotation_format : str
        The format of the annotation file. JSON or csv are the formats supported.
    text_file: str
        An alternate path to the text file (needed when multiple models are trained on
        the same data file)
    add_dummy_prefix : bool
        If True the tokenizer adds dummy whitespace at the beginning of text. (default: True)

    Example
    -------
    >>> import torch
    >>> dict_int2lab = {1: "HELLO", 2: "MORNING"}
    >>> model_dir = getfixture('tmpdir') / "tokenizer_data"
    >>> # Example with csv
    >>> annotation_train = "tests/samples/annotation/dev-clean.csv"
    >>> annotation_read = "wrd"
    >>> model_type = "bpe"
    >>> bpe = SentencePiece(str(model_dir), 100, annotation_train, annotation_read, model_type)
    >>> batch_seq = torch.Tensor([[1, 2, 2, 1],[1, 2, 1, 0]])
    >>> batch_lens = torch.Tensor([1.0, 0.75])
    >>> encoded_seq_ids, encoded_seq_pieces = bpe(
    ...     batch_seq, batch_lens, dict_int2lab, task="encode"
    ... )
    >>> # Example using JSON
    >>> annotation_train = str(model_dir + "/dev-clean.json")
    >>> annotation_read = "wrd"
    >>> bpe = SentencePiece(model_dir, 100, annotation_train, annotation_read, model_type, annotation_format = 'json')
    >>> encoded_seq_ids, encoded_seq_pieces = bpe(
    ...     batch_seq, batch_lens, dict_int2lab, task="encode"
    ... )
    NunigramF      ?
   r   Tcsvc                 C   s"  |dvrt dtj|st| t|tst d|| _|| _|| _	| jd urJtj
| jd }|d u rGtj|tj| j|d}|| _tj|t|d | | _t|| _|| _|| _t|| _t|	| _t|
| _t|| _t|| _t|| _|| _|| _|| _t|| _tj| jd st| j  nt!"d t!"d	 t!"d
| j d  t!"dt| j  t!"d| j  t#$ | _%| j%&| jd  t| j| j% krd| j d| j%  d}| jdkrt!'| d nt!'| d |d urt| j(d|id d S d S )N)r	   bpecharz0model_type must be one of : [unigram, bpe, char]zvocab_size must be integer.   z.txt_z.modelzTokenizer is already trained.z==== Loading Tokenizer ===zTokenizer path: zTokenizer vocab_size: zTokenizer type: zSentencePiece vocab size `z'` requested, but the loaded model has `zQ`! This can cause decoding errors or weird model training behavior in some cases.r   z@ The model type is 'char', for which `vocab_size` has no impact.z7 Are you loading a tokenizer with the wrong parameters?list_annotation_files)kwargs))
ValueErrorospathisdirmakedirs
isinstanceintannotation_trainannotation_readannotation_formatsplitextjoinbasenamereplace	text_filestrprefix_model_file
vocab_size
model_typechar_format_inputcharacter_coveragemax_sentencepiece_lengthbos_ideos_idpad_idunk_idnum_sequencessplit_by_whitespaceuser_defined_symbolsadd_dummy_prefixisfiler   
_train_BPEloggerinfospmSentencePieceProcessorsploadwarning_check_coverage_from_bpe)self	model_dirr%   r   r   r&   r'   r(   r0   r)   r*   r+   r,   r-   r/   r.   annotation_list_to_checkr   r"   r1   extbase_msg rA   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/speechbrain/tokenizers/SentencePiece.py__init__s   sv   
















zSentencePiece.__init__c           	      C   s2  t jt j| jst| jd td| j d | j  t	| jddd}t
|}t|d}| j|vr@t| jd | j || j}t	| jd	dd}d
}|D ]4}| jdurg|| jkrgtd| j   n |d7 }|| }| jrt| g\}d|}||d  qR|  |  td| j  dS )z?Read CSV file and convert specific data entries into text file.< is not a file. please provide annotation file for training.Extract  sequences from:rutf-8encodingN must exist in:w+r   *Using %d sequences to train the tokenizer.r    
Text file created at: )r   r   r2   abspathr   r   r4   r5   r   openr   readernextindexr"   r.   printr'   r   splitr   writeclose)	r<   annotation_filerS   headersindex_labelr"   row_idxrowsentrA   rA   rB   	_csv2text   sT   



zSentencePiece._csv2textc                 C   s   t jt j| jst| jd td| j d | j  t	| jddd}t
|}W d   n1 s7w   Y  t	| jddd}d	}| D ]7}| jdur_|| jkr_td
| j   n#|d7 }|| | j }| jrzt| g\}d|}||d  qJ|  td| j  dS )z@Read JSON file and convert specific data entries into text file.rD   rE   rF   rG   rH   rI   NrL   r   rM   r   rN   rO   rP   )r   r   r2   rQ   r   r   r4   r5   r   rR   jsonr9   r"   keysr.   rV   r'   r   rW   r   rX   rY   )r<   fout_jsonr"   r]   snt_idr_   rA   rA   rB   
_json2text   sH   
zSentencePiece._json2textc                 C   s
  t d| j  tj| js*| jdkr|   n| jdkr#| 	  nt
d| j d| j d | j d | j d | j d	 | j d
 | j d | j d | j d | j d | j }| jdvrj|dt| j 7 }| jdurv|d| j 7 }| js}|d7 }tj| dS )zTrain tokenizer with unsupervised techniques (BPE, Unigram) using
        SentencePiece Library. If you use "char" mode, the SentencePiece
        creates a char dict so the vocab_size attribute is not needed.
        zTrain tokenizer with type:r   ra   zIAnnotation format not supported. Supported formats are csv and json. Got z--input=z --model_prefix=z --model_type=z
 --bos_id=z
 --eos_id=z
 --pad_id=z
 --unk_id=z --max_sentencepiece_length=z --character_coverage=z --add_dummy_prefix=)r   z --vocab_size=Nz --user_defined_symbols=z --split_by_whitespace=false)r4   r5   r&   r   r   r2   r"   r   r`   rf   r   r$   r*   r+   r,   r-   r)   r(   r1   r#   r%   r0   r/   r6   SentencePieceTrainertrain)r<   queryrA   rA   rB   r3     st   



	


zSentencePiece._train_BPEc              	   C   s^  |D ])}t jt j|r%td | jdkr@t|ddd}t	|}t
|d}| j|vr9t| jd | || j}n t| jddd}t|}| j}W d   n1 s[w   Y  g }|D ]n}	| jdkrp|	| }	n||	 | }	| jrt|	 g\}	d|	}	|	d	d
 }	| j|	}
| j|
}tjdg|	dg|dgdd\}|d d
kr|d D ]}|d
 dkr|d dur|d |vr||d  qqd| jdkr|  td|  t|d
krtdtt|  tdt| j   tdtdt t|| j     qtd tdtd  qtd|  qdS )a  Logging the accuracy of the BPE model to recover words from the training text.

        Arguments
        ---------
        list_annotation_files : list,
            List of the annotation file which is used for checking the accuracy of recovering words from the tokenizer.
        z===== Accuracy checking for recovering text from tokenizer ===r   rG   rH   rI   NrK   rN   rO   r   utt1T)compute_alignmentsWER	alignment=r   zrecover words from: zWrong recover words: zTokenizer vocab size: zaccuracy recovering words: zWrong recover words: 0r
   z No accuracy recover checking for)!r   r   r2   rQ   r4   r5   r   rR   r   rS   rT   r   r   rU   r   ra   r9   r'   r   rW   r   r8   encode_as_ids
decode_idsr   wer_details_for_batchappendrY   lenr:   r#   r%   float)r<   r   rZ   fannotation_filerS   r[   r\   rc   wrong_recover_listr^   
encoded_iddecode_textdetailsalignrA   rA   rB   r;   O  s   













	z&SentencePiece._check_coverage_from_bpeencodec                    s  |dkrdu rt d|dkrg }d} |jd     t|D ]<\}}fdd|d |  D }	jrEt|	g\}
d|
}nd|	}j	|}|
| t||kr_t|}q#tj|jd |f|jd	}tj|jd |jd	}t|D ]\}}t|||dt|f< t|| ||< q|||fS |d
krfdd|D S |dkr |jd      fddt|D S dS )ao  This __call__ function implements the tokenizer encoder and decoder
        (restoring the string of word) for BPE, Regularized BPE (with unigram),
        and char (speechbrain/nnet/RNN.py).
        Arguments
        ----------
        batch : tensor.IntTensor or list
            List if ( batch_lens = None and task = "decode_from_list")
            Contains the original labels. Shape: [batch_size, max_length]
        batch_lens : tensor.LongTensor
            Containing the relative length of each label sequences. Must be 1D
            tensor of shape: [batch_size]. (default: None)
        ind2lab : dict
            Dictionary which maps the index from label sequences
            (batch tensor) to string label.
        task : str
            ("encode", "decode", "decode_from_list)
            "encode": convert the batch tensor into sequence of tokens.
                the output contain a list of (tokens_seq, tokens_lens)
            "decode": convert a tensor of tokens to a list of word sequences.
            "decode_from_list": convert a list of token sequences to a list
                of word sequences.
        r{   Nz0Tokenizer encoder must have the ind2lab functionr   r   c                    s   g | ]} t | qS rA   )r   ).0rU   )ind2labrA   rB   
<listcomp>  s    z*SentencePiece.__call__.<locals>.<listcomp>rN   )devicedecode_from_listc                    s   g | ]} j |d qS )rN   )r8   rp   rW   )r|   utt_seq)r<   rA   rB   r~     s    decodec                    s6   g | ]\}}j |d  |    dqS )NrN   )r8   rp   r   tolistrW   )r|   ir   )
batch_lensr<   rA   rB   r~     s    )r   shaperoundr   	enumerater'   r   r   r8   ro   rr   rs   torchzerosr   Tensor)r<   batchr   r}   taskr   max_bpe_lenr   r   tokens
words_listr_   
bpe_encode
bpe_tensorbpe_lensbpe_uttrA   )r   r}   r<   rB   __call__  sF   


zSentencePiece.__call__)NNr	   Fr
   Nr   r   r   r   r   TNNr   NT)NNr{   )
__name__
__module____qualname____doc__rC   r`   rf   r3   r;   r   rA   rA   rA   rB   r      s2    ^
Z((2Tr   c                    s2   t    |   fddt j D }|S )a  Fetch list of tokens, can be indexed by token id

    The resulting list can be used to map id to token.

    Arguments
    ---------
    model_path : str
        Path to SentencePiece model

    Returns
    -------
    list
        Tokens in order by id (can be indexed by id)
    c                    s   g | ]} j |qS rA   )r8   id_to_piece)r|   r   modelrA   rB   r~     s    z"get_spm_tokens.<locals>.<listcomp>)r6   r7   r9   ranger8   r%   )
model_pathmappingrA   r   rB   get_spm_tokens  s   
r   c                   @   s   e Zd ZU dZdZeed< dS )$SentencePieceDecoderStreamingContextzGMutable streaming context for a single SentencePiece streaming session.r   emitted_symbol_countN)r   r   r   r   r   r   __annotations__rA   rA   rA   rB   r     s   
 r   	tokenizerhypscontextreturnc                 C   sh   | j |gddd }|j}t|jdkr2|jdk}|r(|jd jdr(d| }| jt|j7  _|S )u  Assuming the tokenizer is sentencepiece, decodes the input hypothesis
    but avoids incorrectly stripping leading spaces when streaming.
    Operates on a single hypothesis, not a batch of hypotheses.

    Normally, the tokenizer always decodes full sentences at a time, with the
    consequence that the first space in decoding will get removed.
    However, when streaming, we might be decoding mid-utterance where spaces
    must not be removed mid-sentence. This function handles this case.

    e.g. if within the same streaming context, you decode `["▁how", "▁are"]`
    then `["▁you"]`, the decoder would normally return `"how areyou"` instead of
    `"how are you"` like this function does.

    Arguments
    ---------
    tokenizer : sentencepiece.SentencePieceProcessor
        The SentencePiece processor to use for decoding.
    hyps : list of output token hypotheses
        List of tokens to decode of any length `>=0`.
    context : SentencePieceDecoderStreamingContext
        Mutable streaming context for the sentencepiece decoder, which should be
        reused across calls for the same decoding stream.

    Returns
    -------
    str
        Decoded text. Leading spaces are preserved, except at the start of a
        transcription.
    immutable_proto)out_typer   r   u   ▁rN   )r   textrs   piecesr   piece
startswith)r   r   r   protor   should_preserve_spacerA   rA   rB   !spm_decode_preserve_leading_space  s   "
r   )r   r   ra   os.pathr   dataclassesr   typingr   sentencepiecer6   r   speechbrain.dataio.dataior   speechbrain.utilsr   speechbrain.utils.distributedr   speechbrain.utils.loggerr   r   r4   r   r   r   r7   r   r#   r   rA   rA   rA   rB   <module>   s:       U