o
    %ݫij                     @   s   d Z ddlZddlZddlZddlZddlZddlmZm	Z	m
Z
 ddlmZ eeZdZdZdZdZe
G d	d
 d
ZG dd deZG dd deZdd ZdS )z[Encoding categorical data as integers

Authors
  * Samuele Cornell 2020
  * Aku Rouhe 2020
    N)mark_as_loadermark_as_saverregister_checkpoint_hooks)
get_loggerz<unk><bos><eos>z<blank>c                   @   sJ  e Zd ZdZdZdZdGddZdd Zd	d
 Ze	dd Z
dHddZ	dHddZ	dIddZg g ddi fddZdd Zdd Zdd Zdd Zefd d!Zd"d# Zd$d% ZdJd'd(ZdJd)d*ZdJd+d,ZdJd-d.Zd/d0 Zd1d2 Zed3d4 Zd5d6 Ze dHd7d8Z!d9d: Z"d;d< Z#d=d> Z$d?d@ Z%dAdB Z&e'dCdD Z(e'dEdF Z)dS )KCategoricalEncoderaK  Encode labels of a discrete set.

    Used for encoding, e.g., speaker identities in speaker recognition.
    Given a collection of hashables (e.g a strings) it encodes
    every unique item to an integer value: ["spk0", "spk1"] --> [0, 1]
    Internally the correspondence between each label to its index is handled by
    two dictionaries: lab2ind and ind2lab.

    The label integer encoding can be generated automatically from a SpeechBrain
    DynamicItemDataset by specifying the desired entry (e.g., spkid) in the annotation
    and calling update_from_didataset method:

    >>> from speechbrain.dataio.encoder import CategoricalEncoder
    >>> from speechbrain.dataio.dataset import DynamicItemDataset
    >>> dataset = {"ex_{}".format(x) : {"spkid" : "spk{}".format(x)} for x in range(20)}
    >>> dataset = DynamicItemDataset(dataset)
    >>> encoder = CategoricalEncoder()
    >>> encoder.update_from_didataset(dataset, "spkid")
    >>> assert len(encoder) == len(dataset) # different speaker for each utterance

    However can also be updated from an iterable:

    >>> from speechbrain.dataio.encoder import CategoricalEncoder
    >>> from speechbrain.dataio.dataset import DynamicItemDataset
    >>> dataset = ["spk{}".format(x) for x in range(20)]
    >>> encoder = CategoricalEncoder()
    >>> encoder.update_from_iterable(dataset)
    >>> assert len(encoder) == len(dataset)

    Note
    ----
    In both methods it can be specified it the single element in the iterable
    or in the dataset should be treated as a sequence or not (default False).
    If it is a sequence each element in the sequence will be encoded.


    >>> from speechbrain.dataio.encoder import CategoricalEncoder
    >>> from speechbrain.dataio.dataset import DynamicItemDataset
    >>> dataset = [[x+1, x+2] for x in range(20)]
    >>> encoder = CategoricalEncoder()
    >>> encoder.ignore_len()
    >>> encoder.update_from_iterable(dataset, sequence_input=True)
    >>> assert len(encoder) == 21 # there are only 21 unique elements 1-21

    This class offers 4 different methods to explicitly add a label in the internal
    dicts: add_label, ensure_label, insert_label, enforce_label.
    add_label and insert_label will raise an error if it is already present in the
    internal dicts. insert_label, enforce_label allow also to specify the integer value
    to which the desired label is encoded.

    Encoding can be performed using 4 different methods:
    encode_label, encode_sequence, encode_label_torch and encode_sequence_torch.
    encode_label operate on single labels and simply returns the corresponding
    integer encoding:

    >>> from speechbrain.dataio.encoder import CategoricalEncoder
    >>> from speechbrain.dataio.dataset import DynamicItemDataset
    >>> dataset = ["spk{}".format(x) for x in range(20)]
    >>> encoder.update_from_iterable(dataset)
    >>>
    22
    >>>
    encode_sequence on sequences of labels:
    >>> encoder.encode_sequence(["spk1", "spk19"])
    [22, 40]
    >>>
    encode_label_torch and encode_sequence_torch return torch tensors
    >>> encoder.encode_sequence_torch(["spk1", "spk19"])
    tensor([22, 40])
    >>>
    Decoding can be performed using decode_torch and decode_ndim methods.
    >>> encoded = encoder.encode_sequence_torch(["spk1", "spk19"])
    >>> encoder.decode_torch(encoded)
    ['spk1', 'spk19']
    >>>
    decode_ndim is used for multidimensional list or pytorch tensors
    >>> encoded = encoded.unsqueeze(0).repeat(3, 1)
    >>> encoder.decode_torch(encoded)
    [['spk1', 'spk19'], ['spk1', 'spk19'], ['spk1', 'spk19']]
    >>>

    In some applications, it can happen that during testing a label which has not
    been encountered during training is encountered. To handle this out-of-vocabulary
    problem add_unk can be used. Every out-of-vocab label is mapped to this special
    <unk> label and its corresponding integer encoding.

    >>> import torch
    >>> try:
    ...     encoder.encode_label("spk42")
    ... except KeyError:
    ...        print("spk42 is not in the encoder this raises an error!")
    spk42 is not in the encoder this raises an error!
    >>> encoder.add_unk()
    41
    >>> encoder.encode_label("spk42")
    41
    >>>
    returns the <unk> encoding

    This class offers also methods to save and load the internal mappings between
    labels and tokens using: save and load methods as well as load_or_create.
    z => z================
r   c                 K   s    i | _ i | _|| _| | d S N)lab2indind2labstarting_indexhandle_special_labels)selfr   special_labels r   N/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/dataio/encoder.py__init__   s   zCategoricalEncoder.__init__c                 C   s   d|v r|  |d  dS dS )z)Handles special labels such as unk_label.	unk_labelN)add_unkr   r   r   r   r   r      s   z(CategoricalEncoder.handle_special_labelsc                 C   s
   t | jS r	   )lenr
   r   r   r   r   __len__   s   
zCategoricalEncoder.__len__c                 C   s   |  }| | |S )z,Recreate a previously saved encoder directly)load)clspathobjr   r   r   
from_saved   s   
zCategoricalEncoder.from_savedFc                 C   s2   |r	t j|}nt|}|D ]}| | qdS )a  Update from iterator

        Arguments
        ---------
        iterable : iterable
            Input sequence on which to operate.
        sequence_input : bool
            Whether iterable yields sequences of labels or individual labels
            directly. (default False)
        N)	itertoolschainfrom_iterableiterensure_label)r   iterablesequence_inputlabel_iteratorlabelr   r   r   update_from_iterable   s   z'CategoricalEncoder.update_from_iterablec                    sN   |  g | j fdd|D |d W d   dS 1 s w   Y  dS )a  Update from DynamicItemDataset.

        Arguments
        ---------
        didataset : DynamicItemDataset
            Dataset on which to operate.
        output_key : str
            Key in the dataset (in data or a dynamic item) to encode.
        sequence_input : bool
            Whether the data yielded with the specified key consists of
            sequences of labels or individual labels directly.
        c                 3   s    | ]}|  V  qd S r	   r   ).0
data_point
output_keyr   r   	<genexpr>   s    z;CategoricalEncoder.update_from_didataset.<locals>.<genexpr>)r$   N)output_keys_asr'   r   	didatasetr+   r$   r   r*   r   update_from_didataset   s   "z(CategoricalEncoder.update_from_didatasetN   c           
      C   st   | j r| jj}td| d |rtj|}nt|}t	
|}||D ]\}}	|	|k r2 |S | | q'|S )a  Produce label mapping from iterable based on label counts

        Used to limit label set size.

        Arguments
        ---------
        iterable : iterable
            Input sequence on which to operate.
        sequence_input : bool
            Whether iterable yields sequences of labels or individual labels
            directly. False by default.
        n_most_common : int, None
            Take at most this many labels as the label set, keeping the most
            common ones. If None (as by default), take all.
        min_count : int
            Don't take labels if they appear less than this many times.

        Returns
        -------
        collections.Counter
            The counts of the different labels (unfiltered).
        z+Limited_labelset_from_iterable called, but zx is not empty. The new labels will be added, i.e. won't overwrite. This is normal if there is e.g. an unk label already.)r
   	__class____name__loggerinfor   r   r    r!   collectionsCountermost_common	add_label)
r   r#   r$   n_most_common	min_countclsnamer%   countsr&   countr   r   r   limited_labelset_from_iterable   s"   
z1CategoricalEncoder.limited_labelset_from_iterablec           	   	   C   s   z^t jj rC| |sQ|D ]}| || q|D ]}|du r#td| ||| q| | | 	| W t jj
  | | dS W t jj
  | | dS W t jj
  | | dS t jj
  | | w )zConvenient syntax for creating the encoder conditionally

        This pattern would be repeated in so many experiments that
        we decided to add a convenient shortcut for it here. The
        current version is multi-gpu (DDP) safe.
        Nz,Provide an output_key for DynamicItemDataset)sbutilsdistributedif_main_processload_if_possibler'   
ValueErrorr0   r   saveddp_barrierr   )	r   r   from_iterablesfrom_didatasetsr$   r+   r   r#   r/   r   r   r   load_or_create   s2   

z!CategoricalEncoder.load_or_createc                 C   s@   || j v r| jj}td| |  }|| j |< || j|< |S )a  Add new label to the encoder, at the next free position.

        Arguments
        ---------
        label : hashable
            Most often labels are str, but anything that can act as dict key is
            supported. Note that default save/load only supports Python
            literals.

        Returns
        -------
        int
            The index that was used to encode this label.
        Label already present in )r
   r2   r3   KeyError_next_indexr   )r   r&   r<   indexr   r   r   r9     s   


zCategoricalEncoder.add_labelc                 C   s   || j v r
| j | S | |S )a  Add a label if it is not already present.

        Arguments
        ---------
        label : hashable
            Most often labels are str, but anything that can act as dict key is
            supported. Note that default save/load only supports Python
            literals.

        Returns
        -------
        int
            The index that was used to encode this label.
        )r
   r9   )r   r&   r   r   r   r"   4  s   


zCategoricalEncoder.ensure_labelc                 C   s0   || j v r| jj}td| | || dS )a  Add a new label, forcing its index to a specific value.

        If a label already has the specified index, it is moved to the end
        of the mapping.

        Arguments
        ---------
        label : hashable
            Most often labels are str, but anything that can act as dict key is
            supported. Note that default save/load only supports Python
            literals.
        index : int
            The specific index to use.
        rK   N)r
   r2   r3   rL   enforce_label)r   r&   rN   r<   r   r   r   insert_labelH  s   
zCategoricalEncoder.insert_labelc              	   C   s   t |}|| jv r|| j| krdS | j| j| = || jv r&| j| }d}nd}|| j|< || j|< |rWtdt| d| dt| d |  }|| j|< || j|< dS dS )a&  Make sure label is present and encoded to a particular index.

        If the label is present but encoded to some other index, it is
        moved to the given index.

        If there is already another label at the
        given index, that label is moved to the next free position.
        NTFzMoving label z from index z
, because z was put at its place.)intr
   r   r4   r5   reprrM   )r   r&   rN   saved_labelmoving_other	new_indexr   r   r   rO   ]  s.   	





z CategoricalEncoder.enforce_labelc                 C   s   || _ | |S )a  Add label for unknown tokens (out-of-vocab).

        When asked to encode unknown labels, they can be mapped to this.

        Arguments
        ---------
        unk_label : hashable, optional
            Most often labels are str, but anything that can act as dict key is
            supported. Note that default save/load only supports Python
            literals. Default: <unk>. This can be None, as well!

        Returns
        -------
        int
            The index that was used to encode this.
        )r   r9   )r   r   r   r   r   r     s   
zCategoricalEncoder.add_unkc                 C   s&   | j }|| jv r|d7 }|| jv s|S )z'The index to use for the next new labelr1   )r   r   )r   rN   r   r   r   rM     s
   

zCategoricalEncoder._next_indexc                 C   s@   t | j }| j|v otdd t|dd |dd D S )a4  Check that the set of indices doesn't have gaps

        For example:
        If starting index = 1
        Continuous: [1,2,3,4]
        Continuous: [0,1,2]
        Non-continuous: [2,3,4]
        Non-continuous: [1,2,4]

        Returns
        -------
        bool
            True if continuous.
        c                 s   s     | ]\}}|| d kV  qdS )r1   Nr   )r(   ijr   r   r   r,     s    
z3CategoricalEncoder.is_continuous.<locals>.<genexpr>Nr1   )sortedr   keysr   allzip)r   indicesr   r   r   is_continuous  s   z CategoricalEncoder.is_continuousTc                 C   s   |    z| j| W S  tyF   t| dr |r | j| j  Y S t| dr/|s/td| dt| ds>|r>td| dtd| dw )a  Encode label to int

        Arguments
        ---------
        label : hashable
            Label to encode, must exist in the mapping.
        allow_unk : bool
            If given, that label is not in the label set
            AND unk_label has been added with add_unk(),
            allows encoding to unk_label's index.

        Returns
        -------
        int
            Corresponding encoded int value.
        r   zUnknown label z=, and explicitly disallowed the use of the existing unk-labelzCannot encode unknown label zN. You have not called add_unk() to add a special unk-label for unknown labels.z+Couldn't and wouldn't encode unknown label .)_assert_lenr
   rL   hasattrr   r   r&   	allow_unkr   r   r   encode_label  s$   


zCategoricalEncoder.encode_labelc                 C   s   t | ||gS )a  Encode label to torch.LongTensor.

        Arguments
        ---------
        label : hashable
            Label to encode, must exist in the mapping.
        allow_unk : bool
            If given, that label is not in the label set
            AND unk_label has been added with add_unk(),
            allows encoding to unk_label's index.

        Returns
        -------
        torch.LongTensor
            Corresponding encoded int value.
            Tensor shape [1].
        )torch
LongTensorrd   rb   r   r   r   encode_label_torch  s   z%CategoricalEncoder.encode_label_torchc                    s       fdd|D S )a  Encode a sequence of labels to list

        Arguments
        ---------
        sequence : iterable
            Labels to encode, must exist in the mapping.
        allow_unk : bool
            If given, that label is not in the label set
            AND unk_label has been added with add_unk(),
            allows encoding to unk_label's index.

        Returns
        -------
        list
            Corresponding integer labels.
        c                       g | ]} | qS r   rd   r(   r&   rc   r   r   r   
<listcomp>      z6CategoricalEncoder.encode_sequence.<locals>.<listcomp>)r`   r   sequencerc   r   rk   r   encode_sequence  s   z"CategoricalEncoder.encode_sequencec                    s   t  fdd|D S )a  Encode a sequence of labels to torch.LongTensor

        Arguments
        ---------
        sequence : iterable
            Labels to encode, must exist in the mapping.
        allow_unk : bool
            If given, that label is not in the label set
            AND unk_label has been added with add_unk(),
            allows encoding to unk_label's index.

        Returns
        -------
        torch.LongTensor
            Corresponding integer labels.
            Tensor shape [len(sequence)].
        c                    rh   r   ri   rj   rk   r   r   rl     rm   z<CategoricalEncoder.encode_sequence_torch.<locals>.<listcomp>)re   rf   rn   r   rk   r   encode_sequence_torch   s   z(CategoricalEncoder.encode_sequence_torchc                 C   sV   |    g }|jdkr|D ]}|| jt|  q|S |D ]
}|| | q|S )a  Decodes an arbitrarily nested torch.Tensor to a list of labels.

        Provided separately because Torch provides clearer introspection,
        and so doesn't require try-except.

        Arguments
        ---------
        x : torch.Tensor
            Torch tensor of some integer dtype (Long, int) and any shape to
            decode.

        Returns
        -------
        list
            list of original labels
        r1   )r`   ndimappendr   rQ   decode_torch)r   xdecodedelement	subtensorr   r   r   rt     s   
zCategoricalEncoder.decode_torchc                 C   sN   |    zg }|D ]
}|| | q	|W S  ty&   | jt|  Y S w )a  Decodes an arbitrarily nested iterable to a list of labels.

        This works for essentially any pythonic iterable (including torch), and
        also single elements.

        Arguments
        ---------
        x : Any
            Python list or other iterable or torch.Tensor or a single integer element

        Returns
        -------
        list, Any
            ndim list of original labels, or if input was single element,
            output will be, too.
        )r`   rs   decode_ndim	TypeErrorr   rQ   )r   ru   rv   rx   r   r   r   ry   2  s   zCategoricalEncoder.decode_ndimc                 C   s   |   }| || j| dS )a>  Save the categorical encoding for later use and recovery

        Saving uses a Python literal format, which supports things like
        tuple labels, but is considered safe to load (unlike e.g. pickle).

        Arguments
        ---------
        path : str, Path
            Where to save. Will overwrite.
        N)_get_extras_save_literalr
   )r   r   extrasr   r   r   rF   M  s   zCategoricalEncoder.savec                 C   sZ   | j r| jj}td| d | |\}}}|| _ || _| | td|  dS )a!  Loads from the given path.

        CategoricalEncoder uses a Python literal format, which supports things
        like tuple labels, but is considered safe to load (unlike e.g. pickle).

        Arguments
        ---------
        path : str, Path
            Where to load from.
        zLoad called, but zs is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.z!Loaded categorical encoding from N)	r
   r2   r3   r4   r5   _load_literalr   _set_extrasdebug)r   r   r<   r
   r   r}   r   r   r   r   \  s   

zCategoricalEncoder.loadc              	   C   sb   ~z|  | W dS  ty   td| d Y dS  ttfy0   td| d Y dS w )aR  Loads if possible, returns a bool indicating if loaded or not.

        Arguments
        ---------
        path : str, Path
            Where to load from.
        end_of_epoch : bool
            Whether the checkpoint was end-of-epoch or not.

        Returns
        -------
        bool :
            If load was successful.

        Example
        -------
        >>> encoding_file = getfixture('tmpdir') / "encoding.txt"
        >>> encoder = CategoricalEncoder()
        >>> # The idea is in an experiment script to have something like this:
        >>> if not encoder.load_if_possible(encoding_file):
        ...     encoder.update_from_iterable("abcd")
        ...     encoder.save(encoding_file)
        >>> # So the first time you run the experiment, the encoding is created.
        >>> # However, later, the encoding exists:
        >>> encoder = CategoricalEncoder()
        >>> encoder.expect_len(4)
        >>> if not encoder.load_if_possible(encoding_file):
        ...     assert False  # We won't get here!
        >>> encoder.decode_ndim(range(4))
        ['a', 'b', 'c', 'd']
        z%Would load categorical encoding from z, but file doesn't exist yet.FzH, and file existed but seems to be corrupted or otherwise couldn't load.T)r   FileNotFoundErrorr4   r   rE   SyntaxError)r   r   end_of_epochr   r   r   rD   u  s   !

z#CategoricalEncoder.load_if_possiblec                 C   s
   || _ dS )a  Specify the expected category count. If the category count observed
        during encoding/decoding does NOT match this, an error will be raised.

        This can prove useful to detect bugs in scenarios where the encoder is
        dynamically built using a dataset, but downstream code expects a
        specific category count (and may silently break otherwise).

        This can be called anytime and the category count check will only be
        performed during an actual encoding/decoding task.

        Arguments
        ---------
        expected_len : int
            The expected final category count, i.e. `len(encoder)`.

        Example
        -------
        >>> encoder = CategoricalEncoder()
        >>> encoder.update_from_iterable("abcd")
        >>> encoder.expect_len(3)
        >>> encoder.encode_label("a")
        Traceback (most recent call last):
          ...
        RuntimeError: .expect_len(3) was called, but 4 categories found
        >>> encoder.expect_len(4)
        >>> encoder.encode_label("a")
        0
        Nexpected_len)r   r   r   r   r   
expect_len  s   
zCategoricalEncoder.expect_lenc                 C   s
   d| _ dS )zSpecifies that category count shall be ignored at encoding/decoding
        time.

        Effectively inhibits the ".expect_len was never called" warning.
        Prefer :py:meth:`~CategoricalEncoder.expect_len` when the category count
        is known.Nr   r   r   r   r   
ignore_len  s   
zCategoricalEncoder.ignore_lenc                 C   sp   t | dr#| jdu rdS t| }|| jkr!td| j d| ddS t| jj dt|  d |   dS )zIf `expect_len` was called, then check if len(self) matches the
        expected value. If it does not, raise a RuntimeError.
        If neither `expect_len` or `ignore_len` were ever called, warn once.r   Nz.expect_len(z) was called, but z categories foundz9.expect_len was never called: assuming category count of z to be correct! Sanity check your encoder using `.expect_len`. Ensure that downstream code also uses the correct size. If you are sure this does not apply to you, use `.ignore_len`.)	ra   r   r   RuntimeErrorr4   warning_oncer2   r3   r   )r   real_lenr   r   r   r`     s$   




zCategoricalEncoder._assert_lenc                 C   s"   d| j i}t| dr| j|d< |S )zzOverride this to provide any additional things to save

        Call super()._get_extras() to get the base extras
        r   r   )r   ra   r   r   r}   r   r   r   r{     s   


zCategoricalEncoder._get_extrasc                 C   s    d|v r	|d | _ |d | _dS )zvOverride this to e.g. load any extras needed

        Call super()._set_extras(extras) to set the base extras
        r   r   N)r   r   r   r   r   r   r     s   
zCategoricalEncoder._set_extrasc                 C   s   t | dddE}| D ]\}}|t|tj t| d  q|tj | D ]\}}|t|tj t| d  q+|  W d   dS 1 sOw   Y  dS )z+Save which is compatible with _load_literalwutf-8encoding
N)	openitemswriterR   r   VALUE_SEPARATORstrEXTRAS_SEPARATORflush)r   r
   r}   fr&   indkeyvaluer   r   r   r|     s0   
"z CategoricalEncoder._save_literalc                 C   s   i }i }i }t | ddP}|D ]&}|tjkr n| jtjdd\}}t|}t|}|||< |||< q|D ]}| jtjdd\}	}
t|	}t|
}|||< q8W d   n1 s_w   Y  |||fS )zLoad which supports Python literals as keys.

        This is considered safe for user input, as well (unlike e.g. pickle).
        r   r   r1   )maxsplitN)	r   r   r   stripsplitr   rQ   astliteral_eval)r   r
   r   r}   r   lineliteralr   r&   literal_keyliteral_valuer   r   r   r   r   r~     s0   








z CategoricalEncoder._load_literal)r   )F)FNr1   T)*r3   
__module____qualname____doc__r   r   r   r   r   classmethodr   r'   r0   r?   rJ   r9   r"   rP   rO   DEFAULT_UNKr   rM   r^   rd   rg   rp   rq   rt   ry   r   rF   r   r   rD   r   r   r`   r{   r   staticmethodr|   r~   r   r   r   r   r       s^    g





1
##

'


2	
	
r   c                       s   e Zd ZdZ fddZd" fdd	Zd" fdd	Z	
d# fdd	Zee	fddZ
ee	dd	fddZdd Zdd Zdd Zdd Zdd Zdd Z fddZ fd d!Z  ZS )$TextEncodera
	  CategoricalEncoder subclass which offers specific methods for encoding text and handle
    special tokens for training of sequence to sequence models.
    In detail, aside special <unk> token already present in CategoricalEncoder
    for handling out-of-vocab tokens here special methods to handle
    <bos> beginning of sequence and <eos> tokens are defined.

    Note: update_from_iterable and update_from_didataset here have as default
    sequence_input=True because it is assumed that this encoder is used on
    iterables of strings: e.g.

    >>> from speechbrain.dataio.encoder import TextEncoder
    >>> dataset = [["encode", "this", "textencoder"], ["foo", "bar"]]
    >>> encoder = TextEncoder()
    >>> encoder.update_from_iterable(dataset)
    >>> encoder.expect_len(5)
    >>> encoder.encode_label("this")
    1
    >>> encoder.add_unk()
    5
    >>> encoder.expect_len(6)
    >>> encoder.encode_sequence(["this", "out-of-vocab"])
    [1, 5]
    >>>

    Two methods can be used to add <bos> and <eos> to the internal dicts:
    insert_bos_eos, add_bos_eos.

    >>> encoder.add_bos_eos()
    >>> encoder.expect_len(8)
    >>> encoder.lab2ind[encoder.eos_label]
    7
    >>>
    add_bos_eos adds the special tokens at the end of the dict indexes
    >>> encoder = TextEncoder()
    >>> encoder.update_from_iterable(dataset)
    >>> encoder.insert_bos_eos(bos_index=0, eos_index=1)
    >>> encoder.expect_len(7)
    >>> encoder.lab2ind[encoder.eos_label]
    1
    >>>
    insert_bos_eos allows to specify whose index will correspond to each of them.
    Note that you can also specify the same integer encoding for both.

    Four methods can be used to prepend <bos> and append <eos>.
    prepend_bos_label and append_eos_label add respectively the <bos> and <eos>
    string tokens to the input sequence

    >>> words = ["foo", "bar"]
    >>> encoder.prepend_bos_label(words)
    ['<bos>', 'foo', 'bar']
    >>> encoder.append_eos_label(words)
    ['foo', 'bar', '<eos>']

    prepend_bos_index and append_eos_index add respectively the <bos> and <eos>
    indexes to the input encoded sequence.

    >>> words = ["foo", "bar"]
    >>> encoded = encoder.encode_sequence(words)
    >>> encoder.prepend_bos_index(encoded)
    [0, 3, 4]
    >>> encoder.append_eos_index(encoded)
    [3, 4, 1]

    c                    sV   t  | d|v rd|v r| jdd|d |d d dS d|v s%d|v r)tddS )z+Handles special labels such as bos and eos.	bos_label	eos_labelr   r   )r   r   	bos_index	eos_indexz.Only BOS or EOS specified. Need both for init.N)superr   insert_bos_eosrz   r   r2   r   r   r   u  s   
z!TextEncoder.handle_special_labelsTc                    s   t  ||S *Change default for sequence_input to True.)r   r'   )r   r#   r$   r   r   r   r'     s   z TextEncoder.update_from_iterablec                    s   t  |||S r   )r   r0   r.   r   r   r   r0     s   z!TextEncoder.update_from_didatasetNr1   c                    s   t  j|ddddS )r   TNr1   )r$   r:   r;   )r   r?   )r   r#   r$   r:   r;   r   r   r   r?     s   z*TextEncoder.limited_labelset_from_iterablec                 C   sB   ||krt d | | n
| | | | || _|| _dS )aH  Add sentence boundary markers in the label set.

        If the beginning-of-sentence and end-of-sentence markers
        are the same, will just use one sentence-boundary label.

        This method adds to the end of the index, rather than at the beginning,
        like insert_bos_eos.

        Arguments
        ---------
        bos_label : hashable
            Beginning-of-sentence label, any label.
        eos_label : hashable
            End-of-sentence label, any label. If set to the same label as
            bos_label, will just use one sentence-boundary label.
        IBOS and EOS labels are the same so using just one sentence boundary labelN)r4   r   r9   r   r   )r   r   r   r   r   r   add_bos_eos  s   


zTextEncoder.add_bos_eosr   c                 C   sl   ||krt d | || n| || |du r(t d | ||d  n| || || _|| _dS )a  Insert sentence boundary markers in the label set.

        If the beginning-of-sentence and end-of-sentence markers
        are the same, will just use one sentence-boundary label.

        Arguments
        ---------
        bos_label : hashable
            Beginning-of-sentence label, any label
        eos_label : hashable
            End-of-sentence label, any label. If set to the same label as
            bos_label, will just use one sentence-boundary label.
        bos_index : int
            Where to insert bos_label. eos_index = bos_index + 1
        eos_index : optional, int
            Where to insert eos_label. Default: eos_index = bos_index + 1
        r   Nz,EOS label not specified, using BOS label + 1r1   )r4   r   rP   r   r   )r   r   r   r   r   r   r   r   r     s   

zTextEncoder.insert_bos_eosc                 C      t | ds	td| | jS )(Returns the index to which blank encodesr   zBOS label is not set!)ra   r   rd   r   r   r   r   r   get_bos_index     
zTextEncoder.get_bos_indexc                 C   r   )r   r   zEOS label is not set!)ra   r   rd   r   r   r   r   r   get_eos_index  r   zTextEncoder.get_eos_indexc                 C   s"   t | ds	td| jgt| S )z/Returns a list version of x, with BOS prependedr   *BOS label has not been added to label set!)ra   rL   r   listr   ru   r   r   r   prepend_bos_label     
zTextEncoder.prepend_bos_labelc                 C   sT   t | ds	tdt|rt| j| j g}t||gS | j| j gt| S )znReturns a list version of x, with BOS index prepended.
        If the input is a tensor, a tensor is returned.r   r   )	ra   rL   re   	is_tensorTensorr
   r   catr   )r   ru   bos_indr   r   r   prepend_bos_index     

zTextEncoder.prepend_bos_indexc                 C   s"   t | ds	tdt|| jg S )z/Returns a list version of x, with EOS appended.r   *EOS label has not been added to label set!)ra   rL   r   r   r   r   r   r   append_eos_label  r   zTextEncoder.append_eos_labelc                 C   sT   t | ds	tdt|rt| j| j g}t||gS t|| j| j g S )zmReturns a list version of x, with EOS index appended.
        If the input is a tensor, a tensor is returned.r   r   )	ra   rL   re   r   r   r
   r   r   r   )r   ru   eos_indr   r   r   append_eos_index  r   zTextEncoder.append_eos_indexc                    s6   t   }t| dr| j|d< t| dr| j|d< |S Nr   r   )r   r{   ra   r   r   r   r   r   r   r{     s   




zTextEncoder._get_extrasc                    s8   t  | d|v r|d | _d|v r|d | _d S d S r   )r   r   r   r   r   r   r   r   r     s   
zTextEncoder._set_extrasr   )TNr1   )r3   r   r   r   r   r'   r0   r?   DEFAULT_BOSDEFAULT_EOSr   r   r   r   r   r   r   r   r{   r   __classcell__r   r   r   r   r   3  s.    A

#
(

r   c                       sn   e Zd ZdZ fddZefddZedfddZd	d
 ZdddZ	dddZ
 fddZ fddZ  ZS )CTCTextEncoderaf  Subclass of TextEncoder which also provides methods to handle CTC blank token.

    add_blank and insert_blank can be used to add <blank> special token to the encoder
    state.

    >>> from speechbrain.dataio.encoder import CTCTextEncoder
    >>> chars = ["a", "b", "c", "d"]
    >>> encoder = CTCTextEncoder()
    >>> encoder.update_from_iterable(chars)
    >>> encoder.add_blank()
    >>> encoder.expect_len(5)
    >>> encoder.encode_sequence(chars)
    [0, 1, 2, 3]
    >>> encoder.get_blank_index()
    4
    >>> encoder.decode_ndim([0, 1, 2, 3, 4])
    ['a', 'b', 'c', 'd', '<blank>']

    collapse_labels and collapse_indices_ndim can be used to apply CTC collapsing
    rules:
    >>> encoder.collapse_labels(["a", "a", "b", "c", "d"])
    ['a', 'b', 'c', 'd']
    >>> encoder.collapse_indices_ndim([4, 4, 0, 1, 2, 3, 4, 4]) # 4 is <blank>
    [0, 1, 2, 3]
    c                    s(   d|v r| j |d d t | dS )z&Handles special labels such as blanks.blank_label)rN   N)insert_blankr   r   r   r   r   r   r   9  s   z$CTCTextEncoder.handle_special_labelsc                 C   s   |  | || _dS )zAdd blank symbol to labelset.N)r9   r   )r   r   r   r   r   	add_blankF  s   

zCTCTextEncoder.add_blankr   c                 C   s   |  || || _dS )z(Insert blank symbol at a given labelset.N)rP   r   )r   r   rN   r   r   r   r   K  s   
zCTCTextEncoder.insert_blankc                 C   r   )z)Returns the index to which blank encodes.r   zBlank label is not set!)ra   r   rd   r   r   r   r   r   get_blank_indexP  r   zCTCTextEncoder.get_blank_indexTc                    s@   t  ds	td|r fddtD S  fddD S )a  Applies the CTC collapsing rules on one label sequence.

        Arguments
        ---------
        x : iterable
            Label sequence on which to operate.
        merge_repeats : bool
            Whether to merge repeated labels before removing blanks.
            In the basic CTC label topology, repeated labels are merged.
            However, in RNN-T, they are not.

        Returns
        -------
        list
            List of labels with collapsing rules applied.
        r   Blank label has not been addedc                    s6   g | ]\}}|d ks||d  kr| j kr|qS r   r1   r   )r(   rV   r&   r   r   r   rl   l  s
    "z2CTCTextEncoder.collapse_labels.<locals>.<listcomp>c                    s   g | ]	}| j kr|qS r   r   rj   r   r   r   rl   r  s    )ra   rL   	enumerate)r   ru   merge_repeatsr   r   r   collapse_labelsV  s   
zCTCTextEncoder.collapse_labelsc              	      s   t | ds	tdg }D ]}z|| || W q ty$   Y  nw |S | j| j  |r; fddtD S  fddD S )a  Applies the CTC collapsing rules on arbitrarily label sequence.

        Arguments
        ---------
        x : iterable
            Label sequence on which to operate.
        merge_repeats : bool
            Whether to merge repeated labels before removing blanks.
            In the basic CTC label topology, repeated labels are merged.
            However, in RNN-T, they are not.

        Returns
        -------
        list
            List of labels with collapsing rules applied.
        r   r   c                    s4   g | ]\}}|d ks||d  kr| kr|qS r   r   )r(   rV   rN   blank_indexru   r   r   rl     s
     z8CTCTextEncoder.collapse_indices_ndim.<locals>.<listcomp>c                    s   g | ]}| kr|qS r   r   )r(   rN   )r   r   r   rl     rm   )ra   rL   rs   collapse_indices_ndimrz   r
   r   r   )r   ru   r   	collapsedrx   r   r   r   r   t  s$   

z$CTCTextEncoder.collapse_indices_ndimc                    s"   t   }t| dr| j|d< |S Nr   )r   r{   ra   r   r   r   r   r   r{     s   


zCTCTextEncoder._get_extrasc                    s&   t  | d|v r|d | _d S d S r   )r   r   r   r   r   r   r   r     s   zCTCTextEncoder._set_extrasr   )r3   r   r   r   r   DEFAULT_BLANKr   r   r   r   r   r{   r   r   r   r   r   r   r     s    

*r   c                 C   s   t  }||  t|j S )a  Loads the encoder tokens from a pretrained model.

    This method is useful when you used with a pretrained HF model.
    It will load the tokens in the yaml and then you will be able
    to instantiate any CTCBaseSearcher directly in the YAML file.

    Arguments
    ---------
    model_path : str, Path
        Path to the pretrained model.

    Returns
    -------
    list
        List of tokens.
    )r   r   r   r
   rZ   )
model_pathlabel_encoderr   r   r   load_text_encoder_tokens  s   
r   )r   r   r6   r   re   speechbrainr@   speechbrain.utils.checkpointsr   r   r   speechbrain.utils.loggerr   r3   r4   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s4           l 