o
    ½e¦i#W  ã                   @   s<   d Z ddlZddlmZ G dd„ dejƒZG dd„ dƒZdS )zDTools for homograph disambiguation
Authors
 * Artem Ploujnikov 2021
é    N)Únnc                       sd   e Zd ZdZd‡ fdd„	Zedd„ ƒZejdd„ ƒZedd	„ ƒZejd
d	„ ƒZ		ddd„Z	‡  Z
S )ÚSubsequenceLossa	  
    A loss function for a specific word in the output, used in
    the homograph disambiguation task
    The approach is as follows:
    1. Arrange only the target words from the original batch into a
    single tensor
    2. Find the word index of each target word
    3. Compute the beginnings and endings of words in the predicted
    sequences. The assumption is that the model has been trained well
    enough to identify word boundaries with a simple argmax without
    having to perform a beam search.
    Important! This loss can be used for fine-tuning only
    The model is expected to be able to already be able
    to correctly predict word boundaries

    Arguments
    ---------
    seq_cost: callable
        the loss to be used on the extracted subsequences
    word_separator: int
        the index of the "space" character (in phonemes)
    word_separator_base: str
        the index of word separators used in unprocessed
        targets (if different, used with tokenizations)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.g2p.homograph import SubsequenceLoss
    >>> from speechbrain.nnet.losses import nll_loss
    >>> loss = SubsequenceLoss(
    ...     seq_cost=nll_loss
    ... )
    >>> phns = torch.Tensor(
    ...     [[1, 2, 0, 1, 3, 0, 2, 1, 0],
    ...      [2, 1, 3, 0, 1, 2, 0, 3, 2]]
    ... )
    >>> phn_lens = torch.IntTensor([8, 9])
    >>> subsequence_phn_start = torch.IntTensor([3, 4])
    >>> subsequence_phn_end = torch.IntTensor([5, 7])
    >>> p_seq = torch.Tensor([
    ...     [[0., 1., 0., 0.],
    ...      [0., 0., 1., 0.],
    ...      [1., 0., 0., 0.],
    ...      [0., 1., 0., 0.],
    ...      [0., 0., 0., 1.],
    ...      [1., 0., 0., 0.],
    ...      [0., 0., 1., 0.],
    ...      [0., 1., 0., 0.],
    ...      [1., 0., 0., 0.]],
    ...     [[0., 0., 1., 0.],
    ...      [0., 1., 0., 0.],
    ...      [0., 0., 0., 1.],
    ...      [1., 0., 0., 0.],
    ...      [0., 1., 0., 0.],
    ...      [0., 0., 1., 0.],
    ...      [1., 0., 0., 0.],
    ...      [0., 0., 0., 1.],
    ...      [0., 0., 1., 0.]]
    ... ])
    >>> loss_value = loss(
    ...    phns,
    ...    phn_lens,
    ...    p_seq,
    ...    subsequence_phn_start,
    ...    subsequence_phn_end
    ... )
    >>> loss_value
    tensor(-0.8000)
    r   c                    s    t ƒ  ¡  || _t||ƒ| _d S ©N)ÚsuperÚ__init__Úseq_costÚSubsequenceExtractorÚ_subsequence_extractor)Úselfr   Úword_separatorÚword_separator_base©Ú	__class__© úd/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/lobes/models/g2p/homograph.pyr   R   s
   

ÿzSubsequenceLoss.__init__c                 C   ó   | j jS ©z/
        The word separator being used
        ©r	   r   ©r
   r   r   r   r   Y   ó   zSubsequenceLoss.word_separatorc                 C   ó   || j _dS )z)
        Sets the word separator
        Nr   ©r
   Úvaluer   r   r   r   `   ó   c                 C   r   r   ©r	   r   r   r   r   r   r   g   r   z#SubsequenceLoss.word_separator_basec                 C   r   )z.
        Sets the base word separator
        Nr   r   r   r   r   r   n   r   Nc              	   C   s*   |   |||||||¡\}}	}
|  ||	|
¡S )aR  
        Evaluates the subsequence loss

        Arguments
        ---------
        phns: torch.Tensor
            the phoneme tensor (batch x length)
        phn_lens: torch.Tensor
            the phoneme length tensor
        p_seq: torch.Tensor
            the output phoneme probability tensor
            (batch x length x phns)
        subsequence_phn_start: torch.Tensor
            the beginning of the target subsequence
            (i.e. the homograph)
        subsequence_phn_end: torch.Tensor
            the end of the target subsequence
            (i.e. the homograph)
        phns_base: torch.Tensor
            the phoneme tensor (not preprocessed)
        phn_lens_base: torch.Tensor
            the phoneme lengths (not preprocessed)

        Returns
        -------
        loss: torch.Tensor
            the loss tensor
        )r	   r   )r
   ÚphnsÚphn_lensÚp_seqÚsubsequence_phn_startÚsubsequence_phn_endÚ	phns_baseÚphn_lens_baseÚp_seq_subsequenceÚphns_subsequenceÚsubsequence_lengthsr   r   r   Úforwardu   s    *ùü
ÿzSubsequenceLoss.forward)r   r   ©NN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   Úpropertyr   Úsetterr   r%   Ú__classcell__r   r   r   r   r   
   s    G



ør   c                   @   s|   e Zd ZdZddd„Zdd„ Z		ddd	„Zd
d„ Zdd„ Zdd„ Z		ddd„Z
	ddd„Zdd„ Z	ddd„Zdd„ ZdS )r   a  
    A utility class to help extract subsequences out of a batch
    of sequences

    Arguments
    ---------
    word_separator: int
        the index of the word separator (used in p_seq)
    word_separator_base: int
        the index of word separators used in unprocessed
        targets (if different)

    Example
    -------
    >>> import torch
    >>> from speechbrain.lobes.models.g2p.homograph import SubsequenceExtractor
    >>> extractor = SubsequenceExtractor()
    >>> phns = torch.Tensor(
    ...     [[1, 2, 0, 1, 3, 0, 2, 1, 0],
    ...      [2, 1, 3, 0, 1, 2, 0, 3, 2]]
    ... )
    >>> phn_lens = torch.IntTensor([8, 9])
    >>> subsequence_phn_start = torch.IntTensor([3, 4])
    >>> subsequence_phn_end = torch.IntTensor([5, 7])
    >>> p_seq = torch.Tensor([
    ...     [[0., 1., 0., 0.],
    ...      [0., 0., 1., 0.],
    ...      [1., 0., 0., 0.],
    ...      [0., 1., 0., 0.],
    ...      [0., 0., 0., 1.],
    ...      [1., 0., 0., 0.],
    ...      [0., 0., 1., 0.],
    ...      [0., 1., 0., 0.],
    ...      [1., 0., 0., 0.]],
    ...     [[0., 0., 1., 0.],
    ...      [0., 1., 0., 0.],
    ...      [0., 0., 0., 1.],
    ...      [1., 0., 0., 0.],
    ...      [0., 1., 0., 0.],
    ...      [0., 0., 1., 0.],
    ...      [1., 0., 0., 0.],
    ...      [0., 0., 0., 1.],
    ...      [0., 0., 1., 0.]]
    ... ])
    >>> extractor.extract_seq(
    ...    phns,
    ...    phn_lens,
    ...    p_seq,
    ...    subsequence_phn_start,
    ...    subsequence_phn_end
    ... )
    (tensor([[[0., 1., 0., 0.],
             [0., 0., 0., 1.],
             [0., 0., 0., 0.]],
    <BLANKLINE>
            [[0., 1., 0., 0.],
             [0., 0., 1., 0.],
             [0., 0., 0., 0.]]]), tensor([[1., 3., 0.],
            [1., 2., 0.]]), tensor([0.6667, 1.0000]))
    r   Nc                 C   s   || _ |d u r	|}|| _d S r   )r   r   )r
   r   r   r   r   r   r   ë   s   
zSubsequenceExtractor.__init__c                 O   s   | j |i |¤ŽS r   )Úextract_seq)r
   ÚargsÚkwargsr   r   r   Ú__call__ñ   s   zSubsequenceExtractor.__call__c                 C   sn  d}|du r|du r|}|}n|du s|du rt dƒ‚d}| d¡}	| d¡|  ¡  d¡}
|| }| ¡ }|  ||¡}|  ||¡}| d¡}tjj 	|ddd|f¡}| d¡}tj
| d¡|jd |¡}tj
||jd | d¡|¡}| j|||| j|d	}|r|  ||||
¡\}}n||k||| k @ }||  |j¡}d
||| d¡k< |  ||||	¡}|||| fS )ag  
        Extracts the subsequence from the complete sequence

        Arguments
        ---------
        phns: torch.Tensor
            the phoneme tensor (batch x length)
        phn_lens: torch.Tensor
            the phoneme length tensor
        p_seq: torch.Tensor
            the output phoneme probability tensor
            (batch x length x phns)
        subsequence_phn_start: torch.Tensor
            the beginning of the target subsequence
            (i.e. the homograph)
        subsequence_phn_end: torch.Tensor
            the end of the target subsequence
            (i.e. the homograph)
        phns_base: torch.Tensor
            the phoneme tensor (not preprocessed)
        phn_base_lens: torch.Tensor
            the phoneme lengths (not preprocessed)

        Returns
        -------
        p_seq_subsequence: torch.Tensor
            the output subsequence (of probabilities)
        phns_subsequence: torch.Tensor
            the target subsequence
        subsequence_lengths: torch.Tensor
            subsequence lengths, expressed as a fraction
            of the tensor's last dimension

        FNzDphn_base and phn_lens_base, if provided, should be provided togetherTé   éÿÿÿÿr   ©Údevice)r   ç        )Ú
ValueErrorÚsizeÚlongÚ	unsqueezeÚmaxÚ_pad_subsequenceÚtorchr   Ú
functionalÚpadÚaranger5   Ú	expand_asÚexpandÚ_get_target_word_indexesr   Ú_get_phns_subsequenceÚreshapeÚshapeÚ_get_p_seq_subsequence)r
   r   r   r   r   r   r    Úphn_base_lensÚhas_baseÚ
p_seq_edgeÚ	phns_edger$   Úlongest_subsequenceÚ	p_seq_padÚsubsequence_phn_start_unsqÚrange_phns_baseÚrange_phns_subsequenceÚtarget_word_indexesr#   Úmatchr"   r   r   r   r.   ô   sp   ,ÿ


ÿþÿþû
ÿÿÿþÿÿýz SubsequenceExtractor.extract_seqc                 C   s    |dkrt jj |d|f¡}|S )aa  Pads a subsequence to the length of the longest subsequence

        Arguments
        ---------
        sequence: torch.Tensor
            the sequence to be padded
        longest_subsequence: int
            the length of the longest subsequence

        Returns
        -------
        sequence: torch.Tensor
            The padded sequence
        r   )r=   r   r>   r?   )r
   ÚsequencerL   r   r   r   r<   g  s
   ÿz%SubsequenceExtractor._pad_subsequencec                 C   sÈ   |   |||¡\}}| d¡}| d¡}tj| d¡|jd d¡ |¡}	|	|k|	|| k @ }
||
  | d¡|¡}tj| d¡|jd d¡ |¡}d|||| k< t || t 	| d¡¡¡}||fS )aû  Extracts a subsequence

        Arguments
        ---------
        phns: torch.Tensor
            a tensor of phoneme indexes
        target_word_indexes: torch.Tensor
            a tensor of word indexes to extract, zero-based
            (e.g.) torch.IntTensor([2, 3])  means extracting
            the third word from the first sample and the
            fourth word from the second sample
        longest_subsequence: int
            the length of the longest subsequence
        edge: int
            the index of the "edge" of the sequence

        Returns
        -------
        phn_subsequence: torch.Tensor
            a tensor with only the target words
        subsequence_lengths: torch.Tensor
            the lengths of the extracted words
        r3   r2   r4   r   r6   )
Ú_get_word_boundariesr:   r=   r@   r8   r5   rA   ÚviewÚminimumÚtensor)r
   r   rQ   rL   ÚedgeÚ
word_startÚword_endÚword_start_unsqÚword_end_unsqÚ
phns_rangeÚ	phn_matchr#   Úphns_subsequence_ranger$   r   r   r   rD   |  s:   ÿ

ý
ÿ
ÿÿû	þ
ÿÿz*SubsequenceExtractor._get_phns_subsequencec                 C   sÈ   |   |||¡\}}tj| d¡|jd d¡ d¡ |¡}| d¡ d¡}| d¡ d¡}	||k||| k @ }
||
  | d¡|| d¡¡}tj| d¡|jd d¡ d¡ |¡}d|||	| k< |S )a:  Extracts a subsequence out of a tensor of probabilities

        Arguments
        ---------
        p_seq: torch.Tensor
            a tensor of phoneme probabilities
            (batch x sequence index x phoneme index)
        target_word_indexes: torch.Tensor
            a tensor of word indexes to extract, zero-based
            (e.g.) torch.IntTensor([2, 3])  means extracting
            the third word from the first sample and the
            fourth word from the second sample
        longest_subsequence: int
            the length of the longest subsequence
        edge: int
            the index of the "edge" of the sequence

        Returns
        -------
        p_seq_subsequence: torch.Tensor
            a probability tensor composed of the phoneme
            probabilities for target words only
        r2   r4   r   r3   r6   )rT   r=   r@   r8   r5   r:   rA   rU   )r
   r   rQ   rL   rX   rY   rZ   Úp_seq_ranger[   r\   r^   r"   Úp_seq_subsequence_ranger   r   r   rG   ¶  s8   ÿü
ÿÿÿú
þ
ÿz+SubsequenceExtractor._get_p_seq_subsequencec           	      C   sL   |dur||  d¡| d¡  ¡ knd}||k ||k|B @ }|jdd}|S )a%  Computes the target word indexes

        Arguments
        ---------
        phns: torch.Tensor
            a phoneme batch tensor
        range_phns: torch.Tensor
            a range tensor over thephoneme sequence
        start: torch.Tensor
            the beginning of the subsequence
        word_separator: int
            the word separator being used
        phn_lens: torch.Tensor
            Lengths corresponding to input phns

        Returns
        -------
        word_indexes: torch.Tensor
            the word index tensor
        Nr3   r2   F©Údim)r:   r8   r9   Úsum)	r
   r   Ú
range_phnsÚstartr   r   Úend_of_sequenceÚword_boundariesÚword_indexesr   r   r   rC   ï  s   ÿý
ÿz-SubsequenceExtractor._get_target_word_indexesc                 C   sš   |du r| j }| ¡ dkr| d¡n|}tj| d¡|jd |¡}||k||kB }|jdd}|| 	d¡k}	|  
|	|tj|¡}
|  
|	|tjd¡}|
|fS )a&  Determines the word boundaries for the specified
        word indexes within a sequence

        Arguments
        ---------
        seq: torch.Tensor
            a sequence (phonemes or graphemes)
        word_indexes: torch.Tensor
            the word indexes
        edge: int
            a tensor indicating the last position
        word_separator: int
            the word separator token

        Returns
        -------
        start: torch.Tensor
            word start indexes
        end: torch.Tensor
            word end indexes
        Né   r3   r4   rb   r   )r   rc   Úargmaxr=   r@   r8   r5   rA   Úcumsumr:   Ú_get_positionsÚminr;   )r
   Úseqri   rX   r   ÚtokensÚwords_rangerh   ÚwordsÚindex_matchrf   Úendr   r   r   rT     s   ÿþz)SubsequenceExtractor._get_word_boundariesc                 C   s2   t  |||¡}||ddj}t  |dkd|d ¡S )aÛ  A helper method to calculate start or end positions corresponding
        to specific words

        Arguments
        ---------
        index_match: torch.Tensor
            a mask where positions matching the word index are
            indicated as a 1 and the remaining positions are 0
        words_range: torch.Tensor
            a range tensor over the tokens
        aggregation: callable
            the aggregation to use (torch.min or torch.max)
        no_match_value: int
            the value to output if no match is found (this could
            happen when searching in model outputs rather than
            in source data)

        Returns
        -------
        Start or end positions of specific words.
        r3   rb   r   r2   )r=   ÚwhereÚvalues)r
   rs   rq   ÚaggregationÚno_match_valueÚ	positionsr   r   r   rm   ?  s   z#SubsequenceExtractor._get_positionsFc           	         sn   t j| d¡|jd |¡}ˆ  ||| d¡|rˆ jnˆ j¡}‡ fdd„|D ƒ}‡ fdd„t	|||ƒD ƒ}|S )aà  Extracts a subsequence from hypotheses (e.g. the result of a beam
        search) based on a reference sequence, which can be either a sequence of phonemes (the target during training)

        Arguments
        ---------
        ref_seq: torch.Tensor
            a reference sequence (e.g. phoneme targets)
        hyps: list
            a batch of hypotheses, a list of list of
            integer indices (usually of phonemes)
        subsequence_phn_start: torch.Tensor
            the index of the beginning of the subsequence to
        use_base: bool
            whether to use the raw (token) space for word separators

        Returns
        -------
        result: torch.Tensor
            The extracted subsequence.
        r2   r4   r3   c                    s.   g | ]}d g‡ fdd„t |ƒD ƒ dg ‘qS )r3   c                    s   g | ]\}}|ˆ j kr|‘qS r   )r   )Ú.0ÚidxÚphnr   r   r   Ú
<listcomp>}  s
    
þz@SubsequenceExtractor.extract_hyps.<locals>.<listcomp>.<listcomp>N)Ú	enumerate)rz   Ú	item_hypsr   r   r   r}   {  s    ù
þÿúÿz5SubsequenceExtractor.extract_hyps.<locals>.<listcomp>c                    s    g | ]\}}}ˆ   |||¡‘qS r   )Ú_extract_hyp_word)rz   r   Úitem_separator_indexesÚ
word_indexr   r   r   r}   …  s    ýÿÿ)
r=   r@   r8   r5   rA   rC   r:   r   r   Úzip)	r
   Úref_seqÚhypsr   Úuse_basere   rQ   Úseparator_indexesÚresultr   r   r   Úextract_hyps[  s(   ÿþü
ø

ÿüz!SubsequenceExtractor.extract_hypsc                 C   sL   |t |ƒk r"|| }|du rdS |d7 }||d  }|||… }|S g }|S )a‘  Extracts a single word out of a hypothesis sequence

        Arguments
        ---------
        hyps: list
            a hypotheses list (or tensor)
        separator_indexes: torch.Tensor
            a tensor of word separators
        word_index: int
            the index of the word to eb retrieved

        Returns
        -------
        result: list|str
            the extracted word
        NÚ r2   )Úlen)r
   r…   r‡   r‚   ÚleftÚrightrˆ   r   r   r   r€     s   ÿz&SubsequenceExtractor._extract_hyp_word)r   Nr&   r   )F)r'   r(   r)   r*   r   r1   r.   r<   rD   rG   rC   rT   rm   r‰   r€   r   r   r   r   r   ­   s$    
=

øs::
ÿ#
ÿ.
ÿ4r   )r*   r=   r   ÚModuler   r   r   r   r   r   Ú<module>   s     $