o
    GÿÆiôû  ã                   @   s<  d dl Z d dlZd dlmZ d dlmZmZmZ ddlmZ d?dedee defd	d
„Z			d@dededede
dedee deeef fdd„Z					dAdededede
dee dededee dedeeeeeeef f f fdd„Z		d@dedede
dedee deeef fdd„Z				dBdedede
dee dededee defdd„Zd ejdejfd!d"„Zd#ejd$e
dejfd%d&„Zd'ejd(ejdejd$e
dejf
d)d*„Zd'ejd(ejdejd$e
dejf
d+d,„Zdejdejd-ejdeejejf fd.d/„Zd0ejd1ejfd2d3„Z	dCdeded-ede
dededeeef fd4d5„Z				dBdeded-ede
dedededee defd6d7„Z	8	8		dDdededede
d9ed:edee dedeeef fd;d<„Z	8	8					dEdededede
d9ed:edee dededee dedeeeeeef f ef fd=d>„ZdS )Fé    N)ÚTensor)ÚOptionalÚTupleÚUnioné   )Úmutual_information_recursionÚpxÚboundaryÚreturnc                 C   sP   |du r| S | j \}}}|dd…df  |dd¡ |||¡}| jd|tdƒdS )aV  
    Insert -inf's into `px` in appropriate places if `boundary` is not
    None.  If boundary == None and rnnt_type == "regular", px[:,:,-1] will
    be -infinity, but if boundary is specified, we need px[b,:,boundary[b,3]]
    to be -infinity.

     Args:
          px: a Tensor of of shape [B][S][T+1] (this function is only
              called if rnnt_type == "regular", see other docs for `rnnt_type`)
              px is modified in-place and returned.
           boundary: None, or a Tensor of shape [B][3] containing
              [s_begin, t_begin, s_end, t_end]; we need only t_end.
    Né   r   é   ú-inf)ÚdimÚindexÚvalue)ÚshapeÚreshapeÚexpandÚscatter_Úfloat)r   r	   ÚBÚSÚT1© r   ú@/home/ubuntu/.local/lib/python3.10/site-packages/k2/rnnt_loss.pyÚfix_for_boundary   s
   $r   ÚregularÚlmÚamÚsymbolsÚtermination_symbolÚ	rnnt_typec              	   C   sÂ  | j dks
J | jƒ‚|j dksJ |jƒ‚| jd |jd ks&J | j|jfƒ‚| jd |jd ks8J | j|jfƒ‚|j\}}}| jd d }	|j||	fksTJ |j||	fƒ‚|	dks\J |	ƒ‚||	ksfJ ||	fƒ‚|dv snJ |ƒ‚tj|ddd\}
}tj| ddd\}}||
  ¡ }| |  ¡ }t || dd¡¡t |j¡j	  
¡ }|| |
 dd¡ }tj| d¡ ||	||¡d| ||	dd¡ ||	|d¡d d	¡}|d
krætj|tj||	dftdƒ|j|jdfdd}tj| dd…d|	…f d| d	¡d}|| }|dd…dd…d|…f  |dd…d|	…dd…f 8  < |dd…dd…|f  d¡}| dd…dd…|f  d¡}|| | }|d
krIt||ƒ}||fS |dkr]||dd…dd…dd…f 7 }||fS )aû  
    Reduces RNN-T problem (the simple case, where joiner network is just
    addition), to a compact, standard form that can then be given
    (with boundaries) to mutual_information_recursion().
    This function is called from rnnt_loss_simple(), but may be useful for
    other purposes.

    Args:
      lm:
        Language model part of un-normalized logprobs of symbols, to be added to
        acoustic model part before normalizing.  Of shape::

           [B][S+1][C]

        where B is the batch size, S is the maximum sequence length of
        the symbol sequence, possibly including the EOS symbol; and
        C is size of the symbol vocabulary, including the termination/next-frame
        symbol.
        Conceptually, lm[b][s] is a vector of length [C] representing the
        "language model" part of the un-normalized logprobs of symbols,
        given all symbols *earlier than* s in the sequence.  The reason
        we still need this for position S is that we may still be emitting
        the termination/next-frame symbol at this point.
      am:
        Acoustic-model part of un-normalized logprobs of symbols, to be added
        to language-model part before normalizing.  Of shape::

           [B][T][C]

        where B is the batch size, T is the maximum sequence length of
        the acoustic sequences (in frames); and C is size of the symbol
        vocabulary, including the termination/next-frame symbol.  It reflects
        the "acoustic" part of the probability of any given symbol appearing
        next on this frame.
      symbols:
        A LongTensor of shape [B][S], containing the symbols at each position
        of the sequence.
      termination_symbol:
        The identity of the termination symbol, must be in {0..C-1}
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
    Returns:
        (px, py) (the names are quite arbitrary).
           px: logprobs, of shape [B][S][T+1] if rnnt_type is regular,
                                  [B][S][T] if rnnt_type is not regular.
           py: logprobs, of shape [B][S+1][T]

      in the recursion::

          p[b,0,0] = 0.0
          if rnnt_type == "regular":
             p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t],
                                p[b,s,t-1] + py[b,s,t-1])
          if rnnt_type != "regular":
             p[b,s,t] = log_add(p[b,s-1,t-1] + px[b,s-1,t-1],
                                p[b,s,t-1] + py[b,s,t-1])
          .. where p[b][s][t] is the "joint score" of the pair of subsequences
          of length s and t respectively.  px[b][s][t] represents the
          probability of extending the subsequences of length (s,t) by one in
          the s direction, given the particular symbol, and py[b][s][t]
          represents the probability of extending the subsequences of length
          (s,t) by one in the t direction,
          i.e. of emitting the termination/next-frame symbol.

          if rnnt_type == "regular", px[:,:,T] equals -infinity, meaning on the
          "one-past-the-last" frame we cannot emit any symbols.
          This is simply a way of incorporating
          the probability of the termination symbol on the last frame.
    r   r   r   r   ©r   ÚmodifiedÚconstrainedT©r   Úkeepdim©r   r   éÿÿÿÿr   r   ©ÚdeviceÚdtype©r   Nr$   )Úndimr   ÚtorchÚmaxÚexpÚmatmulÚ	transposeÚfinfor+   ÚtinyÚlogÚgatherÚ	unsqueezer   r   ÚsqueezeÚcatÚfullr   r*   r   )r   r   r   r    r!   r	   r   ÚTÚCr   Úam_maxÚ_Úlm_maxÚam_probsÚlm_probsÚnormalizersÚpx_amÚpx_lmr   Úpy_amÚpy_lmÚpyr   r   r   Úget_rnnt_logprobs.   sn   ^$$ÿýýÿüüþ	öÿ8


ýrH   ç        ÚmeanFÚdelay_penaltyÚ	reductionÚreturn_gradc	                 C   sF  t | |||||d\}	}
|dkrd|	j\}}}|dkr|n|d }|du r8tj|d d |	j|	jd |dd¡}n|dd…df d d }| |dd¡tj||	jd	 dd|¡ }|| }|	| 	|	j¡7 }	t
|	|
||d
}|rr|d n|}|dkr|| }n|dkr‡t |¡ }n|dkr’t |¡ }ntd|› ƒ‚|r¡||d fS |S )a¤  A simple case of the RNN-T loss, where the 'joiner' network is just
    addition.

    Args:
      lm:
        language-model part of unnormalized log-probs of symbols, with shape
        (B, S+1, C), i.e. batch, symbol_seq_len+1, num_classes
      am:
        acoustic-model part of unnormalized log-probs of symbols, with shape
        (B, T, C), i.e. batch, frame, num_classes
      symbols:
        the symbol sequences, a LongTensor of shape [B][S], and elements in
        {0..C-1}.
      termination_symbol:
        the termination symbol, with 0 <= termination_symbol < C
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
      delay_penalty: A constant value to penalize symbol delay, this may be
         needed when training with time masking, to avoid the time-masking
         encouraging the network to delay symbols.
         See https://github.com/k2-fsa/k2/issues/955 for more details.
      reduction:
        Specifies the reduction to apply to the output: `none`, `mean` or `sum`.
        `none`: no reduction will be applied.
        `mean`: apply `torch.mean` over the batches.
        `sum`: the output will be summed.
        Default: `mean`
      return_grad:
        Whether to return grads of px and py, this grad standing for the
        occupation probability is the output of the backward with a
        `fake gradient`, the `fake gradient` is the same as the gradient you'd
        get if you did `torch.autograd.grad((-loss.sum()), [px, py])`, note, the
        loss here is the loss with reduction "none".
        This is useful to implement the pruned version of rnnt loss.
    Returns:
       If return_grad is False, returns a tensor of shape (B,), containing the
       total RNN-T loss values for each element of the batch if reduction equals
       to "none", otherwise a scalar with the reduction applied.
       If return_grad is True, the grads of px and py, which is the output of
       backward with a `fake gradient`(see above), will be returned too. And the
       returned value will be a tuple like (loss, (px_grad, py_grad)).
    )r   r   r   r    r	   r!   rI   r   r   Nr   ©r+   r*   r   ©r*   ©r   rG   r	   rM   r   ÚnonerJ   Úsumú5reduction should be ('none' | 'mean' | 'sum'), given )rH   r   r.   Útensorr+   r*   r   r   ÚarangeÚtor   rJ   rR   Ú
ValueError)r   r   r   r    r	   r!   rK   rL   rM   r   rG   r   r   ÚT0r;   ÚoffsetÚpenaltyÚscores_and_gradsÚnegated_lossÚlossr   r   r   Úrnnt_loss_simpleÔ   sR   F
ú	
ý
üÿ
þÿÿr^   Úlogitsc              	   C   s°  | j dks
J | jƒ‚| j\}}}}|d }	|j||	fks$J |j||	fƒ‚|	dks,J |	ƒ‚||	ks6J ||	fƒ‚|dv s>J |ƒ‚tj| dd}
|
 d¡}
tj| d| |d|	d¡ |||	d¡d d¡}| d¡}|d	krtj	|tj
||	dftd
ƒ|j|jdfdd}|dd…dd…d|…f  |
dd…d|	…dd…f 8  < | dd…dd…dd…|f  d¡ ¡ }||
8 }|d	krÁt||ƒ}||fS |dkrÔ||dd…dd…dd…f 7 }||fS )añ  Reduces RNN-T problem to a compact, standard form that can then be given
    (with boundaries) to mutual_information_recursion().
    This function is called from rnnt_loss().

    Args:
      logits:
        The output of joiner network, with shape (B, T, S + 1, C),
        i.e. batch, time_seq_len, symbol_seq_len+1, num_classes
      symbols:
        A LongTensor of shape [B][S], containing the symbols at each position
        of the sequence.
      termination_symbol:
        The identity of the termination symbol, must be in {0..C-1}
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
    Returns:
      (px, py) (the names are quite arbitrary)::

          px: logprobs, of shape [B][S][T+1] if rnnt_type is regular,
                                 [B][S][T] if rnnt_type is not regular.
          py: logprobs, of shape [B][S+1][T]

      in the recursion::

         p[b,0,0] = 0.0
         if rnnt_type == "regular":
            p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t],
                               p[b,s,t-1] + py[b,s,t-1])
         if rnnt_type != "regular":
            p[b,s,t] = log_add(p[b,s-1,t-1] + px[b,s-1,t-1],
                               p[b,s,t-1] + py[b,s,t-1])

      .. where p[b][s][t] is the "joint score" of the pair of subsequences of
      length s and t respectively.  px[b][s][t] represents the probability of
      extending the subsequences of length (s,t) by one in the s direction,
      given the particular symbol, and py[b][s][t] represents the probability
      of extending the subsequences of length (s,t) by one in the t direction,
      i.e. of emitting the termination/next-frame symbol.

      if `rnnt_type == "regular"`, px[:,:,T] equals -infinity, meaning on the
      "one-past-the-last" frame we cannot emit any symbols.
      This is simply a way of incorporating
      the probability of the termination symbol on the last frame.
    é   r   r"   r   r,   ©r   r   r   r'   r(   r   r   r)   r   Nr$   )r-   r   r.   Ú	logsumexpÚpermuter6   r   r   r8   r9   r:   r   r*   r+   Úcloner   )r_   r   r    r!   r	   r   r;   ÚS1r<   r   rB   r   rG   r   r   r   Úget_rnnt_logprobs_jointE  sD   D
ÿþ
ÿþù8
$ÿ
ýrf   c                 C   s  t | ||||d\}}|dkrc|j\}	}
}|dkr|n|d }|du r7tj|d d |j|jd |	dd¡}n|dd…df d d }| |	dd¡tj||jd	 dd|¡ }|| }|| 	|j¡7 }t
|||d
}|dkrq| S |dkr{t |¡ S |dkr…t |¡ S td|› ƒ‚)a_	  A normal RNN-T loss, which uses a 'joiner' network output as input,
    i.e. a 4 dimensions tensor.

    Args:
      logits:
        The output of joiner network, with shape (B, T, S + 1, C),
        i.e. batch, time_seq_len, symbol_seq_len+1, num_classes
      symbols:
        The symbol sequences, a LongTensor of shape [B][S], and elements
        in {0..C-1}.
      termination_symbol:
        the termination symbol, with 0 <= termination_symbol < C
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T] if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
      delay_penalty: A constant value to penalize symbol delay, this may be
         needed when training with time masking, to avoid the time-masking
         encouraging the network to delay symbols.
         See https://github.com/k2-fsa/k2/issues/955 for more details.
      reduction:
        Specifies the reduction to apply to the output: `none`, `mean` or `sum`.
        `none`: no reduction will be applied.
        `mean`: apply `torch.mean` over the batches.
        `sum`: the output will be summed.
        Default: `mean`

    Returns:
      If recursion is `none`, returns a tensor of shape (B,), containing the
      total RNN-T loss values for each element of the batch, otherwise a scalar
      with the reduction applied.
    )r_   r   r    r	   r!   rI   r   r   Nr   rN   r   rO   ©r   rG   r	   rQ   rJ   rR   rS   )rf   r   r.   rT   r+   r*   r   r   rU   rV   r   rJ   rR   rW   )r_   r   r    r	   r!   rK   rL   r   rG   r   r   rX   r;   rY   rZ   r\   r   r   r   Ú	rnnt_loss³  sH   7
û
ý
üÿ
þÿrh   Úxc                 C   s2   t j| dd} t j| dd\} }t j| dd} | S )a   Compute a monotonically increasing lower bound of the tensor `x` on the
    last dimension. The basic idea is: we traverse the tensor in reverse order,
    and update current element with the following statement,

        min_value = min(x[i], min_value)
        x[i] = min_value

    >>> import torch
    >>> x = torch.tensor([0, 2, 1, 3, 6, 5, 8], dtype=torch.int32)
    >>> _monotonic_lower_bound(x)
    tensor([0, 1, 1, 3, 5, 5, 8], dtype=torch.int32)
    >>> x
    tensor([0, 2, 1, 3, 6, 5, 8], dtype=torch.int32)
    >>> x = torch.randint(20, (3, 6), dtype=torch.int32)
    >>> x
    tensor([[12, 18,  5,  4, 18, 17],
            [11, 14, 14,  3, 10,  4],
            [19,  3,  8, 13,  7, 19]], dtype=torch.int32)
    >>> _monotonic_lower_bound(x)
    tensor([[ 4,  4,  4,  4, 17, 17],
            [ 3,  3,  3,  3,  4,  4],
            [ 3,  3,  7,  7,  7, 19]], dtype=torch.int32)
    Args:
      x:
        The source tensor.

    Returns:
      Returns a tensor which is monotonic on the last dimension
      (i.e. satisfiy `x[i] <= x[i+1]`).
    )r(   )Údimsr(   r,   )r.   ÚflipÚcummin)ri   r>   r   r   r   Ú_monotonic_lower_bound  s   rm   Ús_beginÚs_rangec                 C   sl   | j \}}t| ƒ} | |d tjd|| jd   } t| ƒ} tj| dd} | |d tjd|| jd   } | S )a|  Adjust s_begin (pruning lower bounds) to make it satisfy the following
    constraints

      - monotonic increasing, i.e. s_begin[i] <= s_begin[i + 1]
      - start with symbol 0 at first frame.
      - s_begin[i + 1] - s_begin[i] < s_range, which means that we can't skip
        any symbols.

    To make it monotonic increasing, we can use `_monotonic_lower_bound` above,
    which guarantees `s_begin[i] <= s_begin[i + 1]`. The main idea is:
    traverse the array in reverse order and update the elements by
    `min_value = min(a_begin[i], min_value)`.

    The method we used to realize `s_begin[i + 1] - s_begin[i] < s_range`
    constraint is a little tricky. We first transform `s_begin` with
    `s_begin = -(s_begin - (s_range - 1) * torch.arange(0,T))`
    then we make the transformed `s_begin` monotonic increasing, after that,
    we transform back `s_begin` with the same formula as the previous
    transformation. The idea is: if we want to make
    `s_begin[i + 1] - s_begin[i] < s_range` we only need to make
    `-(s_begin[i] - i * (s_range - 1))` a non-decreasing array. Proof:

      -(s_begin[i] - i * (s_range - 1)) <= -(s_begin[i + 1] - (i + 1) * (s_range - 1))
                            -s_begin[i] <= -s_begin[i + 1] + (i + 1) * (s_range - 1) - i * (s_range - 1)
                            -s_begin[i] <= -s_begin[i + 1] + s_range - 1
            s_begin[i + 1] - s_begin[i] <= s_range - 1
            s_begin[i + 1] - s_begin[i] < s_range

    The above transformation can not guarantee the start symbol to be 0, so we
    have to make all the elements that less than 0 to be 0 before transforming
    back the `s_begin`.
    r   r   rO   ©Úmin)r   rm   r.   rU   r*   Úclamp)rn   ro   r   r;   r   r   r   Ú_adjust_pruning_lower_bound5  s   
$ÿÿrs   Úpx_gradÚpy_gradc                 C   sJ  | j \}}}|j d }|||d fv sJ ||fƒ‚|d }|j |||fks.J |j |||fƒ‚|j |dfks<J |j |fƒ‚|dksDJ |ƒ‚||ksNJ ||fƒ‚||krV|d }||krf|dkseJ d|› ƒ‚n|dksqJ d|› ƒ‚| ¡ \}	}
}t |||| d ||f|	|
|
|f¡}tj|dd}tj|d|f| j| jd}tj|| fdd	}||d
d
…d
|| d …d
|…f  }tj	|dd}tj
d|| jd d|¡ ||¡}||d
d
…df  |d¡d k }|d
d
…df  |d¡| d }tj|dd}t |||¡}t|||kr
dn|ƒ}| ||df¡ |||f¡tj
|| jd }|S )á	  Get the pruning ranges of normal rnnt loss according to the grads
    of px and py returned by mutual_information_recursion.

    For each sequence with T frames, we will generate a tensor with the shape of
    (T, s_range) containing the information that which symbols will be token
    into consideration for each frame. For example, here is a sequence with 10
    frames and the corresponding symbols are `[A B C D E F]`, if the s_range
    equals 3, one possible ranges tensor will be::

      [[0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [1, 2, 3],
       [1, 2, 3], [1, 2, 3], [3, 4, 5], [3, 4, 5], [3, 4, 5]]

    which means we only consider `[A B C]` at frame 0, 1, 2, 3, and `[B C D]`
    at frame 4, 5, 6, `[D E F]` at frame 7, 8, 9.

    We can only consider limited number of symbols because frames and symbols
    are monotonic aligned, theoretically it can only generate particular range
    of symbols given a particular frame.

    Note:
      For the generated tensor ranges (assuming batch size is 1), ranges[:, 0]
      is a monotonic increasing tensor from 0 to `len(symbols) - s_range` and
      it satisfies `ranges[t+1, 0] - ranges[t, 0] < s_range` which means we
      won't skip any symbols.

    Args:
      px_grad:
        The gradient of px, see docs in `mutual_information_recursion` for more
        details of px.
      py_grad:
        The gradient of py, see docs in `mutual_information_recursion` for more
        details of py.
      boundary:
        a LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame]
      s_range:
        How many symbols to keep for each frame.
    Returns:
      A tensor with the shape of (B, T, s_range) containing the indexes of the
      kept symbols for each frame.
    r(   r   r`   ú~Pruning range for modified RNN-T should be equal to or greater
        than 1, or no valid paths could survive pruning. Given r   ú~Pruning range for standard RNN-T should be equal to or greater
        than 2, or no valid paths could survive pruning. Given )ÚaxisrN   r,   Nr   rO   r   rp   )r   Ústrider.   Ú
as_stridedrR   Úzerosr+   r*   r9   ÚargmaxrU   r   r   rr   Úwherers   )rt   ru   r	   ro   r   r   r   r;   re   ÚB_strideÚS_strideÚT_strideÚblk_gradÚblk_sum_gradÚpx_padÚpx_grad_padÚ
final_gradrn   ÚmaskÚs_begin_paddingÚrangesr   r   r   Úget_rnnt_prune_rangesm  sT   /
"
ÿÿ
ÿÿ
ý&"  ÿrŠ   c                 C   sž  | j \}}}|j d }|||d fv sJ ||fƒ‚|j ||d |fks,J |j |||fƒ‚|j |dfks:J |j |fƒ‚|dksBJ |ƒ‚||ksLJ ||fƒ‚||krT|d }||krd|dkscJ d|› ƒ‚n|dksoJ d|› ƒ‚tj|d|f| j| jd}tj||d df|j|jd}	||kr‘|ntj||	fdd}
tj| |fdd|
 }tjtj|d|f|j|jd|fdd}tj|dd}|d	d	…|d	…d	d	…f |d	d	…d
| …d	d	…f  }tj|dd}|d	d	…d	|…f }tjd
|| jd 	d|¡ 
||¡}||d	d	…df  	|d¡d k }|d	d	…df  	|d¡| d }tj|d
d}t |||¡}t|||kr4dn|ƒ}| 	||df¡ 
|||f¡tj|| jd }|S )rv   r(   r   r`   rw   r   rx   rN   r,   Nr   rO   r   rp   )r   r.   r|   r+   r*   r9   Úcumsumr}   rU   r   r   rr   r~   rs   )rt   ru   r	   ro   r   r   r   r;   r„   Úpy_padÚpy_grad_paddedÚtot_gradÚ	diff_gradrn   r‡   rˆ   r‰   r   r   r   Ú get_rnnt_prune_ranges_deprecatedî  sb   /
&
ÿÿ
ÿÿÿÿÿüù	6"  ÿr   r‰   c              	   C   s  |j d | j d ksJ |j | j fƒ‚|j d |j d ks$J |j |j fƒ‚| j d |j d ks6J | j |j fƒ‚|j \}}}|j \}}}| j d }| j |||fksXJ | j |||fƒ‚|d }	|  d¡ ||||f¡}
tj| d¡ |||	d |f¡d| |||df¡ ||||f¡d}|
|fS )a@  Prune the output of encoder(am) and prediction network(lm) with ranges
    generated by `get_rnnt_prune_ranges`.

    Args:
      am:
        The encoder output, with shape (B, T, encoder_dim)
      lm:
        The prediction network output, with shape (B, S + 1, decoder_dim)
      ranges:
        A tensor containing the symbol indexes for each frame that we want to
        keep. Its shape is (B, T, s_range), see the docs in
        `get_rnnt_prune_ranges` for more details of this tensor.

    Returns:
      Return the pruned am and lm with shape (B, T, s_range, C)
    r   r   r(   r   r'   )r   r7   r   r.   r6   r   )r   r   r‰   r   r;   ro   re   Údecoder_dimÚencoder_dimr   Ú	am_prunedÚ	lm_prunedr   r   r   Údo_rnnt_pruningn  s"   $$$
"
ÿýr•   ÚsrcÚshiftsc                 C   s”   |   ¡ dksJ | jƒ‚| j\}}}|j||fks J |j||fƒ‚tj|| jd d|f¡ |df¡ |ddf¡}|| ||d¡ | }t | d|¡S )a:  Roll tensor with different shifts for each row.

    Note:
      We assume the src is a 3 dimensions tensor and roll the last dimension.

    Example:

      >>> src = torch.arange(15).reshape((1,3,5))
      >>> src
      tensor([[[ 0,  1,  2,  3,  4],
               [ 5,  6,  7,  8,  9],
               [10, 11, 12, 13, 14]]])
      >>> shift = torch.tensor([[1, 2, 3]])
      >>> shift
      tensor([[1, 2, 3]])
      >>> _roll_by_shifts(src, shift)
      tensor([[[ 4,  0,  1,  2,  3],
               [ 8,  9,  5,  6,  7],
               [12, 13, 14, 10, 11]]])
    r   rO   r   r   )	r   r   r.   rU   r*   ÚviewÚrepeatr   r6   )r–   r—   r   r;   r   r   r   r   r   Ú_roll_by_shifts›  s   

ürš   c              	   C   sŠ  | j dks
J | jƒ‚| j\}}}}	|j|||fks"J |j|||fƒ‚|j\}}
|
dks/J |
ƒ‚||
ks9J ||
fƒ‚|dv sAJ |ƒ‚tj| dd}tj|tj|g| tj|jd |df¡fdd}tj	| 
d¡ |||
d f¡d|d}tj	| d| |||d¡d d	¡}|| }tj|tj|||
d | ftd
ƒ|j|jdfdd}t||dd…dd…df ƒdd…dd…d|
…f }| d¡}|dkrÝtj|tj||
dftd
ƒ|j|jdfdd}| dd…dd…dd…|f  ¡ }|| }tj|tj|||
d | ftd
ƒ|j|jdfdd}t||dd…dd…df ƒ}| d¡}|dkr-t||ƒ}||fS |dkrA||dd…dd…dd…f 7 }||fS )a  Construct px, py for mutual_information_recursion with pruned output.

    Args:
      logits:
        The pruned output of joiner network, with shape (B, T, s_range, C)
      symbols:
        The symbol sequences, a LongTensor of shape [B][S], and elements in
        {0..C-1}.
      ranges:
        A tensor containing the symbol ids for each frame that we want to keep.
        It is a LongTensor of shape ``[B][T][s_range]``, where ``ranges[b,t,0]``
        contains the begin symbol ``0 <= s <= S - s_range + 1``, such that
        ``logits[b,t,:,:]`` represents the logits with positions
        ``s, s + 1, ... s + s_range - 1``.
        See docs in :func:`get_rnnt_prune_ranges` for more details of what
        ranges contains.
      termination_symbol:
        the termination symbol, with 0 <= termination_symbol < C
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame whether emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
    Returns:
      (px, py) (the names are quite arbitrary)::

          px: logprobs, of shape [B][S][T+1] if rnnt_type is regular,
                                 [B][S][T] if rnnt_type is not regular.
          py: logprobs, of shape [B][S+1][T]

      in the recursion::

         p[b,0,0] = 0.0
         if rnnt_type == "regular":
            p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t],
                               p[b,s,t-1] + py[b,s,t-1])
         if rnnt_type != "regular":
            p[b,s,t] = log_add(p[b,s-1,t-1] + px[b,s-1,t-1],
                               p[b,s,t-1] + py[b,s,t-1])

      .. where p[b][s][t] is the "joint score" of the pair of subsequences of
      length s and t respectively.  px[b][s][t] represents the probability of
      extending the subsequences of length (s,t) by one in the s direction,
      given the particular symbol, and py[b][s][t] represents the probability
      of extending the subsequences of length (s,t) by one in the t direction,
      i.e. of emitting the termination/next-frame symbol.

      if `rnnt_type == "regular"`, px[:,:,T] equals -infinity, meaning on the
      "one-past-the-last" frame we cannot emit any symbols.
      This is simply a way of incorporating
      the probability of the termination symbol on the last frame.
    r`   r   r"   r   r,   rN   r   r'   r(   r   r)   Nr   ra   r   r$   )r-   r   r.   rb   r9   rT   Úint64r*   r   r6   r7   r   r8   r:   r   r+   rš   rc   rd   r   )r_   r   r‰   r    r	   r!   r   r;   ro   r<   r   rB   Úsymbols_with_terminalÚpruned_symbolsr   rG   r   r   r   Úget_rnnt_logprobs_pruned¾  s’   M"
ý
ú÷ýÿþüþ	ö2
ÿþù 
üþ	ö



ýrž   c                 C   s  t | |||||d\}}	|dkrd|j\}
}}|dkr|n|d }|du r8tj|d d |j|jd |
dd¡}n|dd…df d d }| |
dd¡tj||jd	 dd|¡ }|| }|| 	|j¡7 }t
||	|d
}|dkrr| S |dkr|t |¡ S |dkr†t |¡ S td|› ƒ‚)až  A RNN-T loss with pruning, which uses the output of a pruned 'joiner'
    network as input, i.e. a 4 dimensions tensor with shape (B, T, s_range, C),
    s_range means the number of symbols kept for each frame.

    Args:
      logits:
        The pruned output of joiner network, with shape (B, T, s_range, C),
        i.e. batch, time_seq_len, prune_range, num_classes
      symbols:
        A LongTensor of shape [B][S], containing the symbols at each position
        of the sequence.
      ranges:
        A tensor containing the symbol ids for each frame that we want to keep.
        It is a LongTensor of shape ``[B][T][s_range]``, where ``ranges[b,t,0]``
        contains the begin symbol ``0 <= s <= S - s_range +1``, such that
        ``logits[b,t,:,:]`` represents the logits with positions
        ``s, s + 1, ... s + s_range - 1``.
        See docs in :func:`get_rnnt_prune_ranges` for more details of what ranges
        contains.
      termination_symbol:
        The identity of the termination symbol, must be in {0..C-1}
      boundary:
        a LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T] if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
      delay_penalty: A constant value to penalize symbol delay, this may be
         needed when training with time masking, to avoid the time-masking
         encouraging the network to delay symbols.
         See https://github.com/k2-fsa/k2/issues/955 for more details.
      reduction:
        Specifies the reduction to apply to the output: `none`, `mean` or `sum`.
        `none`: no reduction will be applied.
        `mean`: apply `torch.mean` over the batches.
        `sum`: the output will be summed.
        Default: `mean`
    Returns:
      If reduction is `none`, returns a tensor of shape (B,), containing the
      total RNN-T loss values for each sequence of the batch, otherwise a scalar
      with the reduction applied.
    )r_   r   r‰   r    r	   r!   rI   r   r   Nr   rN   r   rO   rg   rQ   rJ   rR   rS   )rž   r   r.   rT   r+   r*   r   r   rU   rV   r   rJ   rR   rW   )r_   r   r‰   r    r	   r!   rK   rL   r   rG   r   r   rX   r;   rY   rZ   r\   r   r   r   Úrnnt_loss_prunedj  sJ   @
ú	
ý
üÿ
þÿrŸ   çš™™™™™¹?Úlm_only_scaleÚam_only_scalec           $   	   C   s$  | j dks
J | jƒ‚|j dksJ |jƒ‚| jd |jd ks&J | j|jfƒ‚| jd |jd ks8J | j|jfƒ‚|j\}}	}
| jd d }|j||fksTJ |j||fƒ‚|dks\J |ƒ‚|	|ksfJ |	|fƒ‚|dv snJ |ƒ‚tj|ddd\}}tj| ddd\}}||  ¡ }| |  ¡ }t || dd¡¡t |j¡j	  
¡ }|jddd}tj|| dddt |j¡j	 }t | d	|
¡| |
¡¡ ||	d¡ 
¡ | }| dd¡}| 
¡ }| 
¡ | }|| | dd¡ }tj| d¡ |||	|
¡d| ||dd¡ |||	d¡d
 d	¡}|dkr%tj|tj||dftdƒ|j|jdfdd}tj| dd…d|…f d| d	¡d
}tj| |||
¡d| d	¡d
}|| }|dd…dd…d|	…f  |dd…d|…dd…f 8  < || }|dd…dd…d|	…f  |8  < ||dd…d|…dd…f  }|dd…dd…|f  d¡}| dd…dd…|f  d¡}|| | }|d d | }|| | }|| } d| | }!|dkrÍd}|dkrÔd}||! ||  ||  }"||! | |  ||  }#|dkrút|"|ƒ}"|"|#fS |dkr|"|#dd…dd…dd…f 7 }"|"|#fS )a  
    Reduces RNN-T problem (the simple case, where joiner network is just
    addition), to a compact, standard form that can then be given
    (with boundaries) to mutual_information_recursion().
    This version allows you to make the loss-function one of the form::

          lm_only_scale * lm_probs +
          am_only_scale * am_probs +
          (1-lm_only_scale-am_only_scale) * combined_probs

    where lm_probs and am_probs are the probabilities given the lm and acoustic
    model independently.

    This function is called from
    :func:`rnnt_loss_smoothed`, but may be useful for other purposes.

    Args:
      lm:
        Language model part of un-normalized logprobs of symbols, to be added to
        acoustic model part before normalizing.  Of shape::

           [B][S+1][C]

        where B is the batch size, S is the maximum sequence length of
        the symbol sequence, possibly including the EOS symbol; and
        C is size of the symbol vocabulary, including the termination/next-frame
        symbol.
        Conceptually, lm[b][s] is a vector of length [C] representing the
        "language model" part of the un-normalized logprobs of symbols,
        given all symbols *earlier than* s in the sequence.  The reason
        we still need this for position S is that we may still be emitting
        the termination/next-frame symbol at this point.
      am:
        Acoustic-model part of un-normalized logprobs of symbols, to be added
        to language-model part before normalizing.  Of shape::

           [B][T][C]

        where B is the batch size, T is the maximum sequence length of
        the acoustic sequences (in frames); and C is size of the symbol
        vocabulary, including the termination/next-frame symbol.  It reflects
        the "acoustic" part of the probability of any given symbol appearing
        next on this frame.
      symbols:
        A LongTensor of shape [B][S], containing the symbols at each position
        of the sequence.
      termination_symbol:
        The identity of the termination symbol, must be in {0..C-1}
      lm_only_scale:
        the scale on the "LM-only" part of the loss.
      am_only_scale:
        the scale on the "AM-only" part of the loss, for which we use
        an "averaged" LM (averaged over all histories, so effectively unigram).
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
    Returns:
        (px, py) (the names are quite arbitrary).
           px: logprobs, of shape [B][S][T+1] if rnnt_type == "regular",
                                  [B][S][T] if rnnt_type != "regular".
           py: logprobs, of shape [B][S+1][T]

        in the recursion::

          p[b,0,0] = 0.0
          if rnnt_type == "regular":
             p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t],
                                p[b,s,t-1] + py[b,s,t-1])
          if rnnt_type != "regular":
             p[b,s,t] = log_add(p[b,s-1,t-1] + px[b,s-1,t-1],
                                p[b,s,t-1] + py[b,s,t-1])
          .. where p[b][s][t] is the "joint score" of the pair of subsequences
          of length s and t respectively.  px[b][s][t] represents the
          probability of extending the subsequences of length (s,t) by one in
          the s direction, given the particular symbol, and py[b][s][t]
          represents the probability of extending the subsequences of length
          (s,t) by one in the t direction,
          i.e. of emitting the termination/next-frame symbol.

          px[:,:,T] equals -infinity, meaning on the "one-past-the-last" frame
          we cannot emit any symbols.  This is simply a way of incorporating
          the probability of the termination symbol on the last frame.
    r   r   r   r   r"   Tr%   )r   r   r(   r'   r   r   r)   r,   Ng      ð?rI   g#B’¡œÇ;r$   )r-   r   r.   r/   r0   r1   r2   r3   r+   r4   r5   rR   rJ   Úmvr   r6   r7   r   r8   r9   r:   r   r*   r   )$r   r   r   r    r¡   r¢   r	   r!   r   r;   r<   r   r=   r>   r?   r@   rA   rB   Úlmonly_normalizersÚ
unigram_lmÚamonly_normalizersrC   rD   Úpx_lm_unigramr   Ú	px_amonlyÚ	px_lmonlyrE   rF   rG   Úpy_lm_unigramÚ	py_amonlyÚ	py_lmonlyÚcombined_scaleÚ	px_interpÚ	py_interpr   r   r   Úget_rnnt_logprobs_smoothedÑ  sÆ   m$$	ÿýÿÿÿ
ýÿ
ÿýÿü
üþ	öÿÿ8ÿ"

ÿþÿÿþÿ


ýr°   c              
   C   sJ  t | |||||||d\}}|dkrf|j\}}}|dkr|n|d }|du r:tj|d d |j|jd |dd¡}n|dd…df d d }| |dd¡tj||jd	 dd|¡ }|| }|| 	|j¡7 }t
||||
d
}|
rt|d n|}|	dkr~| }n|	dkr‰t |¡ }n|	dkr”t |¡ }ntd|	› ƒ‚|
r£||d fS |S )a  A simple case of the RNN-T loss, where the 'joiner' network is just
    addition.

    Args:
      lm:
        language-model part of unnormalized log-probs of symbols, with shape
        (B, S+1, C), i.e. batch, symbol_seq_len+1, num_classes.
        These are assumed to be well-normalized, in the sense that we could
        use them as probabilities separately from the am scores
      am:
        acoustic-model part of unnormalized log-probs of symbols, with shape
        (B, T, C), i.e. batch, frame, num_classes
      symbols:
        the symbol sequences, a LongTensor of shape [B][S], and elements in
        {0..C-1}.
      termination_symbol:
        the termination symbol, with 0 <= termination_symbol < C
      lm_only_scale:
        the scale on the "LM-only" part of the loss.
      am_only_scale:
        the scale on the "AM-only" part of the loss, for which we use
        an "averaged" LM (averaged over all histories, so effectively unigram).
      boundary:
        a LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame whether emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
      delay_penalty: A constant value to penalize symbol delay, this may be
         needed when training with time masking, to avoid the time-masking
         encouraging the network to delay symbols.
         See https://github.com/k2-fsa/k2/issues/955 for more details.
      reduction:
        Specifies the reduction to apply to the output: `none`, `mean` or `sum`.
        `none`: no reduction will be applied.
        `mean`: apply `torch.mean` over the batches.
        `sum`: the output will be summed.
        Default: `mean`
      return_grad:
        Whether to return grads of px and py, this grad standing for the
        occupation probability is the output of the backward with a
        `fake gradient`, the `fake gradient` is the same as the gradient you'd
        get if you did `torch.autograd.grad((-loss.sum()), [px, py])`, note, the
        loss here is the loss with reduction "none".
        This is useful to implement the pruned version of rnnt loss.

    Returns:
       If return_grad is False, returns a tensor of shape (B,), containing the
       total RNN-T loss values for each element of the batch if reduction equals
       to "none", otherwise a scalar with the reduction applied.
       If return_grad is True, the grads of px and py, which is the output of
       backward with a `fake gradient`(see above), will be returned too. And the
       returned value will be a tuple like (loss, (px_grad, py_grad)).
    )r   r   r   r    r¡   r¢   r	   r!   rI   r   r   Nr   rN   r   rO   rP   r   rQ   rJ   rR   rS   )r°   r   r.   rT   r+   r*   r   r   rU   rV   r   rJ   rR   rW   )r   r   r   r    r¡   r¢   r	   r!   rK   rL   rM   r   rG   r   r   rX   r;   rY   rZ   r[   r\   r]   r   r   r   Úrnnt_loss_smoothed¾  sV   P
ø
ý
üÿ
þÿÿr±   )N)r   N)Nr   rI   rJ   F)Nr   rI   rJ   )r   )r    r    Nr   )r    r    Nr   rI   rJ   F)Úosr.   r   Útypingr   r   r   Úmutual_informationr   r   ÚintÚstrrH   r   Úboolr^   rf   rh   rm   rs   rŠ   r   r•   Ú
LongTensorrš   rž   rŸ   r°   r±   r   r   r   r   Ú<module>   sÀ  úÿþýüûú

ù ,÷ÿþýüûúùø	÷

öuûÿþýüû

úrùÿþýüûúù
ø]%ÿÿ
þ8ÿþýü
û ÿþýü
û ÿÿÿ
þ-)úÿþýüûú

ù 2øÿþýüûúùø	
÷løÿþýüûúùø
	
÷ sõÿþýüûúùø	÷
öõô