o
    Gi                     @   s<  d dl Z d dlZd dlmZ d dlmZmZmZ ddlmZ d?dedee defd	d
Z			d@dededede
dedee deeef fddZ					dAdededede
dee dededee dedeeeeeeef f f fddZ		d@dedede
dedee deeef fddZ				dBdedede
dee dededee defddZd ejdejfd!d"Zd#ejd$e
dejfd%d&Zd'ejd(ejdejd$e
dejf
d)d*Zd'ejd(ejdejd$e
dejf
d+d,Zdejdejd-ejdeejejf fd.d/Zd0ejd1ejfd2d3Z	dCdeded-ede
dededeeef fd4d5Z				dBdeded-ede
dedededee defd6d7Z	8	8		dDdededede
d9ed:edee dedeeef fd;d<Z	8	8					dEdededede
d9ed:edee dededee dedeeeeeef f ef fd=d>ZdS )F    N)Tensor)OptionalTupleUnion   )mutual_information_recursionpxboundaryreturnc                 C   sP   |du r| S | j \}}}|dddf |dd|||}| jd|tddS )aV  
    Insert -inf's into `px` in appropriate places if `boundary` is not
    None.  If boundary == None and rnnt_type == "regular", px[:,:,-1] will
    be -infinity, but if boundary is specified, we need px[b,:,boundary[b,3]]
    to be -infinity.

     Args:
          px: a Tensor of of shape [B][S][T+1] (this function is only
              called if rnnt_type == "regular", see other docs for `rnnt_type`)
              px is modified in-place and returned.
           boundary: None, or a Tensor of shape [B][3] containing
              [s_begin, t_begin, s_end, t_end]; we need only t_end.
    N   r      -inf)dimindexvalue)shapereshapeexpandscatter_float)r   r	   BST1 r   @/home/ubuntu/.local/lib/python3.10/site-packages/k2/rnnt_loss.pyfix_for_boundary   s
   $r   regularlmamsymbolstermination_symbol	rnnt_typec              	   C   s  | j dks
J | j|j dksJ |j| jd |jd ks&J | j|jf| jd |jd ks8J | j|jf|j\}}}| jd d }	|j||	fksTJ |j||	f|	dks\J |	||	ksfJ ||	f|dv snJ |tj|ddd\}
}tj| ddd\}}||
  }| |  }t||ddt|jj	 
 }|| |
dd }tj|d||	||d|||	dd||	|ddd	}|d
krtj|tj||	dftd|j|jdfdd}tj| ddd|	f d|d	d}|| }|ddddd|f  |ddd|	ddf 8  < |dddd|f d}| dddd|f d}|| | }|d
krIt||}||fS |dkr]||ddddddf 7 }||fS )a  
    Reduces RNN-T problem (the simple case, where joiner network is just
    addition), to a compact, standard form that can then be given
    (with boundaries) to mutual_information_recursion().
    This function is called from rnnt_loss_simple(), but may be useful for
    other purposes.

    Args:
      lm:
        Language model part of un-normalized logprobs of symbols, to be added to
        acoustic model part before normalizing.  Of shape::

           [B][S+1][C]

        where B is the batch size, S is the maximum sequence length of
        the symbol sequence, possibly including the EOS symbol; and
        C is size of the symbol vocabulary, including the termination/next-frame
        symbol.
        Conceptually, lm[b][s] is a vector of length [C] representing the
        "language model" part of the un-normalized logprobs of symbols,
        given all symbols *earlier than* s in the sequence.  The reason
        we still need this for position S is that we may still be emitting
        the termination/next-frame symbol at this point.
      am:
        Acoustic-model part of un-normalized logprobs of symbols, to be added
        to language-model part before normalizing.  Of shape::

           [B][T][C]

        where B is the batch size, T is the maximum sequence length of
        the acoustic sequences (in frames); and C is size of the symbol
        vocabulary, including the termination/next-frame symbol.  It reflects
        the "acoustic" part of the probability of any given symbol appearing
        next on this frame.
      symbols:
        A LongTensor of shape [B][S], containing the symbols at each position
        of the sequence.
      termination_symbol:
        The identity of the termination symbol, must be in {0..C-1}
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
    Returns:
        (px, py) (the names are quite arbitrary).
           px: logprobs, of shape [B][S][T+1] if rnnt_type is regular,
                                  [B][S][T] if rnnt_type is not regular.
           py: logprobs, of shape [B][S+1][T]

      in the recursion::

          p[b,0,0] = 0.0
          if rnnt_type == "regular":
             p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t],
                                p[b,s,t-1] + py[b,s,t-1])
          if rnnt_type != "regular":
             p[b,s,t] = log_add(p[b,s-1,t-1] + px[b,s-1,t-1],
                                p[b,s,t-1] + py[b,s,t-1])
          .. where p[b][s][t] is the "joint score" of the pair of subsequences
          of length s and t respectively.  px[b][s][t] represents the
          probability of extending the subsequences of length (s,t) by one in
          the s direction, given the particular symbol, and py[b][s][t]
          represents the probability of extending the subsequences of length
          (s,t) by one in the t direction,
          i.e. of emitting the termination/next-frame symbol.

          if rnnt_type == "regular", px[:,:,T] equals -infinity, meaning on the
          "one-past-the-last" frame we cannot emit any symbols.
          This is simply a way of incorporating
          the probability of the termination symbol on the last frame.
    r   r   r   r   r   modifiedconstrainedTr   keepdimr   r   r   r   devicedtyper   Nr$   )ndimr   torchmaxexpmatmul	transposefinfor+   tinyloggather	unsqueezer   r   squeezecatfullr   r*   r   )r   r   r   r    r!   r	   r   TCr   am_max_lm_maxam_probslm_probsnormalizerspx_ampx_lmr   py_ampy_lmpyr   r   r   get_rnnt_logprobs.   sn   ^$$	8


rH           meanFdelay_penalty	reductionreturn_gradc	                 C   sF  t | |||||d\}	}
|dkrd|	j\}}}|dkr|n|d }|du r8tj|d d |	j|	jd|dd}n|dddf d d }||ddtj||	jd	dd| }|| }|	|	|	j7 }	t
|	|
||d
}|rr|d n|}|dkr|| }n|dkrt| }n|dkrt| }ntd| |r||d fS |S )a  A simple case of the RNN-T loss, where the 'joiner' network is just
    addition.

    Args:
      lm:
        language-model part of unnormalized log-probs of symbols, with shape
        (B, S+1, C), i.e. batch, symbol_seq_len+1, num_classes
      am:
        acoustic-model part of unnormalized log-probs of symbols, with shape
        (B, T, C), i.e. batch, frame, num_classes
      symbols:
        the symbol sequences, a LongTensor of shape [B][S], and elements in
        {0..C-1}.
      termination_symbol:
        the termination symbol, with 0 <= termination_symbol < C
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
      delay_penalty: A constant value to penalize symbol delay, this may be
         needed when training with time masking, to avoid the time-masking
         encouraging the network to delay symbols.
         See https://github.com/k2-fsa/k2/issues/955 for more details.
      reduction:
        Specifies the reduction to apply to the output: `none`, `mean` or `sum`.
        `none`: no reduction will be applied.
        `mean`: apply `torch.mean` over the batches.
        `sum`: the output will be summed.
        Default: `mean`
      return_grad:
        Whether to return grads of px and py, this grad standing for the
        occupation probability is the output of the backward with a
        `fake gradient`, the `fake gradient` is the same as the gradient you'd
        get if you did `torch.autograd.grad((-loss.sum()), [px, py])`, note, the
        loss here is the loss with reduction "none".
        This is useful to implement the pruned version of rnnt loss.
    Returns:
       If return_grad is False, returns a tensor of shape (B,), containing the
       total RNN-T loss values for each element of the batch if reduction equals
       to "none", otherwise a scalar with the reduction applied.
       If return_grad is True, the grads of px and py, which is the output of
       backward with a `fake gradient`(see above), will be returned too. And the
       returned value will be a tuple like (loss, (px_grad, py_grad)).
    )r   r   r   r    r	   r!   rI   r   r   Nr   r+   r*   r   r*   r   rG   r	   rM   r   nonerJ   sum5reduction should be ('none' | 'mean' | 'sum'), given )rH   r   r.   tensorr+   r*   r   r   arangetor   rJ   rR   
ValueError)r   r   r   r    r	   r!   rK   rL   rM   r   rG   r   r   T0r;   offsetpenaltyscores_and_gradsnegated_losslossr   r   r   rnnt_loss_simple   sR   F
	


r^   logitsc              	   C   s  | j dks
J | j| j\}}}}|d }	|j||	fks$J |j||	f|	dks,J |	||	ks6J ||	f|dv s>J |tj| dd}
|
d}
tj| d||d|	d|||	ddd}|d}|d	krtj	|tj
||	dftd
|j|jdfdd}|ddddd|f  |
ddd|	ddf 8  < | dddddd|f d }||
8 }|d	krt||}||fS |dkr||ddddddf 7 }||fS )a  Reduces RNN-T problem to a compact, standard form that can then be given
    (with boundaries) to mutual_information_recursion().
    This function is called from rnnt_loss().

    Args:
      logits:
        The output of joiner network, with shape (B, T, S + 1, C),
        i.e. batch, time_seq_len, symbol_seq_len+1, num_classes
      symbols:
        A LongTensor of shape [B][S], containing the symbols at each position
        of the sequence.
      termination_symbol:
        The identity of the termination symbol, must be in {0..C-1}
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
    Returns:
      (px, py) (the names are quite arbitrary)::

          px: logprobs, of shape [B][S][T+1] if rnnt_type is regular,
                                 [B][S][T] if rnnt_type is not regular.
          py: logprobs, of shape [B][S+1][T]

      in the recursion::

         p[b,0,0] = 0.0
         if rnnt_type == "regular":
            p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t],
                               p[b,s,t-1] + py[b,s,t-1])
         if rnnt_type != "regular":
            p[b,s,t] = log_add(p[b,s-1,t-1] + px[b,s-1,t-1],
                               p[b,s,t-1] + py[b,s,t-1])

      .. where p[b][s][t] is the "joint score" of the pair of subsequences of
      length s and t respectively.  px[b][s][t] represents the probability of
      extending the subsequences of length (s,t) by one in the s direction,
      given the particular symbol, and py[b][s][t] represents the probability
      of extending the subsequences of length (s,t) by one in the t direction,
      i.e. of emitting the termination/next-frame symbol.

      if `rnnt_type == "regular"`, px[:,:,T] equals -infinity, meaning on the
      "one-past-the-last" frame we cannot emit any symbols.
      This is simply a way of incorporating
      the probability of the termination symbol on the last frame.
       r   r"   r   r,   r   r   r   r'   r(   r   r   r)   r   Nr$   )r-   r   r.   	logsumexppermuter6   r   r   r8   r9   r:   r   r*   r+   cloner   )r_   r   r    r!   r	   r   r;   S1r<   r   rB   r   rG   r   r   r   get_rnnt_logprobs_jointE  sD   D

8
$
rf   c                 C   s  t | ||||d\}}|dkrc|j\}	}
}|dkr|n|d }|du r7tj|d d |j|jd|	dd}n|dddf d d }||	ddtj||jd	dd| }|| }||	|j7 }t
|||d
}|dkrq| S |dkr{t| S |dkrt| S td| )a_	  A normal RNN-T loss, which uses a 'joiner' network output as input,
    i.e. a 4 dimensions tensor.

    Args:
      logits:
        The output of joiner network, with shape (B, T, S + 1, C),
        i.e. batch, time_seq_len, symbol_seq_len+1, num_classes
      symbols:
        The symbol sequences, a LongTensor of shape [B][S], and elements
        in {0..C-1}.
      termination_symbol:
        the termination symbol, with 0 <= termination_symbol < C
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T] if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
      delay_penalty: A constant value to penalize symbol delay, this may be
         needed when training with time masking, to avoid the time-masking
         encouraging the network to delay symbols.
         See https://github.com/k2-fsa/k2/issues/955 for more details.
      reduction:
        Specifies the reduction to apply to the output: `none`, `mean` or `sum`.
        `none`: no reduction will be applied.
        `mean`: apply `torch.mean` over the batches.
        `sum`: the output will be summed.
        Default: `mean`

    Returns:
      If recursion is `none`, returns a tensor of shape (B,), containing the
      total RNN-T loss values for each element of the batch, otherwise a scalar
      with the reduction applied.
    )r_   r   r    r	   r!   rI   r   r   Nr   rN   r   rO   r   rG   r	   rQ   rJ   rR   rS   )rf   r   r.   rT   r+   r*   r   r   rU   rV   r   rJ   rR   rW   )r_   r   r    r	   r!   rK   rL   r   rG   r   r   rX   r;   rY   rZ   r\   r   r   r   	rnnt_loss  sH   7



rh   xc                 C   s2   t j| dd} t j| dd\} }t j| dd} | S )a   Compute a monotonically increasing lower bound of the tensor `x` on the
    last dimension. The basic idea is: we traverse the tensor in reverse order,
    and update current element with the following statement,

        min_value = min(x[i], min_value)
        x[i] = min_value

    >>> import torch
    >>> x = torch.tensor([0, 2, 1, 3, 6, 5, 8], dtype=torch.int32)
    >>> _monotonic_lower_bound(x)
    tensor([0, 1, 1, 3, 5, 5, 8], dtype=torch.int32)
    >>> x
    tensor([0, 2, 1, 3, 6, 5, 8], dtype=torch.int32)
    >>> x = torch.randint(20, (3, 6), dtype=torch.int32)
    >>> x
    tensor([[12, 18,  5,  4, 18, 17],
            [11, 14, 14,  3, 10,  4],
            [19,  3,  8, 13,  7, 19]], dtype=torch.int32)
    >>> _monotonic_lower_bound(x)
    tensor([[ 4,  4,  4,  4, 17, 17],
            [ 3,  3,  3,  3,  4,  4],
            [ 3,  3,  7,  7,  7, 19]], dtype=torch.int32)
    Args:
      x:
        The source tensor.

    Returns:
      Returns a tensor which is monotonic on the last dimension
      (i.e. satisfiy `x[i] <= x[i+1]`).
    )r(   )dimsr(   r,   )r.   flipcummin)ri   r>   r   r   r   _monotonic_lower_bound  s   rm   s_begins_rangec                 C   sl   | j \}}t| } | |d tjd|| jd   } t| } tj| dd} | |d tjd|| jd   } | S )a|  Adjust s_begin (pruning lower bounds) to make it satisfy the following
    constraints

      - monotonic increasing, i.e. s_begin[i] <= s_begin[i + 1]
      - start with symbol 0 at first frame.
      - s_begin[i + 1] - s_begin[i] < s_range, which means that we can't skip
        any symbols.

    To make it monotonic increasing, we can use `_monotonic_lower_bound` above,
    which guarantees `s_begin[i] <= s_begin[i + 1]`. The main idea is:
    traverse the array in reverse order and update the elements by
    `min_value = min(a_begin[i], min_value)`.

    The method we used to realize `s_begin[i + 1] - s_begin[i] < s_range`
    constraint is a little tricky. We first transform `s_begin` with
    `s_begin = -(s_begin - (s_range - 1) * torch.arange(0,T))`
    then we make the transformed `s_begin` monotonic increasing, after that,
    we transform back `s_begin` with the same formula as the previous
    transformation. The idea is: if we want to make
    `s_begin[i + 1] - s_begin[i] < s_range` we only need to make
    `-(s_begin[i] - i * (s_range - 1))` a non-decreasing array. Proof:

      -(s_begin[i] - i * (s_range - 1)) <= -(s_begin[i + 1] - (i + 1) * (s_range - 1))
                            -s_begin[i] <= -s_begin[i + 1] + (i + 1) * (s_range - 1) - i * (s_range - 1)
                            -s_begin[i] <= -s_begin[i + 1] + s_range - 1
            s_begin[i + 1] - s_begin[i] <= s_range - 1
            s_begin[i + 1] - s_begin[i] < s_range

    The above transformation can not guarantee the start symbol to be 0, so we
    have to make all the elements that less than 0 to be 0 before transforming
    back the `s_begin`.
    r   r   rO   min)r   rm   r.   rU   r*   clamp)rn   ro   r   r;   r   r   r   _adjust_pruning_lower_bound5  s   
$rs   px_gradpy_gradc                 C   sJ  | j \}}}|j d }|||d fv sJ ||f|d }|j |||fks.J |j |||f|j |dfks<J |j |f|dksDJ |||ksNJ ||f||krV|d }||krf|dkseJ d| n|dksqJ d| | \}	}
}t|||| d ||f|	|
|
|f}tj|dd}tj|d|f| j| jd}tj|| fdd	}||d
d
d
|| d d
|f  }tj	|dd}tj
d|| jdd|||}||d
d
df |dd k }|d
d
df |d| d }tj|dd}t|||}t|||kr
dn|}|||df|||ftj
|| jd }|S )	  Get the pruning ranges of normal rnnt loss according to the grads
    of px and py returned by mutual_information_recursion.

    For each sequence with T frames, we will generate a tensor with the shape of
    (T, s_range) containing the information that which symbols will be token
    into consideration for each frame. For example, here is a sequence with 10
    frames and the corresponding symbols are `[A B C D E F]`, if the s_range
    equals 3, one possible ranges tensor will be::

      [[0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [1, 2, 3],
       [1, 2, 3], [1, 2, 3], [3, 4, 5], [3, 4, 5], [3, 4, 5]]

    which means we only consider `[A B C]` at frame 0, 1, 2, 3, and `[B C D]`
    at frame 4, 5, 6, `[D E F]` at frame 7, 8, 9.

    We can only consider limited number of symbols because frames and symbols
    are monotonic aligned, theoretically it can only generate particular range
    of symbols given a particular frame.

    Note:
      For the generated tensor ranges (assuming batch size is 1), ranges[:, 0]
      is a monotonic increasing tensor from 0 to `len(symbols) - s_range` and
      it satisfies `ranges[t+1, 0] - ranges[t, 0] < s_range` which means we
      won't skip any symbols.

    Args:
      px_grad:
        The gradient of px, see docs in `mutual_information_recursion` for more
        details of px.
      py_grad:
        The gradient of py, see docs in `mutual_information_recursion` for more
        details of py.
      boundary:
        a LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame]
      s_range:
        How many symbols to keep for each frame.
    Returns:
      A tensor with the shape of (B, T, s_range) containing the indexes of the
      kept symbols for each frame.
    r(   r   r`   ~Pruning range for modified RNN-T should be equal to or greater
        than 1, or no valid paths could survive pruning. Given r   ~Pruning range for standard RNN-T should be equal to or greater
        than 2, or no valid paths could survive pruning. Given )axisrN   r,   Nr   rO   r   rp   )r   strider.   
as_stridedrR   zerosr+   r*   r9   argmaxrU   r   r   rr   wherers   )rt   ru   r	   ro   r   r   r   r;   re   B_strideS_strideT_strideblk_gradblk_sum_gradpx_padpx_grad_pad
final_gradrn   masks_begin_paddingrangesr   r   r   get_rnnt_prune_rangesm  sT   /
"


&"  r   c                 C   s  | j \}}}|j d }|||d fv sJ ||f|j ||d |fks,J |j |||f|j |dfks:J |j |f|dksBJ |||ksLJ ||f||krT|d }||krd|dkscJ d| n|dksoJ d| tj|d|f| j| jd}tj||d df|j|jd}	||kr|ntj||	fdd}
tj| |fdd|
 }tjtj|d|f|j|jd|fdd}tj|dd}|d	d	|d	d	d	f |d	d	d
| d	d	f  }tj|dd}|d	d	d	|f }tjd
|| jd	d|
||}||d	d	df 	|dd k }|d	d	df 	|d| d }tj|d
d}t|||}t|||kr4dn|}|	||df
|||ftj|| jd }|S )rv   r(   r   r`   rw   r   rx   rN   r,   Nr   rO   r   rp   )r   r.   r|   r+   r*   r9   cumsumr}   rU   r   r   rr   r~   rs   )rt   ru   r	   ro   r   r   r   r;   r   py_padpy_grad_paddedtot_grad	diff_gradrn   r   r   r   r   r   r    get_rnnt_prune_ranges_deprecated  sb   /
&

	6"  r   r   c              	   C   s  |j d | j d ksJ |j | j f|j d |j d ks$J |j |j f| j d |j d ks6J | j |j f|j \}}}|j \}}}| j d }| j |||fksXJ | j |||f|d }	| d||||f}
tj|d|||	d |fd||||df||||fd}|
|fS )a@  Prune the output of encoder(am) and prediction network(lm) with ranges
    generated by `get_rnnt_prune_ranges`.

    Args:
      am:
        The encoder output, with shape (B, T, encoder_dim)
      lm:
        The prediction network output, with shape (B, S + 1, decoder_dim)
      ranges:
        A tensor containing the symbol indexes for each frame that we want to
        keep. Its shape is (B, T, s_range), see the docs in
        `get_rnnt_prune_ranges` for more details of this tensor.

    Returns:
      Return the pruned am and lm with shape (B, T, s_range, C)
    r   r   r(   r   r'   )r   r7   r   r.   r6   r   )r   r   r   r   r;   ro   re   decoder_dimencoder_dimr   	am_pruned	lm_prunedr   r   r   do_rnnt_pruningn  s"   $$$
"
r   srcshiftsc                 C   s   |   dksJ | j| j\}}}|j||fks J |j||ftj|| jdd|f|df|ddf}||||d | }t| d|S )a:  Roll tensor with different shifts for each row.

    Note:
      We assume the src is a 3 dimensions tensor and roll the last dimension.

    Example:

      >>> src = torch.arange(15).reshape((1,3,5))
      >>> src
      tensor([[[ 0,  1,  2,  3,  4],
               [ 5,  6,  7,  8,  9],
               [10, 11, 12, 13, 14]]])
      >>> shift = torch.tensor([[1, 2, 3]])
      >>> shift
      tensor([[1, 2, 3]])
      >>> _roll_by_shifts(src, shift)
      tensor([[[ 4,  0,  1,  2,  3],
               [ 8,  9,  5,  6,  7],
               [12, 13, 14, 10, 11]]])
    r   rO   r   r   )	r   r   r.   rU   r*   viewrepeatr   r6   )r   r   r   r;   r   r   r   r   r   _roll_by_shifts  s   

r   c              	   C   s  | j dks
J | j| j\}}}}	|j|||fks"J |j|||f|j\}}
|
dks/J |
||
ks9J ||
f|dv sAJ |tj| dd}tj|tj|g| tj|jd|dffdd}tj	|
d|||
d fd|d}tj	| d||||ddd	}|| }tj|tj|||
d | ftd
|j|jdfdd}t||dddddf ddddd|
f }|d}|dkrtj|tj||
dftd
|j|jdfdd}| dddddd|f  }|| }tj|tj|||
d | ftd
|j|jdfdd}t||dddddf }|d}|dkr-t||}||fS |dkrA||ddddddf 7 }||fS )a  Construct px, py for mutual_information_recursion with pruned output.

    Args:
      logits:
        The pruned output of joiner network, with shape (B, T, s_range, C)
      symbols:
        The symbol sequences, a LongTensor of shape [B][S], and elements in
        {0..C-1}.
      ranges:
        A tensor containing the symbol ids for each frame that we want to keep.
        It is a LongTensor of shape ``[B][T][s_range]``, where ``ranges[b,t,0]``
        contains the begin symbol ``0 <= s <= S - s_range + 1``, such that
        ``logits[b,t,:,:]`` represents the logits with positions
        ``s, s + 1, ... s + s_range - 1``.
        See docs in :func:`get_rnnt_prune_ranges` for more details of what
        ranges contains.
      termination_symbol:
        the termination symbol, with 0 <= termination_symbol < C
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame whether emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
    Returns:
      (px, py) (the names are quite arbitrary)::

          px: logprobs, of shape [B][S][T+1] if rnnt_type is regular,
                                 [B][S][T] if rnnt_type is not regular.
          py: logprobs, of shape [B][S+1][T]

      in the recursion::

         p[b,0,0] = 0.0
         if rnnt_type == "regular":
            p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t],
                               p[b,s,t-1] + py[b,s,t-1])
         if rnnt_type != "regular":
            p[b,s,t] = log_add(p[b,s-1,t-1] + px[b,s-1,t-1],
                               p[b,s,t-1] + py[b,s,t-1])

      .. where p[b][s][t] is the "joint score" of the pair of subsequences of
      length s and t respectively.  px[b][s][t] represents the probability of
      extending the subsequences of length (s,t) by one in the s direction,
      given the particular symbol, and py[b][s][t] represents the probability
      of extending the subsequences of length (s,t) by one in the t direction,
      i.e. of emitting the termination/next-frame symbol.

      if `rnnt_type == "regular"`, px[:,:,T] equals -infinity, meaning on the
      "one-past-the-last" frame we cannot emit any symbols.
      This is simply a way of incorporating
      the probability of the termination symbol on the last frame.
    r`   r   r"   r   r,   rN   r   r'   r(   r   r)   Nr   ra   r   r$   )r-   r   r.   rb   r9   rT   int64r*   r   r6   r7   r   r8   r:   r   r+   r   rc   rd   r   )r_   r   r   r    r	   r!   r   r;   ro   r<   r   rB   symbols_with_terminalpruned_symbolsr   rG   r   r   r   get_rnnt_logprobs_pruned  s   M"

	2
 
	



r   c                 C   s  t | |||||d\}}	|dkrd|j\}
}}|dkr|n|d }|du r8tj|d d |j|jd|
dd}n|dddf d d }||
ddtj||jd	dd| }|| }||	|j7 }t
||	|d
}|dkrr| S |dkr|t| S |dkrt| S td| )a  A RNN-T loss with pruning, which uses the output of a pruned 'joiner'
    network as input, i.e. a 4 dimensions tensor with shape (B, T, s_range, C),
    s_range means the number of symbols kept for each frame.

    Args:
      logits:
        The pruned output of joiner network, with shape (B, T, s_range, C),
        i.e. batch, time_seq_len, prune_range, num_classes
      symbols:
        A LongTensor of shape [B][S], containing the symbols at each position
        of the sequence.
      ranges:
        A tensor containing the symbol ids for each frame that we want to keep.
        It is a LongTensor of shape ``[B][T][s_range]``, where ``ranges[b,t,0]``
        contains the begin symbol ``0 <= s <= S - s_range +1``, such that
        ``logits[b,t,:,:]`` represents the logits with positions
        ``s, s + 1, ... s + s_range - 1``.
        See docs in :func:`get_rnnt_prune_ranges` for more details of what ranges
        contains.
      termination_symbol:
        The identity of the termination symbol, must be in {0..C-1}
      boundary:
        a LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T] if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
      delay_penalty: A constant value to penalize symbol delay, this may be
         needed when training with time masking, to avoid the time-masking
         encouraging the network to delay symbols.
         See https://github.com/k2-fsa/k2/issues/955 for more details.
      reduction:
        Specifies the reduction to apply to the output: `none`, `mean` or `sum`.
        `none`: no reduction will be applied.
        `mean`: apply `torch.mean` over the batches.
        `sum`: the output will be summed.
        Default: `mean`
    Returns:
      If reduction is `none`, returns a tensor of shape (B,), containing the
      total RNN-T loss values for each sequence of the batch, otherwise a scalar
      with the reduction applied.
    )r_   r   r   r    r	   r!   rI   r   r   Nr   rN   r   rO   rg   rQ   rJ   rR   rS   )r   r   r.   rT   r+   r*   r   r   rU   rV   r   rJ   rR   rW   )r_   r   r   r    r	   r!   rK   rL   r   rG   r   r   rX   r;   rY   rZ   r\   r   r   r   rnnt_loss_prunedj  sJ   @
	


r   皙?lm_only_scaleam_only_scalec           $   	   C   s$  | j dks
J | j|j dksJ |j| jd |jd ks&J | j|jf| jd |jd ks8J | j|jf|j\}}	}
| jd d }|j||fksTJ |j||f|dks\J ||	|ksfJ |	|f|dv snJ |tj|ddd\}}tj| ddd\}}||  }| |  }t||ddt|jj	 
 }|jddd}tj|| dddt|jj	 }t|d	|
||
||	d
 | }|dd}|
 }|
 | }|| |dd }tj|d|||	|
d|||dd|||	dd
d	}|dkr%tj|tj||dftd|j|jdfdd}tj| ddd|f d|d	d
}tj||||
d|d	d
}|| }|ddddd|	f  |ddd|ddf 8  < || }|ddddd|	f  |8  < ||ddd|ddf  }|dddd|f d}| dddd|f d}|| | }|d d | }|| | }|| } d| | }!|dkrd}|dkrd}||! ||  ||  }"||! | |  ||  }#|dkrt|"|}"|"|#fS |dkr|"|#ddddddf 7 }"|"|#fS )a  
    Reduces RNN-T problem (the simple case, where joiner network is just
    addition), to a compact, standard form that can then be given
    (with boundaries) to mutual_information_recursion().
    This version allows you to make the loss-function one of the form::

          lm_only_scale * lm_probs +
          am_only_scale * am_probs +
          (1-lm_only_scale-am_only_scale) * combined_probs

    where lm_probs and am_probs are the probabilities given the lm and acoustic
    model independently.

    This function is called from
    :func:`rnnt_loss_smoothed`, but may be useful for other purposes.

    Args:
      lm:
        Language model part of un-normalized logprobs of symbols, to be added to
        acoustic model part before normalizing.  Of shape::

           [B][S+1][C]

        where B is the batch size, S is the maximum sequence length of
        the symbol sequence, possibly including the EOS symbol; and
        C is size of the symbol vocabulary, including the termination/next-frame
        symbol.
        Conceptually, lm[b][s] is a vector of length [C] representing the
        "language model" part of the un-normalized logprobs of symbols,
        given all symbols *earlier than* s in the sequence.  The reason
        we still need this for position S is that we may still be emitting
        the termination/next-frame symbol at this point.
      am:
        Acoustic-model part of un-normalized logprobs of symbols, to be added
        to language-model part before normalizing.  Of shape::

           [B][T][C]

        where B is the batch size, T is the maximum sequence length of
        the acoustic sequences (in frames); and C is size of the symbol
        vocabulary, including the termination/next-frame symbol.  It reflects
        the "acoustic" part of the probability of any given symbol appearing
        next on this frame.
      symbols:
        A LongTensor of shape [B][S], containing the symbols at each position
        of the sequence.
      termination_symbol:
        The identity of the termination symbol, must be in {0..C-1}
      lm_only_scale:
        the scale on the "LM-only" part of the loss.
      am_only_scale:
        the scale on the "AM-only" part of the loss, for which we use
        an "averaged" LM (averaged over all histories, so effectively unigram).
      boundary:
        a optional LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame either emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
    Returns:
        (px, py) (the names are quite arbitrary).
           px: logprobs, of shape [B][S][T+1] if rnnt_type == "regular",
                                  [B][S][T] if rnnt_type != "regular".
           py: logprobs, of shape [B][S+1][T]

        in the recursion::

          p[b,0,0] = 0.0
          if rnnt_type == "regular":
             p[b,s,t] = log_add(p[b,s-1,t] + px[b,s-1,t],
                                p[b,s,t-1] + py[b,s,t-1])
          if rnnt_type != "regular":
             p[b,s,t] = log_add(p[b,s-1,t-1] + px[b,s-1,t-1],
                                p[b,s,t-1] + py[b,s,t-1])
          .. where p[b][s][t] is the "joint score" of the pair of subsequences
          of length s and t respectively.  px[b][s][t] represents the
          probability of extending the subsequences of length (s,t) by one in
          the s direction, given the particular symbol, and py[b][s][t]
          represents the probability of extending the subsequences of length
          (s,t) by one in the t direction,
          i.e. of emitting the termination/next-frame symbol.

          px[:,:,T] equals -infinity, meaning on the "one-past-the-last" frame
          we cannot emit any symbols.  This is simply a way of incorporating
          the probability of the termination symbol on the last frame.
    r   r   r   r   r"   Tr%   )r   r   r(   r'   r   r   r)   r,   Ng      ?rI   g#B;r$   )r-   r   r.   r/   r0   r1   r2   r3   r+   r4   r5   rR   rJ   mvr   r6   r7   r   r8   r9   r:   r   r*   r   )$r   r   r   r    r   r   r	   r!   r   r;   r<   r   r=   r>   r?   r@   rA   rB   lmonly_normalizers
unigram_lmamonly_normalizersrC   rD   px_lm_unigramr   	px_amonly	px_lmonlyrE   rF   rG   py_lm_unigram	py_amonly	py_lmonlycombined_scale	px_interp	py_interpr   r   r   get_rnnt_logprobs_smoothed  s   m$$	


	8"




r   c              
   C   sJ  t | |||||||d\}}|dkrf|j\}}}|dkr|n|d }|du r:tj|d d |j|jd|dd}n|dddf d d }||ddtj||jd	dd| }|| }||	|j7 }t
||||
d
}|
rt|d n|}|	dkr~| }n|	dkrt| }n|	dkrt| }ntd|	 |
r||d fS |S )a  A simple case of the RNN-T loss, where the 'joiner' network is just
    addition.

    Args:
      lm:
        language-model part of unnormalized log-probs of symbols, with shape
        (B, S+1, C), i.e. batch, symbol_seq_len+1, num_classes.
        These are assumed to be well-normalized, in the sense that we could
        use them as probabilities separately from the am scores
      am:
        acoustic-model part of unnormalized log-probs of symbols, with shape
        (B, T, C), i.e. batch, frame, num_classes
      symbols:
        the symbol sequences, a LongTensor of shape [B][S], and elements in
        {0..C-1}.
      termination_symbol:
        the termination symbol, with 0 <= termination_symbol < C
      lm_only_scale:
        the scale on the "LM-only" part of the loss.
      am_only_scale:
        the scale on the "AM-only" part of the loss, for which we use
        an "averaged" LM (averaged over all histories, so effectively unigram).
      boundary:
        a LongTensor of shape [B, 4] with elements interpreted as
        [begin_symbol, begin_frame, end_symbol, end_frame] that is treated as
        [0, 0, S, T]
        if boundary is not supplied.
        Most likely you will want begin_symbol and begin_frame to be zero.
      rnnt_type:
        Specifies the type of rnnt paths: `regular`, `modified` or `constrained`.
        `regular`: The regular rnnt that taking you to the next frame only if
                   emitting a blank (i.e., emitting a symbol does not take you
                   to the next frame).
        `modified`: A modified version of rnnt that will take you to the next
                    frame whether emitting a blank or a non-blank symbol.
        `constrained`: A version likes the modified one that will go to the next
                       frame when you emit a non-blank symbol, but this is done
                       by "forcing" you to take the blank transition from the
                       *next* context on the *current* frame, e.g. if we emit
                       c given "a b" context, we are forced to emit "blank"
                       given "b c" context on the current frame.
      delay_penalty: A constant value to penalize symbol delay, this may be
         needed when training with time masking, to avoid the time-masking
         encouraging the network to delay symbols.
         See https://github.com/k2-fsa/k2/issues/955 for more details.
      reduction:
        Specifies the reduction to apply to the output: `none`, `mean` or `sum`.
        `none`: no reduction will be applied.
        `mean`: apply `torch.mean` over the batches.
        `sum`: the output will be summed.
        Default: `mean`
      return_grad:
        Whether to return grads of px and py, this grad standing for the
        occupation probability is the output of the backward with a
        `fake gradient`, the `fake gradient` is the same as the gradient you'd
        get if you did `torch.autograd.grad((-loss.sum()), [px, py])`, note, the
        loss here is the loss with reduction "none".
        This is useful to implement the pruned version of rnnt loss.

    Returns:
       If return_grad is False, returns a tensor of shape (B,), containing the
       total RNN-T loss values for each element of the batch if reduction equals
       to "none", otherwise a scalar with the reduction applied.
       If return_grad is True, the grads of px and py, which is the output of
       backward with a `fake gradient`(see above), will be returned too. And the
       returned value will be a tuple like (loss, (px_grad, py_grad)).
    )r   r   r   r    r   r   r	   r!   rI   r   r   Nr   rN   r   rO   rP   r   rQ   rJ   rR   rS   )r   r   r.   rT   r+   r*   r   r   rU   rV   r   rJ   rR   rW   )r   r   r   r    r   r   r	   r!   rK   rL   rM   r   rG   r   r   rX   r;   rY   rZ   r[   r\   r]   r   r   r   rnnt_loss_smoothed  sV   P



r   )N)r   N)Nr   rI   rJ   F)Nr   rI   rJ   )r   )r   r   Nr   )r   r   Nr   rI   rJ   F)osr.   r   typingr   r   r   mutual_informationr   r   intstrrH   r   boolr^   rf   rh   rm   rs   r   r   r   
LongTensorr   r   r   r   r   r   r   r   r   <module>   s  

 ,	

u

r
]%
8
 
 
-)

 2	
l
	
 s	
