o
    }oi_                 	   @   sh  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	Z	d dl
Zd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% dZ&dd Z'dJddZ(dJddZ)dKde*de+fddZ,dd Z-G dd dZ.G d d! d!e!Z/d"d# Z0G d$d% d%e!Z1G d&d' d'Z2G d(d) d)Z3G d*d+ d+e2Z4G d,d- d-e3Z5G d.d/ d/e5Z6G d0d1 d1e5Z7G d2d3 d3Z8G d4d5 d5e3Z9G d6d7 d7e3Z:G d8d9 d9e3Z;eG d:d; d;Z<eG d<d= d=Z=G d>d? d?Z>dLdAe+eB dBe*dCe?ej@e*f fdDdEZAG dFdG dGeZBG dHdI dIeZCdS )M    N)	dataclass)Path)
NamedTupleOptional)	OmegaConf)pad_sequence)
DataLoaderDataset)PromptedAudioToTextMiniBatch)ASRModel)StreamingEncoder)normalize_batch)get_samples)
rnnt_utils)IterableDataset)LengthsTypeMelSpectrogramType
NeuralType   c                 C   sx   t | }|dkr8t | d }t|D ]'}t|D ]}|dkr(t|dddd t| | |  dd qt  qdS dS )z
    Print an alignment matrix of the shape (m + 1, n + 1)

    Args:
        alignment: An integer alignment matrix of shape (m + 1, n + 1)
    r   4dz | )endN)lenrangeprint)	alignmentmnij r    d/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/streaming_utils.pyprint_alignment*   s   r"   c                 C   s$   |du ri }| |d< t || dS )z
    Writes out the LCS alignment to a file, along with any extras provided.

    Args:
        alignment: An alignment matrix of shape [m + 1, n + 1]
        filepath: str filepath
        extras: Optional dictionary of items to preserve.
    Nr   )torchsave)r   filepathextrasr    r    r!   write_lcs_alignment_to_pickle<   s   	r'   c                    s  t | }t |  fddt|d D }d}g d}t|d D ]P}t d D ]G}|dks3|dkr:d|| |< q)| |d  ||d  krj||d  |d  d || |< ||| | kri|| | }|||g}q)d|| |< q)q!|dd \}}||k}	|	r|d }
|
dkr|dkr|dkr||d  |d  dkr|
d8 }
|d |d }}n|d |d |
d }}}
n|
dkr|dkr|dksnd} }|}d}d}d}t|ddD ]$}td d D ]}|| | |kr||kr|| | }|}|}|}qq|tkr|}d}d|d< n|d }|d }d}d}t||d D ]4}d}t||| d D ]}| d k rG|| | dkrAd}q+d| | }q+||7 }|d7 }qd}||7 }|dkr|dkr|| | dkrt|d8 }|d7 }|dkr|d7 }|d8 }|d8 }|dkr|dksctd|}td|}|| |d< ||d< ||d< |dur|	| ||d	}t|||d
 td| ||fS )a  
    Longest Common Subsequence merge algorithm for aligning two consecutive buffers.

    Base alignment construction algorithm is Longest Common Subsequence (reffered to as LCS hear after)

    LCS Merge algorithm looks at two chunks i-1 and i, determins the aligned overlap at the
    end of i-1 and beginning of ith chunk, and then clips the subsegment of the ith chunk.

    Assumption is that the two chunks are consecutive chunks, and there exists at least small overlap acoustically.

    It is a sub-word token merge algorithm, operating on the abstract notion of integer ids representing
    the subword ids. It is independent of text or character encoding.

    Since the algorithm is merge based, and depends on consecutive buffers, the very first buffer is processes using
    the "middle tokens" algorithm.

    It requires a delay of some number of tokens such that:
        lcs_delay = math.floor(((total_buffer_in_secs - chunk_len_in_sec)) / model_stride_in_secs)

    Total cost of the model is O(m_{i-1} * n_{i}) where (m, n) represents the number of subword ids of the buffer.

    Args:
        X: The subset of the previous chunk i-1, sliced such X = X[-(lcs_delay * max_steps_per_timestep):]
            Therefore there can be at most lcs_delay * max_steps_per_timestep symbols for X, preserving computation.
        Y: The entire current chunk i.
        filepath: Optional filepath to save the LCS alignment matrix for later introspection.

    Returns:
        A tuple containing -
            - i: Start index of alignment along the i-1 chunk.
            - j: Start index of alignment along the ith chunk.
            - slice_len: number of tokens to slice off from the ith chunk.
        The LCS alignment matrix itself (shape m + 1, n + 1)
    c                    s"   g | ]}d d t  d D qS )c                 S      g | ]}d qS r   r    .0_r    r    r!   
<listcomp>s       z?longest_common_subsequence_merge.<locals>.<listcomp>.<listcomp>r   )r   r*   r   r    r!   r-   s   s   " z4longest_common_subsequence_merge.<locals>.<listcomp>r   r   )r   r   r      N)is_complete_mergeXY	slice_idx)r%   r&   zWrote alignemnt to :)r   r   MIN_MERGE_SUBSEQUENCE_LENmaxr'   r   )r3   r4   r%   r   LCSuffresult
result_idxr   r   r2   lengthmax_j	max_j_idx	i_partial	j_partialj_skipslice_counti_idxj_idxi_tempj_tempj_exp
j_any_skipr&   r    r/   r!    longest_common_subsequence_mergeL   s   % 







rH      max_steps_per_timestepr%   c                 C   s   |dk r
| |7 } | S t | dkr| |7 } | S t|| }| | d }t|||d\}}	|d |d  }
||
d }| |7 } | S )a  
    Merges the new text from the current frame with the previous text contained in the buffer.

    The alignment is based on a Longest Common Subsequence algorithm, with some additional heuristics leveraging
    the notion that the chunk size is >= the context window. In case this assumptio is violated, the results of the
    merge will be incorrect (or at least obtain worse WER overall).
    r   r   N)r%   r1   )r   intrH   )bufferdatadelaymodelrJ   r%   search_sizebuffer_slicelcs_idxlcs_alignmentr5   r    r    r!   lcs_alignment_merge_buffer'  s   	rT   c                 C   s8   |dk r
| |7 } | S t | dkr| |7 } | S | |7 } | S )a  
    Merges the new text from the current frame with the previous text contained in the buffer.

    The alignment is based on a Longest Common Subsequence algorithm, with some additional heuristics leveraging
    the notion that the chunk size is >= the context window. In case this assumptio is violated, the results of
    the merge will be incorrect (or at least obtain worse WER overall).
    r   r   )r   )rL   rM   	timestepsrO   r    r    r!   inplace_buffer_mergeJ  s   	rV   c                   @   sP   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd ZdS )StreamingFeatureBuffererz
    Class to append each feature frame to a buffer and return an array of buffers.
    This class is designed to perform a real-life streaming decoding where only a single chunk
    is provided at each step of a streaming pipeline.
    c                 C   s6  d| _ t|jdr|jjrd| _nd| _|| _|jj| _|jjj	| _
|| _|jjj}t|| j | _t|| j | _|| _t|| }|jjj| _tt| j| j | _tj| j|gtjd| j | _t|| | _|| _|   t|j}t|jd d|j_ d|j_!d|j_	t"#|j| _$| j$%|j& d	S )
aL  
        Args:
            asr_model:
                Reference to the asr model instance for which the feature needs to be created
            chunk_size (float):
                Duration of the new chunk of audio
            buffer_size (float):
                Size of the total audio in seconds maintained in the buffer
        h㈵>log(\0        dtypeFr   NoneN)'NORM_CONSTANThasattrpreprocessorrY   ZERO_LEVEL_SPEC_DB_VAL	asr_modelcfgsample_ratesr	normalizemodel_normalize_type
chunk_sizewindow_striderK   n_chunk_look_backn_chunk_samplesbuffer_sizefeaturesn_featr#   zerossample_bufferonesfloat32rL   feature_chunk_lenfeature_buffer_lenresetcopydeepcopyr   
set_structditherpad_tor   from_config_dictraw_preprocessortodevice)selfrc   ri   rm   timestep_durationtotal_buffer_lenrd   r    r    r!   __init__h  s4   

z!StreamingFeatureBufferer.__init__c                 C   s^   t j| jjt jd| j | _g | _t t| j	| j
 | _t j| j| jgt jd| j | _dS )9
        Reset frame_history and decoder's state
        r\   N)r#   rr   rL   shapers   rb   frame_buffersrp   rK   rm   rf   rq   ro   ru   feature_bufferr   r    r    r!   rv     s
   zStreamingFeatureBufferer.resetc                 C   s<   | j | jd  | j d| j < | | j | j d< dS )z
        Add time-series audio signal to `sample_buffer`

        Args:
            chunk (Tensor):
                Tensor filled with time-series audio signal
        N)rq   rl   clone)r   chunkr    r    r!   _add_chunk_to_buffer  s   "z-StreamingFeatureBufferer._add_chunk_to_bufferc                 C   sT   | j dd| jdf  | j ddd| j f< | | j dd| j df< dS )z>
        Add an extracted feature to `feature_buffer`
        N)r   rt   r   )r   
feat_chunkr    r    r!   _update_feature_buffer  s   2"z/StreamingFeatureBufferer._update_feature_bufferc                 C   s   | j S N)r   r   r    r    r!   get_raw_feature_buffer  s   z/StreamingFeatureBufferer.get_raw_feature_bufferc                 C   s6   t | jdtt| jg| jd\}}}|dS )Nr   xseq_lennormalize_type)r   r   	unsqueezer#   tensorr   rh   squeeze)r   normalized_bufferr,   r    r    r!   get_normalized_feature_buffer  s   

z6StreamingFeatureBufferer.get_normalized_feature_bufferc                 C   s   | j d| j| j   }| jj}|d|}t|j	d g|}| j
||d\}}| }| |dd| j df  dS )zU
        Extract features from the time-series audio buffer `sample_buffer`.
        Nr   r   input_signalr;   )rq   rl   rk   rc   r   
unsqueeze_r~   r#   Tensorr   r}   r   r   rt   )r   samplesr   audio_signalaudio_signal_lenrn   features_lenr    r    r!   _convert_buffer_to_features  s   
"z4StreamingFeatureBufferer._convert_buffer_to_featuresc                 C   sl   t || jkrtd| j dt || jk r+tj| jtjd}||d|jd < |}| | |   dS )z
        Update time-series signal `chunk` to the buffer then generate features out of the
        signal in the audio buffer.

        Args:
            chunk (Tensor):
                Tensor filled with time-series audio signal
        zchunk should be of length z or lessr\   Nr   )	r   rl   
ValueErrorr#   rp   rs   r   r   r   )r   r   
temp_chunkr    r    r!   update_feature_buffer  s   	
z.StreamingFeatureBufferer.update_feature_bufferN)__name__
__module____qualname____doc__r   rv   r   r   r   r   r   r   r    r    r    r!   rW   a  s    *rW   c                   @   s&   e Zd Zd	ddZdd Zdd ZdS )
AudioFeatureIteratorTc           	      C   s   || _ || _d| _d| _d| _|| _|jd }|| | _t	| j 
d|}t| j jd g|}|||d\| _| _| j | _d S )Nr   Trj   r   )_samples
_frame_len_startoutputcountpad_to_frame_len_cfg_feature_frame_lenr#   
from_numpyr   r~   r   r   	_features_features_lenr   )	r   r   	frame_lenra   r   r   r   r   r   r    r    r!   r     s   

zAudioFeatureIterator.__init__c                 C      | S r   r    r   r    r    r!   __iter__     zAudioFeatureIterator.__iter__c                 C   s   | j stt| j| j }|| jd kr&| jd d | j|f  }|| _nF| js;| jd d | j| jd f  }n.t	j
| jjd t| jgdd}| jd d | j| jd f  }||d d d |jd f< d| _ |  jd7  _|S )Nr   rs   r\   r   F)r   StopIterationrK   r   r   r   r   cpur   nprp   r   r   )r   lastframesegmentr    r    r!   __next__  s   $ "zAudioFeatureIterator.__next__N)T)r   r   r   r   r   r   r    r    r    r!   r     s    
r   c           	      C   s   t |  \}}d}|d du}|rt| }g }| D ]!\}}|r;| }||k r6d|| f}tjj||}|| q|rLt|}t|}||fS d\}}||fS )ar  collate batch of audio sig, audio len, tokens, tokens len
    Args:
        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
               LongTensor):  A tuple of tuples of signal, signal lengths,
               encoded tokens, and encoded tokens length.  This collate func
               assumes the signals are 1d torch tensors (i.e. mono audio).
    r   N)NN)	zipr7   itemr#   nn
functionalpadappendstack)	batchr,   audio_lengthsmax_audio_len	has_audior   sigsig_lenr   r    r    r!   speech_collate_fn  s(   


r   c                       sH   e Zd Zedd Z fddZdd Zdd Zd	d
 Zdd Z	  Z
S )AudioBuffersDataLayerc                 C   s   t dt t tdt dS )N)BDTr   )processed_signalprocessed_length)r   r   tupler   r   r    r    r!   output_types(  s   
z"AudioBuffersDataLayer.output_typesc                    s   t    d S r   superr   r   	__class__r    r!   r   /  s   zAudioBuffersDataLayer.__init__c                 C   r   r   r    r   r    r    r!   r   2  r   zAudioBuffersDataLayer.__iter__c                 C   s`   | j t| jkr
t|  j d7  _ tj| j| j d  tjdtj| j| j d  jd tjdfS )Nr   r\   )	
_buf_countr   signalr   r#   	as_tensorrs   r   int64r   r    r    r!   r   5  s    zAudioBuffersDataLayer.__next__c                 C   s   || _ | j d j| _d| _d S Nr   )r   r   signal_shaper   )r   signalsr    r    r!   
set_signal>  s   
z AudioBuffersDataLayer.set_signalc                 C   s   dS Nr   r    r   r    r    r!   __len__C  r   zAudioBuffersDataLayer.__len__)r   r   r   propertyr   r   r   r   r   r   __classcell__r    r    r   r!   r   '  s    
	r   c                   @   sZ   e Zd ZdZdddZdd	 Zd
d Zdd Zdd Zdd Z	dd Z
dd Zdd ZdS )FeatureFrameBuffererz\
    Class to append each feature frame to a buffer and return
    an array of buffers.
    皙?         @Tc                 C   s   t |jdr|jjrd| _nd| _|| _|jj| _|| _|jjj	}t
|| | _t
|| }|jjj| _tj| j|gtjd| j | _|| _|| _d| _d| _|| _tj| j| jgtjd| j | _g | _d| _|   d| _dS )
        Args:
          frame_len: frame's duration, seconds
          frame_overlap: duration of overlaps before and after current frame, seconds
          offset: number of symbols to drop for smooth streaming
        rY   rZ   r[   r\   FNr   )r`   ra   rY   rb   rc   r   re   rf   r   rj   rK   n_frame_lenrn   ro   r   rr   rs   rL   pad_to_buffer_len
batch_size
signal_endframe_readerru   r   r   buffered_features_sizerv   buffered_len)r   rc   r   r   total_bufferr   r   r   r    r    r!   r   M  s,   


zFeatureFrameBufferer.__init__c                 C   sX   t j| jjt jd| j | _d| _g | _g | _d| _	t j| j
| jgt jd| j | _dS )r   )r   r]    r   r\   N)r   rr   rL   r   rs   rb   	prev_charunmergedr   r   ro   ru   r   r   r    r    r!   rv   p  s   zFeatureFrameBufferer.resetc                 C   sJ   | j rg S g }| jD ]}|t| t|| jkr|  S q
d| _ |S )NT)r   r   r   r   rw   r   r   )r   batch_framesr   r    r    r!   get_batch_frames}  s   
z%FeatureFrameBufferer.get_batch_framesc                 C   s   g | _ |D ]L}|jd }|  j|7  _|| jk r%| js%| j t| q| jd d |d f | jd d d | f< || jd d | j	 d f< | j t| j q| j S r   )
r   r   r   ru   r   r   r   rw   rL   r   )r   framesr   curr_frame_lenr    r    r!   get_frame_buffers  s   
*z&FeatureFrameBufferer.get_frame_buffersc                 C   s   || _ d| _d S NF)r   r   r   r   r    r    r!   set_frame_reader     
z%FeatureFrameBufferer.set_frame_readerc                 C   s   |j d }|| jk r| jst|| _n*| jd d |j d d f | jd d d |j d  f< || jd d |j d  d f< |  j|j d 7  _d S r   )r   ru   r   r   rw   r   r   )r   
feat_framer   r    r    r!   r     s   
6z+FeatureFrameBufferer._update_feature_bufferc                 C   sf   g }t |D ]*\}}| | tj| jdd}tj| jdd}||| jd|| jdf q|S )Nr   )axis)		enumerater   r   meanr   stdr   reshapero   )r   r   norm_constsr   r   mean_from_bufferstdev_from_bufferr    r    r!   get_norm_consts_per_frame  s   
$z.FeatureFrameBufferer.get_norm_consts_per_framec                 C   s>   d}t |D ]\}}||| d  || d |  ||< qd S )NrX   r   r   )r  )r   r   r  CONSTANTr   frame_bufferr    r    r!   normalize_frame_buffers  s   &z,FeatureFrameBufferer.normalize_frame_buffersc                 C   sJ   |   }t|dkr#| |}| |}t|dkrq| || |S g S r   r   r   r   r  r  r   r   r   r  r    r    r!   get_buffers_batch  s   

z&FeatureFrameBufferer.get_buffers_batchN)r   r   r   T)r   r   r   r   r   rv   r   r   r   r   r  r  r  r    r    r    r!   r   G  s    
#		r   c                   @   s   e Zd ZdZ				dddZdd	 Zd
efddZdd Ze	
 dddZe	
 dddZddededefddZdd ZdS )FrameBatchASRz
    class for streaming frame-based ASR use reset() method to reset FrameASR's
    state call transcribe(frame) to do ASR on contiguous signal's frames
    r   r   r   Tc                 C   s  t |||||d| _|| _t|dd| _|| _g | _g | _g | _| jdu r-t	|j
j| _nt|jdr;t	|jj| _nt	|jj| _|j
| _
g | _g | _|   t|j}|| _|| _t|jd d|j_d|j_d|j_t|j| _| j |j! | j| _dS )	r   )rc   r   r   r   r   decoderN
vocabularyFr[   r   r^   )"r   frame_buffererrc   getattrr  r   
all_logits	all_predsr   r   	tokenizerr  blank_idr`   jointtoks_unmergedr   rv   rw   rx   r   rd   r   r   ry   ra   rz   r{   rg   r   r|   r}   r~   r   )r   rc   r   r   r   r   rd   r    r    r!   r     s@   
zFrameBatchASR.__init__c                 C   sN   d| _ g | _t | _t| j| jtd| _g | _g | _	g | _
g | _| j  dS )r   r   r   
collate_fnN)r   r   r   
data_layerr   r   r   data_loaderr  r  r  r   r  rv   r   r    r    r!   rv      s   zFrameBatchASR.resetaudio_filepathc                 C   sN   t |}t|dt|| | jjj f}t|| j| j	| jj
}| | d S r   r   r   r   rK   rc   r   re   r   r   r}   r   r   r   r  rN   model_stride_in_secsr   r   r    r    r!   read_audio_file  s   "zFrameBatchASR.read_audio_filec                 C   s   | j | d S r   r  r   r   r    r    r!   r     s   zFrameBatchASR.set_frame_readerFc                 C   sh   | j  }t|dkr2|  j|d d  7  _| j|d d   | | | j  }t|dksd S d S r   )r  r  r   r   r  r   _get_batch_preds)r   keep_logitsr   r    r    r!   infer_logits  s   


zFrameBatchASR.infer_logitsc                 C   s   | j j}t| jD ]b}|\}}||||}}| j ||d}t|dkr:|\}}| j j|d}	|	jddd}
n|\}	}}
t	|
}|D ]}| j
|   qF|rht	|	}	|	D ]
}| j|  q\n~	~~
q	d S )Nr   processed_signal_lengthr0   encoder_outputr1   F)dimkeepdim)rc   r   iterr  r~   r   ctc_decoderargmaxr#   unbindr  r   r   numpyr  )r   r%  r   r   feat_signalfeat_signal_lenforward_outsencodedencoded_len	log_probspredictionspredspredlog_probr    r    r!   r$  !  s,   


zFrameBatchASR._get_batch_predstokens_per_chunkrN   r%  c           
      C   s   |  | g | _| jD ]}| }|  j|t|d | t|d | |  7  _q| | j}|s5|S g }| jD ] }|jd }	||	d | |	d | | d d f }|| q:t	
|d}||fS )Nr   r   )r&  r   r  tolistr   greedy_merger  r   r   r#   concat)
r   r<  rN   r%  r:  decoded
hypothesisr  r;  r   r    r    r!   
transcribe=  s   

4

(zFrameBatchASR.transcribec                 C   sN   g }| j }|D ]}||ks|| j kr|| j kr|| |}q| j|}|S r   )r  r   r  ids_to_text)r   r9  decoded_predictionpreviousprA  r    r    r!   r>  O  s   
zFrameBatchASR.greedy_mergeN)r   r   r   TF)r   r   r   r   r   rv   strr"  r   r#   no_gradr&  r$  rK   boolrB  r>  r    r    r    r!   r    s     
6	r  c                       sf   e Zd ZdZd fdd	Z fddZd	d
 Zdd Zdd Zdd Z	dd Z
dd Zdd Z  ZS )BatchedFeatureFrameBuffererzi
    Batched variant of FeatureFrameBufferer where batch dimension is the independent audio samples.
    r   r   r   c                    s   t  j||||d |jjj}t|| }tj|| j|gtj	d| j
 | _dd t| jD | _dd t| jD | _dd t| jD | _d| _|   | `| `dS )	r   )r   r   r   r\   c                 S   r(   r   r    r*   r    r    r!   r-   p  r.   z8BatchedFeatureFrameBufferer.__init__.<locals>.<listcomp>c                 S   r(   rG  r    r*   r    r    r!   r-   q  r.   c                 S   r(   r   r    r*   r    r    r!   r-   r  r.   r   N)r   r   r   ra   rj   rK   r   rr   ro   rs   rb   rL   r   r   all_frame_readerr   signal_end_indexbuffer_numberrv   r   r   )r   rc   r   r   r   r   r   r   r    r!   r   _  s   
z$BatchedFeatureFrameBufferer.__init__c                    sz   t    tj| j| j| jgtjd| j | _	dd t
| jD | _dd t
| jD | _dd t
| jD | _d| _dS )r   r\   c                 S   r(   r   r    r*   r    r    r!   r-     r.   z5BatchedFeatureFrameBufferer.reset.<locals>.<listcomp>c                 S   r(   rG  r    r*   r    r    r!   r-     r.   c                 S   r(   r   r    r*   r    r    r!   r-     r.   r   N)r   rv   r   rr   r   ro   ru   rs   rb   r   r   rL  r   rM  rN  r   r   r    r!   rv   y  s   

z!BatchedFeatureFrameBufferer.resetc              	   C   s   t | jrg S g }t| jD ]4\}}zt|}t|}|| W q tyB   |d  d| j|< | j	| d u r@| j
| j	|< Y qw |  j
d7  _
|S )NTr   )allr   r  rL  nextr   rw   r   r   rM  rN  )r   r   idxr   r   r    r    r!   r     s"   



z,BatchedFeatureFrameBufferer.get_batch_framesc                 C   s   g | _ t| jD ]\}|| }|d urG| j|d d | jd f | j|d d d | j f< || j|d d | j d f< | j t| j| g q| j|d d d d f  d9  < | j t| j| g q| j S )Nr[   )r   r   r   rL   r   r   r   rw   )r   r   rQ  r   r    r    r!   r     s   2 z-BatchedFeatureFrameBufferer.get_frame_buffersc                 C   s"   || j |< d| j|< d | j|< d S r   )rL  r   rM  r   r   rQ  r    r    r!   r     s   

z,BatchedFeatureFrameBufferer.set_frame_readerc                 C   s   |d ur3| j |d d |jd d f | j |d d d |jd  f< || j |d d |jd  d f< d S | j |d d d d f  d9  < d S )Nr   r[   )r   r   )r   r   rQ  r    r    r!   r     s   :$$z2BatchedFeatureFrameBufferer._update_feature_bufferc                 C   sJ   t |D ]
\}}| || qtj| jddd}tj| jddd}||fS )Nr0   T)r   keepdims)r  r   r   r  r   r  )r   r   rQ  r   r  r  r    r    r!   r    s
   z5BatchedFeatureFrameBufferer.get_norm_consts_per_framec                 C   sB   d}t t|D ]}|| |d |  |d | |  ||< qd S )Ng:0yE>r   r   )r   r   )r   r   r  r	  r   r    r    r!   r    s   *z3BatchedFeatureFrameBufferer.normalize_frame_buffersc                 C   s<   |   }t|dkr| |}| |}| || |S g S r   r  r  r    r    r!   r    s   

z-BatchedFeatureFrameBufferer.get_buffers_batch)r   r   r   )r   r   r   r   r   rv   r   r   r   r   r  r  r  r   r    r    r   r!   rK  Z  s    		rK  c                       s   e Zd ZdZ					ddedef fd	d
Z fddZdefddZ	dd Z
e dd Ze dd ZdedefddZdd Zdd Z  ZS )BatchedFrameASRRNNTzz
    Batched implementation of FrameBatchASR for RNNT models, where the batch dimension is independent audio samples.
    r   r       rI   FrJ   stateful_decodingc                    s   t  j||||d || _|| _dd t| jD | _dd t| jD | _dd t| jD | _d| _	dd t| jD | _
z| jjj| _W n tyS   d	| _Y nw td
| j t||||d| _|   dS )a  
        Args:
            asr_model: An RNNT model.
            frame_len: frame's duration, seconds.
            total_buffer: duration of total audio chunk size, in seconds.
            batch_size: Number of independent audio samples to process at each step.
            max_steps_per_timestep: Maximum number of tokens (u) to process per acoustic timestep (t).
            stateful_decoding: Boolean whether to enable stateful decoding for preservation of state across buffers.
        r   r   r   c                 S      g | ]}g qS r    r    r*   r    r    r!   r-     r.   z0BatchedFrameASRRNNT.__init__.<locals>.<listcomp>c                 S   rX  r    r    r*   r    r    r!   r-     r.   c                 S   rX  r    r    r*   r    r    r!   r-      r.   Nc                 S      i | ]}||qS r    r    r+   rQ  r    r    r!   
<dictcomp>  s    z0BatchedFrameASRRNNT.__init__.<locals>.<dictcomp>r1   zPerforming Stateful decoding :)rc   r   r   r   )r   r   rJ   rV  r   r   all_alignmentsr  all_timestampsprevious_hypothesesbatch_index_maprc   r  eos_id	Exceptionr   rK  r  rv   )r   rc   r   r   r   rJ   rV  r   r    r!   r     s(   
zBatchedFrameASRRNNT.__init__c                    s   t    dd t jD  _dd t jD  _dd t jD  _d _dd t jD  _dd t jD  _	 fd	dt jD  _
dS )
r   c                 S   rX  r    r    r*   r    r    r!   r-     r.   z-BatchedFrameASRRNNT.reset.<locals>.<listcomp>c                 S   rX  r    r    r*   r    r    r!   r-     r.   c                 S   rX  r    r    r*   r    r    r!   r-     r.   Nc                 S   rY  r    r    rZ  r    r    r!   r[        z-BatchedFrameASRRNNT.reset.<locals>.<dictcomp>c                 S   s   g | ]}t  qS r    )r   r*   r    r    r!   r-      rb  c                    s    g | ]}t  j| d tdqS )r   r  )r   r  r   rZ  r   r    r!   r-   !  s    )r   rv   r   r   r\  r  r]  r^  r_  r  r  r   r   r   r!   rv     s   

zBatchedFrameASRRNNT.resetr  c                 C   sv   t || jks	J t| jD ]*}t|| }t|dt|| | jjj	 f}t
|| j| j| jj}| || qd S r   )r   r   r   r   r   r   rK   rc   r   re   r   r   r}   r   r   )r   r  rN   r!  rQ  r   r   r    r    r!   r"  &  s   "z#BatchedFrameASRRNNT.read_audio_filec                 C   s   | j || d S r   r#  rR  r    r    r!   r   0  s   z$BatchedFrameASRRNNT.set_frame_readerc                 C   s|   | j  }t|dkr<|  j|d d  7  _t|D ]\}}| j| |d d   q|   | j  }t|dksd S d S r   )r  r  r   r   r  r  r   r$  )r   r   rQ  rL   r    r    r!   r&  3  s   

z BatchedFrameASRRNNT.infer_logitsc                 C   s  | j j}dd | jD }g }g }g }t| jD ]-}| jj| r qt|| }|\}}	|||	|}}	|	| |	|	 |	| qt
|dkrMdS t|d}t|d}	~~| j ||	d\}
}| jr| jdurg }t|D ]\}}| j| }|	| j|  qr|| _| j jj|
|d| jd}| jr|| _t|D ]\}}|| }| jj| }|s| j| 	|j qdd |D }t|D ]\}}|| }| jj| }|s| j| 	|   qd	d |D }t|D ]\}}|| }| jj| }|s| j| 	| q| jrB| j j|
}t|D ].\}}t
|dkr@|d
 | jkr@| j| jdd
 | j| _| j j||| j| _qt
|t
| jkr[t|D ]
\}}|| j|< qP~
~~~dS )a  
        Perform dynamic batch size decoding of frame buffers of all samples.

        Steps:
            -   Load all data loaders of every sample
            -   For all samples, determine if signal has finished.
                -   If so, skip calculation of mel-specs.
                -   If not, compute mel spec and length
            -   Perform Encoder forward over this sub-batch of samples. Maintain the indices of samples that
                were processed.
            -   If performing stateful decoding, prior to decoder forward, remove the states of samples that
                were not processed.
            -   Perform Decoder + Joint forward for samples that were processed.
            -   For all output RNNT alignment matrix of the joint do:
                -   If signal has ended previously (this was last buffer of padding), skip alignment
                -   Otherwise, recalculate global index of this sample from the sub-batch index, and preserve
                alignment.
            -   Same for preds
            -   Update indices of sub-batch with global index map.
            - Redo steps until all samples were processed (sub-batch size == 0).
        c                 S   s   g | ]}t |qS r    )r-  )r+   r  r    r    r!   r-   Z  s    z8BatchedFrameASRRNNT._get_batch_preds.<locals>.<listcomp>r   Nr'  T)return_hypothesespartial_hypothesesc                 S      g | ]}|j qS r    
y_sequencer+   hypr    r    r!   r-     rb  c                 S   re  r    )	timestamprh  r    r    r!   r-     rb  r1   )rc   r   r  r   r   r  r   rP  r~   r   r   r#   catrV  r^  r  r_  decodingrnnt_decoder_predictions_tensorr\  
alignmentsr  r   r1  r]  r  initialize_stater`  rg  batch_select_state	dec_state)r   r   
data_itersfeat_signalsfeat_signal_lensnew_batch_keysrQ  r   r2  r3  r5  r6  new_prev_hypothesisnew_batch_idxglobal_index_keyold_posbest_hypri  has_signal_endedr9  r:  
timestampsrj  reset_statesr    r    r!   r$  A  s   



z$BatchedFrameASRRNNT._get_batch_predsr<  rN   c                 C   s  |    dd t| jD | _t| jD ]_\}}| jj| }|du r&tdt|D ]H\}}|t	|kr7d}nd}|t	|| | t	|| | |  }| 
|| jj| j\}	}
t	|	dkrr||k rrt| j| |	|| jd| j|< q*qg }t| jD ]}|| | j|  q{|S )^
        Performs "middle token" alignment prediction using the buffered audio chunk.
        c                 S   rX  r    r    r*   r    r    r!   r-     r.   z2BatchedFrameASRRNNT.transcribe.<locals>.<listcomp>NSignal did not endr   r   rO   )r&  r   r   r   r  r\  r  rM  r   r   _alignment_decoderrc   r  r  rV   r   r>  )r   r<  rN   rQ  rn  signal_end_idxa_idxr   offsetidstoksr   r    r    r!   rB    s6   "zBatchedFrameASRRNNT.transcribec                 C   s   g }g }t t|D ]1}t t|| D ]&}|| | \}}	t|	}	|	|kr9||	gd }
||
 ||	 q	 qq
||fS r   )r   r   rK   ids_to_tokensr   )r   rn  r  r  sr  tur,   token_idtokenr    r    r!   r    s   
z&BatchedFrameASRRNNT._alignment_decoderc                 C   s    dd |D }| j j|}|S )Nc                 S      g | ]}|qS r    r    )r+   rF  r    r    r!   r-     r.   z4BatchedFrameASRRNNT.greedy_merge.<locals>.<listcomp>)rc   r  rC  )r   r9  rD  rA  r    r    r!   r>    s   z BatchedFrameASRRNNT.greedy_merge)r   r   rU  rI   F)r   r   r   r   rK   rJ  r   rv   listr"  r   r#   rI  r&  r$  rB  r  r>  r   r    r    r   r!   rT    s4    .


r
*rT  c                       sN   e Zd ZdZ						dded	ed
ef fddZdedefddZ  ZS )BatchedFrameASRTDTa  
    Batched implementation of FrameBatchASR for TDT models, where the batch dimension is independent audio samples.
    It's mostly similar to BatchedFrameASRRNNT with special handling of boundary cases due to the frame-skipping
    resulted by TDT models.
    r   r   rU  rI   Fr   rJ   rV  tdt_search_boundaryc                    s   t  j||||d || _dS )ab  
        Args:
            asr_model: An RNNT model.
            frame_len: frame's duration, seconds.
            total_buffer: duration of total audio chunk size, in seconds.
            batch_size: Number of independent audio samples to process at each step.
            max_steps_per_timestep: Maximum number of tokens (u) to process per acoustic timestep (t).
            stateful_decoding: Boolean whether to enable stateful decoding for preservation of state across buffers.
            tdt_search_boundary: The max number of frames that we search between chunks to match the token at boundary.
        rW  N)r   r   r  )r   rc   r   r   r   rJ   rV  r  r   r    r!   r     s   
zBatchedFrameASRTDT.__init__r<  rN   c                 C   s  |    dd t| jD | _t| jD ]\}}| jj| }|du r&tdt|D ]\}}|t	|kr7d}nd}|t	|| | | j
 t	|| | |  }	|t	|| | t	|| | |  }| |	| jj| j\}
}| || jj| j\}}t	|
dkr||k r|dkst	| j| dkrt| j| ||| jd| j|< q*t	| j| dkrt	|dkr| j| d }tt	|
t	| t	|
d }d}t||dD ]}|
| |kr|
|d d } nqt| j| ||| jd| j|< q*qg }t| jD ]}|| | j|  q|S )	r~  c                 S   rX  r    r    r*   r    r    r!   r-     r.   z1BatchedFrameASRTDT.transcribe.<locals>.<listcomp>Nr  r   r   r  r1   )r&  r   r   r   r  r\  r  rM  r   r   r  r  rc   r  r  rV   minr   r>  )r   r<  rN   rQ  rn  r  r  r   r  longer_alignment
longer_idslonger_toksr  r,   id_to_matchstartr   r   r   r    r    r!   rB    s~   
"1zBatchedFrameASRTDT.transcribe)r   r   rU  rI   Fr   )	r   r   r   r   rK   rJ  r   rB  r   r    r    r   r!   r    s(    	r  c                       sN   e Zd ZdZ						dded	ed
ef fddZdedefddZ  Z	S )+LongestCommonSubsequenceBatchedFrameASRRNNTz
    Implements a token alignment algorithm for text alignment instead of middle token alignment.

    For more detail, read the docstring of longest_common_subsequence_merge().
    r   r   r   rI   FNrJ   rV  alignment_basepathc                    s,   t  |||||| d| _d| _|| _dS )ab  
        Args:
            asr_model: An RNNT model.
            frame_len: frame's duration, seconds.
            total_buffer: duration of total audio chunk size, in seconds.
            batch_size: Number of independent audio samples to process at each step.
            max_steps_per_timestep: Maximum number of tokens (u) to process per acoustic timestep (t).
            stateful_decoding: Boolean whether to enable stateful decoding for preservation of state across buffers.
            alignment_basepath: Str path to a directory where alignments from LCS will be preserved for later analysis.
        r   r1   N)r   r   sample_offset	lcs_delayr  )r   rc   r   r   r   rJ   rV  r  r   r    r!   r   d  s   
z4LongestCommonSubsequenceBatchedFrameASRRNNT.__init__r<  rN   c              
   C   s  | j dk r	td|   dd t| jD | _t| jD ]\}}| jj	| }|d u r/tdt|D ]\}}|dkri|t
|d | d  }| || jj| j\}}	t
|dkrht| j| ||| jd| j|< q3| || jj| j\}}	t
|dkr||k r| jd ur| j}
| j| }|}tj|
t|}tj|dd	 tj|d
t| d }|}nd }t| j| || j | j| j|d| j|< q3qg }t| jD ]}|| | j|  q|S )Nr   zYPlease set LCS Delay valus as `(buffer_duration - chunk_duration) / model_stride_in_secs`c                 S   rX  r    r    r*   r    r    r!   r-     r.   zJLongestCommonSubsequenceBatchedFrameASRRNNT.transcribe.<locals>.<listcomp>r  r   r  T)exist_ok
alignment_z.pt)rO   rJ   r%   )r  r   r&  r   r   r   r  r\  r  rM  r   r  rc   r  r  rV   r  r  ospathjoinrH  makedirsrT   rJ   r   r>  )r   r<  rN   rQ  rn  r  r  r   r  r  basepathr  alignment_offsetr  r%   r   r    r    r!   rB  ~  s^   


*z6LongestCommonSubsequenceBatchedFrameASRRNNT.transcribe)r   r   r   rI   FN)
r   r   r   r   rK   rJ  rH  r   rB  r   r    r    r   r!   r  ]  s(    	r  c                   @   s   e Zd ZdZdddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
d ddZd ddZd ddZdd Zd!ddZdd ZdS )"CacheAwareStreamingAudioBufferz
    A buffer to be used for cache-aware streaming. It can load a single or multiple audio
    files/processed signals, split them in chunks and return one on one. It can be used to
    simulate streaming audio or audios.
    NFc                 C   s   || _ d| _d| _d| _d| _|| _|| _t|jt	st
d|jjdu r*|j  |jj| _|jj| _|  | _t|jdrOt|jjdrO|jj | _dS d| _dS )a1  
        Args:
            model: An ASR model.
            online_normalization (bool): whether to perform online normalization per chunk or
            normalize the whole audio before chunking
            pad_and_drop_preencoded (bool): if true pad first audio chunk and always drop preencoded
        Nr   z`The model's encoder is not inherited from StreamingEncoder, and likely not to support streaming!
pre_encodeget_sampling_frames)rO   rL   
buffer_idxstreams_lengthsteppad_and_drop_preencodedonline_normalization
isinstanceencoderr   r   streaming_cfgsetup_streaming_params_feat_ininput_featuresextract_preprocessorra   r`   r  r  sampling_frames)r   rO   r  r  r    r    r!   r     s&   




z'CacheAwareStreamingAudioBuffer.__init__c                 c   s   	 | j | jdkrd S | j dkr*t| jjtr*| jr#| jjd }n| jjd }nt| jjtr7| jjd n| jj}| j dkrXt| jjtrX| jrQ| jjd }n| jjd }nt| jjtre| jjd n| jj}| jd d d d | j | j | f }| j	d ur| j dkrt| j	tr| j	d }nt| j	tr| j	d n| j	}|d|k rd S d }| j dkrt| jj
tr| jr| jj
d }n| jj
d }tj|d| j|f|j|jd}nLt| jj
tr| jj
d }n| jj
}| j | }	|	dk rd}	| jd d d d |	| j f }|d|k r$tj|d|d||d f|j|jd}|d}
tj||fdd}| jrMt|t|dg|d | jd\}}}|d urbtj||fdd}|
|d7 }
| j| j  }||
 }tj|d|dd	}|  j |7  _ |  jd7  _||fV  q)
NTr1   r   r   )r   r]   r+  r   )r  r7   )r  rL   sizer  r  ri   r  r  
shift_sizer  pre_encode_cache_sizer#   rp   r  r   r]   rk  r  r   r   rh   r  clampr  )r   ri   r  audio_chunkcur_sampling_frames
zeros_padscache_pre_encode_num_framescache_pre_encoder  start_pre_encode_cache	added_lenx_meanx_stdmax_chunk_lengthschunk_lengthsr    r    r!   r     s   $





z'CacheAwareStreamingAudioBuffer.__iter__c                 C   s   | j | jdkrdS dS )Nr1   TF)r  rL   r  r   r    r    r!   is_buffer_emptyM  s   z.CacheAwareStreamingAudioBuffer.is_buffer_emptyc                 C   
   t | jS r   )r   rL   r   r    r    r!   r   S     
z&CacheAwareStreamingAudioBuffer.__len__c                 C   s   d | _ d| _d | _d| _d S r   )rL   r  r  r  r   r    r    r!   reset_bufferV  s   
z+CacheAwareStreamingAudioBuffer.reset_bufferc                 C   s   d| _ d| _d S r   )r  r  r   r    r    r!   reset_buffer_pointer\  r   z3CacheAwareStreamingAudioBuffer.reset_buffer_pointerc                 C   s`   t | jj}|jj| _t|jd d|j_	d|j_
| jr"d|j_| j|j}||  S )NFr[   r   r^   )rw   rx   rO   r   ra   rg   rh   r   ry   rz   r{   r  r|   r~   get_model_device)r   rd   ra   r    r    r!   r  `  s   
z3CacheAwareStreamingAudioBuffer.extract_preprocessorr1   c                 C   s$   t |}| ||\}}}|||fS r   )r   append_audio)r   r  	stream_idaudior   r(  r    r    r!   append_audio_filel  s   
z0CacheAwareStreamingAudioBuffer.append_audio_filec                 C   s*   |  |\}}| ||\}}}|||fS r   )preprocess_audioappend_processed_signal)r   r  r  r   r(  r    r    r!   r  q  s
   

z+CacheAwareStreamingAudioBuffer.append_audioc                 C   s  t j|d|jd}|dkr| jd ur|t| jkrtd| jd u r:|dkr,td|| _t j|g|jd| _n{| jd|dkrItd|dk rrt jj	j
| jdd	| _t j| jt jdg| jjdfdd
| _t| jd }| j| | }|| jdkrt jj	j
| jd|| jd fd	| _|| j|d d | j| | j| | f< | j| |d | j|< | jrt|t |g| jd\}}}|||fS )Nr1   r   r   zNot valid stream_id!z7stream_id can not be specified when there is no stream.r   z:Buffer and the processed signal have different dimensions!)r   r   r   r   r   r   )r   r  r   )r#   r   r  r   r  r   r   rL   r   r   r   rk  r  r   rh   )r   r   r  r(  
needed_lenr  r  r    r    r!   r  x  s>    
& 

z6CacheAwareStreamingAudioBuffer.append_processed_signalc                 C   s   | j jS r   )rO   r   r   r    r    r!   r    s   z/CacheAwareStreamingAudioBuffer.get_model_devicec                 C   sX   |d u r|   }t|d|}t|jd g|}| j||d\}}||fS )Nr   r   )r  r#   r   r   r~   r   r   ra   )r   r  r   r   r   r   r(  r    r    r!   r    s   
z/CacheAwareStreamingAudioBuffer.preprocess_audioc                 C   s4   | j }| jrt|t| j| jd\}}}|| jfS )Nr   )rL   r  r   r#   r   r  rh   )r   r   r  r  r    r    r!   get_all_audios  s   

z-CacheAwareStreamingAudioBuffer.get_all_audiosr   )r1   r   )r   r   r   r   r   r   r  r   r  r  r  r  r  r  r  r  r  r    r    r    r!   r    s    
!d


#

r  c                       s   e Zd Zd fdd	Z fddZdefddZd	efd
dZe	
 dddZ	ddee dee defddZdd Zdd Zdd Zdd Z  ZS ) FrameBatchMultiTaskAEDr   c                    s:   t  j||||dd |jjj| _|jjj| _dg| _d S )NFr   r   )r   r   r   ra   rj   r  subsampling_factorchunk_offsetsr   rc   r   r   r   r   r    r!   r     s
   
zFrameBatchMultiTaskAED.__init__c                    s   t    dg| _d S r   )r   rv   r  r   r   r    r!   rv     s   

zFrameBatchMultiTaskAED.resetsamplec                    s   | j jdkrh d}i }n| j jdkr!ddh}dddd	d
dd}n	td| j j  fdd|D }|r?td| d  | D ]\}} || |< qC| j jjdi  | j jjdidgdd }t	j
|t	j| j jddS )Ncanary>   pnctasknamesource_langtarget_langcanary2r  r  r   z<|emo:undefined|>z	<|noitn|>z<|notimestamp|>z<|nodiarize|>z<|pnc|>)decodercontextemotionitnrj  diarizer  zUnknown prompt format: c                    s   g | ]}| vr|qS r    r    )r+   kr  r    r!   r-     s    z;FrameBatchMultiTaskAED.get_input_tokens.<locals>.<listcomp>z4We found sample that is missing the following keys: zWPlease ensure that every utterance in the input manifests contains these keys. Sample: user
spl_tokens)roleslots)turnscontext_idsr]   r   r   )rc   prompt_formatr   RuntimeErroritemsgetpromptencode_dialogPROMPT_LANGUAGE_SLOTr#   r   longr   r   )r   r  expected_slotsdefault_slot_valuesmissing_keysr  vtokensr    r  r!   get_input_tokens  sF   	

z'FrameBatchMultiTaskAED.get_input_tokensr  c                 C   s^   |  || _t|}t|dt|| | jjj f}t	|| j
| j| jjdd}| | d S Nr   F)r   )r  input_tokensr   r   r   rK   rc   r   re   r   r   r}   r   r   )r   r  rN   r!  	meta_datar   r   r    r    r!   r"    s   "z&FrameBatchMultiTaskAED.read_audio_fileFc           
      C   s   | j j}t| jD ]S}|\}}| j|  ||||}}| j|	|
dd}tj|
dg|
d |d }t||d d ||d d d}| j j|dd}	| j|	 ~	q	d S )Nr   r   r  )r  
audio_lens
transcripttranscript_lensr  prompt_lensprompted_transcriptprompted_transcript_lensT)has_processed_signal)rc   r   r-  r  r  extendr=  r~   r  repeatr  r#   r   r  r
   predict_stepr  )
r   r%  r   r   r2  r3  r  
tokens_lenbatch_inputr8  r    r    r!   r$    s*   $
z'FrameBatchMultiTaskAED._get_batch_predsNr<  rN   r%  c                 C   s.   |  | | | j}|s|S td |g fS )V
        unsued params are for keeping the same signature as the parent class
        zZkeep_logits=True is not supported for MultiTaskAEDFrameBatchInfer. Returning empty logits.)r&  _join_hypothesesr  r   r   r<  rN   r%  rA  r    r    r!   rB    s   
z!FrameBatchMultiTaskAED.transcribec                 C   sZ   t |dkr
|d S tjdtg g g g dd}| ||}| ||}| ||}|S )Nr   r   r[   )charwordr   )scorerg  rj  )r   r   
Hypothesisr#   r   
_join_text_join_y_sequence_join_timestamp)r   
hypothesesmerged_hypthesisr    r    r!   r
    s   z'FrameBatchMultiTaskAED._join_hypothesesc                 C   s   d dd |D |_|S )Nr   c                 S   re  r    )textr+   hr    r    r!   r-   5  rb  z5FrameBatchMultiTaskAED._join_text.<locals>.<listcomp>)r  r  r   merged_hypothesisr  r    r    r!   r  4     z!FrameBatchMultiTaskAED._join_textc                 C   s   t dd |D |_|S )Nc                 S   re  r    rf  r  r    r    r!   r-   9  rb  z;FrameBatchMultiTaskAED._join_y_sequence.<locals>.<listcomp>)r#   rk  rg  r  r    r    r!   r  8  r  z'FrameBatchMultiTaskAED._join_y_sequencec                    s   d t |D ])\}} j| 7   fdd|jd D }fdd|D }|jd | qd t |D ])\}} j| 7   fdd|jd D }fdd|D }|jd | q6|S )	Nr   c                    :   g | ]}i ||d   j   |d  j   dqS start_offset
end_offset)r  r  r  r+   r  cumulative_offsetr   r    r!   r-   C  s    z:FrameBatchMultiTaskAED._join_timestamp.<locals>.<listcomp>r  c                    >   g | ]}i ||d   j   j |d  j   j dqS r  r  )r  r   rj   r  r   r   r    r!   r-   O      c                    r  r  r  r+   r   r!  r    r!   r-   `  s    r   c                    r#  r$  r%  r'  r   r    r!   r-   j  r&  )r  r  rj  r  )r   r  r  r   r  updated_timestampsr    r!  r!   r  <  s*   
	

	z&FrameBatchMultiTaskAED._join_timestampr   r   r   rG  NNF)r   r   r   r   rv   dictr  rH  r"  r#   rI  r$  r   rK   rJ  rB  r
  r  r  r  r   r    r    r   r!   r    s&    )	
r  c                       `   e Zd Zd fdd	ZdefddZe ddd	Z	dde	e
 de	e
 defddZ  ZS )FrameBatchChunkedRNNTr   c                       t  j||||dd d S NFr  r   r  r   r    r!   r   y     zFrameBatchChunkedRNNT.__init__r  c                 C   R   t |}t|dt|| | jjj f}t|| j| j	| jj
dd}| | d S r  r  r   r    r    r!   r"  |     "z%FrameBatchChunkedRNNT.read_audio_fileFc           
      C   sx   | j j}t| jD ]0}|\}}||||}}| j ||d\}}| j jj||dd\}}	| j| ~~	~~q	d S )Nr'  F)r*  encoded_lengthsrc  )	rc   r   r-  r  r~   rl  rm  r  r  )
r   r%  r   r   r2  r3  r5  r6  best_hyp_textall_hyp_textr    r    r!   r$    s    

z&FrameBatchChunkedRNNT._get_batch_predsNr<  rN   r%  c                 C   .   |  | d| j}|s|S td |g fS )r	  r   zTkeep_logits=True is not supported for FrameBatchChunkedRNNT. Returning empty logits.r&  r  r  r   r  r    r    r!   rB       
z FrameBatchChunkedRNNT.transcriber)  rG  r*  r   r   r   r   rH  r"  r#   rI  r$  r   rK   rJ  rB  r   r    r    r   r!   r-  x  s    r-  c                       r,  )FrameBatchChunkedCTCr   c                    r.  r/  r   r  r   r    r!   r     r0  zFrameBatchChunkedCTC.__init__r  c                 C   r1  r  r  r   r    r    r!   r"    r2  z$FrameBatchChunkedCTC.read_audio_fileFc                 C   s   | j j}t| jD ]P}|\}}||||}}| j ||d}t|dkr?|\}}| j j|d}	| j jj|	|dd\}
}n|\}	}}| j j	j|	|dd\}
}| j
|
 ~	~~q	d S )Nr'  r0   r)  F)decoder_outputsdecoder_lengthsrc  )rc   r   r-  r  r~   r   r.  ctc_decodingctc_decoder_predictions_tensorrl  r  r  )r   r%  r   r   r2  r3  resultsr5  r6  r7  transcribed_textsr,   r8  r    r    r!   r$    s0   

z%FrameBatchChunkedCTC._get_batch_predsNr<  rN   r%  c                 C   r6  )r	  r   zSkeep_logits=True is not supported for FrameBatchChunkedCTC. Returning empty logits.r7  r  r    r    r!   rB    r8  zFrameBatchChunkedCTC.transcriber)  rG  r*  r9  r    r    r   r!   r:    s    r:  c                   @   sh   e Zd ZU eed< eed< eed< defddZdedd fdd	Zd
ededd defddZdd Z	dS )ContextSizeleftr   rightreturnc                 C      | j | j | j S zTotal context sizerB  r   rC  r   r    r    r!   total     zContextSize.totalfactorc                 C   s    t | j| | j| | j| dS )h
        Subsample context size by factor

        Args:
            factor: subsampling factor
        rG  )rA  rB  r   rC  r   rJ  r    r    r!   	subsample  s
   zContextSize.subsample
num_framesis_last_chunkexpected_contextc                 C   s   ||j |j krtd| d| |  j| j 7  _d| _ |  j|7  _|r.| j| _ d| _n|j | _ |  j|j 8  _t|  |  d}|  j|8  _|sV| j|jksVJ |S )z
        Add frames to context size
        Args:
            num_frames: number of frames to add
            is_last_chunk: if last chunk

        Returns:
            number of frames removed from the left side
        zAdded chunk length z2 is larger than expected chunk with right context r   )r   rC  r   rB  r7   rH  )r   rN  rO  rP  extra_samplesr    r    r!   add_frames_get_removed_  s&   
z#ContextSize.add_frames_get_removed_c                 C   s   d| j  d| j d| j S )NzLeft z	 - Chunk z	 - Right rG  r   r    r    r!   __str__  s   zContextSize.__str__N)
r   r   r   rK   __annotations__rH  rM  rJ  rR  rS  r    r    r    r!   rA    s   
  rA  c                   @   sl   e Zd ZU dZejed< ejed< ejed< dejfddZdedd fd	d
Z	dejdejddfddZ
dS )ContextSizeBatchzBatched context sizerB  r   rC  rD  c                 C   rE  rF  rG  r   r    r    r!   rH  $  rI  zContextSizeBatch.totalrJ  c                 C   s8   t tj| j|ddtj| j|ddtj| j|dddS )rK  floor)rounding_moderG  )rU  r#   divrB  r   rC  rL  r    r    r!   rM  (  s
   zContextSizeBatch.subsamplenum_frames_batchis_last_chunk_batchrP  rA  c                 C   s   |  j | j7  _ | jd |  j|7  _t|| j|j| _t|d| j|j | _t| jdk| j d| _ t|  |  t| j }|  j |8  _ t| j dk t| j | j | _ dS )z
        Add frames to context size
        Args:
            num_frames_batch: number of frames to add
            is_last_chunk_batch: if last chunk

        Returns:
            number of frames removed from the left side
        r   N)	rB  r   fill_rC  r#   wheremaximumrH  
zeros_like)r   rY  rZ  rP  rQ  r    r    r!   add_frames_5  s    $zContextSizeBatch.add_frames_N)r   r   r   r   r#   r   rT  rH  rK   rM  r_  r    r    r    r!   rU    s   
 


rU  c                	   @   sR   e Zd ZdZdededejdeje	B fddZ
dejd	ejd
edejfddZdS )StreamingBatchedAudioBufferzaBatched audio buffer with strict context management for streaming inference without left padding.r   context_samplesr]   r   c                 C   sv   || _ || _tj|dg||d| _tdddd| _ttj|gtj|dtj|gtj|dtj|gtj|dd| _	dS )z
        Init batched audio buffer for streaming inference
        Args:
            batch_size: batch size
            context_samples: context size
            dtype: buffer dtype
            device: device for buffer
        r   r  rG  N)
r   rP  r#   rp   r   rA  context_sizerU  r  context_size_batch)r   r   ra  r]   r   r    r    r!   r   S  s   	z$StreamingBatchedAudioBuffer.__init__audio_batchr   rO  rZ  c                 C   st   |j d }tj| j|fdd| _| jj||| jd}| jj||| jd |dkr8| jdd|df 	 | _dS dS )a   
        Add audio batch to buffer

        Args:
            audio_batch: chunk with audio
            audio_lengths: length of audio
            is_last_chunk: if last chunk
            is_last_chunk_batch: if last chunk for each audio utterance
        r   r  )rO  rP  )rY  rZ  rP  r   N)
r   r#   rk  r   rb  rR  rP  rc  r_  r   )r   rd  r   rO  rZ  added_chunk_lengthextra_samples_in_bufferr    r    r!   add_audio_batch_f  s   
 z,StreamingBatchedAudioBuffer.add_audio_batch_N)r   r   r   r   rK   rA  r#   r]   r   rH  r   r   rJ  rg  r    r    r    r!   r`  P  s    "r`  >  	file_pathre   rD  c                 C   s&   t j| |d\}}tj|tjd|fS )zLoad audio from file)rf   r\   )librosaloadr#   r   rs   )ri  re   r  rf   r    r    r!   
load_audio  s   rl  c                   @   s>   e Zd ZU ejed< ejed< edeej dd fddZdS )
AudioBatchaudio_signalsaudio_signal_lengthsrd  rD  c                 C   s<   t dd | D ddd}tdd | D  }t||dS )z0
        Collate audio signals to batch
        c                 S   r  r    r    r+   audio_tensorr    r    r!   r-     r.   z)AudioBatch.collate_fn.<locals>.<listcomp>Tr[   )batch_firstpadding_valuec                 S   s   g | ]}|j d  qS r)   )r   rp  r    r    r!   r-     s    )rn  ro  )r   r#   r   r  rm  )rd  rn  ro  r    r    r!   r    s   zAudioBatch.collate_fnN)	r   r   r   r#   r   rT  staticmethodr  r  r    r    r    r!   rm    s   
 

rm  c                       sP   e Zd ZdZddeeeB  def fddZdede	j
fd	d
Zdd Z  ZS )SimpleAudioDatasetz;Dataset constructed from audio filenames. Each item - audiorh  audio_filenamesre   c                    s   t    || _|| _d S r   )r   r   rv  re   )r   rv  re   r   r    r!   r     s   

zSimpleAudioDataset.__init__r   rD  c                 C   s   t | j| \}}|S r   )rl  rv  )r   r   r  r,   r    r    r!   __getitem__  s   zSimpleAudioDataset.__getitem__c                 C   r  r   )r   rv  r   r    r    r!   r     r  zSimpleAudioDataset.__len__rh  )r   r   r   r   r  rH  r   rK   r   r#   r   rw  r   r   r    r    r   r!   ru    s
     ru  r   )rI   Nrx  )Drw   r  dataclassesr   pathlibr   typingr   r   rj  r1  r   r#   	omegaconfr   torch.nn.utils.rnnr   torch.utils.datar   r	   7nemo.collections.asr.data.audio_to_text_lhotse_promptedr
   nemo.collections.asr.modelsr   +nemo.collections.asr.parts.mixins.streamingr   1nemo.collections.asr.parts.preprocessing.featuresr   0nemo.collections.asr.parts.preprocessing.segmentr    nemo.collections.asr.parts.utilsr   nemo.core.classesr   nemo.core.neural_typesr   r   r   r6   r"   r'   rH   rK   rH  rT   rV   rW   r   r   r   r   r  rK  rT  r  r  r  r  r-  r:  rA  rU  r`  r   r   rl  rm  ru  r    r    r    r!   <module>   sl   

 \#}(! }    fd s F09:3&7