o
    wi                  	   @   s  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' eee( ee" eee(  eee"  f Z)eee eee  ee eee  ee(ee f f Z*eG dd dZ+eG dd dZ,dd Z-G dd deZ.G dd deZ/G dd de/Z0dS )    N)ABCabstractmethod)	dataclasspartial)AnyDictListOptionalTupleUnion)
DictConfig)
DataLoaderDataset)tqdm)process_augmentations)AudioSegmentChannelSelectorType)manifest_utils)
Hypothesis)move_data_to_device)logginglogging_modec                   @   s   e Zd ZU dZeej ed< dZeej ed< dZe	ed< dZ
ee ed< dZeed< d	Zeed
< dZee ed< dZee ed< dS )InternalTranscribeConfigNdevicedtypeFtraining_modelogging_level        dither_valuer   pad_to_valuetemp_dirmanifest_filepath)__name__
__module____qualname__r   r
   torch__annotations__r   r   boolr   r   r   floatr    intr!   strr"    r,   r,   l/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/parts/mixins/transcription.pyr   (   s   
 r   c                   @   s   e Zd ZU dZeed< dZeed< dZe	e ed< dZ
eed< dZe	e ed< dZe	e ed	< d
Zeed< dZe	ee  ed< dZe	e ed< dS )TranscribeConfig   
batch_sizeFreturn_hypothesesNnum_workerschannel_selector	augmentor
timestampsTverbosepartial_hypothesis	_internal)r#   r$   r%   r0   r*   r'   r1   r(   r2   r
   r3   r   r4   r   r5   r6   r7   r	   r   r8   r   r,   r,   r,   r-   r.   9   s   
 r.   c              	   C   s6   t | |r
t| |S td| d| d|  d |S )a  
    Utility function to get a value from the transcription config.
    If the value is not present in the transcription config, the default value is returned.

    Args:
        trcfg: A dataclass that represents the transcription config.
        key: The name of the arg to retrieve.
        default: The default value to return if the key is not present in the transcription config.

    Returns:
        The value of the key in the transcription config or the default value.
    zUsing default value of z for zG because it is not present                 in the transcription config .)hasattrgetattrr   debug)trcfgkeydefaultr,   r,   r-   #get_value_from_transcription_configI   s   

r@   c                       sB   e Zd Zdeeef f fddZdd Zdd Zdd	 Z	  Z
S )
TranscriptionTensorDatasetconfigc                    sj   t    |d | _|d | _|dd | _|d | _| jd ur*t| jddd| _nd | _t	| j| _
d S )Naudio_tensorsr3   r4   sample_rater      )global_rank
world_size)super__init__rC   r3   getaugmentor_cfgrD   r   r4   lenlengthselfrB   	__class__r,   r-   rI   a   s   




z#TranscriptionTensorDataset.__init__c                 C   s*   || j krtd| d| j  | |S )NzIndex z" out of range for dataset of size )rM   
IndexErrorget_item)rO   indexr,   r,   r-   __getitem__o   s   

z&TranscriptionTensorDataset.__getitem__c                 C   s   | j S )N)rM   )rO   r,   r,   r-   __len__u   s   z"TranscriptionTensorDataset.__len__c                 C   s   | j | }| jd ur8tjdtjd |j}|jdtj	d
 }t|| j| j| jd}| j|}tj|j|d}tj|jd tjd}||d d fS )NzAudio Augmentations are being applied during inference by moving the tensor onto CPU. This is highly inefficient and therefore not recommended.modecpu)r   r   )	target_srr3   )r   r   )rC   r4   r   warningr   ONCEr   tor&   float32numpyr   rD   r3   perturbtensorsamplesshapelong)rO   rT   rb   original_dtypesegmentseq_lenr,   r,   r-   rS   x   s   

z#TranscriptionTensorDataset.get_item)r#   r$   r%   r   r+   r   rI   rU   rV   rS   __classcell__r,   r,   rP   r-   rA   `   s
    rA   c                   @   s`  e Zd ZdZe 								d-deeee e	j
ef ded	ed
edee dededee dee defddZdee fddZ	 defddZdefddZdeee	j
ejf  dedefddZedee dededeeef fdd Zed!edefd"d#Zed$edefd%d&Zededefd'd(Zdefd)d*Z d!ededefd+d,Z!dS ).TranscriptionMixina  
    An abstract class for transcribe-able models.

    Creates a template function `transcribe()` that provides an interface to perform transcription of audio tensors or
    filepaths.

    The following abstract classes must be implemented by the subclass:

        - `_transcribe_input_manifest_processing()`:
            Process the provided input arguments (filepaths only) and return a
            config dict for the dataloader. The data loader is should generally operate on NeMo manifests.

        - `_setup_transcribe_dataloader()`:
            Setup the dataloader for transcription. Receives the output from
            `_transcribe_input_manifest_processing()`.

        - `_transcribe_forward()`:
            Implements the model's custom forward pass to return outputs that are processed by
            `_transcribe_output_processing()`.

        - `_transcribe_output_processing()`:
            Implements the post processing of the model's outputs to return the results to
            the user. The result can be a list of objects, list of list of objects, tuple of objects, tuple of list of
            objects, or a dict of list of objects.

    r/   Fr   NTaudior0   r1   r2   r3   r4   r6   r5   override_configreturnc
              
   K   s  |	du rt d|||||||d|
}nt|	dstd|	jdu r&t |	_|	}|jdu r2t |_n
t|jts<tdd}z| j||d}|D ]}t|tr[|du rUg }|| qHt|t	rx|du rg|}qH|
 D ]\}}|| | qkqHt|tr|du rtdd |D }t|d	 trt|D ]\}}|| | qqHt|t|krtd
t| dt| dt|D ]\}}|| | qqHtdW |S  ty   Y |S w )a"
  
        Template function that defines the execution strategy for transcribing audio.

        Args:
            audio: (a single or list) of paths to audio files or a np.ndarray audio array.
                Can also be a dataloader object that provides values that can be consumed by the model.
                Recommended length per file is between 5 and 25 seconds.
                But it is possible to pass a few hours long file if enough GPU memory is available.
            batch_size: (int) batch size to use during inference.
                Bigger will result in better throughput performance but would use more memory.
            return_hypotheses: (bool) Either return hypotheses or text
                With hypotheses can do some postprocessing like getting timestamp or rescoring
            num_workers: (int) number of workers for DataLoader
            channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from
                multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set
                to `None`. Defaults to `None`. Uses zero-based indexing.
            augmentor: (DictConfig): Augment audio samples during transcription if augmentor is applied.
            verbose: (bool) whether to display tqdm progress bar
            timestamps: Optional(Bool): timestamps will be returned if set to True as part of hypothesis object
                (output.timestep['segment']/output.timestep['word']). Refer to `Hypothesis` class for more details.
                Default is None and would retain the previous state set by using self.change_decoding_strategy().
            override_config: (Optional[TranscribeConfig]) override transcription config pre-defined by the user.
                **Note**: All other arguments in the function will be ignored if override_config is passed.
                You should call this argument as `model.transcribe(audio, override_config=TranscribeConfig(...))`.
            **config_kwargs: (Optional[Dict]) additional arguments to override the default TranscribeConfig.
                Note: If override_config is passed, these arguments will be ignored.

        Returns:
            Output is defined by the subclass implementation of `TranscriptionMixin._transcribe_output_processing()`.
            It can be:

                - List[str/Hypothesis]

                - List[List[str/Hypothesis]]

                - Tuple[str/Hypothesis]

                - Tuple[List[str/Hypothesis]]

                - Dict[str, List[str/Hypothesis]]
        N)r0   r1   r2   r3   r4   r6   r5   r8   `transcribe_cfg must have an `_internal` argument, which must be of an object of type InternalTranscribeConfig or its subclass.``transcribe_cfg._internal` must be of an object of type InternalTranscribeConfig or its subclass)rk   c                 S   s   g | ]}g qS r,   r,   ).0_r,   r,   r-   
<listcomp>"  s    z1TranscriptionMixin.transcribe.<locals>.<listcomp>r   z&The number of elements in the result (z3) does not match the results of the current batch (z).zGiven output result for transcription is not supported. Please return a list of results, list of list of results, a dict of list of results, or a tuple of list of results.r,   )r.   r:   
ValueErrorr8   r   
isinstancetranscribe_generatorlistextenddictitemstuple	enumeraterL   RuntimeErrorappendNotImplementedErrorStopIteration)rO   rj   r0   r1   r2   r3   r4   r6   r5   rk   config_kwargstranscribe_cfgresults	generatorprocessed_outputskviprocessed_outputr,   r,   r-   
transcribe   s   8






/zTranscriptionMixin.transcribec           
   	   c   s2   |du rt  }t|dstd|jdu rt |_n
t|jts%td|}zk| || t I}||j_	t|t
sC| ||}n|}t|drN|j}nd}t|d| dD ]}t||jj}| ||}| ||}	~~|	V  ~	qXW d   n1 s~w   Y  W | | dS W | | dS | | w )	z?
        A generator version of `transcribe` function.
        Nr8   rm   rn   r6   TTranscribing)descdisable)r.   r:   rr   r8   r   rs   _transcribe_on_begintempfileTemporaryDirectoryr!   r   _transcribe_input_processingr6   r   r   r   _transcribe_forward_transcribe_output_processing_transcribe_on_end)
rO   rj   rk   r   tmpdir
dataloaderr6   
test_batchmodel_outputsr   r,   r,   r-   rt   ?  sH   





z'TranscriptionMixin.transcribe_generatorr=   c                 C   sZ  |du ri S t |ttjtjfr|g}t |tr t|dkr i S t| 	 }|j
jdu r1|j|j
_|j
jdu r<|j|j
_t|ddd}|du rWt|ddd}t|t d }t|dr_||_| j|j
_t| drt| jd	rt| jjd
r| jjj|j
_d| jj_t| jd	rt| jjdr| jjj|j
_d| jj_|   t |j
_ttj  dS )+  
        Internal function to setup the model for transcription. Perform all setup and pre-checks here.

        Args:
            audio: Of type `GenericTranscriptionType`
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.
        Nr   r2   )r?   r0   r/   rE   preprocessor
featurizerditherr   pad_to)!rs   r+   npndarrayr&   Tensorru   rL   next
parametersr8   r   r   r@   minos	cpu_countr:   r2   trainingr   r   r   r   r   r   r    evalr   get_verbosityr   set_verbosityWARNING)rO   rj   r=   _paramsr2   _batch_sizer,   r,   r-   r     s8   






z'TranscriptionMixin._transcribe_on_beginc                 C   s"  t |ttfrt|dkrtdn|g}t |d trSt|dkr)|d ds0|d dr=|d |j_t	
|d }t|}|jj}| |||}| |}|S t |d tjtjfrt|}tdd |D rrdd |D }|jj}| |||}| ||}|S td	t|d  d
)a  
        Internal function to process the input audio data and return a DataLoader. This function is called by
        `transcribe()` and `transcribe_generator()` to setup the input data for transcription.

        Args:
            audio: Of type `GenericTranscriptionType`
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            A DataLoader object that is used to iterate over the input audio data.
        r   zInput `audio` is emptyrE   z.jsonz.jsonlc                 S   s   g | ]}t |tjqS r,   )rs   r   r   )ro   _tensorr,   r,   r-   rq     s    zCTranscriptionMixin._transcribe_input_processing.<locals>.<listcomp>c                 S   s&   g | ]}t |tjrt|n|qS r,   )rs   r   r   r&   	as_tensor)ro   audio_tensorr,   r,   r-   rq     s    Input `audio` is of type z[. Only `str` (path to audio file), `np.ndarray`, and `torch.Tensor` are supported as input.)rs   ru   ry   rL   rr   r+   endswithr8   r"   r   read_manifestr!   %_transcribe_input_manifest_processing_setup_transcribe_dataloaderr   r   r&   r   any#_transcribe_input_tensor_processing#_setup_transcribe_tensor_dataloadertype)rO   rj   r=   audio_filestmp_dir	ds_configtemp_dataloaderrC   r,   r,   r-   r     s6   (
z/TranscriptionMixin._transcribe_input_processingrC   r!   c                 C   s   d}t | drd| jv r| jj}nt | dr| j}|du r!td|t|dd|t|ddt|d	d|d
}t|dd}|rB||d< |S )aE  
        Internal function to process the input audio tensors and return a config dict for the dataloader.

        Args:
            audio_tensors: A list of numpy or torch tensors. The user must ensure that they satisfy the correct
                sample rate and channel format.
            temp_dir: A temporary directory to store intermediate files.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            A config dict that is used to setup the dataloader for transcription.
        NcfgrD   zProvided `audio` data contains numpy or torch tensors, however the class does not have `sample_rate` attribute. Please set `sample_rate` attribute to the model explicitly.r0   r/   r2   r   r3   )rC   r0   r!   r2   r3   rD   r4   )r:   r   rD   r{   r@   )rO   rC   r!   r=   rD   r   r4   r,   r,   r-   r     s(   




	z6TranscriptionMixin._transcribe_input_tensor_processingr   c                 C      dS )a  
        Internal function to process the input audio filepaths and return a config dict for the dataloader.

        Args:
            audio_files: A list of string filepaths for audio files, or a single string filepath for a manifest file.
            temp_dir: A temporary directory to store intermediate files.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            A config dict that is used to setup the dataloader for transcription.
        Nr,   )rO   r   r!   r=   r,   r,   r-   r        z8TranscriptionMixin._transcribe_input_manifest_processingrB   c                 C   r   )a  
        Internal function to setup the dataloader for transcription. This function is called by
        `transcribe()` and `transcribe_generator()` to setup the input data for transcription.

        Args:
            config: A config dict that is used to setup the dataloader for transcription. It can be generated either
                by `_transcribe_input_manifest_processing()` or `_transcribe_input_tensor_processing()`.

        Returns:
            A DataLoader object that is used to iterate over the input audio data.
        Nr,   rN   r,   r,   r-   r   0  s   z/TranscriptionMixin._setup_transcribe_dataloaderbatchc                 C   r   )au  
        Internal function to perform the model's custom forward pass to return outputs that are processed by
        `_transcribe_output_processing()`.
        This function is called by `transcribe()` and `transcribe_generator()` to perform the model's forward pass.

        Args:
            batch: A batch of input data from the data loader that is used to perform the model's forward pass.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            The model's outputs that are processed by `_transcribe_output_processing()`.
        Nr,   )rO   r   r=   r,   r,   r-   r   ?  s   z&TranscriptionMixin._transcribe_forwardc                 C   r   )a  
        Internal function to process the model's outputs to return the results to the user. This function is called by
        `transcribe()` and `transcribe_generator()` to process the model's outputs.

        Args:
            outputs: The model's outputs that are processed by `_transcribe_forward()`.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            The output can be a list of
            objects, list of list of objects, tuple of objects, tuple of list of objects, or a dict of list of objects.
            Its type is defined in `TranscriptionReturnType`.
        Nr,   )rO   outputsr=   r,   r,   r-   r   O  r   z0TranscriptionMixin._transcribe_output_processingc                 C   s   | j |jjd t| dr5t| jdr!t| jjdr!|jj| jj_t| jdr5t| jjdr5|jj| jj_	|jj
durDt|jj
 dS dS )
        Internal function to teardown the model after transcription. Perform all teardown and post-checks here.

        Args:
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.
        rW   r   r   r   r   N)trainr8   r   r:   r   r   r   r   r    r   r   r   r   rO   r=   r,   r,   r-   r   `  s   
z%TranscriptionMixin._transcribe_on_endc                 C   s   t |}ddlm} t| drt| jdr| jj}nt| dr,td| j	 | j	}ntd d}t
|d|d	 |d
 ddt||ddS )a  
        Internal function to setup the dataloader for transcription. This function is called by
        `transcribe()` and `transcribe_generator()` to setup the input data for transcription.

        Args:
            config: A config dict that is used to setup the dataloader for transcription. It can be generated either
                by `_transcribe_input_manifest_processing()` or `_transcribe_input_tensor_processing()`.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            A DataLoader object that is used to iterate over the input audio data.
        r   )_speech_collate_fn	tokenizerpad_idtranscribe_pad_idz:Pad id is explicitly set to `model.transcribe_pad_id` = {}zPad id is being set to 0 because it could not be resolved from the tokenizer. This can happen for various reasons, especially for character based models. If pad id is incorrect, please provide the pad id explicitly by setting `model.transcribe_pad_id`.Fr0   r2   )r   )datasetshuffler0   r2   
pin_memory	drop_last
collate_fn)rA   'nemo.collections.asr.data.audio_to_textr   r:   r   r   r   infoformatr   r   r   )rO   rB   r=   r   r   r   r,   r,   r-   r   t  s(   


z6TranscriptionMixin._setup_transcribe_tensor_dataloader)r/   Fr   NNTNN)"r#   r$   r%   __doc__r&   inference_moder   r+   r	   r   r   r   r*   r(   r
   r   r   r.   GenericTranscriptionTyper   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r,   r,   r,   r-   ri      s|    	
 A79
+
ri   c                
       sp   e Zd ZdZdee dededeeef fddZ	def fdd	Z
def fd
dZedefddZ  ZS )ASRTranscriptionMixina  
    An abstract class for ASR models that can transcribe audio. This class is a subclass of `TranscriptionMixin` that
    implements the default implementation of common abstract methods among the speech recognition model classes.

    The following abstract classes must be implemented by the subclass:

        - _transcribe_forward():
            Implements the model's custom forward pass to return outputs that are processed by
            `_transcribe_output_processing()`.

        - _transcribe_output_processing():
            Implements the post processing of the model's outputs to return the results to
            the user. The result can be a list of objects, list of list of objects, tuple of objects, tuple of list of
    r   r!   r=   rl   c           	   
   C   s   t tj|dddd<}|D ]1}t|tr'|ddd}|t|d  qt|t	r7|t|d  qt
d	t| d
W d   n1 sKw   Y  |t|dd|t|ddt|ddt|ddt|ddd}t|dd}|rz||d< |S )aB  
        Internal function to process the input audio filepaths and return a config dict for the dataloader.
        Specializes to ASR models which can have Encoder-Decoder-Joint architectures.

        Args:
            audio_files: A list of string filepaths for audio files.
            temp_dir: A temporary directory to store intermediate files.
            trcfg: The transcription config dataclass. Subclasses can change this to a different dataclass if needed.

        Returns:
            A config dict that is used to setup the dataloader for transcription.
        zmanifest.jsonwzutf-8)encodingi  )audio_filepathdurationtext
r   zC. Only `str` (path to audio file) or `dict` are supported as input.Nr0   r/   r2   r   r3   
text_fieldr   
lang_fieldlang)paths2audio_filesr0   r!   r2   r3   r   r   r4   )openr   pathjoinrs   r+   writejsondumpsrw   rr   r   r@   )	rO   r   r!   r=   fp
audio_fileentryr   r4   r,   r,   r-   r     s0   







z;ASRTranscriptionMixin._transcribe_input_manifest_processingc                    sR   t  || t| dr| j  t| dr| j  t| dr'| j  dS dS )r   encoderdecoderjointN)rH   r   r:   r   freezer   r   )rO   rj   r=   rP   r,   r-   r     s   




z*ASRTranscriptionMixin._transcribe_on_beginc                    s\   t  | t| dr| jjdd t| dr| jjdd t| dr,| jjdd dS dS )r   r   Tr   r   r   N)rH   r   r:   r   unfreezer   r   r   rP   r,   r-   r     s   


z(ASRTranscriptionMixin._transcribe_on_endc                 C   s   t  S )z
        Utility method that returns the default config for transcribe() function.

        Returns:
            A dataclass
        )r.   )clsr,   r,   r-   get_transcribe_config  s   z+ASRTranscriptionMixin.get_transcribe_config)r#   r$   r%   r   r	   r+   r.   r   r   r   r   r   classmethodr   rh   r,   r,   rP   r-   r     s    

,r   )1r   r   r   abcr   r   dataclassesr   	functoolsr   typingr   r   r	   r
   r   r   r_   r   r&   	omegaconfr   torch.utils.datar   r   r   0nemo.collections.asr.parts.preprocessing.perturbr   0nemo.collections.asr.parts.preprocessing.segmentr   r    nemo.collections.asr.parts.utilsr   +nemo.collections.asr.parts.utils.rnnt_utilsr   "nemo.collections.common.data.utilsr   
nemo.utilsr   r   r+   TranscriptionReturnTyper   r   r.   r@   rA   ri   r   r,   r,   r,   r-   <module>   s@    (62    