o
    ei                     @   s8   d Z ddlZddlmZ ddlmZ G dd deZdS )aa   Specifies the inference interfaces for Spoken Language Understanding (SLU) modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
    N)EncoderDecoderASR)
Pretrainedc                       sP   e Zd ZdZddgZddgZ fddZdd	 Zd
d Zdd Z	dd Z
  ZS )EndToEndSLUa)  An end-to-end SLU model.

    The class can be used either to run only the encoder (encode()) to extract
    features or to run the entire model (decode()) to map the speech to its semantics.

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> from speechbrain.inference.SLU import EndToEndSLU
    >>> tmpdir = getfixture("tmpdir")
    >>> slu_model = EndToEndSLU.from_hparams(
    ...     source="speechbrain/slu-timers-and-such-direct-librispeech-asr",
    ...     savedir=tmpdir,
    ... )  # doctest: +SKIP
    >>> slu_model.decode_file("tests/samples/single-mic/example6.wav") # doctest: +SKIP
    "{'intent': 'SimpleMath', 'slots': {'number1': 37.67, 'number2': 75.7, 'op': ' minus '}}"
    	tokenizerasr_model_sourceslu_encbeam_searcherc                    s:   t  j|i | | jj| _tj| jjd| jid| _d S )Ndevice)sourcerun_opts)	super__init__hparamsr   r   from_hparamsr   r	   	asr_model)selfargskwargs	__class__ W/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/inference/SLU.pyr   2   s   
zEndToEndSLU.__init__c                 K   sL   | j |fi |}|| j}|d}tdg}| ||\}}|d S )am  Maps the given audio file to a string representing the
        semantic dictionary for the utterance.

        Arguments
        ---------
        path : str
            Path to audio file to decode.
        **kwargs : dict
            Arguments forwarded to ``load_audio``.

        Returns
        -------
        str
            The predicted semantics.
        r   g      ?)
load_audiotor	   	unsqueezetorchtensordecode_batch)r   pathr   waveformbatch
rel_lengthpredicted_wordspredicted_tokensr   r   r   decode_file:   s   
zEndToEndSLU.decode_filec                 C   sD   |  }|| j|| j}}| j| |}| j|}|S )a`  Encodes the input audio into a sequence of hidden states

        Arguments
        ---------
        wavs : torch.Tensor
            Batch of waveforms [batch, time, channels] or [batch, time]
            depending on the model.
        wav_lens : torch.Tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        torch.Tensor
            The encoded batch
        )floatr   r	   r   encode_batchdetachmodsr   )r   wavswav_lensASR_encoder_outencoder_outr   r   r   r&   R   s
   zEndToEndSLU.encode_batchc                    s   t  2 | j| j}} ||} j||\}}}} fdd|D }W d   ||fS 1 s9w   Y  ||fS )a  Maps the input audio to its semantics

        Arguments
        ---------
        wavs : torch.Tensor
            Batch of waveforms [batch, time, channels] or [batch, time]
            depending on the model.
        wav_lens : torch.Tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        list
            Each waveform in the batch decoded.
        tensor
            Each predicted token id.
        c                    s   g | ]} j |qS r   )r   
decode_ids).0	token_seqr   r   r   
<listcomp>   s    
z,EndToEndSLU.decode_batch.<locals>.<listcomp>N)r   no_gradr   r	   r&   r(   r   )r   r)   r*   r,   r#   scores_r"   r   r0   r   r   k   s   




zEndToEndSLU.decode_batchc                 C   s   |  ||S )z8Runs full decoding - note: no gradients through decoding)r   )r   r)   r*   r   r   r   forward   s   zEndToEndSLU.forward)__name__
__module____qualname____doc__HPARAMS_NEEDEDMODULES_NEEDEDr   r$   r&   r   r5   __classcell__r   r   r   r   r      s    !r   )r9   r   speechbrain.inference.ASRr    speechbrain.inference.interfacesr   r   r   r   r   r   <module>   s
    