o
    7wi[                     @   s,   d Z ddlZddlmZ G dd deZdS )aU   Specifies the inference interfaces for Speech Translation (ST) modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
    N)
Pretrainedc                       sN   e Zd ZdZdgZddgZ fddZdd Zd	d
 Zdd Z	dd Z
  ZS )EncoderDecoderS2UTa"  A ready-to-use Encoder Decoder for speech-to-unit translation model

    The class can be used  to  run the entire encoder-decoder S2UT model
    (translate_file()) to translate speech. The given YAML must contains the fields
    specified in the *_NEEDED[] lists.

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> from speechbrain.inference.ST import EncoderDecoderS2UT
    >>> tmpdir = getfixture("tmpdir")
    >>> s2ut_model = EncoderDecoderS2UT.from_hparams(source="speechbrain/s2st-transformer-fr-en-hubert-l6-k100-cvss", savedir=tmpdir) # doctest: +SKIP
    >>> s2ut_model.translate_file("speechbrain/s2st-transformer-fr-en-hubert-l6-k100-cvss/example-fr.wav") # doctest: +SKIP
    sample_rateencoderdecoderc                    s    t  j|i | | jj| _d S )N)super__init__hparamsr   )selfargskwargs	__class__ U/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/speechbrain/inference/ST.pyr   .   s   zEncoderDecoderS2UT.__init__c                 C   s@   |  |}|| j}|d}tdg}| ||}|d S )a6  Translates the given audiofile into a sequence speech unit.

        Arguments
        ---------
        path : str
            Path to audio file which to translate.

        Returns
        -------
        int[]
            The audiofile translation produced by this speech-to-unit translationmodel.
        r   g      ?)
load_audiotodevice	unsqueezetorchtensortranslate_batch)r
   pathaudiobatch
rel_lengthpredicted_tokensr   r   r   translate_file2   s   

z!EncoderDecoderS2UT.translate_filec                 C   s4   |  }|| j|| j}}| j||}|S )a  Encodes the input audio into a sequence of hidden states

        The waveforms should already be in the model's desired format.
        You can call:
        ``normalized = EncoderDecoderS2UT.normalizer(signal, sample_rate)``
        to get a correctly converted signal in most cases.

        Arguments
        ---------
        wavs : torch.tensor
            Batch of waveforms [batch, time, channels].
        wav_lens : torch.tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        torch.tensor
            The encoded batch
        )floatr   r   modsr   )r
   wavswav_lensencoder_outr   r   r   encode_batchH   s   zEncoderDecoderS2UT.encode_batchc                 C   s\   t    || j}| ||}| j||\}}}}W d   |S 1 s'w   Y  |S )aM  Translates the input audio into a sequence of words

        The waveforms should already be in the model's desired format.
        You can call:
        ``normalized = EncoderDecoderS2UT.normalizer(signal, sample_rate)``
        to get a correctly converted signal in most cases.

        Arguments
        ---------
        wavs : torch.tensor
            Batch of waveforms [batch, time, channels].
        wav_lens : torch.tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        list
            Each waveform in the batch translated.
        tensor
            Each predicted token id.
        N)r   no_gradr   r   r#   r   r   )r
   r    r!   r"   r   _r   r   r   r   d   s   

z"EncoderDecoderS2UT.translate_batchc                 C   s   |  ||S )zRuns full translation)r#   )r
   r    r!   r   r   r   forward   s   zEncoderDecoderS2UT.forward)__name__
__module____qualname____doc__HPARAMS_NEEDEDMODULES_NEEDEDr   r   r#   r   r&   __classcell__r   r   r   r   r      s    r   )r*   r    speechbrain.inference.interfacesr   r   r   r   r   r   <module>   s    