o
    %Ý«i[  ã                   @   s,   d Z ddlZddlmZ G dd„ deƒZdS )aU   Specifies the inference interfaces for Speech Translation (ST) modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
é    N)Ú
Pretrainedc                       sN   e Zd ZdZdgZddgZ‡ fdd„Zdd„ Zd	d
„ Zdd„ Z	dd„ Z
‡  ZS )ÚEncoderDecoderS2UTa"  A ready-to-use Encoder Decoder for speech-to-unit translation model

    The class can be used  to  run the entire encoder-decoder S2UT model
    (translate_file()) to translate speech. The given YAML must contains the fields
    specified in the *_NEEDED[] lists.

    Arguments
    ---------
    *args : tuple
    **kwargs : dict
        Arguments are forwarded to ``Pretrained`` parent class.

    Example
    -------
    >>> from speechbrain.inference.ST import EncoderDecoderS2UT
    >>> tmpdir = getfixture("tmpdir")
    >>> s2ut_model = EncoderDecoderS2UT.from_hparams(source="speechbrain/s2st-transformer-fr-en-hubert-l6-k100-cvss", savedir=tmpdir) # doctest: +SKIP
    >>> s2ut_model.translate_file("speechbrain/s2st-transformer-fr-en-hubert-l6-k100-cvss/example-fr.wav") # doctest: +SKIP
    Úsample_rateÚencoderÚdecoderc                    s    t ƒ j|i |¤Ž | jj| _d S )N)ÚsuperÚ__init__Úhparamsr   )ÚselfÚargsÚkwargs©Ú	__class__© úL/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/inference/ST.pyr   .   s   zEncoderDecoderS2UT.__init__c                 C   s@   |   |¡}| | j¡}| d¡}t dg¡}|  ||¡}|d S )a6  Translates the given audiofile into a sequence speech unit.

        Arguments
        ---------
        path : str
            Path to audio file which to translate.

        Returns
        -------
        int[]
            The audiofile translation produced by this speech-to-unit translationmodel.
        r   g      ð?)Ú
load_audioÚtoÚdeviceÚ	unsqueezeÚtorchÚtensorÚtranslate_batch)r
   ÚpathÚaudioÚbatchÚ
rel_lengthÚpredicted_tokensr   r   r   Útranslate_file2   s   

z!EncoderDecoderS2UT.translate_filec                 C   s4   |  ¡ }| | j¡| | j¡}}| j ||¡}|S )a  Encodes the input audio into a sequence of hidden states

        The waveforms should already be in the model's desired format.
        You can call:
        ``normalized = EncoderDecoderS2UT.normalizer(signal, sample_rate)``
        to get a correctly converted signal in most cases.

        Arguments
        ---------
        wavs : torch.tensor
            Batch of waveforms [batch, time, channels].
        wav_lens : torch.tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        torch.tensor
            The encoded batch
        )Úfloatr   r   Úmodsr   )r
   ÚwavsÚwav_lensÚencoder_outr   r   r   Úencode_batchH   s   zEncoderDecoderS2UT.encode_batchc                 C   s\   t  ¡   | | j¡}|  ||¡}| j ||¡\}}}}W d  ƒ |S 1 s'w   Y  |S )aM  Translates the input audio into a sequence of words

        The waveforms should already be in the model's desired format.
        You can call:
        ``normalized = EncoderDecoderS2UT.normalizer(signal, sample_rate)``
        to get a correctly converted signal in most cases.

        Arguments
        ---------
        wavs : torch.tensor
            Batch of waveforms [batch, time, channels].
        wav_lens : torch.tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        list
            Each waveform in the batch translated.
        tensor
            Each predicted token id.
        N)r   Úno_gradr   r   r#   r   r   )r
   r    r!   r"   r   Ú_r   r   r   r   d   s   

ýüz"EncoderDecoderS2UT.translate_batchc                 C   s   |   ||¡S )zRuns full translation)r#   )r
   r    r!   r   r   r   Úforwardƒ   s   zEncoderDecoderS2UT.forward)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚHPARAMS_NEEDEDÚMODULES_NEEDEDr   r   r#   r   r&   Ú__classcell__r   r   r   r   r      s    r   )r*   r   Ú speechbrain.inference.interfacesr   r   r   r   r   r   Ú<module>   s    