o
    %ݫiX                     @   sx   d Z ddlmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZ dZdZdZd	Zee ZeeZG d
d de	ZdS )a  This lobe enables the integration of huggingface pretrained whisper model.

Transformer from HuggingFace needs to be installed:
https://huggingface.co/transformers/installation.html

Authors
 * Adel Moumen 2022, 2024
 * Titouan Parcollet 2022
 * Luca Della Libera 2022
 * Ha Nguyen 2023
    )cached_propertyN)HFTransformersInterface)
get_logger>  i        c                       s  e Zd ZdZ								dB fdd	Zdd	 ZdCd
dZdd Z	dDdefddZ	e
dfdefddZdd Z		dEddZedd Zedd Zedd  Zed!efd"d#Zed!efd$d%Zed!efd&d'Zed!efd(d)Zed!efd*d+Zed!efd,d-Zed!efd.d/Zed!efd0d1Zed!efd2d3Zed!efd4d5Zd6d7 Zd8d9 Zd:d; Zed<d= Zed>d? Z e!" d@dA Z#  Z$S )FWhispera
  This lobe enables the integration of HuggingFace pretrained Whisper model.

    Source paper whisper:
        https://cdn.openai.com/papers/whisper.pdf
    Transformer from HuggingFace needs to be installed:
    https://huggingface.co/transformers/installation.html

    Some part of the code also cis adapted from the official OpenAI repository:
    https://github.com/openai/whisper

    The model can be finetuned. It will download automatically the model from
    HuggingFace or use a local path.

    Arguments
    ---------
    source : str
        HuggingFace hub name: e.g "openai/whisper-tiny"
    save_path : str
        Path (dir) of the downloaded model.
    sampling_rate : int (default: 16000)
        Sampling rate of the audio signal.
    encoder_only : bool (default: False)
        If True, the forward function outputs the hidden states from the last transformer layer of the encoder.
        If False, one step of the decoder is performed and returned.
    freeze : bool (default: False)
        If True, the model is frozen.
    freeze_encoder : bool (default: False)
        If True, the encoder is frozen.
    output_attentions : bool (default: False)
        If ``True``, the forward function outputs the attention weights. By default, it is ``False`` because
        flash attention requires having ``output_attentions=False``. In case ``output_attentions`` is ``True``,
        a from-scratch attention implementation is being used, which can make the code slower and can increase the
        VRAM memory usage.
    output_all_hiddens: bool (default: False)
        If True, the forward function outputs the hidden states from all transformer layers of the encoder.
        For example whisper-base has 6 transformer layers and the output is of shape (7, B, T, C),
        where the output of the CNN output is added to the beginning.
        If False, the forward function outputs the hidden states only from the last transformer layer of the encoder.
    language: str (default: "en")
        Language token to use for the decoder.
    task: str (default: "transcribe")
        Task token to use for the decoder. It must be one of the following:
        - "transcribe"
        - "translate"

    Example
    -------
    >>> model_hub = "openai/whisper-tiny"
    >>> save_path = "savedir"
    >>> sampling_rate = 16000
    >>> model = Whisper(model_hub, save_path, sampling_rate)
    >>> tokens = torch.tensor([[1, 1]]) * model.model.config.decoder_start_token_id
    >>> inputs = torch.randn([1, 93680])
    >>> outputs = model(inputs, tokens)
    r   FN
transcribec                    sb  t  j|||d || _|| _|| _|| _|| _|	| _|
| _|r>d | _	| j
j  | j
`d | j
_dd l}|  tj  n| j|dd | jrV| jpLd}	| j	j|	| jd | j|||d | jj| _| jj| _| jj| _| jj}|jd | jjkr}|j }|jd | jjksJ | !dtj"|tj#d	 | j$s| jrt%&d
 | j
j'( D ]	}d|_)qd S d S d S )N)source	save_pathfreezer   <|startoftranscript|>)	bos_tokenen)languagetask)sampling_rate_mel_filtersdtypezVspeechbrain.lobes.models.huggingface_transformers.whisper - whisper encoder is frozen.F)*super__init__r   encoder_onlyfreeze_encoderoutput_attentionsoutput_all_hiddensr   r   	tokenizermodeldecodercpugccollecttorchcudaempty_cacheload_tokenizeris_multilingualset_prefix_tokensload_feature_extractorfeature_extractorn_fft_n_fft
hop_length_hop_length	n_samples
_n_samplesmel_filtersshapefeature_sizeTregister_buffer	as_tensorfloat32r   loggerwarningencoder
parametersrequires_grad)selfr
   r   r   r   r   r   r   r   r   r   r    r0   param	__class__ m/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/lobes/models/huggingface_transformers/whisper.pyr   Z   s\   



zWhisper.__init__c                 C   s*   t d |  | D ]}d|_qdS )z
        Freezes parameters of a model.

        Arguments
        ---------
        model : from AutoModel.from_config
            Valid HuggingFace transformers model object.
        z^speechbrain.lobes.models.huggingface_transformers.whisper - whisper encoder-decoder is frozen.FN)r7   r8   trainr:   r;   )r<   r   r=   r@   r@   rA   freeze_model   s   
zWhisper.freeze_modelc                    sN    fdd}j r$t  | W  d   S 1 sw   Y  dS | S )a1  Perform mel transformation and one step of the whisper (encoder-decoder).

        Arguments
        ---------
        wav : torch.Tensor
            A batch of audio signals to transform to features.
        decoder_input_ids : torch.Tensor
            Input tokens for the decoder. This can be language, task, etc.
            Please refer to the whisper paper for more details or go to the
            seq2seq2.py file in SpeechBrain to see how to generate the tokens
            with Greedy Search and/or Beam Search.

        Returns
        -------
        out_encoder : torch.Tensor
            The output of the encoder model.
        decoder_logits : torch.Tensor
            The output of the decoder model.
        decoder_attn : torch.Tensor
            The attention values of the decoder model.
        c                     sX    } | }jr|S jr|d  \}}}n	| \}}}|||fS )zForward pass of the model)_get_melforward_encoderr   r   forward_decoder)melout_encoderdecoder_logitsdecoder_attn_decoder_input_idsr<   wavr@   rA   _forward   s   



z!Whisper.forward.<locals>._forwardN)r   r"   no_grad)r<   rO   rN   rP   r@   rM   rA   forward   s   
$zWhisper.forwardc                 C   s   |  |}| |}|S )ag  
        Compute the mel spectrogram features from the input audio waveform.

        Arguments
        ---------
        wav : torch.Tensor
            A batch of audio signals to compute mel spectrogram features from.

        Returns
        -------
        torch.Tensor
            Mel spectrogram features computed from the input audio waveform.
        )pad_or_trimlog_mel_spectrogram)r<   rO   melsr@   r@   rA   rE      s   

zWhisper._get_melr   paddingc           	      C   s   |dkrt j|d|f}tj| j|jd}tj|| j| j|dd}|dddf 	 d }| j
}|| }tj|d	d
 }t|| d }|d d }|S )a*  Compute the Mel spectrogram of a batch of input waveforms.

        Reference: adapted from
        https://github.com/openai/whisper/blob/eff383b27b783e280c089475852ba83f20f64998/whisper/audio.py#L92

        Arguments
        ---------
        audio : torch.Tensor
            A batch of audio waveforms in 16 kHz.
        padding : int
            The number of samples to append to the end of the audio tensor.

        Returns
        -------
        log_spec : torch.Tensor
            A tensor that contains the batch of Mel spectrograms.
        r   deviceT)windowreturn_complex.NrD      g|=)ming       @g      @)nn
functionalpadr"   hann_windowr+   rX   stftr-   absr   clamplog10maximummax)	r<   audiorV   rY   ra   
magnitudesfiltersmel_speclog_specr@   r@   rA   rT      s"   zWhisper.log_mel_spectrogramrD   lengthc                 C   s~   |j | |kr|j|tj||jdd}|j | |k r=dg|j }d||j |  f||< tj|dd |ddd D }|S )	aM  Pad or trim the Mel spectrograms as expected by the encoder.

        Reference: adapted from
        https://github.com/openai/whisper/blob/eff383b27b783e280c089475852ba83f20f64998/whisper/audio.py#L52

        Arguments
        ---------
        array : torch.Tensor
            A tensor that contains the batch of Mel spectrograms.
        length : int
            Input tensor will be coerced to `length` number of samples.
        axis : int
            The axis along which to pad.

        Returns
        -------
        array : torch.Tensor
            The padded tensor.
        rW   )dimindex)r   r   r   c                 S   s   g | ]	}|D ]}|qqS r@   r@   ).0sizesr_   r@   r@   rA   
<listcomp>C  s    z'Whisper.pad_or_trim.<locals>.<listcomp>NrD   )	r1   index_selectr"   arangerX   ndimr]   r^   r_   )r<   arrayrl   axis
pad_widthsr@   r@   rA   rS   "  s   zWhisper.pad_or_trimc                 C   s*   | j j|| jd}| jrt|jS |jS )a  Takes an input mel and return its corresponding encoder states.
        Returns the last hidden state of the encoder or all hidden states if
        output_all_hiddens is True.

        Arguments
        ---------
        mel : torch.Tensor (signal)
            A batch of audio mel to transform to features.

        Returns
        -------
        torch.Tensor
            The last hidden state of the encoder or all hidden states if
            output_all_hiddens is True.
        )output_hidden_states)r   r9   r   r"   stackhidden_stateslast_hidden_state)r<   rH   encoder_statesr@   r@   rA   rF   H  s   zWhisper.forward_encoderTc           	      C   s   |dur|dddf  d}| jj|||| j|d}| jr:|jd }|j|jd |jd  g|jdd R  }nd}|j}|t	| jjj
j|jdd  }|||jfS )a|  Perform one step of the whisper decoder.

        Arguments
        ---------
        encoder_states : torch.Tensor
            A batch of encoder_states features (mel + whisper feature extractor).
        decoder_input_ids : torch.Tensor
            Input tokens for the decoder. This can be language, task, etc.
            Please refer to the whisper paper for more details or go to the
            seq2seq2.py file in SpeechBrain to see how to generate the tokens
            with Greedy Search and/or Beam Search.
        use_cache : bool
            If True, keys and values are returned as output for KV caching.
        past_key_values : torch.Tensor (default: None)
            If not None, the past key values are used for KV caching and
            avoid recomputing the attention weights.

        Returns
        -------
        logits : torch.Tensor
            The logits of the decoder.
        attn : torch.Tensor | None
            If ``output_attentions`` is True, the attention weights are returned. Otherwise, ``None`` is returned.
        past_key_values : torch.Tensor
            The past key values of the decoder.
        NrD   )encoder_hidden_states	input_idspast_key_valuesr   	use_cacher      r[   )	unsqueezer   r   r   
attentionsviewr1   r{   r"   	transposeembed_tokensweighttor   floatr   )	r<   r|   rN   r   r   output_statesattnxlogitsr@   r@   rA   rG   `  s,   !
.zWhisper.forward_decoderc                 C   sV   ddl m} t| }| j| jj}g }|D ]}||d ||  qt	|S )z@Returns the list of tokens corresponding to the language tokens.r   	LANGUAGESr   )
0transformers.models.whisper.tokenization_whisperr   listkeysr   convert_tokens_to_idsr   appendrn   tuple)r<   r   langsbos_token_idresultlangr@   r@   rA   all_language_tokens  s   zWhisper.all_language_tokensc                 C   s    ddl m} t| }t|S )zHReturns the list of language codes corresponding to the language tokens.r   r   )r   r   r   r   r   )r<   r   r   r@   r@   rA   all_language_codes  s   zWhisper.all_language_codesc                 C   s   t d}|d 7 }td}tdd |D sJ | jjdddd	 | jjd
ddd	 h}|t | D ](}| jj|dd| jjd| ddfD ]}t|dksS||v rZ||d	  qGq3tt	|S )u  
        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.

        - ♪♪♪
        - ( SPEAKING FOREIGN LANGUAGE )
        - [DAVID] Hey there,

        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.

        Taken from: openai/whisper GitHub
        u#   "#()*+/:;<=>@[\]^_`{|}~「」『』uK   << >> <<< >>> -- --- -( -[ (' (" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪u   ♩♪♫♬♭♮♯c                 s   s,    | ]}d t |  kodkn  V  qdS )i@&  i&  N)ord)ro   cr@   r@   rA   	<genexpr>  s   * z,Whisper.non_speech_tokens.<locals>.<genexpr>z -F)add_special_tokensr   z ' r   )
r   splitsetallr   encodelenaddr   sorted)r<   symbolsmiscellaneousr   symboltokensr@   r@   rA   non_speech_tokens  s    zWhisper.non_speech_tokensreturnc                 C      | j dS )zIReturns the token id corresponding to the value of the `transcribe` fieldz<|transcribe|>r   r   r<   r@   r@   rA   r	        zWhisper.transcribec                 C   r   )zHReturns the token id corresponding to the value of the `translate` fieldz<|translate|>r   r   r@   r@   rA   	translate  r   zWhisper.translatec                 C   r   )zBReturns the token id corresponding to the value of the `bos` fieldr   r   r   r@   r@   rA   bos  r   zWhisper.bosc                 C   r   )zBReturns the token id corresponding to the value of the `eos` fieldz<|endoftext|>r   r   r@   r@   rA   eos  r   zWhisper.eosc                 C   r   )zEReturns the token id corresponding to the value of the `bos_lm` fieldz<|startoflm|>r   r   r@   r@   rA   bos_lm  r   zWhisper.bos_lmc                 C   r   )zGReturns the token id corresponding to the value of the `bos_prev` fieldz<|startofprev|>r   r   r@   r@   rA   bos_prev  r   zWhisper.bos_prevc                 C   r   )zLReturns the token id corresponding to the value of the `no_timestamps` fieldz<|notimestamps|>r   r   r@   r@   rA   no_timestamps  r   zWhisper.no_timestampsc                 C   r   )zNReturns the token id corresponding to the value of the `timestamp_begin` fieldz<|0.00|>r   r   r@   r@   rA   timestamp_begin  r   zWhisper.timestamp_beginc                 C   s
   | j d S )zHReturns the token id corresponding to the value of the `no_speech` fieldr   )r   r   r@   r@   rA   	no_speech  s   
zWhisper.no_speechc                 C   s   | j du r	td| | j S )zGReturns the token id corresponding to the value of the `language` fieldNz6This tokenizer does not have language token configured)r   
ValueErrorto_language_tokenr   r@   r@   rA   language_token  s
   
zWhisper.language_tokenc                 C   s0   | j jd| dd}|r|S td| d)a  Returns the token id corresponding to the given language.

        Arguments
        ---------
        language : str
            The language to convert to a token.

        Returns
        -------
        token
            The token id corresponding to the given language.

        Raises
        ------
        KeyError
            If the language is not found in the tokenizer.
        z<|z|>Nz	Language z not found in tokenizer.)r   r   getKeyError)r<   r   tokenr@   r@   rA   r     s   zWhisper.to_language_tokenc                 C      || _ | jj| j d dS )zSet the language token to the given language.

        Arguments
        ---------
        language : str
            The language to set the token to.
        )r   N)r   r   r'   )r<   r   r@   r@   rA   set_language_token*     zWhisper.set_language_tokenc                 C   r   )zSet the task token to the given task.

        Arguments
        ---------
        task : str
            The task to set the token to.
        )r   N)r   r   r'   )r<   r   r@   r@   rA   set_task5  r   zWhisper.set_taskc                 C   s   | j jdkS )z;Returns True if the model is multilingual, False otherwise.i  )config
vocab_sizer   r@   r@   rA   r&   @  r   zWhisper.is_multilingualc                 C   s   t t| jjS )z&Returns the list of tokens to suppress)r   r   r   suppress_tokensr   r@   r@   rA   get_suppress_tokensE  s   zWhisper.get_suppress_tokensc           	         s   j jdu r
td|jd }j|j}tj	gg| 
|j}||d dddf }tj|jd tjd}d|tj< tj |dd|f< |jdd}|jdd   fdd	t|D }||fS )
a  Detect the language of the given mel spectrogram features.

        Arguments
        ---------
        mel : torch.Tensor
            Mel spectrogram features to detect the language of.

        Returns
        -------
        language_tokens : torch.Tensor of shape (batch_size,)
            ids of the most probable language tokens, which appears after the startoftranscript token.
        language_probs : List[Dict[str, float]]
            list of dictionaries containing the probability distribution over all languages.

        Raises
        ------
        ValueError
            If the model doesn't have language tokens.
        NzCThis model doesn't have language tokens so it can't perform lang idr   rD   r   F)rm   c                    s*   g | ]  fd dt jjD qS )c                    s"   i | ]\}}| |f   qS r@   )item)ro   jr   )ilanguage_token_probsr@   rA   
<dictcomp>r  s    z6Whisper.detect_language.<locals>.<listcomp>.<dictcomp>)zipr   r   )ro   r   r<   )r   rA   rq   q  s    z+Whisper.detect_language.<locals>.<listcomp>)r   r   r   r1   r   r9   r{   r"   tensorr   r   rX   rG   onesboolr   r   npinfargmaxsoftmaxr   range)	r<   rH   
batch_size
enc_statesrN   r   masklanguage_tokenslanguage_probsr@   r   rA   detect_languageJ  s&   

zWhisper.detect_language)r   FFFFFNr	   )N)r   )TN)%__name__
__module____qualname____doc__r   rC   rR   rE   intrT   	N_SAMPLESrS   rF   rG   r   r   r   r   r	   r   r   r   r   r   r   r   r   r   r   r   r   r&   r   r"   rQ   r   __classcell__r@   r@   r>   rA   r   !   st    <M
.
*&
=


&

r   )r   	functoolsr   numpyr   r"   torch.nnr]   =speechbrain.lobes.models.huggingface_transformers.huggingfacer   speechbrain.utils.loggerr   SAMPLE_RATEN_FFT
HOP_LENGTHCHUNK_LENGTHr   r   r7   r   r@   r@   r@   rA   <module>   s    