o
    zi                     @   s   d dl Z G dd dZdS )    Nc                   @   sj   e Zd ZdZdeej rdndfdedefddZ	d	ej
fd
dZd	efddZd	ej
fddZdS )WhisperMixinFzopenai/whisper-base.encudacpupretrained_model_name_or_pathdevicec                 C   sH   ddl m} ddl m} || _||| _||| j| _d| _d S )Nr   )WhisperForConditionalGeneration)WhisperProcessorT)	transformersr   r   whisper_devicefrom_pretrainedwhisper_processortowhisper_modelis_initialized)selfr   r   r   r    r   K/home/ubuntu/.local/lib/python3.10/site-packages/audiotools/core/whisper.pysetup_whisper   s   
zWhisperMixin.setup_whisperreturnc                 C   s   ddl }| js|   | | j}t| | jj	j
jdddddf  }|  | j|| jj	j
ddj}W d   |S 1 sEw   Y  |S )zPreprocess audio signal as per the whisper model's training config.

        Returns
        -------
        torch.Tensor
            The prepinput features of the audio signal. Shape: (1, channels, seq_len)
        r   Npt)sampling_ratereturn_tensors)torchr   r   r   r   listcloneresampler   feature_extractorr   
audio_datanumpyinference_modeinput_features)r   r   signal
raw_speechr    r   r   r   get_whisper_features   s.   
	
z!WhisperMixin.get_whisper_featuresc                 C   sl   | j s|   |  }t  || j}| jj|d}W d   n1 s'w   Y  | j	
|}|d S )zGet the transcript of the audio signal using the whisper model.

        Returns
        -------
        str
            The transcript of the audio signal, including special tokens such as <|startoftranscript|> and <|endoftext|>.
        )inputsNr   )r   r   r#   r   r   r   r
   r   generater   batch_decode)r   r    generated_idstranscriptionr   r   r   get_whisper_transcript8   s   	
z#WhisperMixin.get_whisper_transcriptc                 C   sn   ddl }| js|   |  }| j }|  || j}||}W d   |j	S 1 s/w   Y  |j	S )zGet the last hidden state embeddings of the audio signal using the whisper model.

        Returns
        -------
        torch.Tensor
            The Whisper embeddings of the audio signal. Shape: (1, seq_len, hidden_size)
        r   N)
r   r   r   r#   r   get_encoderr   r   r
   last_hidden_state)r   r   r    encoder
embeddingsr   r   r   get_whisper_embeddingsM   s   



z#WhisperMixin.get_whisper_embeddingsN)__name__
__module____qualname__r   r   r   r   is_availablestrr   Tensorr#   r)   r.   r   r   r   r   r      s    
 r   )r   r   r   r   r   r   <module>   s    