o
    Si                     @   s   d dl mZ d dlmZmZmZmZ d dlZd dl	Z	d dl
mZmZ d dlmZmZmZmZmZ 							dd
eeje	jf de	jdedede	jdededeeee	jf  fddZeG dd dZeG dd deZdS )    )	dataclass)AnyDictOptionalUnionN)FeatureExtractorregister_extractor)EPSILONSecondsasdict_nonullcompute_num_frames_from_samplesis_module_availableP        >  audiofiltersn_melsn_fftwindow
hop_lengthsampling_ratedevicec                 C   s.  t | s
t | } |dur| |} t| jdkr)| jd dkr%td| d } t| jdks9J d| j dt j| |||dd	}|d
ddf  d }	||	 }
t j	|
dd
 }t || d }|d d }tt| || |d}||jd krt jjj|d||jd  fdd}|dd}|S )ar  
    From https://github.com/openai/whisper/blob/main/whisper/audio.py

    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
        The number of Mel-frequency filters, only 80 is supported

    device: Optional[Union[str, torch.device]]
        If given, the audio tensor is moved to this device before STFT

    Returns
    -------
    torch.Tensor, shape = (n_frames, 80)
        A Tensor that contains the Mel spectrogram
    N   r      z8Whisper Fbank works only with single-channel recordings.z@Whisper Fbank works only with single-channel recordings (shape: )T)r   return_complex.g|=)ming       @g      @)num_samplesframe_shiftr   constant)mode)torch	is_tensor
from_numpytolenshape
ValueErrorstftabsclamplog10maximummaxr   nn
functionalpad	transpose)r   r   r   r   r   r   r   r   r+   
magnitudesmel_speclog_specpadding r9   Q/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/features/whisper_fbank.pylog_mel_spectrogram   s8   


r;   c                   @   sZ   e Zd ZU dZeed< dZeed< deee	f fddZ
edeee	f dd fd	d
ZdS )WhisperFbankConfigr   num_filterscpur   returnc                 C   s   t | S N)r   selfr9   r9   r:   to_dict\   s   zWhisperFbankConfig.to_dictdatac                 C   s   t di | S )Nr9   )r<   )rD   r9   r9   r:   	from_dict_   s   zWhisperFbankConfig.from_dictN)__name__
__module____qualname__r=   int__annotations__r   strr   r   rC   staticmethodrE   r9   r9   r9   r:   r<   W   s   
  r<   c                	       s   e Zd ZdZeZddee f fddZede	e
ejf fddZedefd	d
Zde
fddZdedefddZde	ejejf dede	ejejf fddZedejdejdedejfddZedejdefddZ  ZS )WhisperFbankzwhisper-fbankNconfigc                    s   t  j|d d| _d| _d| _| jj| _tdrdd l}nt	dt
| j| jj}|jj| j| j| jd}t
|| jj| _t
| j| jj| _d S )	N)rN   r   r   r   librosar   zULibrosa is not installed. Please install librosa before using LibrosaFbank extractor.)srr   r   )super__init__r   r   r   rN   r=   r   rO   ImportErrorr$   hann_windowr'   r   r   melr&   r   )rB   rN   rO   r   r   	__class__r9   r:   rR   i   s    

zWhisperFbank.__init__r?   c                 C   s   | j jS r@   rN   r   rA   r9   r9   r:   r   |   s   zWhisperFbank.devicec                 C   s   | j | j S r@   )r   r   rA   r9   r9   r:   r!      s   zWhisperFbank.frame_shiftr   c                 C   s   || j _d S r@   rX   )rB   r   r9   r9   r:   r'      s   zWhisperFbank.tor   c                 C   s   | j S r@   )r=   )rB   r   r9   r9   r:   feature_dim   s   zWhisperFbank.feature_dimsamplesc                 C   sr   || j ksJ d| j  d| dd}t|tjs t|}d}t|| j| j| j| j	| j
d}|r7|  S |S )Nz)Fbank was instantiated for sampling_rate z, but sampling_rate=zl was passed to extract(). Note you can use CutSet/RecordingSet.resample() to change the audio sampling rate.FT)r   r   r   r   r   )r   
isinstancer$   Tensorr&   r;   r   r   r   r=   r   r>   numpy)rB   rZ   r   is_numpyfeatsr9   r9   r:   extract   s,   
	zWhisperFbank.extract
features_a
features_benergy_scaling_factor_bc              
   C   s&   t t tt | |t |  S r@   )nplogr/   r	   exp)ra   rb   rc   r9   r9   r:   mix   s   zWhisperFbank.mixfeaturesc                 C   s   t tt| S r@   )floatrd   sumrf   )rh   r9   r9   r:   compute_energy   s   zWhisperFbank.compute_energyr@   )rF   rG   rH   namer<   config_typer   rR   propertyr   rK   r$   r   r
   r!   r'   rI   rY   rd   ndarrayr\   r`   rL   ri   rg   rk   __classcell__r9   r9   rV   r:   rM   d   s:    
rM   )r   r   Nr   r   N)dataclassesr   typingr   r   r   r   r]   rd   r$   lhotse.features.baser   r   lhotse.utilsr	   r
   r   r   r   ro   r\   rI   rK   r   r;   r<   rM   r9   r9   r9   r:   <module>   sB    
F