o
    Xi                     @   s   d dl Z d dlmZ d dlmZ d dlZd dlZd dlZd dl	m
  mZ ddlmZ dZdZdZd	Zd
Zee ZeeeZefdedefddZefdddedefddZeddefdedejfddZefdeeejejf defddZdS )    N)	lru_cache)Union   )	exact_divi>  i  P         filesrc              
   C   s   zt j| ddjdddd|djdd	gd
d
d\}}W n t jy3 } ztd|j  |d}~ww t	|tj
 tjd S )a?  
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    r   )threads-s16le	pcm_s16ler   )formatacodecacarffmpegz-nostdinT)cmdcapture_stdoutcapture_stderrzFailed to load audio: Ng      @)r   inputoutputrunErrorRuntimeErrorstderrdecodenp
frombufferint16flattenastypefloat32)r	   r
   out_e r'   A/home/ubuntu/.local/lib/python3.10/site-packages/whisper/audio.py
load_audio   s   
r)   )axislengthr+   c                C   s   t | rC| j| |kr| j|t j|| jdd} | j| |k rAdg| j }d|| j|  f||< t| dd |ddd D } | S | j| |krS| j	t
||d	} | j| |k rqdg| j }d|| j|  f||< t| |} | S )
zO
    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
    )device)dimindex)r   r   r   c                 S   s   g | ]	}|D ]}|qqS r'   r'   ).0sizespadr'   r'   r(   
<listcomp>?   s    zpad_or_trim.<locals>.<listcomp>Nr*   )indicesr+   )torch	is_tensorshapeindex_selectaranger-   ndimFr2   takeranger   )arrayr,   r+   
pad_widthsr'   r'   r(   pad_or_trim4   s   
 
r@   )maxsizen_melsreturnc                 C   sr   |dksJ d| t tjtjtdd}t|d|  	| W  d   S 1 s2w   Y  dS )a  
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:

        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
        )
    r   zUnsupported n_mels: assetszmel_filters.npzmel_N)
r   loadospathjoindirname__file__r5   
from_numpyto)r-   rB   fr'   r'   r(   mel_filtersL   s    $rO   audioc                 C   s   t | st| trt| } t | } t t| j	}t j
| tt|dd}|ddddf  d }t| j	|}|| }t j|dd }t || d }|d	 d	 }|S )
a  
    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
        The number of Mel-frequency filters, only 80 is supported

    Returns
    -------
    torch.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    T)windowreturn_complexNr*      g|=)ming       @g      @)r5   r6   
isinstancestrr)   rL   hann_windowN_FFTrM   r-   stft
HOP_LENGTHabsrO   clamplog10maximummax)rP   rB   rQ   rY   
magnitudesfiltersmel_speclog_specr'   r'   r(   log_mel_spectrogram\   s   


rd   )rG   	functoolsr   typingr   r   numpyr   r5   torch.nn.functionalnn
functionalr;   utilsr   SAMPLE_RATErX   N_MELSrZ   CHUNK_LENGTH	N_SAMPLESN_FRAMESrV   intr)   r@   TensorrO   ndarrayrd   r'   r'   r'   r(   <module>   s(    
(