o
    9wi>                     @   s.  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	Z	d dl
m  mZ d dlmZ dZdZdZdZee ZeeeZed	 ZeeeZeeeZefd
ededejfddZefdddedefddZedddede	jfddZ	 	ddeeeje	jf dededeeee	j f  fddZ!dS )    N)	lru_cache)OptionalUnion)	exact_divi>  i           filesrreturnc                 C   s   zddddd| dddd	d
ddt |dg}tj|dddj}W n tjy6 } ztd|j  |d}~ww t	|tj
 tjd S )a?  
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    ffmpegz-nostdinz-threads0z-iz-fs16lez-ac1z-acodec	pcm_s16lez-ar-T)capture_outputcheckzFailed to load audio: Ng      @)str
subprocessrunstdoutCalledProcessErrorRuntimeErrorstderrdecodenp
frombufferint16flattenastypefloat32)r	   r
   cmdoute r%   K/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/whisperx/audio.py
load_audio   s.   r'   )axislengthr)   c                C   s   t | rC| j| |kr| j|t j|| jdd} | j| |k rAdg| j }d|| j|  f||< t| dd |ddd D } | S | j| |krS| j	t
||d	} | j| |k rqdg| j }d|| j|  f||< t| |} | S )
zO
    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
    )device)dimindex)r   r   r   c                 S   s   g | ]	}|D ]}|qqS r%   r%   ).0sizespadr%   r%   r&   
<listcomp>Q   s    zpad_or_trim.<locals>.<listcomp>Nr(   )indicesr)   )torch	is_tensorshapeindex_selectaranger+   ndimFr0   takeranger   )arrayr*   r)   
pad_widthsr%   r%   r&   pad_or_trimD   s"   
 
r>   )maxsizen_melsc                 C   sr   |dv sJ d| t tjtjtdd}t|d|  	| W  d   S 1 s2w   Y  dS )a  
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:

        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
        )
    )P      zUnsupported n_mels: assetszmel_filters.npzmel_N)
r   loadospathjoindirname__file__r3   
from_numpyto)r+   r@   fr%   r%   r&   mel_filters^   s   $rN   audiopaddingr+   c           
      C   s   t | st| trt| } t | } |dur| |} |dkr(t| d|f} t 	t
| j}t j| t
t|dd}|dddf  d }t| j|}|| }t j|dd	 }	t |	|	 d
 }	|	d d }	|	S )ap  
    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
        The number of Mel-frequency filters, only 80 is supported

    padding: int
        Number of zero samples to pad to the right

    device: Optional[Union[str, torch.device]]
        If given, the audio tensor is moved to this device before STFT

    Returns
    -------
    torch.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    Nr   T)windowreturn_complex.r(   r   g|=)ming       @g      @)r3   r4   
isinstancer   r'   rK   rL   r9   r0   hann_windowN_FFTr+   stft
HOP_LENGTHabsrN   clamplog10maximummax)
rO   r@   rP   r+   rQ   rW   
magnitudesfiltersmel_speclog_specr%   r%   r&   log_mel_spectrogramp   s"   



rb   )r   N)"rF   r   	functoolsr   typingr   r   numpyr   r3   torch.nn.functionalnn
functionalr9   whisperx.utilsr   SAMPLE_RATErV   rX   CHUNK_LENGTH	N_SAMPLESN_FRAMESN_SAMPLES_PER_TOKENFRAMES_PER_SECONDTOKENS_PER_SECONDr   intndarrayr'   r>   TensorrN   r+   rb   r%   r%   r%   r&   <module>   s@    


+