o
    i\A                     @   s   d Z ddlmZmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZ e r9ddlZeeZG d
d deZdgZdS )z%
Feature extractor class for Whisper
    )OptionalUnionN   )is_torch_available)mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                       s>  e Zd ZdZdgZ									d( fd
d	ZdejdedejfddZ	d)dejdedejfddZ
e	d*deej deej dedeej fddZ										d+deejee eej eee  f dedee d eeeef  d!ee d"ee dee d#ee d$ee dee d%ee defd&d'Z  ZS ),WhisperFeatureExtractora  
    Constructs a Whisper feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 400):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        dither (`float`, *optional*, defaults to 0.0):
            Adds dithering. In other words, adds a small Gaussian noise to each frame.
            E.g. use 0.0001 to add dithering with a normal distribution centered
            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
            The value 0.0 means no dithering.
            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
            the high log_mel_fbank values for signals with hard-zero sections,
            when VAD cutoff is present in the signal.
    input_featuresP   >                  Fc	           
   	      st   t  jd||||d|	 || _|| _|| _|| | _| j| | _|| _|| _t	d|d  |dd|ddd| _
d S )	N)feature_sizesampling_ratepadding_valuereturn_attention_mask      r   g     @@slaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   ditherr   mel_filters)
selfr   r   r&   r'   r%   r   r*   r   kwargs	__class__r"   j/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/whisper/feature_extraction_whisper.pyr$   H   s0   

z WhisperFeatureExtractor.__init__waveform_batchdevicereturnc                 C   s   |dkrt d| dg }|D ]5}t|t| jd| j| jd| j| jdd}|dddd	f }t||	 d
 }|d d }|
| qt|}|S )z
        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
        implementation with 1e-5 tolerance.
        cpuzGot device `z` for feature extraction, but feature extraction on CUDA accelerator devices requires torch, which is not installed. Either set `device='cpu'`, or install torch according to the official instructions: https://pytorch.org/get-started/locally/hanng       @log10)frame_lengthr&   powerr*   r+   log_melN       @      @)
ValueErrorr   r   r%   r&   r*   r+   npmaximummaxappendarray)r,   r1   r2   log_spec_batchwaveformlog_specr"   r"   r0   _np_extract_fbank_featuresl   s,   



z2WhisperFeatureExtractor._np_extract_fbank_featuresr4   rD   c           
      C   s*  t ||t j}t j| j|d}| jdkr'|| jt j|j|j	|j
d 7 }t j|| j| j|dd}|dddf  d	 }t | j|t j}|j| }t j|d
d }| d	krw|jd	ddd jdddd }	t ||	d }n
t || d }|d d }|dkr|  }| S )z
        Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
        yielding results similar to cpu computing with 1e-5 tolerance.
        )r2   r   )dtyper2   T)windowreturn_complex.Nr:   r   g|=)min)dimkeepdimr   r   r;   r<   r4   )torch
from_numpytofloat32hann_windowr%   r*   randnshaperG   r2   stftr&   absr+   Tclampr6   rK   r@   r?   detachr4   numpy)
r,   rD   r2   rH   rT   
magnitudesr+   mel_specrE   max_valr"   r"   r0   _torch_extract_fbank_features   s"   
 
 z5WhisperFeatureExtractor._torch_extract_fbank_featuresinput_valuesattention_maskr   c                 C   s   |durEt |t j}g }t| |dD ]-\}}||d|   t |d|  d  }||jd k r=|||d< |	| q|S dd | D }|S )z[
        Every array in the list is normalized to have zero mean and unit variance
        Nr:   Hz>r   c                 S   s*   g | ]}||   t| d   qS )r`   )meanr>   sqrtvar).0xr"   r"   r0   
<listcomp>   s   * zCWhisperFeatureExtractor.zero_mean_unit_var_norm.<locals>.<listcomp>)
r>   rB   int32zipsumra   rb   rc   rS   rA   )r^   r_   r   normed_input_valuesvectorlengthnormed_slicer"   r"   r0   zero_mean_unit_var_norm   s   .z/WhisperFeatureExtractor.zero_mean_unit_var_normTN
max_length
raw_speech
truncationpad_to_multiple_ofreturn_tensorsr   paddingr   do_normalizereturn_token_timestampsc              
      s  |dur| j krtd jj d j  d j  d| d	ntd jj d t|tjo6t	|j
d	k}|rGt	|j
d
krGtd  |pZt|ttfoZt|d tjttf}|redd |D }n&|svt|tjsvtj|tjd}nt|tjr|jttju r|tj}|st|gjg}td|i} j|||r|n j|||p|	d}|	rɈ j|d |d  jd|d< tj|d dd|d< |dd
dd	}t rو jn j}||d |
}t|d trdd |D |d< n||d< |r$|d dddd jf }|d j
d	  j dkr |ddddf }||d< |dur?td jj d  fdd|D |d< |durI| |}|S )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   c                 S   s    g | ]}t j|gt jd jqS rG   )r>   asarrayrP   rV   )rd   speechr"   r"   r0   rf     s     z4WhisperFeatureExtractor.__call__.<locals>.<listcomp>ry   r   )rt   ro   rq   rr   r   r_   )r_   r   )axisc                 S   s   g | ]
}t j|t jd qS rx   )r>   rz   rP   )rd   featurer"   r"   r0   rf   B  s    r:   z,`return_token_timestamps` is deprecated for z~ and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.c                    s   g | ]	}t | j qS r"   )lenr&   )rd   raw_speech_ir,   r"   r0   rf   V  s    
num_frames)!r   r=   r/   __name__loggerwarning
isinstancer>   ndarrayr~   rS   listtuplerz   rP   rG   float64astyperV   r
   padr(   rn   r   stackget	transposer   r]   rF   r&   warning_onceconvert_to_tensors)r,   rp   rq   rr   rs   r   rt   ro   r   ru   r2   rv   r-   is_batched_numpy
is_batchedbatched_speechpadded_inputsr   extract_fbank_featuresrescaled_attention_maskr"   r   r0   __call__   s   D
"




z WhisperFeatureExtractor.__call__)r   r   r   r   r   r   r   F)r4   )r   )
TNNNro   NNNr4   N)r   
__module____qualname____doc__model_input_namesr$   r>   r   strrF   r]   staticmethodr   floatrn   r   boolr   intr   r
   r   __classcell__r"   r"   r.   r0   r   $   sx    !$"	
r   )r   typingr   r   rY   r>    r   audio_utilsr   r   r   !feature_extraction_sequence_utilsr	   feature_extraction_utilsr
   utilsr   r   rM   
get_loggerr   r   r   __all__r"   r"   r"   r0   <module>   s   
  
<