o
    i[3                     @   s   d dl mZmZ d dlZd dlZddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZ e r3d dlZdZd	ZeeZed
dG dd deZdgZdS )    )OptionalUnionN   )SequenceFeatureExtractor)BatchFeature)
TensorTypeis_librosa_availablelogging)requiresgh㈵>g      p>)torchlibrosa)backendsc                       s   e Zd ZdZddgZ								
d! fdd	Zd"ddZ										d#deej	e
e e
ej	 e
e
e  f dedee deeeef  dee dee dee dee dee dee dee defdd Z  ZS )$ParakeetFeatureExtractora  
    Constructs a Parakeet feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        n_fft (`int`, *optional*, defaults to 512):
            Size of the Fourier transform.
        win_length (`int`, *optional*, defaults to 400):
            The window length for the STFT computation.
        preemphasis (`float`, *optional*, defaults to 0.97):
            A preemphasis filter coefficient. 0.0 means no preemphasis filter.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
    input_featuresattention_maskP   >          
ףp=
?        c           
         sf   t  jd|||d| || _|| _|| _|| _tjj|||d|d dd}	t	
|	t	j| _d S )N)feature_sizesampling_ratepadding_valuer      slaney)srn_fftn_melsfminfmaxnorm )super__init__
hop_lengthr   
win_lengthpreemphasisr   filtersmelr   
from_numpytofloat32mel_filters)
selfr   r   r&   r   r'   r(   r   kwargsr.   	__class__r#   l/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/parakeet/feature_extraction_parakeet.pyr%   C   s   z!ParakeetFeatureExtractor.__init__cpuc              	   C   s   t j| jd|d}t j|| j| j| j|ddd}t |}t |d	d}|d}| j
|}|| }t |t }|ddd	}|S )
NF)periodicdeviceTconstant)r&   r'   windowreturn_complexpad_moder   r      )r   hann_windowr'   stftr   r&   view_as_realsqrtpowsumr.   r,   logLOG_ZERO_GUARD_VALUEpermute)r/   waveformr6   r8   r>   
magnitudesr.   mel_specr#   r#   r3   _torch_extract_fbank_featurese   s$   

z6ParakeetFeatureExtractor._torch_extract_fbank_featuresFNlongest
raw_speech
truncationpad_to_multiple_ofreturn_tensorsreturn_attention_maskpadding
max_lengthr   do_normalizer6   return_token_timestampsreturnc              
   K   s  |dur|| j krtd| jj d| j  d| j  d| d	ntd| jj d t|tjr6t	
|}nt|ttfrLt|d	 tjrLd
d |D }t|t	joXt|jdk}|rrt|jdkrrtd| jj d |d}t|ttf}|r|D ]}t|jdkrtd| jj d |d}q}|s|rdd |D }n|dddf t	jg}dd |D }t||d}| j|||||dd}|jd}| jdurt	j|jd |jdd	|jdk }t	j|ddddf |ddddf | j|ddddf   gdd}|| d}| ||
}t	|j| j d d  | j  | j!}t	j|jd |
ddddf |dddf k }|d}|| }|j"dd|d }|d}|| d | j"dd|d d }t	#|d}|| |t$  }||9 }t||d|dS )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   c                 S   s   g | ]}t |qS r#   )r   tensor.0speechr#   r#   r3   
<listcomp>   s    z5ParakeetFeatureExtractor.__call__.<locals>.<listcomp>r<   r   z2Only mono-channel audio is supported for input to z;. We will take the mean of the channels to convert to mono.r;   c                 S   s$   g | ]}|d d d f  tjqS )N)r,   r   r-   rW   r#   r#   r3   rZ      s   $ c                 S   s   g | ]}t |qS r#   )lenrW   r#   r#   r3   rZ      s    )r   audio_lengthspt)rP   rQ   rL   rM   rN   )r6   )dimr   )r   r   )datatensor_type)%r   
ValueErrorr2   __name__loggerwarning
isinstancenpndarrayr   rV   listtupleTensorr[   shapemeanr,   r-   r   padr   squeezer(   aranger6   	unsqueezer\   catmasked_fillrI   floor_divider   r&   rB   r@   EPSILON)r/   rK   rL   rM   rN   rO   rP   rQ   r   rR   r6   rS   r0   is_batched_torchis_batched_sequencerY   r\   batched_speechpadded_inputsr   timemaskfeatures_lengthsr   maskinput_features_maskedrl   variancestdr#   r#   r3   __call__   s   D



B0

&z!ParakeetFeatureExtractor.__call__)r   r   r   r   r   r   r   )r4   )
FNNNrJ   NNNr4   N)rb   
__module____qualname____doc__model_input_namesr%   rI   r   rf   rg   rh   floatboolr   intstrr   r   r   __classcell__r#   r#   r1   r3   r   %   s^    
""	
r   )typingr   r   numpyrf   r   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r	   utils.import_utilsr
   r   rt   rD   
get_loggerrb   rc   r   __all__r#   r#   r#   r3   <module>   s    
 
z