o
    i;                     @   s   d dl Z d dlmZ d dlmZmZ d dlZddlm	Z	 ddl
mZ ddlmZmZmZ eeZ	dded	ed
ededededee dejfddZdejdedededejf
ddZG dd de	ZdgZdS )    N)Sequence)OptionalUnion   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingn_freqsf_minf_maxn_melssample_rate
fft_lengthnormreturnc                 C   s^  |dur|dkrt dtj| tjd||  }dtd|d   }dtd|d   }	t||	|d }
dd	|
d  d  }|d
d |dd  }t|dt|d
 }tjd
tjd}d|ddddf  |dd  }|ddddf |d
d  }t	|t
||}|dur|dkrd|d|d  |d|   }|t|d9 }|S )a  Create a frequency bin conversion matrix (NumPy version).

    Args:
        n_freqs (int): Number of frequencies to highlight/apply
        f_min (float): Minimum frequency (Hz)
        f_max (float): Maximum frequency (Hz)
        n_mels (int): Number of mel filterbanks
        sample_rate (int): Sample rate of the audio waveform
        fft_length (int): FFT length
        norm (Optional[str]): If 'slaney', divide the triangular mel weights by
          the width of the mel band (area normalization). (Default: ``None``)

    Returns:
        np.ndarray: Triangular filter banks (fb matrix) of size (``n_freqs``,
        ``n_mels``)
        meaning number of frequencies to highlight/apply to x the number of
        filterbanks.
        Each column is a filterbank so that assuming there is a matrix A of
        size (..., ``n_freqs``), the applied result would be
        ``A @ create_fb_matrix_numpy(A.shape[-1], ...)``.
    Nslaneyz$norm must be one of None or 'slaney'dtypeg     F@      ?g     @   
      r   g      g       @)
ValueErrornparangefloat32mathlog10linspaceexpand_dimszerosmaximumminimum)r   r   r   r   r   r   r   	all_freqsm_minm_maxm_ptsf_ptsf_diffslopeszerodown_slopes	up_slopesfbenorm r3   j/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.pycreate_fb_matrix   s"   $  r5   array	dimensionsizestepc           	      C   s   | j dkr	td|dkr|| j d krtd| j\}}|| | d }|dkr4tj|d|f| jdS |||f}| jd | jd | | jd f}tjjj	| ||dS )	zNA basic NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim.r   zFThis unfold implementation currently supports 2D arrays (batch, time).r   r   zFThis unfold implementation only supports unfolding the last dimension.r   r   )shapestrides)
ndimr   r:   r   r$   r   r;   libstride_tricks
as_strided)	r6   r7   r8   r9   
batch_sizeoriginal_length
num_framesoutput_shapeoutput_stridesr3   r3   r4   _unfold[   s   


 rE   c                #       s<  e Zd ZdZddgZ								
									d1dededededededededededededededee	e  dee	e  f  fd d!Z
d"ejd#ejd$eejejf fd%d&Z	'	(				d2d)eejee eej eee  f d*eeeef d+ee d,ed-ee d.eeeef  dee d$efd/d0Z  ZS )3Gemma3nAudioFeatureExtractoraT
  An audio feature extractor Universal Speech Models https://huggingface.co/papers/2303.01037.

    Args:
        feature_size (`int`, *optional*, defaults to 128):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether to return the attention mask for the generated MEL spectrograms.
        frame_length_ms (`float`, *optional*, defaults to 32.0):
            The length of a frame in milliseconds.
        hop_length_ms (`float`, *optional*, defaults to 10.0):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        min_frequency (`float`, *optional*, defaults to 125.0):
            The minimum frequency (in Hz) for the Mel filterbank.
        max_frequency (`float`, *optional*, defaults to 7600.0):
            The maximum frequency (in Hz) for the Mel filterbank.
        preemphasis (`float`, *optional*, defaults to 0.97):
            The preemphasis coefficient.
        preemphasis_htk_flavor (`bool`, *optional*, defaults to `True`):
            Whether to use HTK-style preemphasis.
        fft_overdrive (`bool`, *optional*, defaults to `True`):
            Whether to use FFT overdrive.
        dither (`float`, *optional*, defaults to 0.0):
            Adds dithering. In other words, adds a small Gaussian noise to each frame.
            E.g. use 0.0001 to add dithering with a normal distribution centered
            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
            The value 0.0 means no dithering.
            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
            the high log_mel_fbank values for signals with hard-zero sections,
            when VAD cutoff is present in the signal.
        input_scale_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor applied to the input waveform.
        mel_floor (`float`, *optional*, defaults to 1e-05):
            Minimum value for Mel spectrograms to avoid log(0).
        per_bin_mean (`Optional[Sequence[float]]`, *optional*):
            Mean values for per-bin normalization.
        per_bin_stddev (`Optional[Sequence[float]]`, *optional*):
            Standard deviation values for per-bin normalization.
    input_featuresinput_features_mask   >          T      @@      $@     @_@     @
ףp=
?r   h㈵>Nfeature_sizesampling_ratepadding_valuereturn_attention_maskframe_length_mshop_length_msmin_frequencymax_frequencypreemphasispreemphasis_htk_flavorfft_overdriveditherinput_scale_factor	mel_floorper_bin_meanper_bin_stddevc              	      sf  t  jd||||d| || _|| _|	| _|
| _|| _|| _|| _t	t
|| d | _t	t
|| d | _tj|tjd| _dtt| j }| jrT|d9 }|| _tj| jtjd}ddtdtj | | j   }|tj| _t| jd d |||| jd |d| _|d urt|dd|| _nd | _|d urt|dd|| _ d S d | _ d S )	N)rR   rS   rT   rU   g     @@r   r   g      ?r   )r   r   r   r   r   r   r   r3   )!super__init__rX   rY   rZ   r[   r\   r]   r^   introundframe_length
hop_lengthr   r6   float64r_   r    ceillog2r   r   r   cospiastypewindowr5   rS   mel_filtersreshaper`   ra   )selfrR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   kwargsr   hann_arangern   	__class__r3   r4   rc      sP   "

z%Gemma3nAudioFeatureExtractor.__init__waveformattention_maskr   c                 C   s  |j dkrtj|dd}| jdkr!|| jtjj|j |j  }| j	dkr+|| j	 }| j
d }t|d|| jd}| jdkr| jrk|dd	df d| j  }|dddf | j|dd	d
f   }tj||gdd}n|ddd	f | j|dd	df   }n|dd	df }|| j }tjj|| jdd}t|}	t|	| j}
tt|
| j}| jd	ur|| j }| jd	ur|| j }|d}|d	d	| j t}||d	|jd  fS ) r   r   )axisrK   r   r   )r7   r8   r9   .Nr   )nry   )r<   r   r#   r]   randomrandnr:   rm   r   r^   rf   rE   rg   rZ   r[   concatenatern   fftrfftr   absmatmulro   logr%   r_   r`   ra   squeezebool)rq   rv   rw   frame_size_for_unfoldframes_to_processfirst_in_framerest_in_frameframesstftmagnitude_specmel_speclog_mel_specmel_spectrogrammaskr3   r3   r4   _extract_spectrogram   s6   

 



&(






z1Gemma3nAudioFeatureExtractor._extract_spectrogramlongest S 
raw_speechpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc                 K   s   t |tjot|jdk}	t |tot |d tjtf}
|	p|
}|r*dd |D }n|s7t |tjs7t|}|s@t|gg}| jtd|i|||||d}g }g }t	|j
|jD ]\}}| |j|\}}||tj || qZt||d|dS )	a  Creates a batch of MEL spectrograms from the provided raw speech.

        This implementation uses a different algorithm for windowing and preemphasis compared to the built-in
        `transformers.audio_utils.spectrogram()` function that _will_ result in different outputs. Consider this
        carefully when selecting an audio feature extractor, especially with pre-trained models.

        Args:
            raw_speech:
                The audio for which MEL spectrograms are created.
            padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `"longest"`):
                The padding strategy to use for batches of audio with different lengths.
            max_length (`int`, *optional*, defaults to 480000):
                If provided, defines the maximum length of the audio to allow. Audio longer than this will be
                truncated if `truncation=True`.
            truncation (`bool`, *optional*, defaults to `True`):
                Whether or not to truncate audio above `max_length`.
            pad_to_multiple_of (`int`, *optional*, defaults to 128):
                When padding, pad to a multiple of this value. The default value is defined for optimal TPU support.
            return_tensors (`Union[str, TensorType]`, *optional*, defaults to `None`):
                The type of tensors to return (e.g., NumPy, Torch, JAX, TensorFlow).
            return_attention_mask (`bool`, *optional*, defaults to `True`):
                Whether to return the attention mask for the generated MEL spectrograms.
        r   r   c                 S   s   g | ]	}t |gjqS r3   )r   asarrayT).0rsr3   r3   r4   
<listcomp>5  s    z9Gemma3nAudioFeatureExtractor.__call__.<locals>.<listcomp>rG   )r   r   r   r   rU   )rG   rH   )tensor_type)
isinstancer   ndarraylenr:   r   r   padr   ziprG   rw   r   r   appendrm   r   )rq   r   r   r   r   r   r   rU   rr   is_batched_numpyis_batched_sequence
is_batchedbatched_speechprepared_speechprepared_speech_maskspeechr   r3   r3   r4   __call__  s6   #

	z%Gemma3nAudioFeatureExtractor.__call__)rI   rJ   rK   TrL   rM   rN   rO   rP   TTrK   r   rQ   NN)r   r   TrI   NT)__name__
__module____qualname____doc__model_input_namesrd   floatr   r   r   rc   r   r   tupler   r   liststrr   r	   r   r   __classcell__r3   r3   rt   r4   rF   n   s    +	


&D0"
rF   )N)r    collections.abcr   typingr   r   numpyr   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r	   r
   
get_loggerr   loggerrd   r   r   r   r5   rE   rF   __all__r3   r3   r3   r4   <module>   s<   


"= 
e