o
    wik                     @   s  d dl Z d dlZd dlZd dlmZmZmZmZ d dlZd dl	Z
d dlmZ d dlZd dlmZ dZzd dlmZ d dlmZ ddiZW n eyO   dZY nw e Zd	d
 e D Zeeee ef Zddejdee dejfddZ ddededefddZ!G dd de"ZdS )    N)IterableListOptionalUnion)loggingT)AudioSegment)CouldntDecodeErroropusFc                 C   s   g | ]}d |   qS ).)lower).0i r   m/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/asr/parts/preprocessing/segment.py
<listcomp>>       r   signalchannel_selectorreturnc                 C   s  | j dkr|dvrtdt|| S | jd }| j| }||kr'td|| |du r.	 | S |dkr;tj| dd} | S t	|t
rW||krOtd	| d
| d| d|f } | S t	|trt||krmtd| d
| d| d|f } t|dkrtj| dd} | S td| d)a  
    Convert a multi-channel signal to a single-channel signal by averaging over channels or
    selecting a single channel, or pass-through multi-channel signal when channel_selector is `None`.

    Args:
        signal: numpy array with shape (..., num_channels)
        channel selector: string denoting the downmix mode, an integer denoting the channel to be selected,
                          or an iterable of integers denoting a subset of channels. Channel selector is
                          using zero-based indexing. If set to `None`, the original signal will be returned.
                          Uses zero-based indexing.

    Returns:
        numpy array
       Nr   averageJInput signal is one-dimensional, channel selector (%s) cannot not be used.zjNumber of channels (%d) is greater or equal than number of samples (%d). Check for possible transposition.Nr   axiszCannot select channel z from a signal with z
 channels..zCannot select channel subset z'Unexpected value for channel_selector ())ndim
ValueErrorstrshapesizer   warningnpmean
isinstanceintr   maxlensqueeze)r   r   num_channelsnum_samplesr   r   r   select_channelsD   sF   




r,   >  float32
audio_file	target_srdtypec                 C   sd   t | d"}|j|d}|j|krtjj||j|d}| }W d   |S 1 s+w   Y  |S )a  
    Read the samples from the given audio_file path. If not specified, the input audio file is automatically
    resampled to 16kHz.

    Args:
        audio_file (str):
            Path to the input audio file
        target_sr (int):
            Targeted sampling rate
    Returns:
        samples (numpy.ndarray):
            Time-series sample data from the given audio file
    rr1   orig_srr0   N)sf	SoundFileread
sampleratelibrosacoreresample	transpose)r/   r0   r1   fsamplesr   r   r   get_samples   s   


r@   c                   @   s  e Zd ZdZddejddddddddddfdee dee d	ee	e
ee
 f  d
ee dee f
ddZdd Zdd Zdd Zedd ZedddddejdddddddfddZe						d;ddZe							d<ddZedd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zd0d1 Z d2d3 Z!d=d5d6Z"d>d7d8Z#d?d9d:Z$dS )@r   a  Audio segment abstraction.
    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :raises TypeError: If the sample data type is not float or int.
    NF<   i   i   normalize_dbref_channelr/   offsetdurationc                 C   s  |  |}|jdkr|
dvrtdt|
|jdkr t||
}n	|jdkr)td|durD||krD| }tjj	|||d}| }|}|r[| }tj
j|||||d	\}}| }|| _|| _|	durg|	n|| _|| _|| _|| _|| _|| _|dur| || dS dS )
zwCreate audio segment from samples.
        Samples are convert float32 internally, with int scaled to [-1, 1].
        r   r   r         zTSignals with more than two dimensions (sample, channel) are currently not supported.Nr4   )top_dbrefframe_length
hop_length)_convert_samples_to_float32r   r   r   r,   NotImplementedErrorr=   r:   r;   r<   effectstrim_samples_sample_rate_orig_sr_ref_channel_normalize_db_audio_file_offset	_durationrB   )selfr?   sample_rater0   rO   trim_reftrim_top_dbtrim_frame_lengthtrim_hop_lengthr5   r   rB   rC   r/   rD   rE   _r   r   r   __init__   sB   




zAudioSegment.__init__c                 C   sR   t |t | ur
dS | j|jkrdS | jj|jjkrdS t| j|jkr'dS dS )z%Return whether two objects are equal.FT)typerQ   rP   r    r#   anyr?   rX   otherr   r   r   __eq__   s   zAudioSegment.__eq__c                 C   s   |  | S )z'Return whether two objects are unequal.)rd   rb   r   r   r   __ne__   s   zAudioSegment.__ne__c                 C   s^   | j dkrdt| | j| j| j| jf S ddd | jD }dt| | j| j| j| j |f S )z0Return human-readable representation of segment.r   z@%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, rms=%.2fdBz, c                 S   s   g | ]}|d dqS )z.2fdBr   )r   rmsr   r   r   r      r   z(AudioSegment.__str__.<locals>.<listcomp>zO%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, num_channels=%d, rms=[%s])r*   r`   r+   rY   rE   rms_dbjoin)rX   
rms_db_strr   r   r   __str__   s"   
zAudioSegment.__str__c                 C   st   |  d}| jtjtjtjtjfv r%t| jj}|dd|d   9 }|S | jtj	tj
tjfv r3	 |S td| j )zConvert sample type to float32.
        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        r.   g      ?rF   r   zUnsupported sample type: %s.)astyper1   r#   int8int16int32int64iinfobitsfloat16r.   float64	TypeError)r?   float32_samplesrr   r   r   r   rL     s   
z(AudioSegment._convert_samples_to_float32r   c                 C   s  d}t |tr| j|||||||||	|
||||dS t |tr)tj|d tv rzLt	|d<}|r5dnd}|j
}|durK|dkrK|t||  |dur_|dkr_|jt|| |d}n|j|d}W d   n1 sow   Y  W n( ty } ztd	| d
| d t|dr|d W Y d}~nd}~ww tr|du rzStj|ttj|d d}|j}|j}|dur|dkr|d }|t|d }|dur|dkr|d }|dt| }t| }|dkrt|d|f}W n ty } ztd	| d| d W Y d}~nd}~ww |du r0tr#dnd}td| d| d| |||||||	|
|||||||dS )a  
        Load a file supported by librosa and return as an AudioSegment.
        :param audio_file: path of file to load.
                           Alternatively, a list of paths of single-channel files can be provided
                           to form a multichannel signal.
        :param target_sr: the desired sample rate
        :param int_values: if true, load samples as 32-bit integers
        :param offset: offset in seconds when loading audio
        :param duration: duration in seconds when loading audio
        :param trim: if true, trim leading and trailing silence from an audio signal
        :param trim_ref: the reference amplitude. By default, it uses `np.max` and compares to the peak amplitude in
                         the signal
        :param trim_top_db: the threshold (in decibels) below reference to consider as silence
        :param trim_frame_length: the number of samples per analysis frame
        :param trim_hop_length: the number of samples between analysis frames
        :param orig_sr: the original sample rate
        :param channel selector: string denoting the downmix mode, an integer denoting the channel to be selected,
                                 or an iterable of integers denoting a subset of channels. Channel selector is using
                                 zero-based indexing. If set to `None`, the original signal will be used.
        :param normalize_db (Optional[float]): if not None, normalize the audio signal to a target RMS value
        :param ref_channel (Optional[int]): channel to use as reference for normalizing multi-channel audio,
                                            set None to use max RMS across channels
        :return: AudioSegment instance
        N)audio_file_listr0   
int_valuesrD   rE   rO   rZ   r[   r\   r]   r5   r   rB   rC   r   r2   ro   r.   r   r3   Loading % via SoundFile raised RuntimeError: `z+`. NeMo will fallback to loading via pydub.seek)codeci  r   z' via pydub raised CouldntDecodeError: ``.zsoundfile, and pydub	soundfilezYour audio file z& could not be decoded. We tried using r
   )r0   rO   rZ   r[   r\   r]   r5   r   rB   rC   r/   rD   rE   )r%   listfrom_file_listr   ospathsplitextsf_supported_formatsr6   r7   r9   r{   r&   r8   RuntimeErrorr   errorhasattr
HAVE_PYDUBAudio	from_fileffmpeg_codecsget
frame_ratechannelsr#   arrayget_array_of_samplesreshaper   	Exception)clsr/   r0   rx   rD   rE   rO   rZ   r[   r\   r]   r5   r   rB   rC   r?   r>   r1   rY   er*   secondserrlibsr   r   r   r     s   *
	

	 $
zAudioSegment.from_filec                 O   s"  t |tr|t|krtd| dt| || g}d}d}
|D ]V}| j||||||ddd|	}|jdkrDtd|j d| |du rK|j}|jdddf }|
du r[|}
q"t|t|
krotd	|j d
|
j t	j
|
|gdd}
q"t	|
}
|}| |
|g|R ||||d|	S )a  
        Function wrapper for `from_file` method. Load a list of files from `audio_file_list`.
        The length of each audio file is unified with the duration item in the input manifest file.
        See `from_file` method for arguments.

        If a list of files is provided, load samples from individual single-channel files and
        concatenate them along the channel dimension.
        z-Channel cannot be selected: channel_selector=z, num_audio_files=NF)r/   r0   rx   rD   rE   r   rO   r   z4Expecting a single-channel audio signal, but loaded z channels from file z.Loaded samples need to have identical length: z != r   )r0   rO   r   r/   )r%   r&   r(   r   r   r*   rY   r?   r    r#   concatenater)   )r   rw   r0   rx   rD   rE   rO   r   argskwargsr?   a_file	a_segment	a_samplesrY   r   r   r   r     sl   

	


zAudioSegment.from_file_listr.   c	              
   C   s  d}	zt |d}
|
j}|durt|| | }n|}d|  k r)t|
k ran n6t|
| }|du r<td|}nt|| }||krRt	d| d| d|

| |
j||d}d	}	n$|t|
krtd
| dt|
 d| d |
j|d}n|
j|d}W d   n1 sw   Y  W n t	y } ztd| d| d |d}~ww | ||||||d}|	r|jd| |_|S )aS  Grabs n_segments number of samples from audio_file.
        If offset is not provided, n_segments are selected randomly.
        If offset is provided, it is used to calculate the starting sample.

        Note that audio_file can be either the file path, or a file-like object.

        :param audio_file: path to a file or a file-like object
        :param target_sr: sample rate for the output samples
        :param n_segments: desired number of samples
        :param trim: if true, trim leading and trailing silence from an audio signal
        :param orig_sr: the original sample rate
        :param channel selector: select a subset of channels. If set to `None`, the original signal will be used.
        :param offset: fixed offset in seconds
        :param dtype: data type to load audio as.
        :return: numpy array of samples
        Fr2   Nr   zProvided audio start (z') is larger than the maximum possible (r   r3   TzNumber of segments (z) is greater than the length (z) of the audio file z). This may lead to shape mismatch errors.ry   rz   r}   )r0   rO   r5   r   )r6   r7   r9   mathceilr(   randomrandintfloorr   r{   r8   r   r"   r   rP   )r   r/   r0   
n_segmentsrO   r5   r   rD   r1   is_segmentedr>   rY   n_segments_at_original_srmax_audio_startaudio_startr?   r   featuresr   r   r   segment_from_file  sV   
zAudioSegment.segment_from_filec                 C   s
   | j  S )zReturns a copy of the samples.)rP   copyrX   r   r   r   r?   /  s   
zAudioSegment.samplesc                 C      | j S )z'Returns the sample rate of the segment.)rQ   r   r   r   r   rY   4     zAudioSegment.sample_ratec                 C   s   | j jdkrdS | j jd S )z.Returns the number of channels in the segment.r   r   )rP   r   r    r   r   r   r   r*   9  s   zAudioSegment.num_channelsc                 C   s   | j jd S )z-Returns the number of samples in the segment.r   )rP   r    r   r   r   r   r+   A  s   zAudioSegment.num_samplesc                 C   s   | j t| j S )z/Returns the duration of the segment in seconds.)r+   floatrQ   r   r   r   r   rE   F  s   zAudioSegment.durationc                 C   s"   t j| jd dd}dt | S )zReturn per-channel RMS value.rF   r   r   
   )r#   r$   rP   log10rX   mean_squarer   r   r   rh   K  s   zAudioSegment.rms_dbc                 C   r   )z0Returns the original sample rate of the segment.)rR   r   r   r   r   r5   Q  r   zAudioSegment.orig_src                 C      | j dur
t| j S dS )z(Returns the offset used for the segment.N)rV   r   r   r   r   r   rD   V     zAudioSegment.offsetc                 C   r   )z8Returns the audio file that the segment was loaded from.N)rU   r   r   r   r   r   r/   [  r   zAudioSegment.audio_filec                 C   s,   t t j| jd dd}| jdkp|dkS )zChecks if the segment is empty.rF   r   r   )r#   sumr$   rP   r+   r   r   r   r   is_empty`  s   zAudioSegment.is_emptyc                 C   s   |  j d|d  9  _ dS )zReturns the gain in decibels.g      $@g      4@N)rP   )rX   gainr   r   r   gain_dbe  s   zAudioSegment.gain_dbc                 C   s>   | j }| jdkr|du rt|n|| }|| }| | dS )zNormalize the signal to a target RMS value in decibels.
        For multi-channel audio, the RMS value is determined by the reference channel (if not None),
        otherwise it will be the maximum RMS across all channels.
        r   N)rh   r*   r'   r   )rX   	target_dbrC   rh   r   r   r   r   rB   i  s
   
zAudioSegment.normalize_dbc                 C   sp   | j j}|dkr|r|nd|f}n|dkr$|r||fdfnd|fdf}ntd| dtj| j |dd| _ d	S )
zAdd zero padding to the sample. The pad size is given in number
        of samples.
        If symmetric=True, `pad_size` will be added to both sides. If false,
        `pad_size`
        zeros will be added only to the end.
        r   r   rF   )r   r   z\Padding not implemented for signals with more that 2 dimensions. Current samples dimension: r
   constant)modeN)rP   r   rM   r#   pad)rX   pad_size	symmetricsamples_ndim	pad_widthr   r   r   r   t  s   zAudioSegment.padc                 C   s   |du rdn|}|du r| j n|}|dk r| j | }|dk r#| j | }|dk r-td| |dk r7td| ||krCtd||f || j krQtd|| j f tt|| j }tt|| j }| j|| | _dS )a  Cut the AudioSegment between given boundaries.
        Note that this is an in-place transformation.
        :param start_time: Beginning of subsegment in seconds.
        :type start_time: float
        :param end_time: End of subsegment in seconds.
        :type end_time: float
        :raise ValueError: If start_time or end_time is incorrectly set,
        e.g. out of bounds in time.
        Ng        z1The slice start position (%f s) is out of bounds.z/The slice end position (%f s) is out of bounds.zFThe slice start position (%f s) is later than the end position (%f s).z7The slice end position (%f s) is out of bounds (> %f s))rE   r   r&   roundrQ   rP   )rX   
start_timeend_timestart_sample
end_sampler   r   r   
subsegment  s&   




zAudioSegment.subsegment)NFr   r   FN)Nr   FNNNr.   )r   N)F)NN)%__name__
__module____qualname____doc__r#   r'   r   r   r&   r   r   r   r_   rd   re   rk   staticmethodrL   classmethodr   r   r   propertyr?   rY   r*   r+   rE   rh   r5   rD   r/   r   r   rB   r   r   r   r   r   r   r      s    
<
yYG










r   )N)r-   r.   )#r   r   r   typingr   r   r   r   r:   numpyr#   numpy.typingnptr~   r6   
nemo.utilsr   r   pydubr   r   pydub.exceptionsr   r   ModuleNotFoundErroravailable_formatskeysr   r&   r   ChannelSelectorTypeNDArrayr,   r@   objectr   r   r   r   <module>   s.   # ?