o
    pi9                  
   @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
 ddlm  mZ ddlZddlmZ ddlmZ zddlZddlmZ dd	lmZmZ W n eyg Z zed
e  W Y dZ[ndZ[ww eeB eB eB ZdZdeddfddZG dd dZ dS )z]
# Audio IO

pyannote.audio relies on torchcodec for reading and torchaudio for resampling.

    N)IOBase)Path)MappingOptionalTuple)Segment)Tensor)AudioSamples)AudioDecoderAudioStreamMetadataz
torchcodec is not installed correctly so built-in audio decoding will fail. Solutions are:
* use audio preloaded in-memory as a {'waveform': (channel, time) torch.Tensor, 'sample_rate': int} dictionary;
* fix torchcodec installation. Error message was:

aI  
Audio files can be provided to the Audio class using different types:
    - a "str" or "Path" instance: "audio.wav" or Path("audio.wav")
    - a "IOBase" instance with "read" and "seek" support: open("audio.wav", "rb")
    - a "Mapping" with any of the above as "audio" key: {"audio": ...}
    - a "Mapping" with both "waveform" and "sample_rate" key:
        {"waveform": (channel, time) torch.Tensor, "sample_rate": 44100}

For last two options, an additional "channel" key can be provided as a zero-indexed
integer to load a specific channel: {"audio": "stereo.wav", "channel": 0}
filereturnr   c                 C   s.   t | d j}t| d tr| d d |S )a6  Protocol preprocessor used to cache audio metadata

    This is useful to speed future random access to this file, e.g.
    in dataloaders using Audio.crop a lot....

    Parameters
    ----------
    file : AudioFile

    Returns
    -------
    metadata : AudioStreamMetadata
        Audio file metadata
    audior   )r
   metadata
isinstancer   seek)r   r    r   J/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/core/io.pyget_audio_metadataF   s   r   c                       s   e Zd ZdZdZededefddZedede	fdd	Z
ddef fddZ	
ddededed
B deeef fddZdedefddZ	
ddedee defddZdedeeef fddZ	ddededeeef fddZ  ZS )AudioaA  Audio IO

    Parameters
    ----------
    sample_rate: int, optional
        Target sampling rate. Defaults to using native sampling rate.
    mono : {'random', 'downmix'}, optional
        In case of multi-channel audio, convert to single-channel audio
        using one of the following strategies: select one channel at
        'random' or 'downmix' by averaging all channels.

    Usage
    -----
    >>> audio = Audio(sample_rate=16000, mono='downmix')
    >>> waveform, sample_rate = audio({"audio": "/path/to/audio.wav"})
    >>> assert sample_rate == 16000
    >>> sample_rate = 44100
    >>> two_seconds_stereo = torch.rand(2, 2 * sample_rate)
    >>> waveform, sample_rate = audio({"waveform": two_seconds_stereo, "sample_rate": sample_rate})
    >>> assert sample_rate == 16000
    >>> assert waveform.shape[0] == 1
    gMbP?waveformr   c                 C   s"   |   jddd }| |d  S )zPower-normalize waveform

        Parameters
        ----------
        waveform : (..., time) Tensor
            Waveform(s)

        Returns
        -------
        waveform: (..., time) Tensor
            Power-normalized waveform(s)
        Tdimkeepdimg:0yE>)squaremeansqrt)r   rmsr   r   r   power_normalizey   s   zAudio.power_normalizer   c                 C   s
  t | trn t | ttfrt| t| jd} nt | tr"| ddS ttd| v rY| d }t|j	dks?|j	d |j	d krCtd| 
dd	}|d	u rQtd
| dd | S d| v rt | d trf| S t| d }| sxtd| d| d|j | S td)a  Validate file for use with the other Audio methods

        Parameter
        ---------
        file: AudioFile

        Returns
        -------
        validated_file : Mapping
            {"audio": str, "uri": str, ...}
            {"waveform": tensor, "sample_rate": int, "uri": str, ...}
            {"audio": file, "uri": "stream"} if `file` is an IOBase instance

        Raises
        ------
        ValueError if file format is not valid or file does not exist.

        )r   uristreamr      r      z>'waveform' must be provided as a (channel, time) torch Tensor.sample_rateNz5'waveform' must be provided with their 'sample_rate'.r    r   zFile z does not existz:Neither 'waveform' nor 'audio' is available for this file.)r   r   strr   stemr   
ValueErrorAudioFileDocStringlenshapeget
setdefaultis_file)r   r   r$   pathr   r   r   validate_file   s>   


"zAudio.validate_fileNr$   c                    s   t    || _|| _d S N)super__init__r$   mono)selfr$   r3   	__class__r   r   r2      s   

zAudio.__init__channelc                 C   s   |dur|||d  }|j d }|dkr7| jdkr+td|d }|||d  }n| jdkr7|jddd}| jdurM| j|krMtj||| j}| j}||fS )a  Downmix and resample

        Parameters
        ----------
        waveform : (channel, time) Tensor
            Waveform.
        sample_rate : int
            Sample rate.
        channel : int, optional
            Channel to use.

        Returns
        -------
        waveform : (channel, time) Tensor
            Remixed and resampled waveform
        sample_rate : int
            New sample rate
        Nr#   r   randomdownmixTr   )	r*   r3   r8   randintr   r$   
torchaudio
functionalresample)r4   r   r$   r7   num_channelsr   r   r   downmix_and_resample   s   


zAudio.downmix_and_resamplec                 C   s>   |  |}d|v rt|d j}|d }|| S t|}|jS )zGet audio file duration in seconds

        Parameters
        ----------
        file : AudioFile
            Audio file.

        Returns
        -------
        duration : float
            Duration in seconds.
        r   r$   )r/   r)   Tr   duration_seconds_from_header)r4   r   framesr$   r   r   r   r   get_duration   s   
zAudio.get_durationdurationc                 C   s&   |p| j }|du rtdt|| S )z=Deterministic number of samples from duration and sample rateNz<`sample_rate` must be provided to compute number of samples.)r$   r'   round)r4   rD   r$   r   r   r   get_num_samples  s   
zAudio.get_num_samplesc                 C   s   |  |}|dd}d|v r|d }|d }| j|||dS t|d }| }|j}|j}t|d tr=|d 	d | j|||dS )a%  Obtain waveform

        Parameters
        ----------
        file : AudioFile

        Returns
        -------
        waveform : (channel, time) torch.Tensor
            Waveform
        sample_rate : int
            Sample rate

        See also
        --------
        AudioFile
        r7   Nr   r$   r7   r   r   )
r/   r+   r?   r
   get_all_samplesdatar$   r   r   r   )r4   r   r7   r   r$   decodersamplesr   r   r   __call__#  s   
zAudio.__call__raisesegmentc              
   C   s  |  |}|dd}d|v r|d }|j\}}|d }|| }	| |j|}
td|
 }|
dk rA|dkr?td|jdd	d}
| |j|}t||| }||krn|dkrltd
|jdd|dd d|	dd|}|dd|
|f }t	|||f}| j
|||dS t|d }|j}|j}|j}	| |j|}t|j}t|j}td| | |}|dk r|dkrtd|dd	d}t| |||| }||	kr|dkrtd
|dd|dd d|	dd|	}|||}|j}|j}t|d tr	|d d | |j|}|j\}}|| | | }t|dkr9td| d|dd d| d| d	|dkrI|ddddf }n	|dkrR|d7 }t	|||f}| j
|||dS )a6  Fast version of self(file).crop(segment, **kwargs)

        Parameters
        ----------
        file : AudioFile
            Audio file.
        segment : `pyannote.core.Segment`
            Temporal segment to load.
        mode : {'raise', 'pad'}, optional
            Specifies how out-of-bounds segments will behave.
            * 'raise' -- raise an error (default)
            * 'pad' -- zero pad

        Returns
        -------
        waveform : (channel, time) torch.Tensor
            Waveform
        sample_rate : int
            Sample rate

        r7   Nr   r$   r   rM   z,requested chunk with negative start time (t=z.3fzs)z!requested chunk with end time (t=zs) greater than r    z	in-memoryz file duration (zs).rG   r   g        r#   zrequested chunk z from z file resulted in z! samples instead of the expected z	 samples.r   )r/   r+   r*   rF   startmaxr'   endFpadr?   r
   r   r$   rA   floatget_samples_played_in_rangerI   r   r   r   rD   abs)r4   r   rN   moder7   r   _num_samplesr$   rD   start_sample	pad_start
end_samplepad_endrI   rJ   r   rO   rQ   rK   expected_num_samplesactual_num_samples
differencer   r   r   cropK  s   









z
Audio.crop)NNr0   )rM   )__name__
__module____qualname____doc__	PRECISIONstaticmethodr   r   	AudioFiler   r/   intr2   r   r?   rT   rC   r   rF   rL   r   ra   __classcell__r   r   r5   r   r   _   sH    @

,
,
r   )!re   r8   warningsior   pathlibr   typingr   r   r   torch.nn.functionalnnr<   rR   r;   pyannote.corer   torchr   
torchcodecr	   torchcodec.decodersr
   r   	Exceptionewarnr%   rh   r(   r   r   r   r   r   r   <module>   s4   	