o
    Si                     @   sz  d dl mZ d dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZ d d	lmZ d d
lmZmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 ee:ee: f Z;eG dd dZ<		ddej=de2dee2 de<dee2 de>dej=fddZ?dS )    )	dataclass)BytesIO)ceilisclose)Path)CallableDictListLiteralOptionalSequenceTupleUnionN)ROUND_HALF_UP)get_current_audio_backendinfo
save_audio)AudioSource)AudioLoadingErrorDurationMismatchError	VideoInfo%get_audio_duration_mismatch_tolerance)AudioTransformClippingCompressDereverbWPELoudnessNormalization
NarrowbandResampleReverbWithImpulseResponseSpeedTempoVolume)Codec)	PathlikeSecondsSetContainingAnythingasdict_nonullcompute_num_samplesfastcopyifnoneperturb_num_samplesrich_exception_infoc                   @   s  e Zd ZU dZeed< ee ed< eed< eed< e	ed< dZ
eee  ed< dZeeeeef   ed	< d
d Zedee fddZedefddZedee fddZedefddZedefddZedefddZedefddZe				dmdedeeeeegef f  dee dee d edd fd!d"Zed#e dedd fd$d%Z!				dnd&ee" d'e	dee	 d(ee dd f
d)d*Z#de$fd+d,Z%d-d. Z&e'		/	dod&ee" d'e	dee	 de(j)fd0d1Z*e'		/		2	2dpd&ee" d'e	dee	 d3ed4ede+e,j-ee,j- f fd5d6Z.d7d8 Z/d9ee(j) de(j)fd:d;Z0d'e	dee	 defd<d=Z1dedd fd>d?Z2d@edAedd fdBdCZ3dqdDe4dEedd fdFdGZ5dqdDe4dEedd fdHdIZ6dqdDe4dEedd fdJdKZ7	2drdLedMedEedd fdNdOZ8dsdPe4dEedd fdQdRZ9dqdEedd fdSdTZ:		2		2			dtdUed  dVedWedEedXee;e  dYee dZee dd fd[d\Z<dedd fd]d^Z=		/	2	_	dud`edae4dbedcee dEedd fdddeZ>	gdvdLe?dhe4dd fdidjZ@ed#e$dd fdkdlZAdS )w	Recordinga  
    The :class:`~lhotse.audio.Recording` manifest describes the recordings in a given corpus.
    It contains information about the recording, such as its path(s), duration, the number of samples, etc.
    It allows to represent multiple channels coming from one or more files.

    This manifest does not specify any segmentation information or supervision such as the transcript or the speaker
    -- we use :class:`~lhotse.supervision.SupervisionSegment` for that.

    Note that :class:`~lhotse.audio.Recording` can represent both a single utterance (e.g., in LibriSpeech)
    and a 1-hour session with multiple channels and speakers (e.g., in AMI).
    In the latter case, it is partitioned into data suitable for model training using :class:`~lhotse.cut.Cut`.

    Internally, Lhotse supports multiple audio backends to read audio file.
    By default, we try to use libsoundfile, then torchaudio (with FFMPEG integration starting with torchaudio 2.1),
    and then audioread (which is an ffmpeg CLI wrapper).
    For sphere files we prefer to use sph2pipe binary as it can work with certain unique encodings such as "shorten".

    Audio backends in Lhotse are configurable. See:

    * :func:`~lhotse.audio.backend.available_audio_backends`
    * :func:`~lhotse.audio.backend.audio_backend`,
    * :func:`~lhotse.audio.backend.get_current_audio_backend`
    * :func:`~lhotse.audio.backend.set_current_audio_backend`
    * :func:`~lhotse.audio.backend.get_default_audio_backend`


    Examples

        A :class:`~lhotse.audio.Recording` can be simply created from a local audio file::

            >>> from lhotse import RecordingSet, Recording, AudioSource
            >>> recording = Recording.from_file('meeting.wav')
            >>> recording
            Recording(
                id='meeting',
                sources=[AudioSource(type='file', channels=[0], source='meeting.wav')],
                sampling_rate=16000,
                num_samples=57600000,
                duration=3600.0,
                transforms=None
            )

        This manifest can be easily converted to a Python dict and serialized to JSON/JSONL/YAML/etc::

            >>> recording.to_dict()
            {'id': 'meeting',
             'sources': [{'type': 'file',
               'channels': [0],
               'source': 'meeting.wav'}],
             'sampling_rate': 16000,
             'num_samples': 57600000,
             'duration': 3600.0}

        Recordings can be also created programatically, e.g. when they refer to URLs stored in S3 or somewhere else::

            >>> s3_audio_files = ['s3://my-bucket/123-5678.flac', ...]
            >>> recs = RecordingSet.from_recordings(
            ...     Recording(
            ...         id=url.split('/')[-1].replace('.flac', ''),
            ...         sources=[AudioSource(type='url', source=url, channels=[0])],
            ...         sampling_rate=16000,
            ...         num_samples=get_num_samples(url),
            ...         duration=get_duration(url)
            ...     )
            ...     for url in s3_audio_files
            ... )

        It allows reading a subset of the audio samples as a numpy array::

            >>> samples = recording.load_audio()
            >>> assert samples.shape == (1, 16000)
            >>> samples2 = recording.load_audio(offset=0.5)
            >>> assert samples2.shape == (1, 8000)

        See also: :class:`~lhotse.audio.recording.Recording`, :class:`~lhotse.cut.Cut`, :class:`~lhotse.cut.CutSet`.
    idsourcessampling_ratenum_samplesdurationNchannel_ids
transformsc                 C   sD   | j d u rtdd | jD | _ tdd | jD dk s J dd S )Nc                 s   s     | ]}|j D ]}|V  qqd S Nchannels).0sourcecid r;   J/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/audio/recording.py	<genexpr>   s    z*Recording.__post_init__.<locals>.<genexpr>c                 s   s    | ]}|j V  qd S r5   )	has_video)r8   r9   r;   r;   r<   r=      s       zRLhotse does not currently support recordings with more than a single video stream.)r3   sortedr/   sumselfr;   r;   r<   __post_init__   s   

zRecording.__post_init__returnc                 C   s   | j }|d u r	d S |jS r5   )_video_sourcevideorC   sr;   r;   r<   rG      s   zRecording.videoc                 C   s
   | j d uS r5   )rF   rB   r;   r;   r<   r>         
zRecording.has_videoc                 C   s   | j D ]	}|jr|  S qd S r5   )r/   r>   rH   r;   r;   r<   rF      s
   
zRecording._video_sourcec                 C      t dd | jD S )Nc                 s       | ]}|j d kV  qdS memoryNtyper8   rI   r;   r;   r<   r=          z)Recording.is_in_memory.<locals>.<genexpr>anyr/   rB   r;   r;   r<   is_in_memory      zRecording.is_in_memoryc                 C   rK   )Nc                 s   rL   )sharNrO   rQ   r;   r;   r<   r=      rR   z+Recording.is_placeholder.<locals>.<genexpr>rS   rB   r;   r;   r<   is_placeholder   rV   zRecording.is_placeholderc                 C   s
   t | jS r5   )lenr3   rB   r;   r;   r<   num_channels   rJ   zRecording.num_channelsc                 C   s4   t tdd | jD }t|dkr|d S td)zInfer format of the audio sources.
        If all sources have the same format, return it.
        If sources have different formats, raise an error.
        c                 S   s   g | ]}|j qS r;   formatrQ   r;   r;   r<   
<listcomp>   s    z+Recording.source_format.<locals>.<listcomp>   r   zMSources have different formats. Resolving to a single format not implemented.)listsetr/   rY   NotImplementedError)rC   source_formatsr;   r;   r<   source_format   s   zRecording.source_formatFpathrecording_idrelative_path_depthforce_opus_sampling_rateforce_read_audioc                 C   s   t | } |du r| jn	t|r|| n|}t| ||d}|jdur,|jj}t||j}n|j}|j}t	||j||t
dtt|j|durS|dkrSd| j| d nt| |jdgdS )a  
        Read an audio file's header and create the corresponding ``Recording``.
        Suitable to use when each physical file represents a separate recording session.

        .. caution::
            If a recording session consists of multiple files (e.g. one per channel),
            it is advisable to create the ``Recording`` object manually, with each
            file represented as a separate ``AudioSource`` object.

        :param path: Path to an audio file supported by libsoundfile (pysoundfile).
        :param recording_id: recording id, when not specified ream the filename's stem ("x.wav" -> "x").
            It can be specified as a string or a function that takes the recording path and returns a string.
        :param relative_path_depth: optional int specifying how many last parts of the file path
            should be retained in the ``AudioSource``. By default writes the path as is.
        :param force_opus_sampling_rate: when specified, this value will be used as the sampling rate
            instead of the one we read from the manifest. This is useful for OPUS files that always
            have 48kHz rate and need to be resampled to the real one -- we will perform that operation
            "under-the-hood". For non-OPUS files this input is undefined.
        :param force_read_audio: Set it to ``True`` for audio files that do not have any metadata
            in their headers (e.g., "The People's Speech" FLAC files).
        :return: a new ``Recording`` instance pointing to the audio file.
        N)rg   rh   filer   /)rP   r7   r9   rG   r.   r0   r1   r2   r/   )r   stemcallabler   rG   r2   r(   
samplerateframesr-   r   r_   ranger7   joinpartsstr)rd   re   rf   rg   rh   
audio_infor2   r1   r;   r;   r<   	from_file   s@   

zRecording.from_filedatac              
   C   sB   t | }t |}t||j|j|jtdtt	|j
| dgdS )a  
        Like :meth:`.Recording.from_file`, but creates a manifest for a byte string with
        raw encoded audio data. This data is first decoded to obtain info such as the
        sampling rate, number of channels, etc. Then, the binary data is attached to the
        manifest. Calling :meth:`.Recording.load_audio` does not perform any I/O and
        instead decodes the byte string contents in memory.

        .. note:: Intended use of this method is for packing Recordings into archives
            where metadata and data should be available together
            (e.g., in WebDataset style tarballs).

        .. caution:: Manifest created with this method cannot be stored as JSON
            because JSON doesn't allow serializing binary data.

        :param data: bytes, byte string containing encoded audio contents.
        :param recording_id: recording id, unique string identifier.
        :return: a new ``Recording`` instance that owns the byte string data.
        rN   rP   r7   r9   rk   )r   r   r   r-   rn   ro   r2   r   r_   rp   r7   )rv   re   streamrt   r;   r;   r<   
from_bytes  s   zRecording.from_bytesr7   offsetr\   c           	      C   s  t dd | jD r| S dd }t dd |||fD s:|du s'||| jkrH|du s0t|drH|du s:t|| jrHdd	 | jD }t| |d
S | j|t|d|d}t }t	|t
|| j|d t|| j}t|trp|g}t| jtd|| dg| j|jd t|| jdS )a?  
        Read audio data and return a copy of the manifest with binary data attached.
        Calling :meth:`.Recording.load_audio` on that copy will not trigger I/O.

        If all arguments are left as defaults, we won't decode the audio and attach
        the bytes we read from disk/other source as-is.
        If ``channels``, ``duration``, or ``offset`` are specified, we'll decode the
        audio and re-encode it into ``format`` before attaching.
        The default format is FLAC, other formats compatible with torchaudio.save are
        also accepted.
        c                 s   rL   rM   rO   )r8   srcr;   r;   r<   r=   =  rR   z+Recording.move_to_memory.<locals>.<genexpr>c                 S   s   t | tr| gS | S r5   )
isinstanceint)xr;   r;   r<   _aslist@  s   
z)Recording.move_to_memory.<locals>._aslistc                 s   s    | ]}|d u V  qd S r5   r;   )r8   optr;   r;   r<   r=   F  s    N        c                 S   s(   g | ]}t d |jt|jd dqS )rN   rbrw   )r   r7   openr9   read)r8   
old_sourcer;   r;   r<   r]   K  s    z,Recording.move_to_memory.<locals>.<listcomp>r/   r   )r7   rz   r2   r[   rN   rw   r^   )r.   r/   r0   r1   r2   )allr/   r3   r   r2   r)   
load_audior*   r   r   torch
from_numpyr0   r|   r}   r-   r.   r   getvalueshape)	rC   r7   rz   r2   r\   r   memory_sourcesaudiorx   r;   r;   r<   move_to_memory*  s>   

zRecording.move_to_memoryc                 C   s*   t | }| jd urdd | jD |d< |S )Nc                 S   s"   g | ]}t |tr|n| qS r;   )r|   dictto_dictr8   tr;   r;   r<   r]   p  s    z%Recording.to_dict.<locals>.<listcomp>r4   )r'   r4   )rC   dr;   r;   r<   r   m  s   

zRecording.to_dictc                 C   sT   ddl m}m} | jdkr|n|}|| jd| j| jdkr$| jd | dS | j| dS )zz
        Create a Cut out of this recording --- MonoCut or MultiCut, depending on the
        number of channels.
        r   )MonoCutMultiCutr^   r   )r.   startr2   channel	recording)
lhotse.cutr   r   rZ   r.   r2   r3   )rC   r   r   clsr;   r;   r<   to_cutu  s   zRecording.to_cutr   c                    s  || j ksJ d| j  d| d|}|dur!t|| j ddr!d} du r)t  n!tt tr2 gn  t| j} |sJJ d| d  d	d
d | jpQg D }||}}t	|D ]}	|	j
||| jd\}}q]g }
| jD ]-} |jsyqp|j||| jd} fddt|jD }|rtj||dd}|
| qp| |
}|D ]}	|	|| j}q| jrt|||| ddd}|S t|||| d}|S )a  
        Read the audio samples from the underlying audio source (path, URL, unix pipe/command).

        :param channels: int or iterable of ints, a subset of channel IDs to read (reads all by default).
        :param offset: seconds, where to start reading the audio (at offset 0 by default).
            Note that it is only efficient for local filesystem files, i.e. URLs and commands will read
            all the samples first and discard the unneeded ones afterwards.
        :param duration: seconds, indicates the total audio time to read (starting from ``offset``).
        :return: a numpy array of audio samples with shape ``(num_channels, num_samples)``.
        3Cannot load audio because the Recording's duration 's is smaller than the requested offset s.NMbP?abs_tolbRequested to load audio from a channel that does not exist in the recording: (recording channels:  -- requested channels: )c                 S   $   g | ]}t |tr|nt|qS r;   r|   r   	from_dictr8   tnfmr;   r;   r<   r]         z(Recording.load_audio.<locals>.<listcomp>rz   r2   r0   )rz   r2   rg   c                       g | ]
\}}| vr|qS r;   r;   r8   idxr:   r6   r;   r<   r]         r   axis    .Aconstantrz   r2   r   	tolerancepad_mode)rz   r2   r   )r2   r   r&   	frozensetr|   r}   r3   issubsetr4   reversedreverse_timestampsr0   r/   intersectionr7   r   	enumeratenpdeleteappend_stack_audio_channelsr>    assert_and_maybe_fix_num_samples)rC   r7   rz   r2   orig_durationrecording_channelsr4   
offset_augduration_augtfnsamples_per_sourcer9   sampleschannels_to_remover   r;   r6   r<   r     sz   





zRecording.load_audioT
with_audioforce_consistent_durationc                    sN  | j sJ d| j d|| jksJ d| j d| dt| jdD ]}t|tr5|d dvs4J d	q#t|ttfr@J d	q#|sR| j	j
||d
d\}}|dfS |}	|durbt|| jddrbd} du rjt  n!tt trs gn  t| j}
 |
sJ d|
 d  ddd | jpg D }||}}t|D ]}|j||| jd\}}qg }d}| jD ]8}|j r|j
||d\}}n|j||d} |jsq fddt|jD }|rtj||dd}|| q|dusJ | |}|D ]}||| j}q|rt|||jd | jj  | ddd}n	t|||	| dd}|t!"|fS )a  
        Read the video frames and audio samples from the underlying source (path, URL, unix pipe/command).

        :param channels: int or iterable of ints, a subset of channel IDs to read (reads all by default).
        :param offset: seconds, where to start reading the video (at offset 0 by default).
            Note that it is only efficient for local filesystem files, i.e. URLs and commands will read
            all the samples first and discard the unneeded ones afterwards.
        :param duration: seconds, indicates the total video time to read (starting from ``offset``).
        :param with_audio: bool, whether to load and return audio alongside video. True by default.
        :param force_consistent_duration: bool, if audio duration is different than video duration
            (as counted by ``num_frames / fps``), we'll either truncate or pad the audio with zeros.
            True by default.
        :return: a tuple of video tensor and optional audio tensor (or None).
        z
Recording z has no video to load.r   r   r   r;   name)r    r!   zARecording.load_video() does not support speed/tempo perturbation.F)rz   r2   r   Nr   r   r   r   r   c                 S   r   r;   r   r   r;   r;   r<   r]   1  r   z(Recording.load_video.<locals>.<listcomp>r   )rz   r2   c                    r   r;   r;   r   r6   r;   r<   r]   Q  r   r   r   r   zeror   reflect)rz   r2   r   r   )#r>   r.   r2   r*   r4   r|   r   r    r!   rF   
load_videor   r&   r   r}   r3   r   r   r   r0   r/   r   r   r7   r   r   r   r   r   r   r   rG   fpsr   r   )rC   r7   rz   r2   r   r   r   rG   _r   r   r4   r   r   r   r   r9   r   r   r   r;   r6   r<   r     s   







zRecording.load_videoc                 C   s$   | j rddlm} || jjdS d S )Nr   )Video)filename)r>   IPython.displayr   rF   r9   )rC   r   r;   r;   r<   
play_videoy  s   zRecording.play_videor   c              	   C   s   t tt | jd}t|dkr\dd |D }tdd |D }t|D ].\}}||jd  |krFt	|dd||jd  ffd	}|||< q$t
d
| j dt  dtj|dd}|S t|}|S )Nr0   r^   c                 S   s*   g | ]}|j d kr|dddf n|qS r^   N)ndimrQ   r;   r;   r<   r]     s    z3Recording._stack_audio_channels.<locals>.<listcomp>c                 s   s    | ]}|j d  V  qdS r   )r   rQ   r;   r;   r<   r=     rR   z2Recording._stack_audio_channels.<locals>.<genexpr>r   r   r   r   zVThe mismatch between the number of samples in the different channels of the recording z' is greater than the allowed tolerance .r   )r}   r(   r   r0   rY   maxr   r   r   padr   r.   concatenatevstack)rC   r   allowed_diffmax_samplesirI   r   r;   r;   r<   r     s4    

zRecording._stack_audio_channelsc                 C   s:   |dkr|d u r| j S |d ur|n| j| }t|| jdS )Nr   r   )r1   r2   r(   r0   )rC   rz   r2   r;   r;   r<   _expected_num_samples  s   zRecording._expected_num_samplesc                    s   t |  fdd| jD dS )Nc                    s   g | ]}|  qS r;   )with_path_prefixrQ   rd   r;   r<   r]         z.Recording.with_path_prefix.<locals>.<listcomp>r   r)   r/   )rC   rd   r;   r   r<   r     s   zRecording.with_path_prefixwidthheightc                    s   t |  fdd| jD dS )Nc                    s   g | ]	}|j  d qS ))r   r   )with_video_resolutionrQ   r   r   r;   r<   r]     s    z3Recording.with_video_resolution.<locals>.<listcomp>r   r   )rC   r   r   r;   r   r<   r     s   zRecording.with_video_resolutionfactoraffix_idc                 C   f   | j dur
| j  ng }|t|d t| j|}|| j }t| |r+| j d| n| j|||dS )a  
        Return a new ``Recording`` that will lazily perturb the speed while loading audio.
        The ``num_samples`` and ``duration`` fields are updated to reflect the
        shrinking/extending effect of speed.

        :param factor: The speed will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
        :param affix_id: When true, we will modify the ``Recording.id`` field
            by affixing it with "_sp{factor}".
        :return: a modified copy of the current ``Recording``.
        Nr   _spr.   r1   r2   r4   )	r4   copyr   r    r+   r1   r0   r)   r.   rC   r   r   r4   new_num_samplesnew_durationr;   r;   r<   perturb_speed  s   
zRecording.perturb_speedc                 C   r   )aD  
        Return a new ``Recording`` that will lazily perturb the tempo while loading audio.

        Compared to speed perturbation, tempo preserves pitch.
        The ``num_samples`` and ``duration`` fields are updated to reflect the
        shrinking/extending effect of tempo.

        :param factor: The tempo will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
        :param affix_id: When true, we will modify the ``Recording.id`` field
            by affixing it with "_tp{factor}".
        :return: a modified copy of the current ``Recording``.
        Nr   _tpr   )	r4   r   r   r!   r+   r1   r0   r)   r.   r   r;   r;   r<   perturb_tempo  s   
zRecording.perturb_tempoc                 C   R   | j dur
| j  ng }|t|d t| |r#| j d| |dS | j|dS )a}  
        Return a new ``Recording`` that will lazily perturb the volume while loading audio.

        :param factor: The volume scale to be applied (e.g. factor=1.1 means 1.1x louder).
        :param affix_id: When true, we will modify the ``Recording.id`` field
            by affixing it with "_tp{factor}".
        :return: a modified copy of the current ``Recording``.
        Nr   _vpr.   r4   )r4   r   r   r"   r)   r.   )rC   r   r   r4   r;   r;   r<   perturb_volume     	zRecording.perturb_volumecodecrestore_orig_src                 C   s   | j dur
| j  ng }|t|| j|d  t| j|r!| jndtd}t	| |r2| j
 d| n| j
||r=| j|dS d|dS )z
        Return a new ``Recording`` that will lazily apply narrowband effect while loading audio.
            by affixing it with "_nb_{codec}".

        :return: a modified copy of the current ``Recording``.
        N)r   source_sampling_rater   @  rounding_nb_)r.   r1   r0   r4   )r4   r   r   r   r0   r   r(   r2   r   r)   r.   )rC   r   r   r   r4   r   r;   r;   r<   
narrowband  s0   	zRecording.narrowbandtargetc                 C   r   )aY  
        Return a new ``Recording`` that will lazily apply WPE dereverberation.

        :param target: The target loudness (in dB) to normalize to.
        :param affix_id: When true, we will modify the ``Recording.id`` field
            by affixing it with "_ln{factor}".
        :return: a modified copy of the current ``Recording``.
        N)r  _lnr   )r4   r   r   r   r)   r.   )rC   r  r   r4   r;   r;   r<   normalize_loudness  r   zRecording.normalize_loudnessc                 C   sJ   | j dur
| j  ng }|t  t| |r| j d|dS | j|dS )a  
        Return a new ``Recording`` that will lazily apply WPE dereverberation.

        :param affix_id: When true, we will modify the ``Recording.id`` field
            by affixing it with "_wpe".
        :return: a modified copy of the current ``Recording``.
        N_wper   )r4   r   r   r   r)   r.   )rC   r   r4   r;   r;   r<   dereverb_wpe#  s   zRecording.dereverb_wperir_recordingnormalize_output
early_onlyrir_channelsroom_rng_seedsource_rng_seedc              	   C   s   |dur|j | j ksJ d|j  d| j  d| jdks&|du s&t|dkr*| j}nttt|}|du rEddlm}	 |	| j ||d}
nd}
| jdurQ| j	 ng }|
t||||dur_|ndg|
d	 t| |rp| j d
n| j||dS )ao  
        Return a new ``Recording`` that will lazily apply reverberation based on provided
        impulse response while loading audio. If no impulse response is provided, we will
        generate an RIR using a fast random generator (https://arxiv.org/abs/2208.04101).

        :param rir_recording: The impulse response to be used.
        :param normalize_output: When true, output will be normalized to have energy as input.
        :param early_only: When true, only the early reflections (first 50 ms) will be used.
        :param affix_id: When true, we will modify the ``Recording.id`` field
            by affixing it with "_rvb".
        :param rir_channels: The channels of the impulse response to be used (in case of multi-channel
            impulse responses). By default, only the first channel is used. If no RIR is
            provided, we will generate one with as many channels as this argument specifies.
        :param room_rng_seed: The seed to be used for the room configuration.
        :param source_rng_seed: The seed to be used for the source position.
        :return: the perturbed ``Recording``.
        Nz1Sampling rate mismatch between RIR vs recording: z vs r   r^   r   )FastRandomRIRGenerator)sr	room_seedsource_seed)rirr  r  r  rir_generator_rvb)r.   r3   r4   )r0   rZ   rY   r3   r_   rp   lhotse.augmentation.utilsr  r4   r   r   r   r)   r.   )rC   r  r  r  r   r  r  r  new_channel_idsr  r  r4   r;   r;   r<   
reverb_rir3  s>   	zRecording.reverb_rirc                 C   sh   || j kr	t| S | jdur| j ng }|t| j |d t| j|td}|| }t| ||||dS )z
        Return a new ``Recording`` that will be lazily resampled while loading audio.
        :param sampling_rate: The new sampling rate.
        :return: A resampled ``Recording``.
        Nr   target_sampling_rater  )r2   r1   r0   r4   )	r0   r)   r4   r   r   r   r(   r2   r   )rC   r0   r4   r   r   r;   r;   r<   resample  s(   
zRecording.resample   hardgain_db	normalizeoversamplingc                 C   s   | j dur
| j  ng }|dur|t| j| j| d |t||| |dur7|t| j| | jd t| |rG| j d|d|dS | j|dS )a  
        Return a new ``Recording`` that will lazily apply a clipping effect while loading audio.
        Saturates input signal in [-1, 1] range.

        :param hard: If True, apply hard clipping (sharp cutoff); otherwise, apply soft clipping (saturation).
        :param gain_db: The amount of gain in decibels to apply before clipping (and to revert back to original level after).
        :param normalize: If True, normalize the input signal to 0 dBFS before applying clipping.
        :param oversampling: If provided, we will oversample the input signal by the given integer factor before applying saturation and then downsample back to the original sampling rate.
        :param affix_id: When true, we will modify the ``Recording.id`` field
            by affixing it with "_cl{gain_db}".
        :return: a modified copy of the current ``Recording`` with the saturation transform applied.
        Nr  _clz.1fr   )r4   r   r   r   r0   r   r)   r.   )rC   r  r   r!  r"  r   r4   r;   r;   r<   clip_amplitude  s0   zRecording.clip_amplitudeopusGz?compression_levelc                 C   s   |t jvrtd| ddt j d|  krdks%n td| | jdur/| j ng }|dkrW| jd	krW|t| jd	d
 |t || |td	| jd
 n|t || t	| |dS )a  
        Return a new ``Recording`` that will lazily apply audio compression while loading audio.

        :param codec: The codec to use for compression. Supported codecs are "opus", "mp3", "vorbis", "gsm".
        :param compression_level: The compression level between 0.0 and 1.0 (higher means more compression).
        :return: a modified copy of the current ``Recording``.
        zInvalid codec: z. Must be one of: z, r   g      ?z3Compression level must be between 0.0 and 1.0, got Ngsmr  r  )r4   )
r   supported_codecs
ValueErrorrq   r4   r   r0   r   r   r)   )rC   r   r'  r4   r;   r;   r<   compress  s.   

zRecording.compressc                 C   s\   |  d}z|  d}dd |D }W n ty   d }Y nw tddd |D |d| S )Nr/   r4   c                 S      g | ]}t |qS r;   )r   r   r   r;   r;   r<   r]     r   z'Recording.from_dict.<locals>.<listcomp>c                 S   r,  r;   )r   r   rQ   r;   r;   r<   r]     r   )r/   r4   r;   )popKeyErrorr-   )rv   raw_sourcesr4   r;   r;   r<   r     s   

zRecording.from_dict)NNNF)NNNN)Nr   N)Nr   NTT)T)TT)F)NTFTNNN)Fr   Tr  F)r%  r&  )B__name__
__module____qualname____doc__rs   __annotations__r	   r   r}   r%   r3   r   r4   r   r   r   rD   propertyr   rG   boolr>   rF   rU   rX   rZ   rc   staticmethodr$   r   r   ru   bytesry   Channelsr   r   r   r   r,   r   ndarrayr   r   r   Tensorr   r   r   r   r   r   floatr   r   r   r  r  r
  r   r  r  r$  r#   r+  r   r;   r;   r;   r<   r-   0   sf  
 M	C(
Ch  
	

	
L"
-
$r-   r   r   rz   r2   r   r   r   rE   c           	      C   s   |d u rt  }t|d ur|n|j| |jd}|| jd  }|dkr$| S tt||j }d|  k r7|krGn ntj| dd|ff|d} | S | |  krRdk r`n n| d d d |f } | S t	d| d| d| d	| j d
| 
)N)r2   r0   r^   r   r   )modezjThe number of declared samples in the recording diverged from the one obtained when loading audio (offset=z, duration=z). This could be internal Lhotse's error or a faulty transform implementation. Please report this issue in Lhotse and show the following: diff=z, audio.shape=z, recording=)
r   r(   r2   r0   r   r}   r   r   r   r   )	r   rz   r2   r   r   r   expected_num_samplesdiffr   r;   r;   r<   r     s:   r   )Nr   )@dataclassesr   ior   mathr   r   pathlibr   typingr   r   r	   r
   r   r   r   r   numpyr   r   _decimalr   lhotse.audio.backendr   r   r   lhotse.audio.sourcer   lhotse.audio.utilsr   r   r   r   lhotse.augmentationr   r   r   r   r   r   r   r   r    r!   r"   lhotse.augmentation.compressr#   lhotse.utilsr$   r%   r&   r'   r(   r)   r*   r+   r,   r}   r9  r-   r:  rs   r   r;   r;   r;   r<   <module>   sR    (4,       Y