o
    Si'                     @   s   d dl mZ d dlmZmZmZmZmZmZm	Z	 d dl
Zd dlZd dlmZmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) G dd dZ*dS )    )Path)CallableDictListLiteralOptionalSetTupleN)IntervalIntervalTree)AudioSource	Recording	VideoInfo)
save_audio)	AugmentFn)FeatureExtractor)SupervisionSegment)DecibelsPathlikeSecondsSetContainingAnythingadd_durationsasdict_nonullcompute_num_samplescompute_num_windows'compute_start_duration_for_extended_cutfastcopyifnoneoverlapsto_hashablec                   @   sd  e Zd ZU dZeed< eed< eed< eed< ee	 ed< e
e ed< e
e ed< e
e ed	< e
e ed
< e
e ed< eed< eed< eed< e
e ed< eegd f ed< eg ejf ed< eg eeje
ej f f ed< eg ejf ed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed< eed < eed!< eed"< eed#< eed$< eed%< eed&< eed'< eed(< eed)< ed*efd+d,Zd*efd-d.Zd/d0 Zed*efd1d2Zed*ee	 fd3d4Zd5ed*ed6 fd7d8Z	9	:	;	;dd<d d=ed>ed?e
e d@e
e d*d fdAdBZ	;	;dd<d d?e
e d@e
e d*d fdCdDZ	;ddEedFe
e  d*ejfdGdHZ!ddIdJZ"dKdL Z#dMdN Z$ddPefdQdRZ%	S	;	T	:ddUedVe
e dWe&dX dYed*dZf
d[d\Z'	;	;	]	:dd^ed_e
e d`e
e daedYed*dZfdbdcZ(	9dd_ed*dZfdddeZ)	;	Sddedfe
e dged*dZfdhdiZ*	;ddjedke
e+e  d*eee,f fdldmZ-	;	;	;ddne.doe
e dpe
e dFe
e  d*d f
dqdrZ/	;	;	;ddse
e dte
eeef  due
e d*ejfdvdwZ0	;	;	;ddse
e dte
eeef  due
e d*ejfdxdyZ1	;ddue
e d*ejfdzd{Z2	;ddue
e d*ejfd|d}Z3d~ed*d fddZ4d;S )CutaI  
    .. caution::
        :class:`~lhotse.cut.Cut` is just an abstract class -- the actual logic is implemented by its child classes (scroll down for references).

    :class:`~lhotse.cut.Cut` is a base class for audio cuts.
    An "audio cut" is a subset of a :class:`~lhotse.audio.Recording` -- it can also be thought of as a "view"
    or a pointer to a chunk of audio.
    It is not limited to audio data -- cuts may also point to (sub-spans of) precomputed
    :class:`~lhotse.features.base.Features`.

    Cuts are different from :class:`~lhotse.supervision.SupervisionSegment` in that they may be arbitrarily
    longer or shorter than supervisions; cuts may even contain multiple supervisions for creating contextual
    training data, and unsupervised regions that provide real or synthetic acoustic background context
    for the supervised segments.

    The following example visualizes how a cut may represent a part of a single-channel recording with
    two utterances and some background noise in between::

                          Recording
        |-------------------------------------------|
        "Hey, Matt!"     "Yes?"        "Oh, nothing"
        |----------|     |----|        |-----------|
                   Cut1
        |------------------------|

    This scenario can be represented in code, using :class:`~lhotse.cut.MonoCut`, as::

        >>> from lhotse import Recording, SupervisionSegment, MonoCut
        >>> rec = Recording(id='rec1', duration=10.0, sampling_rate=8000, num_samples=80000, sources=[...])
        >>> sups = [
        ...     SupervisionSegment(id='sup1', recording_id='rec1', start=0, duration=3.37, text='Hey, Matt!'),
        ...     SupervisionSegment(id='sup2', recording_id='rec1', start=4.5, duration=0.9, text='Yes?'),
        ...     SupervisionSegment(id='sup3', recording_id='rec1', start=6.9, duration=2.9, text='Oh, nothing'),
        ... ]
        >>> cut = MonoCut(id='rec1-cut1', start=0.0, duration=6.0, channel=0, recording=rec,
        ...     supervisions=[sups[0], sups[1]])

    .. note::
        All Cut classes assume that the :class:`~lhotse.supervision.SupervisionSegment` time boundaries are relative
        to the beginning of the cut.
        E.g. if the underlying :class:`~lhotse.audio.Recording` starts at 0s (always true), the cut starts at 100s,
        and the SupervisionSegment inside the cut starts at 3s, it really did start at 103rd second of the recording.
        In some cases, the supervision might have a negative start, or a duration exceeding the duration of the cut;
        this means that the supervision in the recording extends beyond the cut.

    Cut allows to check and read audio data or features data::

        >>> assert cut.has_recording
        >>> samples = cut.load_audio()
        >>> if cut.has_features:
        ...     feats = cut.load_features()

    It can be visualized, and listened to, inside Jupyter Notebooks::

        >>> cut.plot_audio()
        >>> cut.play_audio()
        >>> cut.plot_features()

    Cuts can be used with Lhotse's :class:`~lhotse.features.base.FeatureExtractor` to compute features.

        >>> from lhotse import Fbank
        >>> feats = cut.compute_features(extractor=Fbank())

    It is also possible to use a :class:`~lhotse.features.io.FeaturesWriter` to store the features and attach
    their manifest to a copy of the cut::

        >>> from lhotse import LilcomChunkyWriter
        >>> with LilcomChunkyWriter('feats.lca') as storage:
        ...     cut_with_feats = cut.compute_and_store_features(
        ...         extractor=Fbank(),
        ...         storage=storage
        ...     )

    Cuts have several methods that allow their manipulation, transformation, and mixing.
    Some examples (see the respective methods documentation for details)::

        >>> cut_2_to_4s = cut.truncate(offset=2, duration=2)
        >>> cut_padded = cut.pad(duration=10.0)
        >>> cut_extended = cut.extend_by(duration=5.0, direction='both')
        >>> cut_mixed = cut.mix(other_cut, offset_other_by=5.0, snr=20)
        >>> cut_append = cut.append(other_cut)
        >>> cut_24k = cut.resample(24000)
        >>> cut_sp = cut.perturb_speed(1.1)
        >>> cut_vp = cut.perturb_volume(2.)
        >>> cut_rvb = cut.reverb_rir(rir_recording)

    .. note::
        All cut transformations are performed lazily, on-the-fly, upon calling ``load_audio`` or ``load_features``.
        The stored waveforms and features are untouched.

    .. caution::
        Operations on cuts are not mutating -- they return modified copies of :class:`.Cut` objects,
        leaving the original object unmodified.

    A :class:`.Cut` that contains multiple segments (:class:`SupervisionSegment`) can be decayed into
    smaller cuts that correspond directly to supervisions::

        >>> smaller_cuts = cut.trim_to_supervisions()

    Cuts can be detached from parts of their metadata::

        >>> cut_no_feat = cut.drop_features()
        >>> cut_no_rec = cut.drop_recording()
        >>> cut_no_sup = cut.drop_supervisions()

    Finally, cuts provide convenience methods to compute feature frame and audio sample masks for supervised regions::

        >>> sup_frames = cut.supervisions_feature_mask()
        >>> sup_samples = cut.supervisions_audio_mask()

    See also:

        - :class:`lhotse.cut.MonoCut`
        - :class:`lhotse.cut.MixedCut`
        - :class:`lhotse.cut.CutSet`
    idstartdurationsampling_ratesupervisionsnum_samples
num_framesnum_featuresframe_shiftfeatures_typehas_recordinghas_features	has_videovideo	from_dict
load_audio
load_videoload_featurescompute_and_store_featuresdrop_featuresdrop_recordingdrop_supervisionsdrop_alignmentsdrop_in_memory_data	iter_datatruncatepad	extend_byresampleperturb_speedperturb_tempoperturb_volumephone
reverb_rirmap_supervisionsmerge_supervisionsfilter_supervisionsfill_supervisionwith_features_path_prefixwith_recording_path_prefixreturnc                 C   s   t | j| j| jdS )Nr$   )r   r"   r#   r$   self rM   C/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/cut/base.pyend   s   zCut.endc                 C   s   t | }i |dt| jiS )Ntype)r   rP   __name__)rL   drM   rM   rN   to_dict   s   zCut.to_dictc                 K   s   t | di i | j|S )a  
        Returns a shallow copy of self, with specified attributes overwritten.

        Example:
            >>> cut = MonoCut(id="old-id", ...)
            ... cut2 = cut.copy(id="new-id")
            ... assert cut.id == "old-id"
            ... assert cut2.id == "new-id"
        NrM   )rP   __dict__)rL   replace_attrsrM   rM   rN   copy   s   
zCut.copyc                 C   sT   t | jdk r	dS ddlm} |dt| jdd dD ]\}}t||r' dS qdS )	N   Fr   )sliding_windowc                 S      | j S Nr"   srM   rM   rN   <lambda>       z2Cut.has_overlapping_supervisions.<locals>.<lambda>keyT)lenr%   cytoolzrX   sortedr   )rL   rX   leftrightrM   rM   rN   has_overlapping_supervisions   s   
z Cut.has_overlapping_supervisionsc                    s    fdd j D S )a  
        Return the supervisions in this Cut that have modified time boundaries so as not to exceed
        the Cut's start or end.

        Note that when ``cut.supervisions`` is called, the supervisions may have negative ``start``
        values that indicate the supervision actually begins before the cut, or ``end`` values
        that exceed the Cut's duration (it means the supervision continued in the original recording
        after the Cut's ending).

        .. caution::
            For some tasks such as speech recognition (ASR), trimmed supervisions
            could result in corrupted training data. This is because a part of the transcript
            might actually reside outside of the cut.
        c                    s   g | ]}|  jqS rM   )trimr#   .0r]   rK   rM   rN   
<listcomp>   s    z,Cut.trimmed_supervisions.<locals>.<listcomp>r%   rK   rM   rK   rN   trimmed_supervisions   s   zCut.trimmed_supervisions	timestamp)r    r    c                 C   sN   d|  k r| j k sn J d| d| j  | j|d}| j|d}||fS )z
        Split a cut into two cuts at ``timestamp``, which is measured from the start of the cut.
        For example, a [0s - 10s] cut split at 4s yields:
            - left cut [0s - 4s]
            - right cut [4s - 10s]
        r   z0 < z < )r#   )offset)r#   r:   )rL   rn   re   rf   rM   rM   rN   split   s   .z	Cut.split        FNotheroffset_other_byallow_paddingsnrpreserve_idc                 C   s    ddl m} || |||||dS )z3Refer to :function:`~lhotse.cut.mix` documentation.   mix)ro   rt   ru   rv   )setry   )rL   rr   rs   rt   ru   rv   ry   rM   rM   rN   ry   	  s   	zCut.mixc                 C   s    ddl m} || || j||dS )a^  
        Append the ``other`` Cut after the current Cut. Conceptually the same as ``mix`` but with an offset
        matching the current cuts length. Optionally scale down (positive SNR) or scale up (negative SNR)
        the ``other`` cut.
        Returns a MixedCut, which only keeps the information about the mix; actual mixing is performed
        during the call to ``load_features``.

        :param preserve_id: optional string ("left", "right"). When specified, append will preserve the cut ID
            of the left- or right-hand side argument. Otherwise, a new random ID is generated.
        rw   rx   )ro   ru   rv   )rz   ry   r#   )rL   rr   ru   rv   ry   rM   rM   rN   append  s   z
Cut.append	extractor
augment_fnc                 C   s*   |   }|dur||| j}||| jS )aH  
        Compute the features from this cut. This cut has to be able to load audio.

        :param extractor: a ``FeatureExtractor`` instance used to compute the features.
        :param augment_fn: optional ``WavAugmenter`` instance for audio augmentation.
        :return: a numpy ndarray with the computed features.
        N)r0   r$   extract)rL   r|   r}   samplesrM   rM   rN   compute_features1  s   zCut.compute_featuresc                 K   s~   ddl m} |  jdd}|du r| \}}|td| jt	|| | j
D ]}|| j}|j|j|jddd q)|S )zV
        Display a plot of the waveform. Requires matplotlib to be installed.
        r   N)axisgreeng?)coloralpha)matplotlib.pyplotpyplotr0   sumsubplotsplotnplinspacer#   rb   r%   rh   axvspanr"   rO   )rL   axkwargspltr   figsupervisionrM   rM   rN   
plot_audioB  s   
zCut.plot_audioc                 C   s&   ddl m} |   }||| jdS )z
        Display a Jupyter widget that allows to listen to the waveform.
        Works only in Jupyter notebook/lab or similar (e.g. Colab).
        r   )Audio)rate)IPython.displayr   r0   squeezer$   )rL   r   r   rM   rM   rN   
play_audioQ  s   zCut.play_audioc                 C   s.   ddl m} t|  ddd}||S )z^
        Display the feature matrix as an image. Requires matplotlib to be installed.
        r   Nrw   )r   r   r   flipr2   	transposematshow)rL   r   featuresrM   rM   rN   plot_features[  s   
zCut.plot_featureswordalignment_typec              	   C   s^  ddl m} ddlm} ddlm} t| jdksJ d| jd }|jdur,||jv s4J d| d| }|j	j
}| ||}|jpGd	}	|jpLd	}
|t|ddd}|d
| j d |	 d |
  |jddddddd t|j| D ],\}}t|d }||j|j|d}|j|d |rdnd|jdddd |j|dd q||  dS )ze
        Display the alignment on top of a spectrogram. Requires matplotlib to be installed.
        r   N)Fbank)compute_num_framesrw   zHCannot plot alignment: there has to be exactly one supervision in a Cut.zBCannot plot alignment: missing alignment field or alignment type ''z	<unknown>zCut ID:z
, Speaker:z, Language:bothmajorTF)r   whichlabelbottomlabeltopbottomtoprW   )r)   r$      F   -      wvertical)fontsizer   rotationk)r   )r   r   lhotser   lhotse.utilsr   rb   r%   	alignmentr|   r$   r=   r   speakerlanguager   r   r   r   titler!   tick_params	enumerateboolrO   r)   textsymbolaxvlineshow)rL   r   r   r   r   supfbankr$   featsr   r   r   idxitemis_even	end_framerM   rM   rN   plot_alignmentd  sX   



	
zCut.plot_alignmentTcenterkeep_overlappingmin_durationcontext_direction)r   re   rf   randomkeep_all_channelsCutSetc                    s"  ddl m} ddlm} ddlm} g }| jdd}	| jD ]n |du r+ j j	}
}nt
 j j	||d\}
}| j|
|||	d	}|sK| fd
d}|s{t||s{ttdd |jD dksdJ d|jd j|_t||r{|jdkr{| d }t|jdkr j|_|| q||S )a>  
        Splits the current :class:`.Cut` into as many cuts as there are supervisions (:class:`.SupervisionSegment`).
        These cuts have identical start times and durations as the supervisions.
        When there are overlapping supervisions, they can be kept or discarded via ``keep_overlapping`` flag.

        For example, the following cut::

                    Cut
            |-----------------|
             Sup1
            |----|  Sup2
               |-----------|

        is transformed into two cuts::

             Cut1
            |----|
             Sup1
            |----|
               Sup2
               |-|
                    Cut2
               |-----------|
               Sup1
               |-|
                    Sup2
               |-----------|

        For the case of a multi-channel cut with multiple supervisions, we can either trim
        while respecting the supervision channels (in which case output cut has the same channels
        as the supervision) or ignore the channels (in which case output cut has the same channels
        as the input cut).

        .. hint:: If the resulting trimmed cut contains a single supervision, we set the cut id to
            the ``id`` of this supervision, for better compatibility with downstream tools, e.g.
            comparing the hypothesis of ASR with the reference in icefall.

        .. hint:: If a MultiCut is trimmed and the resulting trimmed cut contains a single channel,
            we convert it to a MonoCut.

        :param keep_overlapping: when ``False``, it will discard parts of other supervisions that overlap with the
            main supervision. In the illustration above, it would discard ``Sup2`` in ``Cut1`` and ``Sup1`` in ``Cut2``.
            In this mode, we guarantee that there will always be exactly one supervision per cut.
        :param min_duration: An optional duration in seconds; specifying this argument will extend the cuts
            that would have been shorter than ``min_duration`` with actual acoustic context in the recording/features.
            If there are supervisions present in the context, they are kept when ``keep_overlapping`` is true.
            If there is not enough context, the returned cut will be shorter than ``min_duration``.
            If the supervision segment is longer than ``min_duration``, the return cut will be longer.
        :param context_direction: Which direction should the cut be expanded towards to include context.
            The value of "center" implies equal expansion to left and right;
            random uniformly samples a value between "left" and "right".
        :param keep_all_channels: If ``True``, the output cut will have the same channels as the input cut. By default,
            the trimmed cut will have the same channels as the supervision.
        :return: a list of cuts.
        rw   MixedCut)MultiCutr   Tindex_mixed_tracksN)r"   r#   new_duration	directionro   r#   keep_excessive_supervisions_supervisions_indexc                    s   | j  j kS rZ   r!   r\   segmentrM   rN   r^     s    z*Cut.trim_to_supervisions.<locals>.<lambda>c                 s       | ]}t |jV  qd S rZ   )r   channelri   rM   rM   rN   	<genexpr>      z+Cut.trim_to_supervisions.<locals>.<genexpr>zTrimmed cut has supervisions with different channels. Either set `keep_all_channels=True` to keep original channels or `keep_overlapping=False` to retain only 1 supervision per trimmed cut.r   )mixedr   multir   rz   r   index_supervisionsr%   r"   r#   r   r:   rE   
isinstancerb   r   num_channelsto_monor!   r{   	from_cuts)rL   r   r   r   r   r   r   r   cutssupervisions_index	new_startr   trimmedrM   r   rN   trim_to_supervisions  sB   >


zCut.trim_to_supervisions rP   	max_pausemax_segment_duration	delimiterc                    s  ddl m} |du rd}|du r| j}g }| jD ]}|jdu s)||jvs)|j| s*qt|j| dd d  d dgfg}	t dd D ]K\}
}|j d	krQqE|	d
 \}}|j	|j
 |kr|j
|j	 |kr|||j|jg|j	|j
|j	 d}|||
d g f|	d
< qE|	||
d gf qEt|	D ]1\}
\}}|t|j d|
 |j|j	| j	 |j|j|j|j|j|j| fdd|D id
 qqt| |d}|jd|dS )a
  
        Splits the current :class:`.Cut` into its constituent alignment items (:class:`.AlignmentItem`).
        These cuts have identical start times and durations as the alignment item. Additionally,
        the `max_pause` option can be used to merge alignment items that are separated by a pause
        shorter than `max_pause`. If `max_segment_duration` is specified, we will keep merging
        consecutive segments until the duration of the merged segment exceeds `max_segment_duration`.

        For the case of a multi-channel cut with multiple alignments, we can either trim
        while respecting the supervision channels (in which case output cut has the same channels
        as the supervision) or ignore the channels (in which case output cut has the same channels
        as the input cut).

        .. hint:: If the resulting trimmed cut contains a single supervision, we set the cut id to
            the ``id`` of this supervision, for better compatibility with downstream tools, e.g.
            comparing the hypothesis of ASR with the reference in icefall.

        .. hint:: If a MultiCut is trimmed and the resulting trimmed cut contains a single channel,
            we convert it to a MonoCut.

        .. hint:: If you have a Cut with multiple supervision segments and you want to trim it to
            the word-level alignment, you can use the :meth:`.Cut.merge_supervisions` method
            first to merge the supervisions into a single one, followed by the
            :meth:`.Cut.trim_to_alignments` method. For example::

                >>> cut = cut.merge_supervisions(type='word', delimiter=' ')
                >>> cut = cut.trim_to_alignments(type='word', max_pause=1.0)

        .. hint:: The above technique can also be used to segment long cuts into roughly equal
            duration segments, while respecting alignment boundaries. For example, to split a
            Cut into 10s segments, you can do::

                >>> cut = cut.merge_supervisions(type='word', delimiter=' ')
                >>> cut = cut.trim_to_alignments(type='word', max_pause=10.0, max_segment_duration=10.0)

        :param type: The type of the alignment to trim to (e.g. "word").
        :param max_pause: The maximum pause allowed between the alignments to merge them. If ``None``,
            no merging will be performed. [default: None]
        :param delimiter: The delimiter to use when joining the alignment items.
        :param keep_all_channels: If ``True``, the output cut will have the same channels as the input cut. By default,
            the trimmed cut will have the same channels as the supervision.
        :param num_jobs: Number of parallel workers to process the cuts.
        :return: a CutSet object.
        r   )AlignmentItemNg      c                 S   rY   rZ   r[   )arM   rM   rN   r^   W  r_   z(Cut.trim_to_alignments.<locals>.<lambda>r`   rw    )r   r"   r#   -c                    s   g | ]} | qS rM   rM   )rj   j
alignmentsrM   rN   rk     s    z*Cut.trim_to_alignments.<locals>.<listcomp>)
r!   recording_idr"   r#   r   r   r   r   genderr   rl   F)r   r   )lhotse.supervisionr   r#   r%   r   rd   r   r   stripr"   rO   joinr{   r   r!   r   r   r   r   r   r   r   )rL   rP   r   r   r   r   r   new_supervisionsr   merged_alignmentsir   	prev_itemprev_indicesnew_itemindicesnew_cutsrM   r   rN   trim_to_alignments  s`   3




zCut.trim_to_alignmentsc              
   C   s  ddl m} | js|| gS t| jdd d}|d g}|d j}g }d}|dd D ]D}|j| |kr@|| t||j}q+|d j}	t||	 | j	d}
|| j
|	|
d	d
| j d| d|  |d7 }|g}|j}q+t|dkr|d j}	t||	 | j	d}
|| j
|	|
d	d
| j d| d|  tdd |D t| jksJ dtdd |D  dt| j d|  d| ||S )uO  
        Return a new CutSet with Cuts based on supervision groups. A supervision group is
        a set of supervisions with no gaps between them (or gaps shorter than ``max_pause``).
        This is similar to the concept of an `utterance group` as described in this paper:
        https://arxiv.org/abs/2211.00482

        For example, the following cut::

                                                Cut
        ╔═════════════════════════════════════════════════════════════════════════════════╗
        ║┌──────────────────────┐                              ┌────────┐                 ║
        ║│ Hello this is John.  │                              │   Hi   │                 ║
        ║└──────────────────────┘                              └────────┘                 ║
        ║            ┌──────────────────────────────────┐            ┌───────────────────┐║
        ║            │     Hey, John. How are you?      │            │  What do you do?  │║
        ║            └──────────────────────────────────┘            └───────────────────┘║
        ╚═════════════════════════════════════════════════════════════════════════════════╝

        is transformed into two cuts::

                            Cut 1                                       Cut 2
        ╔════════════════════════════════════════════════╗    ╔═══════════════════════════╗
        ║┌──────────────────────┐                        ║    ║┌────────┐                 ║
        ║│ Hello this is John.  │                        ║    ║│   Hi   │                 ║
        ║└──────────────────────┘                        ║    ║└────────┘                 ║
        ║            ┌──────────────────────────────────┐║    ║      ┌───────────────────┐║
        ║            │     Hey, John. How are you?      │║    ║      │  What do you do?  │║
        ║            └──────────────────────────────────┘║    ║      └───────────────────┘║
        ╚════════════════════════════════════════════════╝    ╚═══════════════════════════╝

        For the case of a multi-channel cut with multiple supervisions, we keep all the channels
        in the recording.

        :param max_pause: An optional duration in seconds; if the gap between two supervisions
            is longer than this, they will be treated as separate groups. By default, this is
            set to 0.0, which means that no gaps are allowed between supervisions.
        :return: a ``CutSet``.
        rw   r   c                 S   rY   rZ   r[   r\   rM   rM   rN   r^     r_   z0Cut.trim_to_supervision_groups.<locals>.<lambda>r`   r   NrJ   F)ro   r#   r   r   c                 s   r   rZ   rb   r%   rj   crM   rM   rN   r     r   z1Cut.trim_to_supervision_groups.<locals>.<genexpr>zQThe total number of supervisions decreased after trimming to supervision groups.
c                 s   r   rZ   r  r  rM   rM   rN   r     r   z != z
This is likely a bug. Please report it here: https://github.com/lhotse-speech/lhotse/issues/new, and provide the following information:
original cut: z,
new cuts: )rz   r   r%   rd   rO   r"   r{   maxr   r$   r:   with_idr!   rb   r   r   )rL   r   r   r%   supervision_groupcur_endr  	group_idxr   ro   r#   rM   rM   rN   trim_to_supervision_groups  sd   *







zCut.trim_to_supervision_groupshopr   c           	   
   C   s   ddl m} |s
|}| jrI|| jj  s-J d| j d| d| jj d|| jj  d	|| jj  sIJ d| d| jj d|| jj  dg }t| j||}| j	d	d
}t
|D ]}|| j|| |||d| j d|  q\||S )a  
        Return a list of shorter cuts, made by traversing this cut in windows of
        ``duration`` seconds by ``hop`` seconds.

        The last window might have a shorter duration if there was not enough audio,
        so you might want to use either filter or pad the results.

        :param duration: Desired duration of the new cuts in seconds.
        :param hop: Shift between the windows in the new cuts in seconds.
        :param keep_excessive_supervisions: bool. When a cut is truncated in the
            middle of a supervision segment, should the supervision be kept.
        :return: a list of cuts made from shorter duration windows.
        rw   r   z[cut.id=z[] Window duration must be defined to result in an integer number of video frames (duration=z * fps=z = z).zb[cut.id={self.id}] Window hop must be defined to result in an integer number of video frames (hop=Tr   r   r   )rz   r   r-   r.   fps
is_integerr!   r   r#   r   ranger{   r:   r
  r   )	rL   r#   r  r   r   r  	n_windowsr   r   rM   rM   rN   cut_into_windows  sH   



zCut.cut_into_windowsr   keep_idsc                    sv   ddl m} t t  | jt fdd| jD i}|r9t| |r9| jD ]}t fdd|j	jD ||j	j< q%|S )a  
        Create a two-level index of supervision segments. It is a mapping from a Cut's ID to an
        interval tree that contains the supervisions of that Cut.

        The interval tree can be efficiently queried for overlapping and/or enveloping segments.
        It helps speed up some operations on Cuts of very long recordings (1h+) that contain many
        supervisions.

        :param index_mixed_tracks: Should the tracks of MixedCut's be indexed as additional, separate entries.
        :param keep_ids: If specified, we will only index the supervisions with the specified IDs.
        :return: a mapping from Cut ID to an interval tree of SupervisionSegments.
        rw   r   c                 3   *    | ]}|j  v rt|j|j|V  qd S rZ   r!   r
   r"   rO   ri   r  rM   rN   r   0      
z)Cut.index_supervisions.<locals>.<genexpr>c                 3   r  rZ   r  ri   r  rM   rN   r   9  r  )
r   r   r   r   r!   r   r%   r   trackscut)rL   r   r  r   indexedtrackrM   r  rN   r     s   

zCut.index_supervisionsstorage_pathformatencodingc              
   K   s   t |}| jd	i |}|dur||| j}t||| j||d t|j| j|jd |jd | j tdtt	| j
t|dgd}t| | j| jt| drQ| jnd| jrZ| jdS ddS )
a  
        Store this cut's waveform as audio recording to disk.

        :param storage_path: The path to location where we will store the audio recordings.
        :param format: Audio format argument supported by ``torchaudio.save`` or ``soundfile.write``.
            Please refer to the relevant library's documentation depending on which audio backend you're using.
        :param encoding: Audio encoding argument supported by ``torchaudio.save`` or ``soundfile.write``.
            Please refer to the relevant library's documentation depending on which audio backend you're using.
        :param augment_fn: an optional callable used for audio augmentation.
            Be careful with the types of augmentations used: if they modify
            the start/end/duration times of the cut and its supervisions,
            you will end up with incorrect supervision information when using this API.
            E.g. for speed perturbation, use ``CutSet.perturb_speed()`` instead.
        :param kwargs: additional arguments passed to ``Cut.load_audio()``. Example, if
            saving a MixedCut, we can specify `mono_downmix=True` to downmix the tracks
            to mono before saving.
        :return: a new Cut instance.
        N)r$   r  r   rw   file)rP   channelssource)r!   r$   r&   r#   sourcescustom)r!   r%   r%  r   rM   )r   r0   r$   r   r   stemshaper   listr  r   strr   to_cutr!   r%   hasattrr%  r,   r   )rL   r  r  r   r}   r   r   	recordingrM   rM   rN   r   @  s@   
zCut.save_audiomin_speaker_dimspeaker_to_idx_mapuse_alignment_if_existsc                 C   sP  | j sJ d| j d|du r#dd tttdd | jD D }t|}|dur0t||}t	|| j
f}| jD ]j}||j }|r}|jr}||jv r}|j| D ]*}|jdkr`t|j| j nd}	|j| jk rpt|j| j n| j
}
d	|||	|
f< qQq;|jdkrt|j| j nd}	|j| jk rt|j| j n| j
}
d	|||	|
f< q;|S )
a  
        Return a matrix of per-speaker activity in a cut. The matrix shape is (num_speakers, num_frames),
        and its values are 0 for nonspeech **frames** and 1 for speech **frames** for each respective speaker.

        This is somewhat inspired by the TS-VAD setup: https://arxiv.org/abs/2005.07272

        :param min_speaker_dim: optional int, when specified it will enforce that the matrix shape is at least
            that value (useful for datasets like CHiME 6 where the number of speakers is always 4, but some cuts
            might have less speakers than that).
        :param speaker_to_idx_map: optional dict mapping speaker names (strings) to their global indices (ints).
            Useful when you want to preserve the order of the speakers (e.g. speaker XYZ is always mapped to index 2)
        :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not
            exists, fall back on supervision time spans.
        zPNo features available. Can't compute supervisions feature mask for cut with ID: .Nc                 S      i | ]\}}||qS rM   rM   rj   r   spkrM   rM   rN   
<dictcomp>      z-Cut.speakers_feature_mask.<locals>.<dictcomp>c                 s       | ]}|j V  qd S rZ   r   ri   rM   rM   rN   r         z,Cut.speakers_feature_mask.<locals>.<genexpr>r   rw   )r,   r!   r   rd   rz   r%   rb   minr   zerosr'   r   r   r"   roundr)   rO   r#   rL   r-  r.  r/  num_speakersmaskr   speaker_idxalistetrM   rM   rN   speakers_feature_mask  sP   




zCut.speakers_feature_maskc                 C   sX  | j sJ d| j d|du r#dd tttdd | jD D }t|}|dur0t||}t	|| j
f}| jD ]n}||j }|r|jr||jv r|j| D ],}|jdkr_t|j| jnd}	|j| jk rnt|j| jnt| j| j}
d	|||	|
f< qQq;|jdkrt|j| jnd}	|j| jk rt|j| jnt| j| j}
d	|||	|
f< q;|S )
a  
        Return a matrix of per-speaker activity in a cut. The matrix shape is (num_speakers, num_samples),
        and its values are 0 for nonspeech **samples** and 1 for speech **samples** for each respective speaker.

        This is somewhat inspired by the TS-VAD setup: https://arxiv.org/abs/2005.07272

        :param min_speaker_dim: optional int, when specified it will enforce that the matrix shape is at least
            that value (useful for datasets like CHiME 6 where the number of speakers is always 4, but some cuts
            might have less speakers than that).
        :param speaker_to_idx_map: optional dict mapping speaker names (strings) to their global indices (ints).
            Useful when you want to preserve the order of the speakers (e.g. speaker XYZ is always mapped to index 2)
        :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not
            exists, fall back on supervision time spans.
        ONo recording available. Can't compute supervisions audio mask for cut with ID: r0  Nc                 S   r1  rM   rM   r2  rM   rM   rN   r4    r5  z+Cut.speakers_audio_mask.<locals>.<dictcomp>c                 s   r6  rZ   r7  ri   rM   rM   rN   r     r8  z*Cut.speakers_audio_mask.<locals>.<genexpr>r   rw   )r+   r!   r   rd   rz   r%   rb   r9  r   r:  r&   r   r   r"   r   r$   rO   r#   r<  rM   rM   rN   speakers_audio_mask  sV   





zCut.speakers_audio_maskc                 C   s   ddl m} || |dS )aU  
        Return a 1D numpy array with value 1 for **frames** covered by at least one supervision,
        and 0 for **frames** not covered by any supervision.

        :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not
            exists, fall back on supervision time spans.
        rw   )compute_supervisions_frame_mask)r/  )rz   rF  )rL   r/  rF  rM   rM   rN   supervisions_feature_mask  s   
zCut.supervisions_feature_maskc                 C   s  | j sJ d| j dtj| jtjd}| jD ]k}|rX|jrX||jv rX|j| D ]-}|jdkr8t	|j| j
 nd}|j| jk rHt	|j| j
 nt	| j| j
 }d|||< q)q|jdkret	|j| j
 nd}|j| jk rut	|j| j
 nt	| j| j
 }d|||< q|S )aW  
        Return a 1D numpy array with value 1 for **samples** covered by at least one supervision,
        and 0 for **samples** not covered by any supervision.

        :param use_alignment_if_exists: optional str, key for alignment type to use for generating the mask. If not
            exists, fall back on supervision time spans.
        rD  r0  )dtyper   g      ?)r+   r!   r   r:  r&   float32r%   r   r"   r;  r$   rO   r#   )rL   r/  r>  r   r@  rA  rB  rM   rM   rN   supervisions_audio_mask  s<   



zCut.supervisions_audio_maskid_c                 C   s   t | |dS )z'Return a copy of the Cut with a new ID.r   )r   )rL   rK  rM   rM   rN   r
  ?  s   zCut.with_id)rq   FNN)NNrZ   )r   )TNr   F)NNr   F)rq   )NT)FN)NNN)5rQ   
__module____qualname____doc__r)  __annotations__r   intr   r   r   r   r   r   r   r   ndarrayr	   torchTensorpropertyrO   dictrS   rV   rg   rm   rp   r   ry   r{   r   r   r   r   r   r   r   r   r   r  r  r  r   r   r   r   r   rC  rE  rG  rJ  r
  rM   rM   rM   rN   r    "   s  
 w 




	:
w
}
`
6


&
B
B
E

+r    )+pathlibr   typingr   r   r   r   r   r   r	   numpyr   rR  intervaltreer
   r   lhotse.audior   r   r   lhotse.audio.backendr   lhotse.augmentationr   lhotse.featuresr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    rM   rM   rM   rN   <module>   s    $<