o
    2wiA                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZ d dlm	Z	m
Z
mZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' G dd deZ(eG dd deZ)G dd deeZ*dS )    N)defaultdict)	dataclass)groupbyislice)	AnyCallableDictIterableListMapping
NamedTupleOptionalUnion)tqdm)CustomFieldMixin)AlgorithmMixin)Serializable)PathlikeSecondsTimeSpanadd_durationsasdict_nonullcompute_num_samplesexactly_one_not_nullfastcopyifnoneis_equal_or_contains	overspansperturb_num_samplessplit_manifest_lazysplit_sequencec                   @   s   e Zd ZU dZeed< eed< eed< dZee	 ed< e
deeef dd fd	d
ZdefddZedefddZdedd fddZde	dedd fddZddededd fddZdeegef dd fddZdS )AlignmentItemz
    This class contains an alignment item, for example a word, along with its
    start time (w.r.t. the start of recording) and duration. It can potentially
    be used to store other kinds of alignment items, such as subwords, pdfid's etc.
    symbolstartdurationNscoredatareturnc                 C   s"   t | trtt|   S t|  S N)
isinstancedictr!   listvaluesr&    r.   O/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/supervision.pydeserialize8   s   
zAlignmentItem.deserializec                 C   s   t | S r(   r+   selfr.   r.   r/   	serialize?   s   zAlignmentItem.serializec                 C      t | j| j ddS N   ndigitsroundr#   r$   r2   r.   r.   r/   endB      zAlignmentItem.endoffsetc                 C   s"   t t| j|dd| j| j| jdS )z\Return an identical ``AlignmentItem``, but with the ``offset`` added to the ``start`` field.逻  sampling_rate)r#   r$   r"   r%   )r!   r   r#   r$   r"   r%   r3   r>   r.   r.   r/   with_offsetF   s   zAlignmentItem.with_offsetfactorrA   c                 C   sH   t | j|}t | j|}t||| }t||| }t| j||| jdS )z
        Return an ``AlignmentItem`` that has time boundaries matching the
        recording/cut perturbed with the same factor. See :meth:`SupervisionSegment.perturb_speed`
        for details.
        r"   r#   r$   r%   )r   r#   r$   r   r!   r"   r%   )r3   rD   rA   start_samplenum_samples	new_startnew_durationr.   r.   r/   perturb_speedO   s   zAlignmentItem.perturb_speedr   r<   c              	   C   sZ   |dksJ t td| j| }td| j| }t| jt|| jt| j| | dddS )z6
        See :meth:`SupervisionSegment.trim`.
        r   r?   r@   )r"   r#   r$   )	absminr#   maxr<   r!   r"   r   r$   r3   r<   r#   start_exceeds_byend_exceeds_byr.   r.   r/   trim]   s   
zAlignmentItem.trimtransform_fnc                 C   s   t || j| j| j| jdS )zL
        Perform specified transformation on the alignment content.
        rE   )r!   r"   r#   r$   r%   r3   rR   r.   r.   r/   	transforml   s   zAlignmentItem.transformr   )__name__
__module____qualname____doc__str__annotations__r   r%   r   floatstaticmethodr   r
   r   r0   r+   r4   propertyr<   rC   intrJ   rQ   r   rT   r.   r.   r.   r/   r!   (   s   
 	 r!   c                	   @   s0  e Zd ZU dZeed< eed< eed< eed< dZee	e
e	 f ed< dZee ed	< dZee ed
< dZee ed< dZee ed< dZeeeef  ed< dZeeee
e f  ed< edefddZdede
e dd fddZdedd fddZ	d8dede	dedd fddZ	d8dede	dedd fddZ	d8dededd fd d!Zd8d"ededd fd#d$Z	d9dedeee	e
e	 f  dd fd%d&Zd:d'ededd fd(d)Z d*e!d gd f dd fd+d,Z"d*e!egef dd fd-d.Z#	/d;d*e!egef d0ee dd fd1d2Z$de%fd3d4Z&e'd5e%dd fd6d7Z(dS )<SupervisionSegmenta  
    :class:`~lhotse.supervsion.SupervisionSegment` represents a time interval (segment) annotated with some
    supervision labels and/or metadata, such as the transcription, the speaker identity, the language, etc.

    Each supervision has unique ``id`` and always refers to a specific recording (via ``recording_id``)
    and one or more ``channel`` (by default, 0). Note that multiple channels of the recording
    may share the same supervision, in which case the ``channel`` field will be a list of integers.

    It's also characterized by the start time (relative to the beginning of a :class:`~lhotse.audio.Recording`
    or a :class:`~lhotse.cut.Cut`) and a duration, both expressed in seconds.

    The remaining fields are all optional, and their availability depends on specific corpora.
    Since it is difficult to predict all possible types of metadata, the ``custom`` field (a dict) can be used to
    insert types of supervisions that are not supported out of the box.

    :class:`~lhotse.supervsion.SupervisionSegment` may contain multiple types of alignments.
    The ``alignment`` field is a dict, indexed by alignment's type (e.g., ``word`` or ``phone``),
    and contains a list of :class:`~lhotse.supervision.AlignmentItem` objects -- simple structures
    that contain a given symbol and its time interval.
    Alignments can be read from CTM files or created programatically.

    Examples

        A simple segment with no supervision information::

            >>> from lhotse import SupervisionSegment
            >>> sup0 = SupervisionSegment(
            ...     id='rec00001-sup00000', recording_id='rec00001',
            ...     start=0.5, duration=5.0, channel=0
            ... )

        Typical supervision containing transcript, speaker ID, gender, and language::

            >>> sup1 = SupervisionSegment(
            ...     id='rec00001-sup00001', recording_id='rec00001',
            ...     start=5.5, duration=3.0, channel=0,
            ...     text='transcript of the second segment',
            ...     speaker='Norman Dyhrentfurth', language='English', gender='M'
            ... )

        Two supervisions denoting overlapping speech on two separate channels in a microphone array/multiple headsets
        (pay attention to ``start``, ``duration``, and ``channel``)::

            >>> sup2 = SupervisionSegment(
            ...     id='rec00001-sup00002', recording_id='rec00001',
            ...     start=15.0, duration=5.0, channel=0,
            ...     text="i have incredibly good news for you",
            ...     speaker='Norman Dyhrentfurth', language='English', gender='M'
            ... )
            >>> sup3 = SupervisionSegment(
            ...     id='rec00001-sup00003', recording_id='rec00001',
            ...     start=18.0, duration=3.0, channel=1,
            ...     text="say what",
            ...     speaker='Hervey Arman', language='English', gender='M'
            ... )

        A supervision with a phone alignment::

            >>> from lhotse.supervision import AlignmentItem
            >>> sup4 = SupervisionSegment(
            ...     id='rec00001-sup00004', recording_id='rec00001',
            ...     start=33.0, duration=1.0, channel=0,
            ...     text="ice",
            ...     speaker='Maryla Zechariah', language='English', gender='F',
            ...     alignment={
            ...         'phone': [
            ...             AlignmentItem(symbol='AY0', start=33.0, duration=0.6),
            ...             AlignmentItem(symbol='S', start=33.6, duration=0.4)
            ...         ]
            ...     }
            ... )

        A supervision shared across multiple channels of a recording (e.g. a microphone array)::

            >>> sup5 = SupervisionSegment(
            ...     id='rec00001-sup00005', recording_id='rec00001',
            ...     start=33.0, duration=1.0, channel=[0, 1],
            ...     text="ice",
            ...     speaker='Maryla Zechariah',
            ... )

        Converting :class:`~lhotse.supervsion.SupervisionSegment` to a ``dict``::

            >>> sup0.to_dict()
            {'id': 'rec00001-sup00000', 'recording_id': 'rec00001', 'start': 0.5, 'duration': 5.0, 'channel': 0}


    idrecording_idr#   r$   r   channelNtextlanguagespeakergendercustom	alignmentr'   c                 C   r5   r6   r:   r2   r.   r.   r/   r<      r=   zSupervisionSegment.endkindc                 C   s&   | j }|d u r	i }|||< t| |dS )Nri   )ri   r   )r3   rj   ri   alisr.   r.   r/   with_alignment   s
   z!SupervisionSegment.with_alignmentr>   c                 C   s@   t | j| jt| j| dd| j| j| j| j| j	| j
| j| jdS )zaReturn an identical ``SupervisionSegment``, but with the ``offset`` added to the ``start`` field.r7   r8   )ra   rb   r#   r$   rc   rd   re   rf   rg   rh   ri   )r`   ra   rb   r;   r#   r$   rc   rd   re   rf   rg   rh   ri   rB   r.   r.   r/   rC      s   zSupervisionSegment.with_offsetTrD   rA   affix_idc              	      s   t | j}t | j}t|  }t|  }t| |r&| j d  n| j|r2| j d  n| j||| jrH fdd| j D dS ddS )a_  
        Return a ``SupervisionSegment`` that has time boundaries matching the
        recording/cut perturbed with the same factor.

        :param factor: The speed will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
        :param sampling_rate: The sampling rate is necessary to accurately perturb the start
            and duration (going through the sample counts).
        :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
            by affixing it with "_sp{factor}".
        :return: a modified copy of the current ``SupervisionSegment``.
        _spc                    &   i | ]\}}| fd d|D qS )c                       g | ]	}|j  d qS )rD   rA   )rJ   .0itemrr   r.   r/   
<listcomp>  s    z?SupervisionSegment.perturb_speed.<locals>.<dictcomp>.<listcomp>r.   rt   typealirr   r.   r/   
<dictcomp>  s    z4SupervisionSegment.perturb_speed.<locals>.<dictcomp>N)ra   rb   r#   r$   ri   )	r   r#   r$   r   r   ra   rb   ri   items)r3   rD   rA   rn   rF   rG   rH   rI   r.   rr   r/   rJ      s&   z SupervisionSegment.perturb_speedc                 C   sN   | j ||dd}t||r| j d| n| j|r"| j d| dS | jdS )a_  
        Return a ``SupervisionSegment`` that has time boundaries matching the
        recording/cut perturbed with the same factor.

        :param factor: The tempo will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
        :param sampling_rate: The sampling rate is necessary to accurately perturb the start
            and duration (going through the sample counts).
        :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
            by affixing it with "_tp{factor}".
        :return: a modified copy of the current ``SupervisionSegment``.
        F)rn   _tpra   rb   )rJ   r   ra   rb   )r3   rD   rA   rn   	perturbedr.   r.   r/   perturb_tempo!  s   z SupervisionSegment.perturb_tempoc                 C   >   t | |r| j d| n| j|r| j d| dS | jdS )a~  
        Return a ``SupervisionSegment`` with modified ids.

        :param factor: The volume will be adjusted this many times (e.g. factor=1.1 means 1.1x louder).
        :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
            by affixing it with "_vp{factor}".
        :return: a modified copy of the current ``SupervisionSegment``.
        _vpr}   r   ra   rb   )r3   rD   rn   r.   r.   r/   perturb_volume:  s   z!SupervisionSegment.perturb_volumecodecc                 C   r   )a8  
        Return a ``SupervisionSegment`` with modified ids.

        :param codec: Codec name.
        :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
            by affixing it with "_nb_{codec}".
        :return: a modified copy of the current ``SupervisionSegment``.
        _nb_r}   r   )r3   r   rn   r.   r.   r/   
narrowbandN  s   
zSupervisionSegment.narrowbandc                 C   sF   t | |r
| j dn| j|r| j dn| j|dur|dS | jdS )a  
        Return a ``SupervisionSegment`` with modified ids.

        :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
            by affixing it with "_rvb".
        :return: a modified copy of the current ``SupervisionSegment``.
        _rvbN)ra   rb   rc   )r   ra   rb   rc   )r3   rn   rc   r.   r.   r/   
reverb_rir`  s   
zSupervisionSegment.reverb_rirr<   c              	      s~   dksJ t td| j }td| j  }t| t| jt| j| | dd| jr; fdd| j	 D dS ddS )a.  
        Return an identical ``SupervisionSegment``, but ensure that ``self.start`` is not negative (in which case
        it's set to 0) and ``self.end`` does not exceed the ``end`` parameter. If a `start` is optionally
        provided, the supervision is trimmed from the left (note that start should be relative to the cut times).

        This method is useful for ensuring that the supervision does not exceed a cut's bounds,
        in which case pass ``cut.duration`` as the ``end`` argument, since supervision times are relative to the cut.
        r   r?   r@   c                    rp   )c                    rq   )r<   r#   )rQ   rs   r   r.   r/   rv         z6SupervisionSegment.trim.<locals>.<dictcomp>.<listcomp>r.   rw   r   r.   r/   rz     s    z+SupervisionSegment.trim.<locals>.<dictcomp>N)r#   r$   ri   )
rK   rL   r#   rM   r<   r   r   r$   ri   r{   rN   r.   r   r/   rQ   r  s    	
zSupervisionSegment.trimrR   c                 C   s   || S )z
        Return a copy of the current segment, transformed with ``transform_fn``.

        :param transform_fn: a function that takes a segment as input, transforms it and returns a new segment.
        :return: a modified ``SupervisionSegment``.
        r.   rS   r.   r.   r/   map  s   	zSupervisionSegment.mapc                 C   s    | j du r| S t| || j dS )a.  
        Return a copy of the current segment with transformed ``text`` field.
        Useful for text normalization, phonetic transcription, etc.

        :param transform_fn: a function that accepts a string and returns a string.
        :return: a ``SupervisionSegment`` with adjusted text.
        N)rd   )rd   r   rS   r.   r.   r/   transform_text  s   

z!SupervisionSegment.transform_textwordrx   c                    s0   | j du r| S t|  fdd| j  D dS )a  
        Return a copy of the current segment with transformed ``alignment`` field.
        Useful for text normalization, phonetic transcription, etc.

        :param type:  alignment type to transform (key for alignment dict).
        :param transform_fn: a function that accepts a string and returns a string.
        :return: a ``SupervisionSegment`` with adjusted alignments.
        Nc                    s(   i | ]\ }  fd d|D qS )c                    s$   g | ]} kr|j d n|qS )rR   )rT   rs   )ali_typerR   rx   r.   r/   rv     s    zESupervisionSegment.transform_alignment.<locals>.<dictcomp>.<listcomp>r.   )rt   ry   rR   rx   )r   r/   rz     s    z:SupervisionSegment.transform_alignment.<locals>.<dictcomp>rk   )ri   r   r{   r3   rR   rx   r.   r   r/   transform_alignment  s   
z&SupervisionSegment.transform_alignmentc                 C   sB   | j d u r	t| S dd | j  D }tt| d d}||d< |S )Nc                 S       i | ]\}}|d d |D qS )c                 S   s   g | ]}|  qS r.   )r4   rs   r.   r.   r/   rv     s    z9SupervisionSegment.to_dict.<locals>.<dictcomp>.<listcomp>r.   )rt   rj   ry   r.   r.   r/   rz         z.SupervisionSegment.to_dict.<locals>.<dictcomp>rk   ri   )ri   r   r{   r   )r3   rl   r&   r.   r.   r/   to_dict  s   
zSupervisionSegment.to_dictr&   c                 C   sP   ddl m} d| v r|| d  d| v r!dd | d  D | d< tdi | S )Nr   )deserialize_custom_fieldrh   ri   c                 S   r   )c                 S      g | ]}t |qS r.   )r!   r0   )rt   xr.   r.   r/   rv     s    z;SupervisionSegment.from_dict.<locals>.<dictcomp>.<listcomp>r.   rt   kvr.   r.   r/   rz     r   z0SupervisionSegment.from_dict.<locals>.<dictcomp>r.   )lhotse.serializationr   r{   r`   )r&   r   r.   r.   r/   	from_dict  s   

zSupervisionSegment.from_dict)T)TNrU   r   ))rV   rW   rX   rY   rZ   r[   r   rc   r   r_   r
   rd   r   re   rf   rg   rh   r   r   ri   r!   r^   r<   rm   rC   r\   boolrJ   r   r   r   r   rQ   r   r   r   r   r*   r   r]   r   r.   r.   r.   r/   r`   x   s   
 Y
	
&





r`   c                   @   s  e Zd ZU dZdNdeee  ddfddZdd defdd	Z	e
deeeef ee f fd
dZe
dee fddZedee dd fddZeZedee dd fddZedeeee f dd fddZ			dOdededededd f
ddZdPdededdfddZdee fd d!Z	dQd"ed#ed$eded  fd%d&Z	'dRd(ed)ed*eded  fd+d,Z	dSd-ee d.ee dd fd/d0Zd1eegef dd fd2d3Z 	dPd1eegef dedd fd4d5Z!		6			7dTd8ed9ee d:e"d;ee" d<ed=e"dee fd>d?Z#dZ$eeeee f  e%d@< dAdB Z&defdCdDZ'dEeeef defdFdGZ(deeef defdHdIZ)dee fdJdKZ*defdLdMZ+dS )USupervisionSetaI  
    :class:`~lhotse.supervision.SupervisionSet` represents a collection of segments containing some
    supervision information (see :class:`~lhotse.supervision.SupervisionSegment`).

    It acts as a Python ``list``, extended with an efficient ``find`` operation that indexes and caches
    the supervision segments in an interval tree.
    It allows to quickly find supervision segments that correspond to a specific time interval.
    However, it can also work with lazy iterables.

    When coming from Kaldi, think of :class:`~lhotse.supervision.SupervisionSet` as a ``segments`` file on steroids,
    that may also contain *text*, *utt2spk*, *utt2gender*, *utt2dur*, etc.

    Examples

        Building a :class:`~lhotse.supervision.SupervisionSet`::

            >>> from lhotse import SupervisionSet, SupervisionSegment
            >>> sups = SupervisionSet.from_segments([SupervisionSegment(...), ...])

        Writing/reading a :class:`~lhotse.supervision.SupervisionSet`::

            >>> sups.to_file('supervisions.jsonl.gz')
            >>> sups2 = SupervisionSet.from_file('supervisions.jsonl.gz')

        Using :class:`~lhotse.supervision.SupervisionSet` like a dict::

            >>> 'rec00001-sup00000' in sups
            True
            >>> sups['rec00001-sup00000']
            SupervisionSegment(id='rec00001-sup00000', recording_id='rec00001', start=0.5, ...)
            >>> for segment in sups:
            ...     pass

        Searching by ``recording_id`` and time interval::

            >>> matched_segments = sups.find(recording_id='rec00001', start_after=17.0, end_before=25.0)

        Manipulation::

            >>> longer_than_5s = sups.filter(lambda s: s.duration > 5)
            >>> first_100 = sups.subset(first=100)
            >>> split_into_4 = sups.split(num_splits=4)
            >>> shuffled = sups.shuffle()
    Nsegmentsr'   c                 C   s   t |i | _d S r(   )r   r   )r3   r   r.   r.   r/   __init__	     zSupervisionSet.__init__otherc                 C   s   | j |j kS r(   r   r3   r   r.   r.   r/   __eq__  s   zSupervisionSet.__eq__c                 C      | j S )z$Alias property for ``self.segments``r   r2   r.   r.   r/   r&     s   zSupervisionSet.datac                 C      dd | D S )Nc                 s   s    | ]}|j V  qd S r(   ra   rt   sr.   r.   r/   	<genexpr>  s    z%SupervisionSet.ids.<locals>.<genexpr>r.   r2   r.   r.   r/   ids  s   zSupervisionSet.idsc                 C   s   t t| S r(   )r   r+   r   r.   r.   r/   from_segments  s   zSupervisionSet.from_segmentsr&   c                 C   s   t dd | D S )Nc                 s   s    | ]}t |V  qd S r(   )r`   r   r   r.   r.   r/   r   "      

z,SupervisionSet.from_dicts.<locals>.<genexpr>r   r   r-   r.   r.   r/   
from_dicts   s   
zSupervisionSet.from_dictspathc           	      C   s   ddl m} t| |tfr| gn| } g }| D ]d}t|dU}t|D ]H\}}|  }t|dks<J d| d| |d }t	|d dkrIq"|
t| d	|d
|t|d t	|d t	|d |d d q"W d   n1 suw   Y  qt|S )a  
        Read an RTTM file located at ``path`` (or an iterator) and create a :class:`.SupervisionSet` manifest for them.
        Can be used to create supervisions from custom RTTM files (see, for example, :class:`lhotse.dataset.DiarizationDataset`).

        .. code:: python

            >>> from lhotse import SupervisionSet
            >>> sup1 = SupervisionSet.from_rttm('/path/to/rttm_file')
            >>> sup2 = SupervisionSet.from_rttm(Path('/path/to/rttm_dir').rglob('ref_*'))

        The following description is taken from the [dscore](https://github.com/nryant/dscore#rttm) toolkit:

        Rich Transcription Time Marked (RTTM) files are space-delimited text files
        containing one turn per line, each line containing ten fields:

        - ``Type``  --  segment type; should always by ``SPEAKER``
        - ``File ID``  --  file name; basename of the recording minus extension (e.g.,
        ``rec1_a``)
        - ``Channel ID``  --  channel (1-indexed) that turn is on; should always be
        ``1``
        - ``Turn Onset``  --  onset of turn in seconds from beginning of recording
        - ``Turn Duration``  -- duration of turn in seconds
        - ``Orthography Field`` --  should always by ``<NA>``
        - ``Speaker Type``  --  should always be ``<NA>``
        - ``Speaker Name``  --  name of speaker of turn; should be unique within scope
        of each file
        - ``Confidence Score``  --  system confidence (probability) that information
        is correct; should always be ``<NA>``
        - ``Signal Lookahead Time``  --  should always be ``<NA>``

        For instance:

            SPEAKER CMU_20020319-1400_d01_NONE 1 130.430000 2.350 <NA> <NA> juliet <NA> <NA>
            SPEAKER CMU_20020319-1400_d01_NONE 1 157.610000 3.060 <NA> <NA> tbc <NA> <NA>
            SPEAKER CMU_20020319-1400_d01_NONE 1 130.490000 0.450 <NA> <NA> chek <NA> <NA>

        :param path: Path to RTTM file or an iterator of paths to RTTM files.
        :return: a new ``SupervisionSet`` instance containing segments from the RTTM file.
        r   )Pathr
   zInvalid RTTM line in file z:       -06d         )ra   rb   rc   r#   r$   rf   N)pathlibr   r)   rZ   open	enumeratestripsplitlenr\   appendr`   r_   r   r   )	r   r   r   filefidxlinepartsrb   r.   r.   r/   	from_rttm&  s2   ) 



zSupervisionSet.from_rttmr   Fctm_filerx   match_channelverbosec                    s  g }i }t |<}|rt|ddn|}|D ]'}|  ^}	}
}}}}||	t|
t|t|||r8t|d ndf qW d   n1 sGw   Y  t|dd d}tt	dd	 t
|d
d dD }g }t|}d}tdd | D }|r}t|ddn|}|D ]=}	|	|v r| j|	dD ] fdd||	 D }|t|7 }|t|id qq|fdd| j|	dD  qt| d| d t|S )a  
        Add alignments from CTM file to the supervision set.

        :param ctm: Path to CTM file.
        :param type: Alignment type (optional, default = `word`).
        :param match_channel: if True, also match channel between CTM and SupervisionSegment
        :param verbose: if True, show progress bar
        :return: A new SupervisionSet with AlignmentItem objects added to the segments.
        zReading words from CTM file)descr   Nc                 S   s   | d | d fS )Nr   r   r.   r   r.   r.   r/   <lambda>  s    z8SupervisionSet.with_alignment_from_ctm.<locals>.<lambda>)keyc                 S   s   i | ]	\}}|t |qS r.   r1   r   r.   r.   r/   rz     r   z:SupervisionSet.with_alignment_from_ctm.<locals>.<dictcomp>c                 S   s   | d S )Nr   r.   r   r.   r.   r/   r     s    c                 S   s   g | ]}|j qS r.   rb   r   r.   r.   r/   rv     s    z:SupervisionSet.with_alignment_from_ctm.<locals>.<listcomp>zAdding alignmentsr   c              	      s^   g | ]+}t t|d  |d  |d  r-j|d ks st|d |d  |d |d dqS )r   r   r   r      rE   )r   r   rc   r!   )rt   r   )r   segr.   r/   rv     s    	rk   c                    s   g | ]
}t | g id qS )rk   )r   r   rx   r.   r/   rv     s    z alignments added out of zH total. If there are several missing, there could be a mismatch problem.)r   r   r   r   r   r_   r\   sortedr   r+   r   r   setfindr   extendlogginginfor   r   )r3   r   rx   r   r   	ctm_wordschannel_to_intr   r   reco_idrc   r#   r$   r"   r%   reco_to_ctmr   	num_totalnum_overspanned
recordingsri   r.   )r   r   rx   r/   with_alignment_from_ctmh  sZ   



z&SupervisionSet.with_alignment_from_ctmc                 C   s   t |dh}| D ]\}||jv rd|j| D ]O}t|jtr!|jd n|j}|jdu rD||j d| d|jdd|j	dd|j
 d
 q||j d| d|jdd|j	dd|j
 d|jdd qqW d   dS 1 spw   Y  dS )z
        Write alignments to CTM file.

        :param ctm_file: Path to output CTM file (will be created if not exists)
        :param type: Alignment type to write (default = `word`)
        wr   N z.02f
)r   ri   r)   rc   r+   r%   writerb   r#   r$   r"   )r3   r   rx   r   r   ry   cr.   r.   r/   write_alignment_to_ctm  s   

,6"z%SupervisionSet.write_alignment_to_ctmc                 C   r   )Nc                 s   s    | ]}|  V  qd S r(   )r   r   r.   r.   r/   r     s    z*SupervisionSet.to_dicts.<locals>.<genexpr>r.   r2   r.   r.   r/   to_dicts  s   zSupervisionSet.to_dicts
num_splitsshuffle	drop_lastc                 C   s   dd t | |||dD S )a^  
        Split the :class:`~lhotse.SupervisionSet` into ``num_splits`` pieces of equal size.

        :param num_splits: Requested number of splits.
        :param shuffle: Optionally shuffle the recordings order first.
        :param drop_last: determines how to handle splitting when ``len(seq)`` is not divisible
            by ``num_splits``. When ``False`` (default), the splits might have unequal lengths.
            When ``True``, it may discard the last element in some splits to ensure they are
            equally long.
        :return: A list of :class:`~lhotse.SupervisionSet` pieces.
        c                 S   r   r.   r   )rt   subsetr.   r.   r/   rv     s    z(SupervisionSet.split.<locals>.<listcomp>)r   r   r   )r    )r3   r   r   r   r.   r.   r/   r     s
   zSupervisionSet.split 
output_dir
chunk_sizeprefixc                 C   s   t | |||dS )a;  
        Splits a manifest (either lazily or eagerly opened) into chunks, each
        with ``chunk_size`` items (except for the last one, typically).

        In order to be memory efficient, this implementation saves each chunk
        to disk in a ``.jsonl.gz`` format as the input manifest is sampled.

        .. note:: For lowest memory usage, use ``load_manifest_lazy`` to open the
            input manifest for this method.

        :param it: any iterable of Lhotse manifests.
        :param output_dir: directory where the split manifests are saved.
            Each manifest is saved at: ``{output_dir}/{prefix}.{split_idx}.jsonl.gz``
        :param chunk_size: the number of items in each chunk.
        :param prefix: the prefix of each manifest.
        :return: a list of lazily opened chunk manifests.
        )r   r   r   )r   )r3   r   r   r   r.   r.   r/   
split_lazy  s   zSupervisionSet.split_lazyfirstlastc                 C   s   t ||s	J d|dur|dksJ tt| |}|S |dur>|dks'J |t| kr/| S tt| t| | t| S dS )aq  
        Return a new ``SupervisionSet`` according to the selected subset criterion.
        Only a single argument to ``subset`` is supported at this time.

        :param first: int, the number of first supervisions to keep.
        :param last: int, the number of last supervisions to keep.
        :return: a new ``SupervisionSet`` with the subset results.
        z*subset() can handle only one non-None arg.Nr   )r   r   
from_itemsr   r   r   )r3   r   r   outr.   r.   r/   r     s"   zSupervisionSet.subsetrR   c                    s   t  fdd| D S )aK  
        Return a copy of the current ``SupervisionSet`` with the segments having a transformed ``text`` field.
        Useful for text normalization, phonetic transcription, etc.

        :param transform_fn: a function that accepts a string and returns a string.
        :return: a ``SupervisionSet`` with adjusted text.
        c                 3   s    | ]}|  V  qd S r(   )r   r   r   r.   r/   r     r   z0SupervisionSet.transform_text.<locals>.<genexpr>r   rS   r.   r   r/   r     s   zSupervisionSet.transform_textc                    s   t  fdd| D S )a  
        Return a copy of the current ``SupervisionSet`` with the segments having a transformed ``alignment`` field.
        Useful for text normalization, phonetic transcription, etc.

        :param transform_fn: a function that accepts a string and returns a string.
        :param type:  alignment type to transform (key for alignment dict).
        :return: a ``SupervisionSet`` with adjusted text.
        c                 3   s    | ]
}|j  d V  qdS )r   N)r   r   r   r.   r/   r   )  s    
z5SupervisionSet.transform_alignment.<locals>.<genexpr>r   r   r.   r   r/   r     s   z"SupervisionSet.transform_alignmentr   MbP?rb   rc   start_after
end_beforeadjust_offset	tolerancec                    s*   |   } fdd||g D S )a  
        Return an iterable of segments that match the provided ``recording_id``.

        :param recording_id: Desired recording ID.
        :param channel: When specified, return supervisions in that channel - otherwise, in all channels.
        :param start_after: When specified, return segments that start after the given value.
        :param end_before: When specified, return segments that end before the given value.
        :param adjust_offset: When true, return segments as if the recordings had started at ``start_after``.
            This is useful for creating Cuts. From a user perspective, when dealing with a Cut, it is no
            longer helpful to know when the supervisions starts in a recording - instead, it's useful to
            know when the supervision starts relative to the start of the Cut.
            In the anticipated use-case, ``start_after`` and ``end_before`` would be
            the beginning and end of a cut;
            this option converts the times to be relative to the start of the cut.
        :param tolerance: Additional margin to account for floating point rounding errors
            when comparing segment boundaries.
        :return: An iterator over supervision segments satisfying all criteria.
        c                 3   s\    | ])}d u st |jr+|j kr+d u s |j kr r(| n|V  qd S r(   )r   rc   r#   r<   rC   )rt   segmentr   rc   r   r   r   r.   r/   r   I  s    z&SupervisionSet.find.<locals>.<genexpr>) _index_by_recording_id_and_cacheget)r3   rb   rc   r   r   r   r   segment_by_recording_idr.   r   r/   r   -  s   
zSupervisionSet.find_segments_by_recording_idc                 C   s,   | j d u rddlm} |dd | | _ | j S )Nr   )r   c                 S   r   r(   r   )r   r.   r.   r/   r   Z  s    zASupervisionSet._index_by_recording_id_and_cache.<locals>.<lambda>)r  cytoolzr   )r3   r   r.   r.   r/   r   V  s   
z/SupervisionSet._index_by_recording_id_and_cachec                 C   s   dt |  dS )NzSupervisionSet(len=))r   r2   r.   r.   r/   __repr__]  r   zSupervisionSet.__repr__index_or_idc                    sZ   z| j   W S  ty,   | jrt fddt| D  Y S t fdd| D  Y S w )Nc                 3   s     | ]\}}| kr|V  qd S r(   r.   )rt   r   ru   r  r.   r/   r   f  s    z-SupervisionSet.__getitem__.<locals>.<genexpr>c                 3   s    | ]
}|j  kr|V  qd S r(   r   rs   r  r.   r/   r   i  s    )r   	TypeErroris_lazynextr   )r3   r  r.   r  r/   __getitem__`  s   zSupervisionSet.__getitem__c                    s6   t  trt fdd| D S t fdd| D S )Nc                 3   s    | ]} |j kV  qd S r(   r   rs   r   r.   r/   r   m  s    z.SupervisionSet.__contains__.<locals>.<genexpr>c                 3   s    | ]	} j |j kV  qd S r(   r   rs   r  r.   r/   r   o  s    )r)   rZ   anyr   r.   r  r/   __contains__k  s   
zSupervisionSet.__contains__c                 c   s    | j E d H  d S r(   r   r2   r.   r.   r/   __iter__q  s   zSupervisionSet.__iter__c                 C   s
   t | jS r(   )r   r   r2   r.   r.   r/   __len__t  s   
zSupervisionSet.__len__r(   )r   FFr   )FF)r   )NN)Nr   NFr   ),rV   rW   rX   rY   r   r	   r`   r   r   r   r^   r   r   rZ   r&   r   r]   r   r   r   r   r   r   r   r*   r   r_   r
   r   r   r   r   r   r   r   r   r  r[   r   r  r  r  r  r  r.   r.   r.   r/   r     s   
 - D
I




'r   )+r   collectionsr   dataclassesr   	itertoolsr   r   typingr   r   r   r	   r
   r   r   r   r   r   lhotse.customr   lhotse.lazyr   r   r   lhotse.utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r`   r   r.   r.   r.   r/   <module>   s     ,@P  d