o
    SiB;                     @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% eG dd deZ&dS )    N)	dataclass)partialreduce)add)AnyCallableIterableListOptionalSequenceTupleUnion)	Recording)DataCut)Features)SupervisionSegment)add_durationsfastcopyhash_str_to_intis_equal_or_containsmerge_items_with_delimiteroverlapsrich_exception_infouuid4c                   @   s@  e Zd ZU dZeed< edefddZede	e
j fddZede	e
j fdd	Ze	
d%dede	eeje	ej f  fddZdeee ef defddZ		
		
			d&de	eeef  dedededee de	e de	e defddZ		d'dede	eeee gef  dd fd d!Zed"e dd fd#d$Z!dS )(MonoCuta  
    :class:`~lhotse.cut.MonoCut` is a :class:`~lhotse.cut.Cut` of a single channel of
    a :class:`~lhotse.audio.Recording`. In addition to Cut, it has a specified channel attribute. This is the most commonly used type of cut.

    Please refer to the documentation of :class:`~lhotse.cut.Cut` to learn more about using cuts.

    See also:

        - :class:`lhotse.cut.Cut`
        - :class:`lhotse.cut.MixedCut`
        - :class:`lhotse.cut.CutSet`
    channelreturnc                 C   s   dS )N    selfr   r   C/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/cut/mono.pynum_channels,   s   zMonoCut.num_channelsc                 C   s   | j rA| jj| j| jd}|jd | j dkr$|d| jddf }|S |jd | j dkr?tj||ddddf fdd}|S dS )z
        Load the features from the underlying storage and cut them to the relevant
        [begin, duration] region of the current MonoCut.
        )startdurationr   r   N)axis)	has_featuresfeaturesloadr#   r$   shape
num_framesnpconcatenate)r    featsr   r   r!   load_features1   s   "zMonoCut.load_featuresc                 C   s"   | j r| jj| j| j| jdS dS )a  
        Load the audio by locating the appropriate recording in the supplied RecordingSet.
        The audio is trimmed to the [begin, end] range specified by the MonoCut.

        :return: a numpy ndarray with audio samples, with shape (1 <channel>, N <samples>)
        )channelsoffsetr$   N)has_recording	recording
load_audior   r#   r$   r   r   r   r!   r4   D   s   zMonoCut.load_audioT
with_audioc                 C   s$   | j r| jj| j| j| j|dS dS )a  
        Load the subset of video (and audio) from attached recording.
        The data is trimmed to the [begin, end] range specified by the MonoCut.

        :param with_audio: bool, whether to load and return audio alongside video. True by default.
        :return: a tuple of video tensor and optionally audio tensor (or ``None``),
            or ``None`` if this cut has no video.
        )r0   r1   r$   r5   N)	has_videor3   
load_videor   r#   r$   )r    r5   r   r   r!   r7   T   s   zMonoCut.load_videor0   c              	      s   t  t}t|r gn t| jjs!J d d| jj|p(t dk}|rM|s0 \ t| j d  | j| j	| j
  fdd| jD | jdS dd	lm} || j dt  d
| j	| j
  fdd| jD | jdS )a-  
        Select specified channels from this cut.
        Supports extending to other channels available in the underlying :class:`Recording`.
        If a single channel is provided, we'll return a :class:`~lhotse.cut.MonoCut`,
        otherwise we'll return a :class:`~lhotse.cut.MultiCut`.
        zCannot select channels=z= because they are not a subset of self.recording.channel_ids=r   -c                    s$   g | ]}t |j rt| d qS ))r   )r   r   r   .0sr0   r   r!   
<listcomp>   s    

z)MonoCut.with_channels.<locals>.<listcomp>)idr3   r#   r$   r   supervisionscustomr   MultiCutchanc                    s   g | ]
}t  |jr|qS r   )r   r   r9   r<   r   r!   r=      s    
)r>   r#   r$   r   r?   r@   )
isinstanceintsetissubsetr3   channel_idslenr   r>   r#   r$   r?   r@   lhotserB   )r    r0   channel_is_intmonorB   r   r<   r!   with_channelsj   s@   



zMonoCut.with_channelsNFr   rir_recordingnormalize_output
early_onlyaffix_idrir_channelsroom_rng_seedsource_rng_seedc              	      s>  | j sJ d| jrtd d| _du s%tfdd|D s%J ddu r@dg}|du r:ttt | j	 }|du r@|}t
|dkrn| jj|| |||d	} fd
d| jD }	t|  rg| j	 dn| j	||	dS ddlm}
 ttt
|| jj|| |||d	} fdd| jD }	t|
| ||	dS )a  
        Return a new ``DataCut`` that will convolve the audio with the provided impulse response.
        If the `rir_recording` is multi-channel, the `rir_channels` argument determines which channels
        will be used. By default, we use the first channel and return a MonoCut. If we reverberate
        with a multi-channel RIR, we return a MultiCut.

        If no ``rir_recording`` is provided, we will generate an impulse response using a fast random
        generator (https://arxiv.org/abs/2208.04101). Note that the generator only supports simulating
        reverberation with a single microphone, so we will return a MonoCut in this case.

        :param rir_recording: The impulse response to use for convolving.
        :param normalize_output: When true, output will be normalized to have energy as input.
        :param early_only: When true, only the early reflections (first 50 ms) will be used.
        :param affix_id: When true, we will modify the ``MonoCut.id`` field
            by affixing it with "_rvb".
        :param rir_channels: The channels of the impulse response to use. First channel is used by default.
            If multiple channels are specified, this will produce a MultiCut instead of a MonoCut.
        :param room_rng_seed: The seed for the room configuration.
        :param source_rng_seed: The seed for the source position.
        :return: a modified copy of the current ``MonoCut``.
        z:Cannot apply reverberation on a MonoCut without Recording.zAttempting to reverberate a MonoCut that references pre-computed features. The feature manifest will be detached, as we do not support feature-domain reverberation.Nc                 3   s    | ]}| j k V  qd S N)r"   )r:   c)rO   r   r!   	<genexpr>       

z%MonoCut.reverb_rir.<locals>.<genexpr>z(Invalid channel index in `rir_channels`.r   r   )rO   rP   rQ   rR   rS   rT   rU   c                    s   g | ]}|j  d qS )rR   
reverb_rirr9   rZ   r   r!   r=      s    z&MonoCut.reverb_rir.<locals>.<listcomp>_rvb)r>   r3   r?   rA   c                    s   g | ]	}|j  d qS ))rR   r   r[   r9   )rR   r0   r   r!   r=      s    )r3   r?   r   )r2   r'   loggingwarningr(   allr   strr   r>   rI   r3   r\   r?   r   multirB   listrange	from_mono)r    rO   rP   rQ   rR   rS   rT   rU   recording_rvbsupervisions_rvbrB   r   )rR   r0   rO   r!   r\      sv   !



zMonoCut.reverb_rir	delimitermerge_policycustom_merge_fnc           	         s|  t td|dkd|dur| nfdd t| jdd dtd	kr'| S d
 j}d j}t|| | jd}t	dd D }t	dd D }t
dd td	d D rot
dd D rotd| j d tdd D d
 j||d
 jddd D dd D dd D dd D  fdd|D fdd|D d}t| |gdS )a0  
        Return a copy of the cut that has all of its supervisions merged into
        a single segment.

        The new start is the start of the earliest superivion, and the new duration
        is a minimum spanning duration for all the supervisions. The text fields of
        all segments are concatenated with a whitespace.

        :param merge_policy: one of "keep_first" or "delimiter". If "keep_first", we
            keep only the first segment's field value, otherwise all string fields
            (including IDs) are prefixed with "cat#" and concatenated with a hash symbol "#".
            This is also applied to ``custom`` fields. Fields with a ``None`` value are omitted.
        :param custom_merge_fn: a function that will be called to merge custom fields values.
            We expect ``custom_merge_fn`` to handle all possible custom keys.
            When not provided, we will treat all custom values as strings.
            It will be called roughly like:
            ``custom_merge_fn(custom_key, [s.custom[custom_key] for s in sups])``
        #
keep_first)rh   return_firstNc                    s    t t|S rV   )mapra   )kvs)merge_func_r   r!   <lambda>,  s    z,MonoCut.merge_supervisions.<locals>.<lambda>c                 S   s   | j S rV   )r#   )r;   r   r   r!   rr   .  s    )keyr   r   r%   )sampling_ratec                 s   .    | ]}|j d ur|j  D ]}|V  qqd S rV   )r@   keysr:   r;   ro   r   r   r!   rX   8      $z-MonoCut.merge_supervisions.<locals>.<genexpr>c                 s   ru   rV   )	alignmentrv   rw   r   r   r!   rX   ;  rx   c                 s   s    | ]
\}}t ||V  qd S rV   )r   )r:   s1s2r   r   r!   rX   ?  s    c                 s   s    | ]}|j d uV  qd S rV   textr9   r   r   r!   rX   ?  rY   zYou are merging overlapping supervisions that have text transcripts. The result is likely to be unusable if you are going to train speech recognition models (cut id: z).c                 s   s    | ]}|j V  qd S rV   )r>   r9   r   r   r!   rX   I  s     c                 s       | ]	}|j r|j V  qd S rV   r|   r9   r   r   r!   rX   N      c                 s   r   rV   )speakerr9   r   r   r!   rX   O  r   c                 s   r   rV   )languager9   r   r   r!   rX   P  r   c                 s   r   rV   )genderr9   r   r   r!   rX   Q  r   c                    s&   i | ]    fd dD qS )c                 3   .    | ]}|j d ur |j v r|j   V  qd S rV   )r@   r9   ro   r   r!   rX   U      8MonoCut.merge_supervisions.<locals>.<dictcomp>.<genexpr>r   r:   )merge_customsupsr   r!   
<dictcomp>R  s    	
z.MonoCut.merge_supervisions.<locals>.<dictcomp>c                    s&   i | ]  t t fd dD qS )c                 3   r   rV   )ry   r9   r   r   r!   rX   a  r   r   )r   r   r   )r   r   r!   r   ]  s    

)r>   recording_idr#   r$   r   r}   r   r   r   r@   ry   )r?   )r   r   sortedr?   rI   r#   endr   rt   rF   anyzipwarningswarnr>   r   r   r   joinr   )	r    ri   rj   mstartmend	mdurationcustom_keysalignment_keysmsupr   )r   rq   r   r!   merge_supervisions  s\   

(	

#zMonoCut.merge_supervisionsdatac                 C   s   ddl m} | dd  d| v rt| dnd }d| v r&t| dnd }d| v r1| dng }d| v r=|| d  d| v rF| d tdi | ||dd	 |D d
S )Nr   )deserialize_custom_fieldtyper(   r3   r?   r@   c                 S   s   g | ]}t |qS r   )r   	from_dictr9   r   r   r!   r=     s    z%MonoCut.from_dict.<locals>.<listcomp>)r(   r3   r?   r   )lhotse.serializationr   popr   r   r   r   )r   r   r(   r3   supervision_infosr   r   r!   r   m  s$   

zMonoCut.from_dict)T)NTFTrN   NN)rh   N)"__name__
__module____qualname____doc__rE   __annotations__propertyr"   r   r
   r,   ndarrayr/   r4   boolr   torchTensorr7   r   r	   r   rM   r   r   r\   ra   r   r   r   r   staticmethoddictr   r   r   r   r!   r      sh   
 /	
s
er   )'r^   r   dataclassesr   	functoolsr   r   operatorr   typingr   r   r   r	   r
   r   r   r   numpyr,   r   lhotse.audior   lhotse.cut.datar   lhotse.featuresr   lhotse.supervisionr   lhotse.utilsr   r   r   r   r   r   r   r   r   r   r   r   r!   <module>   s    ((