o
    SiES                  
   @   s  U d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZmZ d dlmZmZmZ d dlmZmZmZmZmZ d dlmZmZ d d	l m!Z!m"Z" d d
l#m$Z$m%Z%m&Z& i Z'e	e(ef e)d< d<dede*ddfddZ+dede"deee"f fddZ,	d<deeef dee"e!f de*ddfddZ-dede"deee"f fddZ.	d=deeef de
e! de*de"fddZ/dd Z0e0d<d ede*ddfd!d"Z1e0	d<d#e!de*ddfd$d%Z2e0	d>d&ede*d'eej3 ddfd(d)Z4e0d<d*ede*ddfd+d,Z5e0d<d*ede*ddfd-d.Z6e0d<d/ede*ddfd0d1Z7e0d<dede*ddfd2d3Z8e0de"ddfd4d5Z9e0d<d6ede*ddfd7d8Z:e0d<d9ede*ddfd:d;Z;dS )?    N)Counterdefaultdict)isclose)AnyCallableDictIterableOptionalTupleUnion)ArrayTemporalArray)	RecordingRecordingSet%get_audio_duration_mismatch_tolerance)CutCutSetMixedCutMonoCut
PaddingCut)Features
FeatureSet)SupervisionSegmentSupervisionSet)compute_num_framesis_equal_or_containsoverlaps_VALIDATORSFobj	read_datareturnc                 C   sX   d}t D ]}t| |rt | } nq|du r$tdt|  dtt  || |d dS )a  
    Validate a Lhotse manifest object.
    It checks for conditions such as positive duration, matching channels, ids, etc.
    It raises AssertionError when it finds some mismatch.

    Optionally it can load the audio/feature data from disk and inspect whether the
    num samples/frames/features declared in the manifests are matching the actual data.

    This function determines the passed object's type and automatically calls
    the proper validator for that object.
    Nz1Object of unknown type passed to validate() (T = z, known types = r   )r   
isinstance
ValueErrortypelist)r   r   	validatorregistered_type r(   =/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/qa.pyvalidate   s   
r*   
recordingssupervisionsc                 C   sd   t | |\} }ttdd | D dksJ dt| |}ttdd |D dks.J d| |fS )aZ  
    Fix a pair of :class:`~lhotse.audio.RecordingSet` and :class:`~lhotse.supervision.SupervisionSet`,
    which is conceptually similar to how Kaldi's ``utils/fix_data_dir.sh`` works.

    We will:
        - remove all supervisions without a corresponding recording;
        - remove all recordings without a corresponding supervision;
        - remove all supervisions that exceed the duration of a recording;
        - trim supervisions that exceed the duration of a recording but start before its end;
        - and possibly other operations in the future.

    :param recordings: a :class:`~lhotse.audio.RecordingSet` instance.
    :param supervisions: a corresponding :class:`~lhotse.supervision.SupervisionSet` instance.
    :return: a pair of ``recordings`` and ``supervisions`` that were fixed:
        the original manifests are not modified.
    c                 s       | ]}|j V  qd S Nid.0rr(   r(   r)   	<genexpr>B       z fix_manifests.<locals>.<genexpr>r   z.No recordings left after fixing the manifests.c                 s   r-   r.   r/   r2   sr(   r(   r)   r4   G   r5   z0No supervisions left after fixing the manifests.)*remove_missing_recordings_and_supervisionslen	frozensettrim_supervisions_to_recordings)r+   r,   r(   r(   r)   fix_manifests*   s   
r<   c           
      C   sz  t | tr
t| g} t |trt|g}|  } | }t| |d t| dd | D }|D ]Z}||j}|dusGJ d|j	 d|j d|j
  kr[|j  kr[|jd ksqn J d|j	 d	|j
 d
|j d|j d	t|j|jsJ d|j	 d|j d|j dq/| }tdd |D }|| }|rtdt| d || }	|	rtdt|	 d dS dS )aG  
    Validate the recording and supervision manifests separately,
    and then check if they are consistent with each other.

    This method will emit warnings, instead of errors, when some recordings or supervisions
    are missing their counterparts.
    These items will be discarded by default when creating a CutSet.
    r!   c                 S      i | ]}|j |qS r(   r/   r1   r(   r(   r)   
<dictcomp>e       z8validate_recordings_and_supervisions.<locals>.<dictcomp>NSupervision z# references non-existent recording gMbPgMbP?zI: exceeded the bounds of its corresponding recording (supervision spans [z, z]; recording spans [0, z])z
: channel zD does not exist in its corresponding Recording (recording channels: )c                 s   r-   r.   recording_idr6   r(   r(   r)   r4   u   r5   z7validate_recordings_and_supervisions.<locals>.<genexpr>z
There are zR recordings that do not have any corresponding supervisions in the SupervisionSet.zR supervisions that are missing their corresponding recordings in the RecordingSet.)r"   r   r   r   r   to_eagerr*   getrC   r0   startenddurationr   channel_idschannelkeysr:   loggingwarningr9   )
r+   r,   r   id2recr7   r3   recording_idsrecording_ids_in_supsonly_in_recordingsonly_in_supervisionsr(   r(   r)   $validate_recordings_and_supervisionsM   sR   




,
rS   c                    s   t dd | D }t dd |D }||   r,|  fdd} tdt  d || rct dd |D }|fd	d}t d
d |D }t|t| }td| dt d | |fS )a?  
    Fix the recording and supervision manifests by removing all entries that
    miss their counterparts.

    :param recordings: a :class:`RecordingSet` object.
    :param supervisions: a :class:`RecordingSet` object.
    :return: A pair of :class:`RecordingSet` and :class:`SupervisionSet` with removed entries.
    c                 s   r-   r.   r/   r1   r(   r(   r)   r4      r5   z=remove_missing_recordings_and_supervisions.<locals>.<genexpr>c                 s   r-   r.   rB   r6   r(   r(   r)   r4      r5   c                    
   | j  vS r.   r/   )r3   )rQ   r(   r)   <lambda>      
 z<remove_missing_recordings_and_supervisions.<locals>.<lambda>Removed z/ recordings with no corresponding supervisions.c                 s   r-   r.   r/   r6   r(   r(   r)   r4      r5   c                    rT   r.   rB   )r7   )rR   r(   r)   rU      rV   c                 s   r-   r.   r/   r6   r(   r(   r)   r4      r5   z? supervisions with no corresponding recordings (for a total of z recording IDs).)r:   filterrL   rM   r9   )r+   r,   rO   rP   supervision_idssupervision_ids_aftern_removed_supsr(   )rQ   rR   r)   r8      s,   
r8   Tverbosec           	      C   s   t | tr
t| g} dd | D }g }d}d}|D ]&}||j j}|j|kr+|d7 }q|j|kr:|d7 }|j|d}|| q|rM|rMt	
d| d |rZ|rZt	
d| d	 t|S )
z
    Return a new :class:`~lhotse.supervision.SupervisionSet` with supervisions that are
    not exceeding the duration of their corresponding :class:`~lhotse.audio.Recording`.
    c                 S   r=   r(   r/   r1   r(   r(   r)   r>      r?   z3trim_supervisions_to_recordings.<locals>.<dictcomp>r      )rG   rW   z6 supervisions starting after the end of the recording.zTrimmed z1 supervisions exceeding the end of the recording.)r"   r   r   rC   rH   rF   rG   trimappendrL   rM   r   from_segments)	r+   r,   r\   rN   supsremovedtrimmedr7   rG   r(   r(   r)   r;      s0   
	





r;   c                 C   s   t t| j }| t|< | S )z~
    Decorator registers the function to be invoked inside ``validate()``
    when the first argument's type is matching.
    )nextiter__annotations__valuesr   )fnfirst_arg_typer(   r(   r)   register_validator   s   rj   r3   c                 C   s   | j dksJ d| j d| j  d| j| j }| jdks&J d| j dt|| j  t ks@J d| j d| j  d| d|rs|  }|j\}}| j|ks_J d| j d	| j d
| | j|ksuJ d| j d	| j d| d S d S )Nr   z
Recording (: duration has to be greater than 0 (is rA   z: no channels availablez : mismatched declared duration (z$) with num_samples / sampling_rate ().: expected z channels, got  samples, got )	rH   r0   num_samplessampling_ratenum_channelsabsr   
load_audioshape)r3   r   expected_durationsamplesn_chn_sr(   r(   r)   validate_recording   s*   
ry   r7   c                 K   s   | j dksJ d| j d| j  d| jd urkt| jts&J d| j d| j D ]A\}}t|tr;t||d q+t|trjt	||d t
| j |j sjtd| j d| j  d	| d
|j d|j d|j  d q+d S d S )Nr   r@   rk   rA   zSupervisionSegment /: custom field has to be set to a dict or None.r!   z4: possibly mismatched duration between supervision ('s) and temporal array in custom field '' (num_frames= * frame_shift= == duration=rl   )rH   r0   customr"   dictitemsr   validate_arrayr   validate_temporal_arrayr   rL   rM   
num_framesframe_shift)r7   r   kwargskeyvaluer(   r(   r)   validate_supervision   s>   



r   f
feats_datac              
   C   s  | j dksJ d| j  d| jdksJ d| j d| jdks*J d| j d| jdks8J d| j d| jdksFJ d| j d| jdksTJ d| j dt| j| j d	d
}tt||ksvJ d| j d| j d| dt	| j| j| jd}|| jksJ d| j d| j d| j d| d	|s|d ur|r| 
 }|j\}}| j|ksJ d| j d| | j|ksJ d| j d| d S d S )Nr   z-Features: start has to be greater than 0 (is rA   z0Features: duration has to be greater than 0 (is z2Features: num_frames has to be greater than 0 (is z4Features: num_features has to be greater than 0 (is z5Features: sampling_rate has to be greater than 0 (is z3Features: frame_shift has to be greater than 0 (is    )ndigitszFeatures: frame_shift of zX is incorrect because it is physically impossible; multiplying it by a sampling rate of z' results in a fractional window hop of z	 samples.)rH   r   rp   z;Features: manifest is inconsistent: declared num_frames is z, but duration (zs) / frame_shift (zs) results in z frames. If you're using a custom feature extractor, you might need to ensure that it preserves this relationship between duration, frame_shift and num_frames (use rounding up if needed - see lhotse.utils.compute_num_frames).zFeatures: expected num_frames: 
, actual: z!Features: expected num_features: )rF   rH   r   num_featuresrp   r   roundfloatintr   loadrt   )r   r   r   
window_hopexpected_num_framesn_frn_ftr(   r(   r)   validate_features
  s`   


r   arrc                 C   s$   |r|   }|j| jksJ d S d S r.   )r   rt   r   r   datar(   r(   r)   r   :  s
   r   c                 C   s   | j dks	J d| j | jk sJ d| j  d| j d| jdks%J d| jdks.J d|r<|  }|j| jks>J d S d S )Nr   z/TemporalArray: temporal_dim cannot be negative.zTemporalArray: temporal_dim z canot be greater than ndim .z,TemporalArray: frame_shift must be positive.z*TemporalArray: start must be non-negative.)temporal_dimndimr   rF   r   rt   r   r(   r(   r)   r   A  s   
r   cc                 C   s  t | tr9t| jdksJ d| j dt| jD ]\}}t|j|d |jdks6J d| j d| dqd S | j	dksKJ d| j d| j	 d	| j
dks]J d| j d
| j
 d	| jdksoJ d| j d| j d	| js~| js~J d| j dt | trd S | jrt| j | j| jjksJ |r|  }|j\}}| j|ksJ d| j d| j d| | j|ksJ d| j d| j d| | jrt| j t| jj| jsJ |r|  }| j|jd ksJ d| j d| j d|jd  t | trM| jD ]H}t| |j | j ks&J d| j d|j d| j  d|j  d		t|j| jr6t| j|jsKJ d| j d|j d| j d|j d		q| j!d urt | j!t"scJ d| j d| j!# D ]h\}	}
t |
t$rzt%|
|d qht |
t&rt'|
|d t(| j
|
j
st)*d| j d| j
 d|	 d|
j d|
j+ d|
j
 d t,| |
sJ d| j d|	 d|
j	 d |
j- d!| j	 d | j- d"qhd S d S )#Nr   zMonoCut z): Mixed cut must have at least one track.r!   z	MonoCut: z: track z has a negative offset.z": start must be 0 or greater (got rA   z': duration must be greater than 0 (got z,: sampling_rate must be greater than 0 (got z2: must have either Features or Recording attached.z: expected num_frames: r   z: expected num_features: r]   rm   rn   z: supervision z) has a mismatched recording_id (expected z, supervision has z$ has a mismatched channel (expected rz   z,: possibly mismatched duration between cut (r{   r|   r}   r~   rl   z!: TemporalArray at custom field 'z@' seems to have incorrect start time (the array with time span [zs - z-s] does not overlap with cut with time span [zs]).).r"   r   r9   tracksr0   	enumeratevalidate_cutcutoffsetrF   rH   rp   has_featureshas_recordingr   r   featuresrJ   channelsload_featuresrt   r   r   ry   	recordingr   rI   rs   ro   r   r,   r   rC   r   r   r   r   r   r   r   r   rL   rM   r   r   rG   )r   r   idxtrackfeatsr   r   rv   r7   r   r   r(   r(   r)   r   O  s   
$





r   c                 C   s   t  }t }| D ]}t||d ||j ||j  d7  < qt|dkr/td| d |	dd d dks>J dd S )Nr!   r]   z@RecordingSet contains recordings with different sampling rates (z$). Make sure that this was intended.r   z0RecordingSet has recordings with duplicated IDs.)
setr   ry   addrp   r0   r9   rL   rM   most_common)r+   r   ratesidsr3   r(   r(   r)   validate_recording_set  s   
r   c           
      K   s   t  }| D ]}t| ||j  d7  < q|dd d dks$J d|   | j D ]D\}}tt}|D ]}t	|j
trB|j
nt|j
}||  t|jdk7  < q7| D ]\}}	|	dkrptd|	 d| d| d qYq-d S )Nr]   r   z4SupervisionSet has supervisions with duplicated IDs.zSupervisionSet contains z, supervisions that start at 0 for recording z
 (channel z1). Did you forget to set supervision start times?)r   r   r0   r    _index_by_recording_id_and_cache_segments_by_recording_idr   r   r   r"   rJ   tuplerF   rL   rM   )
r,   r   r   r7   ridra   cntr_per_channelr   rJ   countr(   r(   r)   validate_supervision_set  s.   r   r   c              	   C   s   t t| }|j}|j}|j}t| D ]F\}}t||d |j|ks1J d| d|j d| d|j|ksEJ d| d|j d| d|j|ksYJ d| d|j d| dqd S )Nr!   zFFeatureSet: mismatched sampling rate (the first Features manifest had z, got z in Features at index rA   zEFeatureSet: mismatched num_features (the first Features manifest had zEFeatureSet: mismatched feature_type (the first Features manifest had )rd   re   rp   r   r$   r   r   )r   r   firstrp   r   features_typer   r   r(   r(   r)   validate_feature_set  s8   r   cutsc                 C   s   | D ]}t ||d qd S )Nr!   )r   )r   r   r   r(   r(   r)   validate_cut_set  s   r   )F)T)FN)<rL   collectionsr   r   mathr   typingr   r   r   r   r	   r
   r   numpynplhotse.arrayr   r   lhotse.audior   r   r   
lhotse.cutr   r   r   r   r   lhotse.featuresr   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   strrf   boolr*   r<   rS   r8   r;   rj   ry   r   ndarrayr   r   r   r   r   r   r   r   r(   r(   r(   r)   <module>   s   
 $

&


7

&

$/g