o
    pi                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZmZ d dlmZ d dlmZmZmZ d dlmZmZ d d	lm Z m!Z!m"Z"m#Z#m$Z$ d d
l%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; zd dl<m=Z=m>Z> dZ?W n e@y   dZ?Y nw eAe2jBZCeAe1jBZDG dd de7ZEG dd deZFdS )    N)Counter)partial)DictLiteralOptionalSequenceTextUnion)MLFlowLoggerTensorBoardLogger)pyplot)Problem
ResolutionSpecifications)SegmentationTaskTask)OptimalDiarizationErrorRate$OptimalDiarizationErrorRateThresholdOptimalFalseAlarmRateOptimalMissedDetectionRateOptimalSpeakerConfusionRate)binary_cross_entropy)	permutate)create_rng_for_worker)SegmentSlidingWindowFeature)SpeakerDiarizationProtocol)ScopeSubset)track)
DataLoaderIterableDataset)BaseWaveformTransform)Metric)MixITLossWrappermultisrc_neg_sisdrTFc                       s6   e Zd ZdZdef fddZdd Zdd Z  ZS )	
ValDatasetzValidation dataset class

    Val dataset needs to be iterable so that mixture of mixture generation
    can be performed in the same way for both training and development.

    Parameters
    ----------
    task : PixIT
        Task instance.
    taskc                    s   t    || _d S N)super__init__r'   )selfr'   	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/tasks/separation/PixIT.pyr*   R   s   

zValDataset.__init__c                 C   
   | j  S r(   )r'   val__iter__r+   r.   r.   r/   __iter__V      
zValDataset.__iter__c                 C   r0   r(   )r'   
val__len__r2   r.   r.   r/   __len__Y   r4   zValDataset.__len__)	__name__
__module____qualname____doc__r   r*   r3   r6   __classcell__r.   r.   r,   r/   r&   F   s
    r&   c                !       s  e Zd ZdZ															d@dedeeedf  d	ed
ee	 dee	 de
deee  dee de	dee	 de
dee deeee eeef f dee	 ded def  fddZdA fdd	Zde	ded	efddZd efd!d"Zd#d$ Zd%ejfd&d'Zd%ejfd(d)Zd%ejfd*d+ZdBd-d.Zd ejfd/d0Z	dAd1ejd2ejdeej d ejfd3d4Z d5d6 Z!d7d8 Z"d9e	fd:d;Z#d eeee eeef f fd<d=Z$d9e	fd>d?Z%  Z&S )CPixITuW  Joint speaker diarization and speaker separation task based on PixIT

    Parameters
    ----------
    protocol : SpeakerDiarizationProtocol
        pyannote.database protocol
    cache : str, optional
        As (meta-)data preparation might take a very long time for large datasets,
        it can be cached to disk for later (and faster!) re-use.
        When `cache` does not exist, `Task.prepare_data()` generates training
        and validation metadata from `protocol` and save them to disk.
        When `cache` exists, `Task.prepare_data()` is skipped and (meta)-data
        are loaded from disk. Defaults to a temporary path.
    duration : float, optional
        Chunks duration. Defaults to 5s.
    max_speakers_per_chunk : int, optional
        Maximum number of speakers per chunk (must be at least 2).
        Defaults to estimating it from the training set.
    max_speakers_per_frame : int, optional
        Maximum number of (overlapping) speakers per frame.
        Setting this value to 1 or more enables `powerset multi-class` training.
        Default behavior is to use `multi-label` training.
    weigh_by_cardinality: bool, optional
        Weigh each powerset classes by the size of the corresponding speaker set.
        In other words, {0, 1} powerset class weight is 2x bigger than that of {0}
        or {1} powerset classes. Note that empty (non-speech) powerset class is
        assigned the same weight as mono-speaker classes. Defaults to False (i.e. use
        same weight for every class). Has no effect with `multi-label` training.
    balance: Sequence[Text], optional
        When provided, training samples are sampled uniformly with respect to these keys.
        For instance, setting `balance` to ["database","subset"] will make sure that each
        database & subset combination will be equally represented in the training samples.
    weight: str, optional
        When provided, use this key as frame-wise weight in loss function.
    batch_size : int, optional
        Number of training samples per batch. Defaults to 32.
    num_workers : int, optional
        Number of workers used for generating training samples.
        Defaults to multiprocessing.cpu_count() // 2.
    pin_memory : bool, optional
        If True, data loaders will copy tensors into CUDA pinned
        memory before returning them. See pytorch documentation
        for more details. Defaults to False.
    augmentation : BaseWaveformTransform, optional
        torch_audiomentations waveform transform, used by dataloader
        during training.
    metric : optional
        Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
        Defaults to AUROC (area under the ROC curve).
    separation_loss_weight : float, optional
        Scaling factor between diarization and separation losses. Defaults to 0.5.

    References
    ----------
    Joonas Kalda, Clément Pagés, Ricard Marxer, Tanel Alumäe, and Hervé Bredin.
    "PixIT: Joint Training of Speaker Diarization and Speech Separation
    from Real-world Multi-speaker Recordings"
    Odyssey 2024. https://arxiv.org/abs/2403.02288
    N      @F          ?protocolcachedurationmax_speakers_per_chunkmax_speakers_per_frameweigh_by_cardinalitybalanceweight
batch_sizenum_workers
pin_memoryaugmentationmetricmax_num_speakersloss)bcemseseparation_loss_weightc              
      s   t stdt j|||	|
||||d t|tstd|d u r,|d ur,|}td |d ur5td |d ur=t	d|	d dkrGtd	|| _
|| _|| _|| _|| _|| _ttd
d| _d S )Nz'asteroid' must be installed to train separation models with PixIT . `pip install pyannote-audio[separation]` should do the trick.)rB   rH   rI   rJ   rK   rL   rA   z>SpeakerDiarization task requires a SpeakerDiarizationProtocol.zL`max_num_speakers` has been deprecated in favor of `max_speakers_per_chunk`.z-`loss` has been deprecated and has no effect.zODiarization is done on masks separately which is incompatible powerset training   r   z-`batch_size` must be divisible by 2 for PixITT)generalized)ASTEROID_IS_AVAILABLEImportErrorr)   r*   
isinstancer   
ValueErrorwarningswarnNotImplementedErrorrC   rD   rE   rF   rG   rQ   r$   r%   
mixit_loss)r+   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rQ   r,   r.   r/   r*      sL   

zPixIT.__init__c              
      s   t  | | jd u r-| jd d tdk}g }d| jdd}tt	|d |dD ]}| jd	 t	| jd	 d
 |kd  }| jd t	| jd d
 |kd  }|D ]U}|d }	|d |d  }
|t	|d |	k|d |
k d  }t
|	|
| j d| j D ]&}|| j }|t	|d |k|d |k d  }|tt|d  qqWq-ttt|  \}}t|t|}}t|}|| }|| }t|t| }t||D ]-\}}|dkrtd|dd q|dkrtd|dd qtd|dd| d qtd|t	|dkd d  | _td| j d| j d | jd urG| j| jkrGtd| j d| j dt| jtj| jd u rUtjntjd d!d" t | jD | jd#}t| jtjtjd$d" t | jD d%}||f| _!d S )&Naudio-metadatasubsettrainz*Estimating maximum number of speakers per gzs chunk in the training setr   )descriptionannotations-segmentsfile_idannotations-regionsstartrB   endg      ?file_label_idxz   - z7.2%z( of all chunks contain no speech at all.   z contain 1 speaker or lessz	 contain z speakers or lessrR   g
ףp=
?z$Setting `max_speakers_per_chunk` to za. You can override this value (or avoid this estimation step) by passing `max_speakers_per_chunk=z` to the task constructor.z`max_speakers_per_frame` (z1) must be smaller than `max_speakers_per_chunk` ()Tc                 S      g | ]	}d |d  qS zspeaker#rg   r.   .0ir.   r.   r/   
<listcomp>B      zPixIT.setup.<locals>.<listcomp>)rB   
resolutionproblempermutation_invariantclassespowerset_max_classesc                 S   ri   rj   r.   rk   r.   r.   r/   rn   J  ro   )rB   rp   rq   rs   )"r)   setuprC   prepared_dataSubsetsindexrB   r   npwherearangeappendlenuniqueziplistr   itemsarrayargsortcumsumsumprintmaxrD   rW   r   r   FRAMEr   MULTI_LABEL_CLASSIFICATIONMONO_LABEL_CLASSIFICATIONrangespecifications)r+   stagetrainingnum_unique_speakersprogress_descriptionrb   annotationsannotated_regionsregionregion_start
region_endregion_annotationswindow_start
window_endwindow_annotationsnum_speakerscountssorting_indicesratioskratiospeaker_diarizationspeaker_separationr,   r.   r/   ru      s   









zPixIT.setuprb   
start_timec                    s  |  |}t| jd | d  }| d}t||| }t }| jj||\|d< }	| jd | jd d |k }
|
|
d |jk |
d |j	k@  }| jj
j}d	| jj
j }t|d |j	|j	 | }td
t|| t}t|d |j|j	 | }t|| t}tt|| }t|}|| jkr	 | jt|| jjj }tj||ftjd}dd t|D }t|||| D ]\}}}|| }d|||d |f< qt|| jj
|d|d< | jd |   fdd jj D |d< ||d d< |S )a  Prepare chunk

        Parameters
        ----------
        file_id : int
            File index
        start_time : float
            Chunk start time
        duration : float
            Chunk duration.

        Returns
        -------
        sample : dict
            Dictionary containing the chunk data with the following keys:
            - `X`: waveform
            - `y`: target as a SlidingWindowFeature instance where y.labels is
                   in meta.scope space.
            - `meta`:
                - `scope`: target scope (0: file, 1: database, 2: global)
                - `database`: database index
                - `file`: file index
        r\   scope
_label_idxXra   rb   rd   re   r?   r   dtypec                 S   s   i | ]\}}||qS r.   r.   )rl   idxlabelr.   r.   r/   
<dictcomp>      z'PixIT.prepare_chunk.<locals>.<dictcomp>rg   )labelsyc                    s   i | ]}| | qS r.   r.   rl   keymetadatar.   r/   r     r   metafile)!get_fileScopesrv   r   dictmodelaudiocropre   rd   receptive_fieldsteprB   ry   maximumroundastypeintminimumr   r~   r}   rC   
num_frameshparamssample_ratezerosuint8	enumerater   r   r   names)r+   rb   r   rB   r   label_scopelabel_scope_keychunksample_r   chunk_annotationsr   halfrd   	start_idxre   end_idxr   
num_labelsr   r   mappingr   mapped_labelr.   r   r/   prepare_chunkO  sJ   




zPixIT.prepare_chunkreturnc              
   C   s(   t t| | j| j| jdt| jdddS )zwValidation data loader

        Returns
        -------
        DataLoader
            Validation data loader.
        Tr^   )r   )rH   rI   rJ   	drop_last
collate_fn)r    r&   rH   rI   rJ   r   r   r2   r.   r.   r/   val_dataloader  s   zPixIT.val_dataloaderc                 #   s    t  j}t dd}|du r |}n't }t fdd|D D ]}dd t||D } j|fi |||< q%	 |durI||| }t	|V  q>)aX  Iterate over validation samples

        Yields
        ------
        dict:
            X: (time, channel)
                Audio chunks.
            y: (frame, )
                Frame-level targets. Note that frame < time.
                `frame` is infered automagically from the
                example model output.
            ...
        rF   Nc                    s   g | ]} j | qS r.   )metadata_unique_valuesr   r2   r.   r/   rn     r   z%PixIT.val__iter__.<locals>.<listcomp>c                 S   s   i | ]\}}||qS r.   r.   )rl   r   valuer.   r.   r/   r     r   z%PixIT.val__iter__.<locals>.<dictcomp>)
r   r   getattrval__iter__helperr   	itertoolsproductr   choicenext)r+   rngrF   chunks	subchunksr   filtersr.   r2   r/   r1     s    

zPixIT.val__iter__r   c           %      k   s   | j d d t|k}| D ]\}}|| j d | | j d | |kM }qt|d }| j d | }t|t| }	| j}
t	| dd}	 ||	
|  }| j d	 t| j d	 d
 |kd  }t|D ]}t| j d d
 |kd }t| j d d | t| j d d |  }||
|  }| j d | \}}}|||| |
 }||d ||
 k |d |k@  }tt|d }|t|d | }|jdkr| |||
}tjj||d}| j d | \}}}|||| |
 }| |||
}|d j|d j }t|| jkr|V  |V  qk|d d |d d gg}|D ]&\}}}}}}|d }||d krJt|d ||d< q,|||g q,g }d}| j d d |d  } |D ]H}!|!d | j d d ||  | j d d ||   kr|d7 }| j d d ||  } |!d |  |
kr|| |!d |!d |  f |!d } qdg d}"tj||"d}t|d dkr| |||
}|d t|d  }#tjj||#d}!|!\}}}||||
 }$| ||$|
}|d j|d j }t|| jkr|V  |V  qkqK)ab  Iterate over samples with optional domain filtering

        Mixtures are paired so that they have no speakers in common, come from the
        same file, and the combined number of speakers is no greater than
        max_speaker_per_chunk.

        Parameters
        ----------
        rng : random.Random
            Random number generator
        filters : dict, optional
            When provided (as {key: value} dict), filter files so that
            only files such as file[key] == value are used for generating chunks.

        Yields
        ------
        chunk : dict
            Chunks.
        r\   r]   r   r   zaudio-annotatednum_chunks_per_filerg   Tra   rb   rc   rB   rd   re   rf   )pr   ))rd   f)re   r   )rB   r   r   )rv   rw   rx   r   ry   rz   r   r   rB   r   searchsortedrandomr   uniformr   r~   isinsizer   r   r   r}   rC   r   r|   r   )%r+   splitr   r   split_filesr   r   file_idsannotated_durationcum_prob_annotated_durationrB   r   rb   r   r   annotated_region_indices#cum_prob_annotated_regions_durationannotated_region_indexregion_durationrd   r   r   previous_speaker_labelsrepeated_speaker_annotationsfirst_chunksecond_chunkr   merged_repeated_segmentsre   previoussegments_without_repeatcurrent_region_indexprevious_timesegmentr   prob_segments_durationnew_start_timer.   r.   r/   common__iter__helper  s  






 zPixIT.common__iter__helperc                 K      | j d|fi |S )a  Iterate over validation samples with optional domain filtering

        Parameters
        ----------
        rng : random.Random
            Random number generator
        filters : dict, optional
            When provided (as {key: value} dict), filter validation files so that
            only files such as file[key] == value are used for generating chunks.

        Yields
        ------
        chunk : dict
            validation chunks.
        developmentr  r+   r   r   r.   r.   r/   r        zPixIT.val__iter__helperc                 K   r  )a  Iterate over training samples with optional domain filtering

        Parameters
        ----------
        rng : random.Random
            Random number generator
        filters : dict, optional
            When provided (as {key: value} dict), filter training files so that
            only files such as file[key] == value are used for generating chunks.

        Yields
        ------
        chunk : dict
            Training chunks.
        r^   r  r  r.   r.   r/   train__iter__helper  r  zPixIT.train__iter__helperr^   c                 C   sb   |  |}| |}| |}| jj|dkd | j|| jjj|dd}|j	|j
d|dS )a  Collate function used for most segmentation tasks

        This function does the following:
        * stack waveforms into a (batch_size, num_channels, num_samples) tensor batch["X"])
        * apply augmentation when in "train" stage
        * convert targets into a (batch_size, num_frames, num_classes) tensor batch["y"]
        * collate any other keys that might be present in the batch using pytorch default_collate function

        Parameters
        ----------
        batch : list of dict
            List of training samples.

        Returns
        -------
        batch : dict
            Collated batch as {"X": torch.Tensor, "y": torch.Tensor} dict.
        r^   moderg   )samplesr   targets)r   r   r   )	collate_X	collate_ycollate_metarK   r^   r   r   r   	unsqueezer  r  squeeze)r+   batchr   
collated_X
collated_ycollated_meta	augmentedr.   r.   r/   r     s   



zPixIT.collate_fnc                 C   s   g }|D ]I}|d j }t|d j}|| jkr2tjtj|dd dd}|dd|d| j f }n|| jk rGtj|dd| j| ffdd}n	 || qt	
t|S )a  

        Parameters
        ----------
        batch : list
            List of samples to collate.
            "y" field is expected to be a SlidingWindowFeature.

        Returns
        -------
        y : torch.Tensor
            Collated target tensor of shape (num_frames, self.max_speakers_per_chunk)
            If one chunk has more than `self.max_speakers_per_chunk` speakers, we keep
            the max_speakers_per_chunk most talkative ones. If it has less, we pad with
            zeros (artificial inactive speakers).
        r   r   )axisN)r   r   constantr
  )datar}   r   rC   ry   r   r   padr|   torch
from_numpystack)r+   r  r  br   r   indicesr.   r.   r/   r    s    


zPixIT.collate_ypermutated_predictiontargetc                 C   s   t || |d}|S )a  Permutation-invariant segmentation loss

        Parameters
        ----------
        permutated_prediction : (batch_size, num_frames, num_classes) torch.Tensor
            Permutated speaker activity predictions.
        target : (batch_size, num_frames, num_speakers) torch.Tensor
            Speaker activity.
        weight : (batch_size, num_frames, 1) torch.Tensor, optional
            Frames weight.

        Returns
        -------
        seg_loss : torch.Tensor
            Permutation-invariant segmentation loss
        rG   )r   float)r+   r!  r"  rG   seg_lossr.   r.   r/   segmentation_loss  s   
zPixIT.segmentation_lossc                 C   s  |j d }|| }|jdddkjdd}|jdddkjdd}g }	t|D ]S}
tj||
 dd||
 jdddkf ||
 dd||
 jdddkf fdd}|j d ||
  ||
  }tj|j d |f|jd}tj||fdd}|	| q't|	}	||	||fS )a  
        Creates mixtures of mixtures and corresponding diarization targets.
        Keeps track of how many speakers came from each mixture in order to
        reconstruct the original mixtures.

        Parameters
        ----------
        mix1 : torch.Tensor
            First mixture.
        mix2 : torch.Tensor
            Second mixture.
        target1 : torch.Tensor
            First mixture diarization targets.
        target2 : torch.Tensor
            Second mixture diarization targets.

        Returns
        -------
        mom : torch.Tensor
            Mixtures of mixtures.
        targets : torch.Tensor
            Diarization targets for mixtures of mixtures.
        num_active_speakers_mix1 : torch.Tensor
            Number of active speakers in the first mixture.
        num_active_speakers_mix2 : torch.Tensor
            Number of active speakers in the second mixture.
        r   rg   dimNrR   device)	shaper   r   r  catr   r*  r|   r  )r+   mix1mix2target1target2rH   momnum_active_speakers_mix1num_active_speakers_mix2r  rm   r"  padding_dimpadding_tensorr.   r.   r/   create_mixtures_of_mixtures3  s2   
""
z!PixIT.create_mixtures_of_mixturesc              	   C   sp  |d }|d }|j d }|dk rdS |d dkr|dd }|ddd d}|ddd d}| |||ddd |ddd \}}}	}	tj|ddd |ddd |fdd}| tj|||fdd\}
}||d }|
j \}}}	t| d	d}||tj||d| jj	d
}t
||
\}}	| j|||d}| |ddt||fdd }|||
||fS )a8  Common step for training and validation

        Parameters
        ----------
        batch : dict of torch.Tensor
            Current batch.

        Returns
        -------
        seg_loss : torch.Tensor
            Segmentation loss.
        separation_loss : torch.Tensor
            Separation loss.
        diarization : torch.Tensor
            Diarization predictions.
        permutated_diarization : torch.Tensor
            Permutated diarization predictions that minizimise seg_loss.
        target : torch.Tensor
            Diarization target.
        r   r   r   rR   Nr   rg   r'  rG   r)  r#  )r+  r  r6  r  r,  r   r   getonesr*  r   r&  r[   	transposer  mean)r+   r  r"  waveformbszr-  r.  r1  
mom_targetr   diarizationsourcesmom_sourcesrH   r   
weight_keyrG   permutated_diarizationr%  separation_lossr.   r.   r/   common_stepj  sJ   
"(zPixIT.common_step	batch_idxc                 C   s   |  |\}}}}}| jjd|ddddd | jjd|ddddd d| j | | j|  }t|r5dS | jjd|ddddd | jsv| j }	t|	t	rP|	n|	g}	|	D ]}
|

  qU| j| |	D ]}
| jj|
| jjd	d
 |
  qdd|iS )a(  Compute PixIT loss for training

        Parameters
        ----------
        batch : (usually) dict of torch.Tensor
            Current batch.
        batch_idx: int
            Batch index.

        Returns
        -------
        loss : {str: torch.tensor}
            {"loss": loss}
        zloss/train/separationFTon_stepon_epochprog_barloggerzloss/train/segmentationrg   Nz
loss/trainnorm)gradient_clip_valgradient_clip_algorithmrN   )rD  r   logrQ   r  isnanautomatic_optimization
optimizersrV   r   	zero_gradmanual_backwardclip_gradientsrL  r   )r+   r  rE  r%  rC  r>  rB  r"  rN   rQ  	optimizerr.   r.   r/   training_step  sh   	





zPixIT.training_stepc                 C   s   t  t t t t dS )z1Returns diarization error rate and its components)DiarizationErrorRatezDiarizationErrorRate/ThresholdzDiarizationErrorRate/ConfusionzDiarizationErrorRate/MisszDiarizationErrorRate/FalseAlarm)r   r   r   r   r   r2   r.   r.   r/   default_metric  s   zPixIT.default_metricc                 C   s  |  |\}}}}}| jjd|ddddd | jjd|ddddd d| j | | j|  }| jjd|ddddd | jt|ddt|dd | jj| jjddddd | jjd	ksmt	
| jjd d	ksm|d	krod
S |   }	|  }
t| jd}t	t	|}t	|| }tjd| |ddd\}}tj|	|	d	k< t|	jdkr|	d
d
d
d
tjf }	|	t|	jd 9 }	t|D ]d}|| }|| }||d d	 |f }|	| }|| |d	t| |d|jd  | d |  d ||d d |f }|
| }|| |dd |d	t| | d qt!  | jj"D ]+}t#|t$rH|j%&d|| jj q4t#|t'r^|j%j(|j)|d| jj dd q4t*| d
S )zCompute validation loss and metric

        Parameters
        ----------
        batch : dict of torch.Tensor
            Current batch.
        batch_idx: int
            Batch index.
        zloss/val/separationFTrF  zloss/val/segmentationrg   zloss/valrR   r   N	   )      )nrowsncolsfigsizer  r   gg?r  samples_epochz.png)run_idfigureartifact_file)+rD  r   rN  rQ   validation_metricr  r9  log_dictcurrent_epochmathlog2r$  cpunumpyminrH   ceilsqrtpltsubplotsry   nanr}   r+  newaxisr{   r   plotset_xlimset_ylim	get_xaxisset_visible	get_yaxistight_layoutloggersrV   r   
experiment
add_figurer
   
log_figurer`  close)r+   r  rE  r%  rC  r>  rB  r"  rN   r   y_prednum_samplesr\  r]  figaxes
sample_idxrow_idxcol_idxax_refsample_yax_hypsample_y_predrJ  r.   r.   r/   validation_step  s   	
	



zPixIT.validation_step)Nr=   NNFNNr>   NFNNNNr?   r(   )r^   )'r7   r8   r9   r:   r   r   r	   strr$  r   boolr   r   r"   r#   r   r   r*   ru   r   r    r   r1   r   Randomr  r   r	  r   r  Tensorr  r&  r6  rD  rV  rX  r  r;   r.   r.   r,   r/   r<   ]   s    ?
	
EpW( 9
+2
7OM
r<   )Gr   rf  r   rX   collectionsr   	functoolsr   typingr   r   r   r   r   r	   ri  ry   r  lightning.pytorch.loggersr
   r   
matplotlibr   rm  pyannote.audio.core.taskr   r   r   (pyannote.audio.tasks.segmentation.mixinsr   r   pyannote.audio.torchmetricsr   r   r   r   r   pyannote.audio.utils.lossr    pyannote.audio.utils.permutationr   pyannote.audio.utils.randomr   pyannote.corer   r   pyannote.database.protocolr   #pyannote.database.protocol.protocolr   r   rich.progressr   torch.utils.datar    r!   /torch_audiomentations.core.transforms_interfacer"   torchmetricsr#   asteroid.lossesr$   r%   rT   rU   r   __args__rw   r   r&   r<   r.   r.   r.   r/   <module>   sD    

