o
    pis                     @   s>  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZmZ ddlZddlZddlmZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/ ddl0m1Z1 dde2fddZ3G dd de eZ4dS )zSpeech separation pipelines    N)Path)CallableOptionalTextTupleUnion)	rearrange)Audio	InferenceModelPipeline)	AudioFile)
Clustering)PretrainedSpeakerEmbedding)PipelineModelSpeakerDiarizationMixin	get_model)set_num_speakers)binarize)
AnnotationSlidingWindowSlidingWindowFeature)GreedyDiarizationErrorRate)Categorical	ParamDictUniform)binary_dilation    
batch_sizec                 C   s   t | g| }tj|d|iS )zBatchify iterable	fillvalue)iter	itertoolszip_longest)iterabler   r   args r%   ^/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/pipelines/speech_separation.pybatchify6   s   r'   c                       s^  e Zd ZdZ										d5d	ed
edededededede	e
 deedf deeedf f fddZedefddZejdefddZdd Zdd Zedd Z	d6deeef fd d!Z		d7d"ed#ed$e	e fd%d&Zd'ed(ejd)edefd*d+Z					d8d,ed-e	e d.e	e d/e	e d0ed$e	e defd1d2Zde fd3d4Z!  Z"S )9SpeechSeparationu>  Speech separation pipeline

    Parameters
    ----------
    segmentation : Model, str, or dict, optional
        Pretrained segmentation model and separation model.
        See pyannote.audio.pipelines.utils.get_model for supported format.
    segmentation_step: float, optional
        The segmentation model is applied on a window sliding over the whole audio file.
        `segmentation_step` controls the step of this window, provided as a ratio of its
        duration. Defaults to 0.1 (i.e. 90% overlap between two consecuive windows).
    embedding : Model, str, or dict, optional
        Pretrained embedding model. Defaults to "speechbrain/spkrec-ecapa-voxceleb@5c0be38".
        See pyannote.audio.pipelines.utils.get_model for supported format.
    embedding_exclude_overlap : bool, optional
        Exclude overlapping speech regions when extracting embeddings.
        Defaults (False) to use the whole speech.
    clustering : str, optional
        Clustering algorithm. See pyannote.audio.pipelines.clustering.Clustering
        for available options. Defaults to "AgglomerativeClustering".
    segmentation_batch_size : int, optional
        Batch size used for speaker segmentation. Defaults to 1.
    embedding_batch_size : int, optional
        Batch size used for speaker embedding. Defaults to 1.
    der_variant : dict, optional
        Optimize for a variant of diarization error rate.
        Defaults to {"collar": 0.0, "skip_overlap": False}. This is used in `get_metric`
        when instantiating the metric: GreedyDiarizationErrorRate(**der_variant).
    token : str or bool, optional
        Huggingface token to be used for downloading from Huggingface hub.
    cache_dir: Path or str, optional
        Path to the folder where files downloaded from Huggingface hub are stored.

    Usage
    -----
    >>> pipeline = SpeechSeparation()
    >>> diarization, separation = pipeline("/path/to/audio.wav")
    >>> diarization, separation = pipeline("/path/to/audio.wav", num_speakers=4)
    >>> diarization, separation = pipeline("/path/to/audio.wav", min_speakers=2, max_speakers=10)

    Hyper-parameters
    ----------------
    segmentation.min_duration_off : float
        Fill intra-speaker gaps shorter than that many seconds.
    segmentation.threshold : float
        Mark speaker has active when their probability is higher than this.
    clustering.method : {'centroid', 'average', ...}
        Linkage used for agglomerative clustering
    clustering.min_cluster_size : int
        Minium cluster size.
    clustering.threshold : float
        Clustering threshold used to stop merging clusters.
    separation.leakage_removal : bool
        Zero-out sources when speaker is inactive.
    separation.asr_collar
        When using leakage removal, keep that many seconds before and after each speaker turn

    References
    ----------
    Joonas Kalda, Clément Pagés, Ricard Marxer, Tanel Alumäe, and Hervé Bredin.
    "PixIT: Joint Training of Speaker Diarization and Speech Separation
    from Real-world Multi-speaker Recordings"
    Odyssey 2024. https://arxiv.org/abs/2403.02288
    pyannote/separation-ami-1.0皙?Jspeechbrain/spkrec-ecapa-voxceleb@5c0be3875fda05e81f3c004ed8c7c06be308de1eFAgglomerativeClustering   Nsegmentationsegmentation_step	embeddingembedding_exclude_overlap
clusteringembedding_batch_sizesegmentation_batch_sizeder_varianttoken	cache_dirc              	      sR  t    || _t||	|
d}|| _|| _|| _|| _|| _|p$ddd| _	|j
d j}t||| j| d|d| _| jjj
d jrKttddd	| _nttd
dtddd| _| jdkr`d}nt| j|	|
d| _t| jjdd| _| jj}zt| }W n ty   tddttj dw |j|d| _ tt!ddgtddd| _"d S )N)r6   r7           F)collarskip_overlapr   T)durationstepskip_aggregationr         ?)min_duration_offr*   g?)	thresholdr?   OracleClusteringnot_applicabledownmix)sample_ratemonozclustering must be one of [, ])metric)leakage_removal
asr_collar)#super__init__segmentation_modelr   r/   r0   r3   r1   
klusteringr5   specificationsr;   r
   _segmentationmodelpowersetr   r   r.   r   
_embeddingr	   rD   _audiorH   r   KeyError
ValueErrorjoinlist__members__valuer2   r   
separation)selfr.   r/   r0   r1   r2   r3   r4   r5   r6   r7   rQ   segmentation_durationrH   
Klustering	__class__r%   r&   rL      sX   



zSpeechSeparation.__init__returnc                 C   s   | j jS NrP   r   r\   r%   r%   r&   r4      s   z(SpeechSeparation.segmentation_batch_sizer   c                 C   s   || j _d S rb   rc   )r\   r   r%   r%   r&   r4      s   c                 C   s   t  rb   )NotImplementedErrorrd   r%   r%   r&   default_parameters   s   z#SpeechSeparation.default_parametersc                 c   s     d}	 d|dV  |d7 }q)Nr   TSPEAKER_02dr-   r%   )r\   speakerr%   r%   r&   classes   s   zSpeechSeparation.classesc                 C   s   dS )Nztraining_cache/segmentationr%   rd   r%   r%   r&   CACHED_SEGMENTATION   s   z$SpeechSeparation.CACHED_SEGMENTATIONc                 C   s~   |durt |dd}| jr2| j|v r|| j \}}||fS | j||d\}}||f|| j< ||fS | j||d\}}||fS )aI  Apply segmentation model

        Parameter
        ---------
        file : AudioFile
        hook : Optional[Callable]

        Returns
        -------
        segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
        separations : (num_chunks, num_samples, num_speakers) SlidingWindowFeature
        Nr.   hook)	functoolspartialtrainingrk   rP   )r\   filerm   segmentationsseparationsr%   r%   r&   get_segmentations   s   
z"SpeechSeparation.get_segmentationsbinary_segmentationsexclude_overlaprm   c                    s  j r"dt }d|v r"jjjd js|d jjkr"|d S  j	j
} jj\}}}	|rYjj}
|jj }t||
 | dtj jddddk  }t j|  j	n	d	t j j	 fd
d}t| jdd}t||	 j }g }|dur|dd|dd t|dD ]1\}}ttdd | \}}t|}t|}j||d}|| |dur|d|||d qt|}t|d|d}j rjjjd jrd|id< |S jj|dd< |S )a  Extract embeddings for each (chunk, speaker) pair

        Parameters
        ----------
        file : AudioFile
        binary_segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
            Binarized segmentation.
        exclude_overlap : bool, optional
            Exclude overlapping speech regions when extracting embeddings.
            In case non-overlapping speech is too short, use the whole speech.
        hook: Optional[Callable]
            Called during embeddings after every batch to report the progress

        Returns
        -------
        embeddings : (num_chunks, num_speakers, dimension) array
        ztraining_cache/embeddings
embeddingsr   segmentation.thresholdr>      Taxiskeepdimsc                  3   s    t  D ]M\\} }\}}jj| dd\}}tj|ddtj}tj|ddtj}t |j|jD ]\}}t|krD|}n|}|d  t	
|d  fV  q6qd S )Npad)moder8   )nan)ziprT   cropnp
nan_to_numastypefloat32Tsumtorch
from_numpy)chunkmasks_clean_maskswaveformspeaker_activation_with_context
clean_mask	used_maskru   clean_segmentationsrq   min_num_framesr\   r%   r&   iter_waveform_and_mask<  s(   
z?SpeechSeparation.get_embeddings.<locals>.iter_waveform_and_mask)NN)r   r   N)total	completedr-   c                 S   s   | d d uS )Nr   r%   )br%   r%   r&   <lambda>j  s    z1SpeechSeparation.get_embeddings.<locals>.<lambda>)r   z(c s) d -> c s d)c)rx   rw   )rp   getdictrP   rQ   rO   rR   r.   r@   sliding_windowr;   datashaperS   min_num_samplesrD   mathceilr   r   r   r'   r3   	enumerater   filterr   vstackappendr   )r\   rq   ru   rv   rm   cacher;   
num_chunks
num_framesnum_speakersr   num_samplesclean_framesr   batchesbatch_countembedding_batchesibatch	waveformsr   waveform_batch
mask_batchembedding_batchrw   r%   r   r&   get_embeddings   sp    



	
zSpeechSeparation.get_embeddingsrr   hard_clusterscountc                 C   s   |j j\}}}t|d }tjt|||f }tt||D ]+\}	\}
\}}t|
D ]}|dkr4q-tj|dd|
|kf dd||	dd|f< q-q t	||j
}|S )a;  Build final discrete diarization out of clustered segmentation

        Parameters
        ----------
        segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
            Raw speaker segmentation.
        hard_clusters : (num_chunks, num_speakers) array
            Output of clustering step.
        count : (total_num_frames, 1) SlidingWindowFeature
            Instantaneous number of active speakers.

        Returns
        -------
        discrete_diarization : SlidingWindowFeature
            Discrete (0s and 1s) diarization.
        r-   Nr{   )r   r   r   maxr   zerosr   r   uniquer   r   )r\   rr   r   r   r   r   local_num_speakersnum_clustersclustered_segmentationsr   clusterr   r.   kr%   r%   r&   reconstruct  s&   	zSpeechSeparation.reconstructrq   r   min_speakersmax_speakersreturn_embeddingsc                     s  | j ||d}t|||d\}}}| j||d\}}|d| |d| | jjjd jr/|}	n	t|| jj	dd}	| j
|	| jjjdd	}
|d
|
 t|
jdkrit|d d}|re|dtd| jjffS |dfS | jdkrs|ssd}n| j||	| j|d}|d| | j||	||||| jjjd\}}}t|d }||k s||krttd| d| d| d| d	 t|
j|tj|
_tj|	jdddk}d||< |  |||
}| !||
}tj|dddk}|jdd|f |_|jj"\}}|d| |  |||
}|j#j$|jj"d  }t%|d| d}t&j'||dddd}|jj"\}}t(|jd dtd|| ff|_|jdd|f |_| j)j*rt+| jj,t-| j)j.| j/j0 }|dkrt1|j}t2|D ] }|jj3| }|dk}t4|dgd|  }|tj|j3|< qft5||j#}|j|6|j |_|jtjt7|jddd!d"  |_| j8|d| jj9d#}|d |_:d$|v r|d$ r| j;|d$ |dd%\}fd&d'|< D nd(d' t=|< | > D |j?d)}d*d' @ D  |jdd fd+d,|< D f |_|s||fS |du r||dfS tA|< |j"d kr:t(|dtA|< |j"d  fd f}| fd-d,|< D  }|||fS ).a  Apply speaker diarization

        Parameters
        ----------
        file : AudioFile
            Processed file.
        num_speakers : int, optional
            Number of speakers, when known.
        min_speakers : int, optional
            Minimum number of speakers. Has no effect when `num_speakers` is provided.
        max_speakers : int, optional
            Maximum number of speakers. Has no effect when `num_speakers` is provided.
        return_embeddings : bool, optional
            Return representative speaker embeddings.
        hook : callable, optional
            Callback called after each major steps of the pipeline as follows:
                hook(step_name,      # human-readable name of current step
                     step_artefact,  # artifact generated by current step
                     file=file)      # file being processed
            Time-consuming steps call `hook` multiple times with the same `step_name`
            and additional `completed` and `total` keyword arguments usable to track
            progress of current step.

        Returns
        -------
        diarization : Annotation
            Speaker diarization
        sources : SlidingWindowFeature
            Separated sources
        embeddings : np.array, optional
            Representative speaker embeddings such that `embeddings[i]` is the
            speaker embedding for i-th speaker in diarization.labels().
            Only returned when `return_embeddings` is True.
        rl   )r   r   r   r.   rs   r   F)onsetinitial_state)r8   r8   )warm_upspeaker_countingr8   uri)r   NrA   )rv   rm   rw   )rw   rr   r   min_clustersmax_clustersrq   framesr-   z2
                The detected number of speakers (z/) is outside
                the given bounds [rF   zS]. This can happen if the
                given audio file is too short to contain zh or more speakers.
                Try to lower the desired minimal number of speakers.
                r   r   discrete_diarizationry   )r<   r;   T)r   hammingmissingskip_average)r   r   rz   g:0yE>)min_duration_onr?   
annotation)return_mappingc                    s   i | ]	}|  ||qS r%   )r   ).0keymappingr%   r&   
<dictcomp>  s    z*SpeechSeparation.apply.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r%   r%   )r   labelexpected_labelr%   r%   r&   r     s    r   c                 S   s   i | ]\}}||qS r%   r%   )r   indexr   r%   r%   r&   r     s    c                       g | ]} | qS r%   r%   r   r   inverse_mappingr%   r&   
<listcomp>      z*SpeechSeparation.apply.<locals>.<listcomp>c                    r   r%   r%   r   r   r%   r&   r     r   )B
setup_hookr   rt   rP   rQ   rO   rR   r   r.   r@   speaker_countreceptive_fieldr   nanmaxr   r   r   rS   	dimensionrN   r   r1   r2   r   warningswarntextwrapdedentminimumr   int8r   r   to_diarizationr   r   r;   r   r
   	aggregater~   r[   rI   intr   roundrJ   rT   rD   
zeros_likeranger   r   r   alignabsto_annotationr?   r   optimal_mappinglabelsr   rj   rename_labelsitemslen) r\   rq   r   r   r   r   rm   rr   rs   binarized_segmentationsr   diarizationrw   r   r   	centroidsnum_different_speakersinactive_speakersr   active_speakersr   clustered_separationsframe_durationr   sourcesnum_sourcesasr_collar_framesdilated_speaker_activationsr   speaker_activation
non_silentdilated_non_silentr%   )r   r   r&   apply  s"  -












zSpeechSeparation.applyc                 C   s   t di | jS )Nr%   )r   r5   rd   r%   r%   r&   
get_metric  s   zSpeechSeparation.get_metric)
r)   r*   r+   Fr,   r-   r-   NNNrb   )FN)NNNFN)#__name__
__module____qualname____doc__r   floatboolstrr   r   r   r   r   r   rL   propertyr4   setterrf   rj   rk   r   r   rt   r   r   r   ndarrayr   r   r   r  r   r  __classcell__r%   r%   r_   r&   r(   =   s    C	

G


"
 
4
  r(   )r   N)5r  rn   r!   r   r   r   pathlibr   typingr   r   r   r   r   numpyr   r   einopsr   pyannote.audior	   r
   r   r   pyannote.audio.core.ior   #pyannote.audio.pipelines.clusteringr   -pyannote.audio.pipelines.speaker_verificationr   pyannote.audio.pipelines.utilsr   r   r   *pyannote.audio.pipelines.utils.diarizationr   pyannote.audio.utils.signalr   pyannote.corer   r   r   pyannote.metrics.diarizationr   pyannote.pipeline.parameterr   r   r   scipy.ndimager   r   r'   r(   r%   r%   r%   r&   <module>   s0   