o
    9wiq                     @   s  d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZ dd	l m!Z! dd
l"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ dde,fddZ-G dd de(eZ.dS )zSpeech separation pipelines    N)CallableOptionalTextTupleUnion)	rearrange)
AnnotationSlidingWindowSlidingWindowFeature)GreedyDiarizationErrorRate)Categorical	ParamDictUniform)Audio	InferenceModelPipeline)	AudioFile)
Clustering)PretrainedSpeakerEmbedding)PipelineModelSpeakerDiarizationMixin	get_model)binarize    
batch_sizec                 C   s   t | g| }tj|d|iS )zBatchify iterable	fillvalue)iter	itertoolszip_longest)iterabler   r   args r"   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/pyannote/audio/pipelines/speech_separation.pybatchify3   s   r$   c                       sN  e Zd ZdZ									d3ded	ed
ededededede	e
 deedf f fddZedefddZejdefddZdd Zdd Zedd Z	d4deeef fddZ		d5d ed!ed"e	e fd#d$Zd%ed&ejd'edefd(d)Z					d6d*ed+e	e d,e	e d-e	e d.ed"e	e defd/d0Zdefd1d2Z   Z!S )7SpeechSeparationuJ  Speech separation pipeline

    Parameters
    ----------
    segmentation : Model, str, or dict, optional
        Pretrained segmentation model and separation model.
        See pyannote.audio.pipelines.utils.get_model for supported format.
    segmentation_step: float, optional
        The segmentation model is applied on a window sliding over the whole audio file.
        `segmentation_step` controls the step of this window, provided as a ratio of its
        duration. Defaults to 0.1 (i.e. 90% overlap between two consecuive windows).
    embedding : Model, str, or dict, optional
        Pretrained embedding model. Defaults to "pyannote/embedding@2022.07".
        See pyannote.audio.pipelines.utils.get_model for supported format.
    embedding_exclude_overlap : bool, optional
        Exclude overlapping speech regions when extracting embeddings.
        Defaults (False) to use the whole speech.
    clustering : str, optional
        Clustering algorithm. See pyannote.audio.pipelines.clustering.Clustering
        for available options. Defaults to "AgglomerativeClustering".
    segmentation_batch_size : int, optional
        Batch size used for speaker segmentation. Defaults to 1.
    embedding_batch_size : int, optional
        Batch size used for speaker embedding. Defaults to 1.
    der_variant : dict, optional
        Optimize for a variant of diarization error rate.
        Defaults to {"collar": 0.0, "skip_overlap": False}. This is used in `get_metric`
        when instantiating the metric: GreedyDiarizationErrorRate(**der_variant).
    use_auth_token : str, optional
        When loading private huggingface.co models, set `use_auth_token`
        to True or to a string containing your hugginface.co authentication
        token that can be obtained by running `huggingface-cli login`

    Usage
    -----
    >>> pipeline = SpeakerDiarization()
    >>> diarization, separation = pipeline("/path/to/audio.wav")
    >>> diarization, separation = pipeline("/path/to/audio.wav", num_speakers=4)
    >>> diarization, separation = pipeline("/path/to/audio.wav", min_speakers=2, max_speakers=10)

    Hyper-parameters
    ----------------
    segmentation.min_duration_off : float
        Fill intra-speaker gaps shorter than that many seconds.
    segmentation.threshold : float
        Mark speaker has active when their probability is higher than this.
    clustering.method : {'centroid', 'average', ...}
        Linkage used for agglomerative clustering
    clustering.min_cluster_size : int
        Minium cluster size.
    clustering.threshold : float
        Clustering threshold used to stop merging clusters.
    separation.leakage_removal : bool
        Zero-out sources when speaker is inactive.
    separation.asr_collar
        When using leakage removal, keep that many seconds before and after each speaker turn

    References
    ----------
    Joonas Kalda, Clément Pagés, Ricard Marxer, Tanel Alumäe, and Hervé Bredin.
    "PixIT: Joint Training of Speaker Diarization and Speech Separation
    from Real-world Multi-speaker Recordings"
    Odyssey 2024. https://arxiv.org/abs/2403.02288
    N皙?Jspeechbrain/spkrec-ecapa-voxceleb@5c0be3875fda05e81f3c004ed8c7c06be308de1eFAgglomerativeClustering   segmentationsegmentation_step	embeddingembedding_exclude_overlap
clusteringembedding_batch_sizesegmentation_batch_sizeder_variantuse_auth_tokenc
              	      sN  t    || _t||	d}
|| _|| _|| _|| _|| _|p#ddd| _	|
j
d j}t|
|| j| d|d| _| jjj
d jrJttddd	| _nttd
dtddd| _| jdkr_d}nt| j|	d| _t| jjdd| _| jj}zt| }W n ty   tddttj dw |j|d| _ tt!ddgtddd| _"d S )N)r2           F)collarskip_overlapr   T)durationstepskip_aggregationr         ?)min_duration_offr&   g?)	thresholdr:   OracleClusteringnot_applicabledownmix)sample_ratemonozclustering must be one of [, ])metric)leakage_removal
asr_collar)#super__init__segmentation_modelr   r+   r,   r/   r-   
klusteringr1   specificationsr6   r   _segmentationmodelpowersetr   r   r*   r   
_embeddingr   r?   _audiorC   r   KeyError
ValueErrorjoinlist__members__valuer.   r   
separation)selfr*   r+   r,   r-   r.   r/   r0   r1   r2   rL   segmentation_durationrC   
Klustering	__class__r"   r#   rG   |   sX   



zSpeechSeparation.__init__returnc                 C   s   | j jS NrK   r   rW   r"   r"   r#   r0      s   z(SpeechSeparation.segmentation_batch_sizer   c                 C   s   || j _d S r]   r^   )rW   r   r"   r"   r#   r0      s   c                 C   s   t  r]   )NotImplementedErrorr_   r"   r"   r#   default_parameters   s   z#SpeechSeparation.default_parametersc                 c   s     d}	 d|dV  |d7 }q)Nr   TSPEAKER_02dr)   r"   )rW   speakerr"   r"   r#   classes   s   zSpeechSeparation.classesc                 C   s   dS )Nztraining_cache/segmentationr"   r_   r"   r"   r#   CACHED_SEGMENTATION   s   z$SpeechSeparation.CACHED_SEGMENTATIONc                 C   sz   |durt |dd}| jr0| j|v r|| j \}}||fS | j||d\}}||| j< ||fS | j||d\}}||fS )aI  Apply segmentation model

        Parameter
        ---------
        file : AudioFile
        hook : Optional[Callable]

        Returns
        -------
        segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
        separations : (num_chunks, num_samples, num_speakers) SlidingWindowFeature
        Nr*   hook)	functoolspartialtrainingrf   rK   )rW   filerh   segmentationsseparationsr"   r"   r#   get_segmentations   s   

z"SpeechSeparation.get_segmentationsbinary_segmentationsexclude_overlaprh   c                    s  j r"dt }d|v r"jjjd js|d jjkr"|d S  j	j
 jj\}}}|rYjj}	jj }
t||	 |
 dtj jddddk  }t j|  j	n	d	t j j	 fd
d}t| jdd}t|| j }g }|dur|dd|dd t|dD ]1\}}ttdd | \}}t|}t|}j||d}|| |dur|d|||d qt|}t|d|d}j rjjjd jrd|id< |S jj|dd< |S )a  Extract embeddings for each (chunk, speaker) pair

        Parameters
        ----------
        file : AudioFile
        binary_segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
            Binarized segmentation.
        exclude_overlap : bool, optional
            Exclude overlapping speech regions when extracting embeddings.
            In case non-overlapping speech is too short, use the whole speech.
        hook: Optional[Callable]
            Called during embeddings after every batch to report the progress

        Returns
        -------
        embeddings : (num_chunks, num_speakers, dimension) array
        ztraining_cache/embeddings
embeddingsr   segmentation.thresholdr9      T)axiskeepdimsc                  3   s    t  D ]N\\} }\}}jj| dd\}}tj|ddtj}tj|ddtj}t |j|jD ]\}}t|krE|}n|}|d  t	
|d  fV  q7qd S )Npad)r6   moder3   )nan)ziprO   cropnp
nan_to_numastypefloat32Tsumtorch
from_numpy)chunkmasks_clean_maskswaveformspeaker_activation_with_context
clean_mask	used_maskrp   clean_segmentationsr6   rl   min_num_framesrW   r"   r#   iter_waveform_and_mask8  s*   
	z?SpeechSeparation.get_embeddings.<locals>.iter_waveform_and_mask)NN)r   r   N)total	completedr)   c                 S   s   | d d uS )Nr   r"   )br"   r"   r#   <lambda>g  s    z1SpeechSeparation.get_embeddings.<locals>.<lambda>)r   z(c s) d -> c s d)c)rs   rr   )rk   getdictrK   rL   rJ   rM   r*   r;   sliding_windowr6   datashaperN   min_num_samplesr?   mathceilr}   r   r
   r$   r/   	enumerater{   filterr   vstackappendr   )rW   rl   rp   rq   rh   cache
num_chunks
num_framesnum_speakersr   num_samplesclean_framesr   batchesbatch_countembedding_batchesibatch	waveformsr   waveform_batch
mask_batchembedding_batchrr   r"   r   r#   get_embeddings   sp   !



	
zSpeechSeparation.get_embeddingsrm   hard_clusterscountc                 C   s   |j j\}}}t|d }tjt|||f }tt||D ]+\}	\}
\}}t|
D ]}|dkr4q-tj|dd|
|kf dd||	dd|f< q-q t	||j
}|S )a;  Build final discrete diarization out of clustered segmentation

        Parameters
        ----------
        segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
            Raw speaker segmentation.
        hard_clusters : (num_chunks, num_speakers) array
            Output of clustering step.
        count : (total_num_frames, 1) SlidingWindowFeature
            Instantaneous number of active speakers.

        Returns
        -------
        discrete_diarization : SlidingWindowFeature
            Discrete (0s and 1s) diarization.
        r)   Nru   )r   r   r}   maxrz   zerosr   r{   uniquer
   r   to_diarization)rW   rm   r   r   r   r   local_num_speakersnum_clustersclustered_segmentationsr   clusterr   r*   kr"   r"   r#   reconstruct  s&   	zSpeechSeparation.reconstructrl   r   min_speakersmax_speakersreturn_embeddingsc                    s  | j ||d}| j|||d\}}}| j||d\}}|d| |d| | jjjd jr0|}	n	t|| jj	dd}	| j
|	| jjjdd	}
|d
|
 t|
jdkrjt|d d}|rf|dtd| jjffS |dfS | jdkrt|std}n| j||	| j|d}|d| | j||	||||| jjjd\}}}t|d }||k s||krttd| d| d| d| d	 t|
j|tj|
_tj|	jdddk}d||< |  |||
}| !||
}|d| |  |||
}|j"j#|jj$d  }t%|d| d}t&j'||dddd}| j(j)rt*| jj+| j(j,| j-j.   dkrt/|jj$d D ]p}|jj0| }t1|dkd t1t2d  kd } fd d!|D }d  krft3dd   g| }d" |j$d   k r|t3d"   |j$d g }tj4t5|t6d#}d|t7|< ||jj0|< q)|jj$d }|j|8|jddd|f  |_| j9|d| jj:d$}|d |_;d%|v r|d% r| j<|d% |dd&\}fd'd(|= D nd)d( t>|= | ? D |j@d*}|s||fS |du r||dfS t5|= |j$d kr'tA|dt5|= |j$d  fd+f}d,d( B D |fd-d!|= D  }|||fS ).a  Apply speaker diarization

        Parameters
        ----------
        file : AudioFile
            Processed file.
        num_speakers : int, optional
            Number of speakers, when known.
        min_speakers : int, optional
            Minimum number of speakers. Has no effect when `num_speakers` is provided.
        max_speakers : int, optional
            Maximum number of speakers. Has no effect when `num_speakers` is provided.
        return_embeddings : bool, optional
            Return representative speaker embeddings.
        hook : callable, optional
            Callback called after each major steps of the pipeline as follows:
                hook(step_name,      # human-readable name of current step
                     step_artefact,  # artifact generated by current step
                     file=file)      # file being processed
            Time-consuming steps call `hook` multiple times with the same `step_name`
            and additional `completed` and `total` keyword arguments usable to track
            progress of current step.

        Returns
        -------
        diarization : Annotation
            Speaker diarization
        sources : SlidingWindowFeature
            Separated sources
        embeddings : np.array, optional
            Representative speaker embeddings such that `embeddings[i]` is the
            speaker embedding for i-th speaker in diarization.labels().
            Only returned when `return_embeddings` is True.
        rg   )r   r   r   r*   rn   r   F)onsetinitial_state)r3   r3   )warm_upspeaker_countingr3   uri)r   Nr<   )rq   rh   rr   )rr   rm   r   min_clustersmax_clustersrl   framesr)   z2
                The detected number of speakers (z/) is outside
                the given bounds [rA   zS]. This can happen if the
                given audio file is too short to contain zh or more speakers.
                Try to lower the desired minimal number of speakers.
                r   r   discrete_diarizationrt   )r7   r6   T)r   hammingmissingskip_averagec                    s,   g | ]}t |   |d     qS )r)   )r}   arange).0gap)asr_collar_frames
non_silentr"   r#   
<listcomp>o  s    
z*SpeechSeparation.apply.<locals>.<listcomp>rw   )dtype)min_duration_onr:   
annotation)return_mappingc                    s   i | ]	}|  ||qS r"   )r   )r   keymappingr"   r#   
<dictcomp>  s    z*SpeechSeparation.apply.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r"   r"   )r   labelexpected_labelr"   r"   r#   r     s    r   )r   r   c                 S   s   i | ]\}}||qS r"   r"   )r   indexr   r"   r"   r#   r     s    c                    s   g | ]} | qS r"   r"   )r   r   )inverse_mappingr"   r#   r     s    )C
setup_hookset_num_speakersro   rK   rL   rJ   rM   r   r*   r;   speaker_countreceptive_fieldr}   nanmaxr   r   r   rN   	dimensionrI   r   r-   r.   r   warningswarntextwrapdedentminimumr   int8r   r   r   r   r6   r   r	   r   	aggregaterV   rD   intr   rE   rO   r?   ranger   wherediffr   oneslenfloatconcatenatealignto_annotationr:   r   optimal_mappinglabelsr{   re   rename_labelsrx   items)rW   rl   r   r   r   r   rh   rm   rn   binarized_segmentationsr   diarizationrr   r   r   	centroidsnum_different_speakersinactive_speakersr   clustered_separationsframe_durationr   sourcesr   speaker_activationremaining_gapsremaining_zerosr   num_sourcesr"   )r   r   r   r   r#   apply  s(  -





	

 





zSpeechSeparation.applyc                 C   s   t di | jS )Nr"   )r   r1   r_   r"   r"   r#   
get_metric  s   zSpeechSeparation.get_metric)	Nr&   r'   Fr(   r)   r)   NNr]   )FN)NNNFN)"__name__
__module____qualname____doc__r   r   boolstrr   r   r   r   r   rG   propertyr0   setterra   re   rf   r   r
   ro   r   r   r}   ndarrayr   r   r   r  r   r	  __classcell__r"   r"   rZ   r#   r%   :   s    C	

F


"
 
5
  r%   )r   N)/r  ri   r   r   r   r   typingr   r   r   r   r   numpyr}   r   einopsr   pyannote.corer   r	   r
   pyannote.metrics.diarizationr   pyannote.pipeline.parameterr   r   r   pyannote.audior   r   r   r   pyannote.audio.core.ior   #pyannote.audio.pipelines.clusteringr   -pyannote.audio.pipelines.speaker_verificationr   pyannote.audio.pipelines.utilsr   r   r   pyannote.audio.utils.signalr   r   r$   r%   r"   r"   r"   r#   <module>   s*   