o
    9wi d                     @   s  d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ dd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z( dde)fddZ*G dd de%eZ+dS )zSpeaker diarization pipelines    N)CallableOptionalTextUnion)	rearrange)
AnnotationSlidingWindowFeature)GreedyDiarizationErrorRate)	ParamDictUniform)Audio	InferenceModelPipeline)	AudioFile)
Clustering)PretrainedSpeakerEmbedding)PipelineModelSpeakerDiarizationMixin	get_model)binarize    
batch_sizec                 C   s   t | g| }tj|d|iS )zBatchify iterable	fillvalue)iter	itertoolszip_longest)iterabler   r   args r   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/pyannote/audio/pipelines/speaker_diarization.pybatchify3   s   r!   c                       sD  e Zd ZdZ									d4d	ed
edededededede	e
 deedf f fddZedefddZejdefddZdd Zdd Zedd Zd5defdd Z		d6d!ed"ed#e	e fd$d%Zd&ed'ejd(edefd)d*Z					d7d+ed,e	e d-e	e d.e	e d/ed#e	e defd0d1Zdefd2d3Z  Z S )8SpeakerDiarizationan
  Speaker diarization pipeline

    Parameters
    ----------
    segmentation : Model, str, or dict, optional
        Pretrained segmentation model. Defaults to "pyannote/segmentation@2022.07".
        See pyannote.audio.pipelines.utils.get_model for supported format.
    segmentation_step: float, optional
        The segmentation model is applied on a window sliding over the whole audio file.
        `segmentation_step` controls the step of this window, provided as a ratio of its
        duration. Defaults to 0.1 (i.e. 90% overlap between two consecuive windows).
    embedding : Model, str, or dict, optional
        Pretrained embedding model. Defaults to "pyannote/embedding@2022.07".
        See pyannote.audio.pipelines.utils.get_model for supported format.
    embedding_exclude_overlap : bool, optional
        Exclude overlapping speech regions when extracting embeddings.
        Defaults (False) to use the whole speech.
    clustering : str, optional
        Clustering algorithm. See pyannote.audio.pipelines.clustering.Clustering
        for available options. Defaults to "AgglomerativeClustering".
    segmentation_batch_size : int, optional
        Batch size used for speaker segmentation. Defaults to 1.
    embedding_batch_size : int, optional
        Batch size used for speaker embedding. Defaults to 1.
    der_variant : dict, optional
        Optimize for a variant of diarization error rate.
        Defaults to {"collar": 0.0, "skip_overlap": False}. This is used in `get_metric`
        when instantiating the metric: GreedyDiarizationErrorRate(**der_variant).
    use_auth_token : str, optional
        When loading private huggingface.co models, set `use_auth_token`
        to True or to a string containing your hugginface.co authentication
        token that can be obtained by running `huggingface-cli login`

    Usage
    -----
    # perform (unconstrained) diarization
    >>> diarization = pipeline("/path/to/audio.wav")

    # perform diarization, targetting exactly 4 speakers
    >>> diarization = pipeline("/path/to/audio.wav", num_speakers=4)

    # perform diarization, with at least 2 speakers and at most 10 speakers
    >>> diarization = pipeline("/path/to/audio.wav", min_speakers=2, max_speakers=10)

    # perform diarization and get one representative embedding per speaker
    >>> diarization, embeddings = pipeline("/path/to/audio.wav", return_embeddings=True)
    >>> for s, speaker in enumerate(diarization.labels()):
    ...     # embeddings[s] is the embedding of speaker `speaker`

    Hyper-parameters
    ----------------
    segmentation.threshold
    segmentation.min_duration_off
    clustering.???
    pyannote/segmentation@2022.07皙?Jspeechbrain/spkrec-ecapa-voxceleb@5c0be3875fda05e81f3c004ed8c7c06be308de1eFAgglomerativeClustering   Nsegmentationsegmentation_step	embeddingembedding_exclude_overlap
clusteringembedding_batch_sizesegmentation_batch_sizeder_variantuse_auth_tokenc
              	      s*  t    || _t||	d}
|| _|| _|| _|| _|| _|p#ddd| _	|
j
j}t|
|| j| d|d| _| jjj
jrFttddd| _nttd	d
tddd| _| jdkr[d}nt| j|	d| _t| jjdd| _| jj}zt| }W n ty   tddttj dw |j|d| _ d S )N)r0           F)collarskip_overlapT)durationstepskip_aggregationr         ?)min_duration_offr$   g?)	thresholdr8   OracleClusteringnot_applicabledownmix)sample_ratemonozclustering must be one of [, ])metric)!super__init__segmentation_modelr   r)   r*   r-   r+   
klusteringr/   specificationsr4   r   _segmentationmodelpowersetr
   r   r(   r   
_embeddingr   r=   _audiorA   r   KeyError
ValueErrorjoinlist__members__valuer,   )selfr(   r)   r*   r+   r,   r-   r.   r/   r0   rH   segmentation_durationrA   
Klustering	__class__r   r    rC   s   sP   


zSpeakerDiarization.__init__returnc                 C   s   | j jS NrG   r   rR   r   r   r    r.      s   z*SpeakerDiarization.segmentation_batch_sizer   c                 C   s   || j _d S rX   rY   )rR   r   r   r   r    r.      s   c                 C   s   t  rX   )NotImplementedErrorrZ   r   r   r    default_parameters   s   z%SpeakerDiarization.default_parametersc                 c   s     d}	 d|dV  |d7 }q)Nr   TSPEAKER_02dr'   r   )rR   speakerr   r   r    classes   s   zSpeakerDiarization.classesc                 C   s   dS )Nztraining_cache/segmentationr   rZ   r   r   r    CACHED_SEGMENTATION   s   z&SpeakerDiarization.CACHED_SEGMENTATIONc                 C   sb   |durt |dd}| jr(| j|v r|| j }|S | j||d}||| j< |S | j||d}|S )zApply segmentation model

        Parameter
        ---------
        file : AudioFile
        hook : Optional[Callable]

        Returns
        -------
        segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
        Nr(   hook)	functoolspartialtrainingra   rG   )rR   filerc   segmentationsr   r   r    get_segmentations   s   


z$SpeakerDiarization.get_segmentationsbinary_segmentationsexclude_overlaprc   c                    s  j r dt }d|v r jjjjs|d jjkr |d S  j	j
 jj\}}}|rWjj}	jj }
t||	 |
 dtj jddddk  }t j|  j	n	dt j j	 fd	d
}t| jdd}t|| j }g }|dur|dd|dd t|dD ]1\}}ttdd | \}}t|}t|}j||d}|| |dur|d|||d qt|}t|d|d}j rjjjjrd|id< |S jj|dd< |S )a  Extract embeddings for each (chunk, speaker) pair

        Parameters
        ----------
        file : AudioFile
        binary_segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
            Binarized segmentation.
        exclude_overlap : bool, optional
            Exclude overlapping speech regions when extracting embeddings.
            In case non-overlapping speech is too short, use the whole speech.
        hook: Optional[Callable]
            Called during embeddings after every batch to report the progress

        Returns
        -------
        embeddings : (num_chunks, num_speakers, dimension) array
        ztraining_cache/embeddings
embeddingssegmentation.thresholdr7      T)axiskeepdimsc                  3   s    t  D ]N\\} }\}}jj| dd\}}tj|ddtj}tj|ddtj}t |j|jD ]\}}t|krE|}n|}|d  t	
|d  fV  q7qd S )Npad)r4   moder1   )nan)ziprK   cropnp
nan_to_numastypefloat32Tsumtorch
from_numpy)chunkmasks_clean_maskswaveformmask
clean_mask	used_maskrj   clean_segmentationsr4   rg   min_num_framesrR   r   r    iter_waveform_and_mask'  s&   
	zASpeakerDiarization.get_embeddings.<locals>.iter_waveform_and_mask)NN)r   r   Nr   )total	completedr'   c                 S   s   | d d uS )Nr   r   )br   r   r    <lambda>T  s    z3SpeakerDiarization.get_embeddings.<locals>.<lambda>)r   z(c s) d -> c s d)c)rm   rl   )rf   getdictrG   rH   rF   rI   r(   r9   sliding_windowr4   datashaperJ   min_num_samplesr=   mathceilrw   r|   r   r!   r-   	enumerateru   filterr}   vstackappendr   )rR   rg   rj   rk   rc   cache
num_chunks
num_framesnum_speakersr   num_samplesclean_framesr   batchesbatch_countembedding_batchesibatch	waveformsr   waveform_batch
mask_batchembedding_batchrl   r   r   r    get_embeddings   sp   




	
z!SpeakerDiarization.get_embeddingsrh   hard_clusterscountc                 C   s   |j j\}}}t|d }tjt|||f }tt||D ]+\}	\}
\}}t|
D ]}|dkr4q-tj|dd|
|kf dd||	dd|f< q-q t	||j
}| ||S )a;  Build final discrete diarization out of clustered segmentation

        Parameters
        ----------
        segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
            Raw speaker segmentation.
        hard_clusters : (num_chunks, num_speakers) array
            Output of clustering step.
        count : (total_num_frames, 1) SlidingWindowFeature
            Instantaneous number of active speakers.

        Returns
        -------
        discrete_diarization : SlidingWindowFeature
            Discrete (0s and 1s) diarization.
        r'   Nro   )r   r   rw   maxrt   zerosr   ru   uniquer   r   to_diarization)rR   rh   r   r   r   r   local_num_speakersnum_clustersclustered_segmentationsr   clusterr   r(   kr   r   r    reconstructy  s&   	zSpeakerDiarization.reconstructrg   r   min_speakersmax_speakersreturn_embeddingsc                    s  | j ||d}| j|||d\}}}| j||d}|d| |jj\}}	}
| jjjjr.|}n	t	|| j
jdd}| j|| jjjdd}|d| t|jd	kret|d
 d}|rc|td| jjffS |S |sz|dk rztj||
ftjd}d}d}n+| jdkr|sd}n| j||| j|d}|d| | j||||||| jjjd\}}}t|d }||k s||krttd| d| d| d| d	 t|j|tj|_tj |jdddk}d||< | !|||}|d| | j"|d	| j
j#d}|d
 |_$d|v r |d r | j%|d |dd \}fd!d"|& D nd#d" t'|& | ( D |j)d$}|s9|S |du rB|dfS t*|& |jd krat+|dt*|& |jd  fd%f}d&d" , D  | fd'd(|& D  }||fS ))a  Apply speaker diarization

        Parameters
        ----------
        file : AudioFile
            Processed file.
        num_speakers : int, optional
            Number of speakers, when known.
        min_speakers : int, optional
            Minimum number of speakers. Has no effect when `num_speakers` is provided.
        max_speakers : int, optional
            Maximum number of speakers. Has no effect when `num_speakers` is provided.
        return_embeddings : bool, optional
            Return representative speaker embeddings.
        hook : callable, optional
            Callback called after each major steps of the pipeline as follows:
                hook(step_name,      # human-readable name of current step
                     step_artefact,  # artifact generated by current step
                     file=file)      # file being processed
            Time-consuming steps call `hook` multiple times with the same `step_name`
            and additional `completed` and `total` keyword arguments usable to track
            progress of current step.

        Returns
        -------
        diarization : Annotation
            Speaker diarization
        embeddings : np.array, optional
            Representative speaker embeddings such that `embeddings[i]` is the
            speaker embedding for i-th speaker in diarization.labels().
            Only returned when `return_embeddings` is True.
        rb   )r   r   r   r(   F)onsetinitial_state)r1   r1   )warm_upspeaker_countingr1   uri)r   r   rn   )dtypeNr:   )rk   rc   rl   )rl   rh   r   min_clustersmax_clustersrg   framesr'   z2
                The detected number of speakers (z/) is outside
                the given bounds [r?   zS]. This can happen if the
                given audio file is too short to contain zh or more speakers.
                Try to lower the desired minimal number of speakers.
                r   r   discrete_diarization)min_duration_onr8   
annotationT)return_mappingc                    s   i | ]	}|  ||qS r   )r   ).0keymappingr   r    
<dictcomp>^  s    z,SpeakerDiarization.apply.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r   r   )r   labelexpected_labelr   r   r    r   c  s    r   )r   r   c                 S   s   i | ]\}}||qS r   r   )r   indexr   r   r   r    r     s    c                    s   g | ]} | qS r   r   )r   r   )inverse_mappingr   r    
<listcomp>  s    z,SpeakerDiarization.apply.<locals>.<listcomp>)-
setup_hookset_num_speakersri   r   r   rG   rH   rF   rI   r   r(   r9   speaker_countreceptive_fieldrw   nanmaxr   r   rJ   	dimensionint8rE   r   r+   r,   r   warningswarntextwrapdedentminimumry   r|   r   to_annotationr8   r   optimal_mappinglabelsru   r`   rename_labelslenrr   items)rR   rg   r   r   r   r   rc   rh   r   r   r   binarized_segmentationsr   diarizationr   rl   	centroidsr   num_different_speakersinactive_speakersr   r   )r   r   r    apply  s   +







zSpeakerDiarization.applyc                 C   s   t di | jS )Nr   )r	   r/   rZ   r   r   r    
get_metric  s   zSpeakerDiarization.get_metric)	r#   r$   r%   Fr&   r'   r'   NNrX   )FN)NNNFN)!__name__
__module____qualname____doc__r   floatboolstrintr   r   r   r   rC   propertyr.   setterr\   r`   ra   r   ri   r   r   rw   ndarrayr   r   r   r   r	   r   __classcell__r   r   rU   r    r"   :   s    :	

A

 
5
 ^r"   )r   N),r   rd   r   r   r   r   typingr   r   r   r   numpyrw   r}   einopsr   pyannote.corer   r   pyannote.metrics.diarizationr	   pyannote.pipeline.parameterr
   r   pyannote.audior   r   r   r   pyannote.audio.core.ior   #pyannote.audio.pipelines.clusteringr   -pyannote.audio.pipelines.speaker_verificationr   pyannote.audio.pipelines.utilsr   r   r   pyannote.audio.utils.signalr   r   r!   r"   r   r   r   r    <module>   s*   