o
    piFu                     @   sT  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZ ddlZddlZddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 dde3fddZ4eG dd dZ5G dd de$eZ6dS )zSpeaker diarization pipelines    N)Path)CallableMappingOptionalTextUnionAny)	dataclass)	rearrange)Audio	InferenceModelPipeline)	AudioFile)
Clustering)PretrainedSpeakerEmbedding)PipelineModelPipelinePLDASpeakerDiarizationMixin	get_modelget_plda)set_num_speakers)binarize)
AnnotationSlidingWindowFeature)GreedyDiarizationErrorRate)	ParamDictUniform    
batch_sizec                 C   s   t | g| }tj|d|iS )zBatchify iterable	fillvalue)iter	itertoolszip_longest)iterabler   r    args r&   `/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/pipelines/speaker_diarization.pybatchify8   s   r(   c                   @   sF   e Zd ZU eed< eed< dZejdB ed< dee	e
f fddZdS )DiarizeOutputspeaker_diarizationexclusive_speaker_diarizationNspeaker_embeddingsreturnc                 C   s   g }| j jddD ]\}}}|t|jdt|jd|d q	g }| jjddD ]\}}}|t|jdt|jd|d q)||dS )a  Serialize diarization output

        Example
        -------
        {
            'diarization': [{
                'start': 6.665,
                'end': 7.165,
                'speaker': 'SPEAKER_00'},
                ...],
            'exclusive_diarization': [{
                'start': 6.665,
                'end': 7.165,
                'speaker': 'SPEAKER_00'},
                ...],
        }
        T)yield_label   )startendspeaker)diarizationexclusive_diarization)r*   
itertracksappendroundr0   r1   r+   )selfr3   speech_turn_r2   r4   r&   r&   r'   	serializeN   s.   



	zDiarizeOutput.serialize)__name__
__module____qualname__r   __annotations__r,   npndarraydictstrr   r;   r&   r&   r&   r'   r)   ?   s
   
 r)   c                       sp  e Zd ZdZddddddddddddd	d
d
dddfdededededededede	de	de
e deedf deeedf f fddZede	fddZejde	fddZdd Zdd Zed d! Zd6defd"d#Z		d7d$ed%ed&e
e fd'd(Zd)ed*ejd+edefd,d-Z				d8d.ed/e
e	 d0e
e	 d1e
e	 d&e
e deeB fd2d3Z de!fd4d5Z"  Z#S )9SpeakerDiarizationa  Speaker diarization pipeline

    Parameters
    ----------
    legacy : bool, optional
        Return only the diarization output. Defaults to return the full output
        with diarization, exclusive diarization, and speaker embeddings.
    segmentation : Model, str, or dict, optional
        Pretrained segmentation model. 
        See pyannote.audio.pipelines.utils.get_model for supported format.
    segmentation_step: float, optional
        The segmentation model is applied on a window sliding over the whole audio file.
        `segmentation_step` controls the step of this window, provided as a ratio of its
        duration. Defaults to 0.1 (i.e. 90% overlap between two consecuive windows).
    embedding : Model, str, or dict, optional
        Pretrained embedding model. 
        See pyannote.audio.pipelines.utils.get_model for supported format.
    embedding_exclude_overlap : bool, optional
        Exclude overlapping speech regions when extracting embeddings.
        Defaults (False) to use the whole speech.
    plda : PLDA, str, or dict, optional
        Pretrained PLDA.
        See pyannote.audio.pipelines.utils.get_plda for supported format.
    clustering : str, optional
        Clustering algorithm. See pyannote.audio.pipelines.clustering.Clustering
        for available options. 
    segmentation_batch_size : int, optional
        Batch size used for speaker segmentation. Defaults to 1.
    embedding_batch_size : int, optional
        Batch size used for speaker embedding. Defaults to 1.
    der_variant : dict, optional
        Optimize for a variant of diarization error rate.
        Defaults to {"collar": 0.0, "skip_overlap": False}. This is used in `get_metric`
        when instantiating the metric: GreedyDiarizationErrorRate(**der_variant).
    token : str or bool, optional
        Huggingface token to be used for downloading from Huggingface hub.
    cache_dir: Path or str, optional
        Path to the folder where files downloaded from Huggingface hub are stored.
        
    Usage
    -----
    # process audio file
    >>> output = pipeline("/path/to/audio.wav")

    # print diarization
    >>> assert isinstance(output.speaker_diarization, pyannote.core.Annotation)
    >>> for turn, speaker in output.speaker_diarization:
    ...     print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
    
    # get one speaker embedding per speaker
    >>> assert isinstance(output.speaker_embeddings, np.ndarray)
    >>> for s, speaker in enumerate(output.speaker_diarization.labels()):
    ...     # output.speaker_embeddings[s] is the embedding of speaker `speaker`

    # exclusive diarization is the same as diarization except 
    # that it does not contain overlapping speech segments
    >>> assert isinstance(output.exclusive_speaker_diarization, pyannote.core.Annotation)

    # force exactly 4 speakers
    >>> output = pipeline("/path/to/audio.wav", num_speakers=4)

    # force between 2 and 10 speakers
    >>> output = pipeline("/path/to/audio.wav", min_speakers=2, max_speakers=10)
    Fz(pyannote/speaker-diarization-community-1segmentation)
checkpoint	subfolder皙?	embeddingpldaVBxClustering   Nlegacysegmentation_stepembedding_exclude_overlap
clusteringembedding_batch_sizesegmentation_batch_sizeder_varianttoken	cache_dirc              	      sr  t    || _|| _t|||d}|| _|| _|| _|| _|| _	t
|||d| _|| _|
p2ddd| _|jj}t||| j| d|	d| _| jjjjrUttddd| _nttd	d
tddd| _| jdkrjd}nt| j||d| _t| jjdd| _| jj}zt| }W n ty   tdd t!tj" dw | jdkr|j#| j|d| _$n|j#|d| _$| j$j%| _&d S )N)rT   rU           F)collarskip_overlapT)durationstepskip_aggregationr         ?)min_duration_offrH   g?)	thresholdr]   OracleClusteringnot_applicabledownmix)sample_ratemonozclustering must be one of [, ]rK   )metric)'super__init__rM   segmentation_modelr   rN   rI   rQ   rO   rJ   r   _plda
klusteringrS   specificationsrY   r   _segmentationmodelpowersetr   r   rE   r   
_embeddingr   rb   _audiorf   r   KeyError
ValueErrorjoinlist__members__valuerP   expects_num_clusters_expects_num_speakers)r8   rM   rE   rN   rI   rO   rJ   rP   rQ   rR   rS   rT   rU   rn   segmentation_durationrf   
Klustering	__class__r&   r'   rh      s\   



zSpeakerDiarization.__init__r-   c                 C   s   | j jS Nrm   r   r8   r&   r&   r'   rR     s   z*SpeakerDiarization.segmentation_batch_sizer   c                 C   s   || j _d S r~   r   )r8   r   r&   r&   r'   rR     s   c                 C   s   ddidddddS )Nr]   rV   g333333?gQ?g?)r^   FaFb)rE   rP   r&   r   r&   r&   r'   default_parameters!  s   
z%SpeakerDiarization.default_parametersc                 c   s     d}	 d|dV  |d7 }q)Nr   TSPEAKER_02drL   r&   )r8   r2   r&   r&   r'   classes'  s   zSpeakerDiarization.classesc                 C   s   dS )Nztraining_cache/segmentationr&   r   r&   r&   r'   CACHED_SEGMENTATION-  s   z&SpeakerDiarization.CACHED_SEGMENTATIONc                 C   sb   |durt |dd}| jr(| j|v r|| j }|S | j||d}||| j< |S | j||d}|S )zApply segmentation model

        Parameter
        ---------
        file : AudioFile
        hook : Optional[Callable]

        Returns
        -------
        segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
        NrE   hook)	functoolspartialtrainingr   rm   )r8   filer   segmentationsr&   r&   r'   get_segmentations1  s   


z$SpeakerDiarization.get_segmentationsbinary_segmentationsexclude_overlapr   c                    s  j r dt }d|v r jjjjs|d jjkr |d S  j	j
} jj\}}}	|rWjj}
|jj }t||
 | dtj jddddk  }t j|  j	n	dt j j	 fd	d
}t| jdd}t||	 j }g }|dur|dd|dd t|dD ]1\}}ttdd | \}}t|}t|}j||d}|| |dur|d|||d qt|}t|d|d}j rjjjjrd|id< |S jj|dd< |S )a  Extract embeddings for each (chunk, speaker) pair

        Parameters
        ----------
        file : AudioFile
        binary_segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
            Binarized segmentation.
        exclude_overlap : bool, optional
            Exclude overlapping speech regions when extracting embeddings.
            In case non-overlapping speech is too short, use the whole speech.
        hook: Optional[Callable]
            Called during embeddings after every batch to report the progress

        Returns
        -------
        embeddings : (num_chunks, num_speakers, dimension) array
        ztraining_cache/embeddings
embeddingssegmentation.thresholdr\      T)axiskeepdimsc                  3   s    t  D ]M\\} }\}}jj| dd\}}tj|ddtj}tj|ddtj}t |j|jD ]\}}t|krD|}n|}|d  t	
|d  fV  q6qd S )Npad)moderV   )nan)ziprq   cropr@   
nan_to_numastypefloat32Tsumtorch
from_numpy)chunkmasksr:   clean_maskswaveformmask
clean_mask	used_maskr   clean_segmentationsr   min_num_framesr8   r&   r'   iter_waveform_and_mask  s$   
zASpeakerDiarization.get_embeddings.<locals>.iter_waveform_and_mask)NN)r   r    Nr   )total	completedrL   c                 S   s   | d d uS )Nr   r&   )br&   r&   r'   <lambda>  s    z3SpeakerDiarization.get_embeddings.<locals>.<lambda>)r   z(c s) d -> c s d)c)r   r   )r   getrB   rm   rn   rl   ro   rE   r^   sliding_windowrY   datashaperp   min_num_samplesrb   mathceilr@   r   r   r(   rQ   	enumerater   filterr   vstackr6   r
   )r8   r   r   r   r   cacherY   
num_chunks
num_framesnum_speakersr   num_samplesclean_framesr   batchesbatch_countembedding_batchesibatch	waveformsr   waveform_batch
mask_batchembedding_batchr   r&   r   r'   get_embeddingsL  sp   




	
z!SpeakerDiarization.get_embeddingsr   hard_clusterscountc                 C   s   |j j\}}}t|d }tjt|||f }tt||D ]+\}	\}
\}}t|
D ]}|dkr4q-tj|dd|
|kf dd||	dd|f< q-q t	||j
}| ||S )a;  Build final discrete diarization out of clustered segmentation

        Parameters
        ----------
        segmentations : (num_chunks, num_frames, num_speakers) SlidingWindowFeature
            Raw speaker segmentation.
        hard_clusters : (num_chunks, num_speakers) array
            Output of clustering step.
        count : (total_num_frames, 1) SlidingWindowFeature
            Instantaneous number of active speakers.

        Returns
        -------
        discrete_diarization : SlidingWindowFeature
            Discrete (0s and 1s) diarization.
        rL   Nr   )r   r   r@   maxr   zerosr   r   uniquer   r   to_diarization)r8   r   r   r   r   r   local_num_speakersnum_clustersclustered_segmentationsr   clusterr   rE   kr&   r&   r'   reconstruct  s&   	zSpeakerDiarization.reconstructr   r   min_speakersmax_speakersc                    s  t |dkrtddt|   | j||d}t|||d\}}}| jrH|du rHt	|t
r?d|v r?t |d  }n	td| j d	| j||d}|d
| |jj\}}	}
| jjjjrd|}n	t|| jjdd}| j|| jjjdd}|d| t|jdkrtt|d dt|d dtd| jj fd}| j!r|j"S |S | j#||| j$|d}|d| | j%||||||| jjjd\}}}t&|d }||k s||krtt'(d| d|d  d| d| d| d t)|j|*tj+|_tj,|jdddk}d||< | -|||}|d| | j.|d| jj/d }|d |_0t)|jd*tj+|_| -|||}| j.|d| jj/d }|d |_0d|v rh|d rh| j1|d |d!d"\}fd#d$| D nd%d$ t2| | 3 D |j4d&}|j4d&}|du rt|||d}| j!r|j"S |S t | |jd krt5|dt | |jd  fd'f}d(d$ 6 D  | fd)d*| D  }t|||d}| j!r|j"S |S )+aC  Apply speaker diarization

        Parameters
        ----------
        file : AudioFile
            Processed file.
        num_speakers : int, optional
            Number of speakers, when known.
        min_speakers : int, optional
            Minimum number of speakers. Has no effect when `num_speakers` is provided.
        max_speakers : int, optional
            Maximum number of speakers. Has no effect when `num_speakers` is provided.
        hook : callable, optional
            Callback called after each major steps of the pipeline as follows:
                hook(step_name,      # human-readable name of current step
                     step_artefact,  # artifact generated by current step
                     file=file)      # file being processed
            Time-consuming steps call `hook` multiple times with the same `step_name`
            and additional `completed` and `total` keyword arguments usable to track
            progress of current step.

        Returns
        -------
        output : DiarizeOutput (or Annotation if `self.legacy` is True)
        r   z'Ignoring unexpected keyword arguments: rd   r   )r   r   r   N
annotationz)num_speakers must be provided when using z clusteringrE   F)onsetinitial_state)rV   rV   )warm_upspeaker_countingrV   uri)r   )r*   r+   r,   )r   r   r   )r   r   r   min_clustersmax_clustersr   framesrL   z2
                The detected number of speakers (z) for z. is outside
                the given bounds [zS]. This can happen if the
                given audio file is too short to contain zh or more speakers.
                Try to lower the desired minimal number of speakers.
                r   r   discrete_diarization)min_duration_onr]   T)return_mappingc                    s   i | ]	}|  ||qS r&   )r   ).0keymappingr&   r'   
<dictcomp>  s    z,SpeakerDiarization.apply.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r&   r&   )r   labelexpected_labelr&   r&   r'   r     s    r   )r   r   c                 S   s   i | ]\}}||qS r&   r&   )r   indexr   r&   r&   r'   r     s    c                    s   g | ]} | qS r&   r&   )r   r   )inverse_mappingr&   r'   
<listcomp>  s    z,SpeakerDiarization.apply.<locals>.<listcomp>)7lenwarningswarnrt   ru   keys
setup_hookr   ry   
isinstancer   labelsrs   rk   r   r   r   rm   rn   rl   ro   r   rE   r^   speaker_countreceptive_fieldr@   nanmaxr)   r   r   rp   	dimensionrM   r*   r   rO   rP   r   textwrapdedentminimumr   int8r   r   to_annotationr]   r   optimal_mappingr   r   rename_labelsr   items)r8   r   r   r   r   r   kwargsr   r   r   r   binarized_segmentationsr   outputr   r   r:   	centroidsnum_different_speakersinactive_speakersr   r3   exclusive_discrete_diarizationr4   r&   )r   r   r'   apply  s  $









zSpeakerDiarization.applyc                 C   s   t di | jS )Nr&   )r   rS   r   r&   r&   r'   
get_metric  s   zSpeakerDiarization.get_metricr~   )FN)NNNN)$r<   r=   r>   __doc__boolr   floatr   rC   intr   rB   r   r   r   rh   propertyrR   setterr   r   r   r   r   r   r   r@   rA   r   r   r)   r   r  r   r  __classcell__r&   r&   r|   r'   rD      s    C
X

 
5
  rD   )r   N)7r  r   r"   r   r   r   pathlibr   typingr   r   r   r   r   r   dataclassesr	   numpyr@   r   einopsr
   pyannote.audior   r   r   r   pyannote.audio.core.ior   #pyannote.audio.pipelines.clusteringr   -pyannote.audio.pipelines.speaker_verificationr   pyannote.audio.pipelines.utilsr   r   r   r   r   *pyannote.audio.pipelines.utils.diarizationr   pyannote.audio.utils.signalr   pyannote.corer   r   pyannote.metrics.diarizationr   pyannote.pipeline.parameterr   r   r  r(   r)   rD   r&   r&   r&   r'   <module>   s4    ?