o
    ॵi2                     @   s   d dl Z d dlZd dlmZmZmZmZ d dlZd dl	Z
d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ e ZdgZej ej!ej"dG dd deZ#dS )    N)AnyDictListUnion)File)	Pipelines)
OutputKeys)pipeline)
InputModelPipeline)	PIPELINES)Tasks)
get_loggerSegmentationClusteringPipeline)module_namec                
       s  e Zd ZdZdef fddZdeeej	e
f deeef fddZd	e
dej	fd
dZdej	dej	fddZde
de
dej	dej	de
f
ddZdeeej	e
f de
fddZde
fddZde
de
fddZdededeej	e
f dej	fddZdd Zd d! Zd%d#d$Z  ZS )&r   aX  Segmentation and Clustering Pipeline
    use `model` to create a Segmentation and Clustering Pipeline.

    Args:
        model (SegmentationClusteringPipeline): A model instance, or a model local dir, or a model id in the model hub.
        kwargs (dict, `optional`):
            Extra kwargs passed into the pipeline's constructor.
    Example:
    >>> from modelscope.pipelines import pipeline
    >>> from modelscope.utils.constant import Tasks
    >>> p = pipeline(
    >>>    task=Tasks.speaker_diarization, model='damo/speech_campplus_speaker-diarization_common')
    >>> print(p(audio))

    modelc                    sZ   t  jd
d|i| | jj| _ddd}| j| | jd | _td| jd d| _d	S )zuse `model` to create a speaker diarization pipeline for prediction
        Args:
            model (str): a valid offical model id
        r         ?g      ?)seg_dur	seg_shiftsample_ratezspeaker-verificationspeaker_modeltaskr   N )	super__init__r   other_configconfigupdatefsr	   sv_pipeline)selfr   kwargsr   	__class__r   o/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/pipelines/audio/segmentation_clustering_pipeline.pyr   -   s   

z'SegmentationClusteringPipeline.__init__audioreturnc                 K   s   | j | td | |}| | td | |}td | |}td | |}td | 	||||}t
j|iS )a   extract the speaker embeddings of input audio and do cluster
        Args:
            audio (str, np.ndarray, list): If it is represented as a str or a np.ndarray, it
            should be a complete speech signal and requires VAD preprocessing. If the audio
            is represented as a list, it should contain only the effective speech segments
            obtained through VAD preprocessing. The list should be formatted as [[0(s),3.2,
            np.ndarray], [5.3,9.1, np.ndarray], ...]. Each element is a sublist that contains
            the start time, end time, and the numpy array of the speech segment respectively.
        zDoing VAD...zDoing segmentation...zExtracting embeddings...zClustering...zPost processing...)r   r   loggerinfo
preprocesscheck_audio_listchunkforward
clusteringpostprocessr   TEXT)r!   r&   paramsvad_segmentssegments
embeddingslabelsoutputr   r   r%   __call__=   s   










z'SegmentationClusteringPipeline.__call__inputc                 C   sL   g }|D ]}| j |d gdd}|d jdkr||d  qt|}|S )N   T)
output_embembs)      )r    shapeappendnpconcatenate)r!   r8   r4   s	save_dictr   r   r%   r-   \   s   
z&SegmentationClusteringPipeline.forwardr4   c                 C   s   | j |fi | j}|S )N)r   r   )r!   r4   r5   r   r   r%   r.   e   s   z)SegmentationClusteringPipeline.clusteringr3   r2   r5   c                 C   s  t |t |ks
J | |}g }tt |D ]}||| d || d || g q| |}g }t| d D ]}|||k d}|| q;t|}dd }	tdt |D ]}|	||d  d || d r|| d ||d  d  d }
d| j	v rt
| dsttj| j	d d| _t|
d	 ||d  d }t|
d	 || d }|| dkr| |||}||d  d }|| d }| j||| || gd
d\}}|d ur|| }
|
|| d< |
||d  d< q\| |}|S )Nr   r<   c                 S   s   | |d krdS dS )Ng-C6?TFr   )t1t2r   r   r%   is_overlappedz   s   zASegmentationClusteringPipeline.postprocess.<locals>.is_overlappedr9   change_locatorchange_locator_pipeliner   r   T)
output_res)lencorrect_labelsranger?   merge_sequemaxmeanr@   stackr   hasattrr	   r   speaker_diarizationrH   min	cut_audiosmooth)r!   r3   r2   r5   r4   distribute_resispk_embsspk_embrF   pshort_utt_stshort_utt_ed
audio_dataspk1spk2_ctr   r   r%   r/   i   sP   
&

 



z*SegmentationClusteringPipeline.postprocessc           
   
   C   s  t |tr|jdd d |S t |trat|}tjt|dd\}}t	|j
dkr4|d d df }|| jkratd| j d	 tjjt|d|d
t| jggd\}}|d }t	|j
dkslJ d|jdv ry|d d}n|d}t| dsttj| jd dd| _| j|| jddd d }g }t |trt|}nt |tr|}nt dt!| |D ](}t"|d d }t"|d d }	|#||	|t"|| j t"|	| j  g q|S )Nc                 S   s   | d S )Nr   r   )xr   r   r%   <lambda>   s    z;SegmentationClusteringPipeline.preprocess.<locals>.<lambda>)keyfloat32)dtyper9   r   z+[WARNING]: The sample rate of audio is not z, resample it.rate)effectsr<   %modelscope error: Wrong audio format.)int16int32int64i   vad_pipeline	vad_modelzv2.0.2)r   r   model_revisionT)r   is_finalvaluezIncorrect vad result. Get %si  )$
isinstancelistsortstrr   readsfioBytesIOrJ   r>   r   r(   r)   
torchaudiosox_effectsapply_effects_tensortorch
from_numpy	unsqueezesqueezenumpyrf   astyperQ   r	   r   voice_activity_detectionr   rm   astliteral_eval
ValueErrortypeintr?   )
r!   r&   
file_bytesr   vad_timer2   vad_time_listtstedr   r   r%   r*      s`   









$z)SegmentationClusteringPipeline.preprocessc                 C   s   d}t t|D ]Y}|| }|d |d ksJ dt|d tjs&J dt|d | j t|d | j  |d jd ksCJ d|dkrW|d ||d  d ksWJ d||d |d  7 }q|dksjJ dd S )	Nr   r<   z$modelscope error: Wrong time stamps.r9   z"modelscope error: Wrong data type.zFmodelscope error: audio data in list is inconsistent with time length.   z<modelscope error: The effective audio duration is too short.)rL   rJ   rr   r@   ndarrayr   r   r>   )r!   r&   	audio_durrW   segr   r   r%   r+      s8   z/SegmentationClusteringPipeline.check_audio_listc                    s4    fdd}g }t |D ]\}}||| q|S )Nc           
         s   | d }| d }t  jd  j }t  jd  j }d}g }td|jd |D ]I}t|| |jd }||kr< |S |}td|| }||| }	|	jd |k r`t|	d||	jd  fd}	|	| j | | j | |	g q)|S )Nr   r9   r   r   constant)
r   r   r   rL   r>   rS   rN   r@   padr?   )
seg_dataseg_stdata	chunk_lenchunk_shiftlast_chunk_edseg_reschunk_stchunk_ed
chunk_datar!   r   r%   	seg_chunk   s0   z7SegmentationClusteringPipeline.chunk.<locals>.seg_chunk)	enumerateextend)r!   r2   r   segsrW   rB   r   r   r%   r,      s
   z$SegmentationClusteringPipeline.chunkcut_stcut_edc              	   C   s`  t |tjr|t|| j t|| j  S t |trtt|D ]L}|dkr2||| d k r1|}n|||d  d krF||| d k rF|}|t|d krY||| d krX|}q!||| d krm|||d  d krm|}q!|||d  }g }tt|D ]&}|| \}	}
}||tt	||	|	 | j tt
||
|	 | j   q~t|}|S td)Nr   r<   ri   )rr   r@   r   r   r   rs   rL   rJ   r?   rN   rS   rA   r   )r!   r   r   r&   rW   st_ied_i
audio_segscut_datas_sts_edr   r   r   r%   rT      sB    
$$
z(SegmentationClusteringPipeline.cut_audioc                 C   sF   d}i }g }|D ]}||vr|||< |d7 }| ||  qt|S )Nr   r<   )r?   r@   array)r!   r5   	labels_idid2id
new_labelsrW   r   r   r%   rK     s   
z-SegmentationClusteringPipeline.correct_labelsc                 C   sv   |d g}t dt|D ],}|| d |d d ks&|| d |d d kr.|||  q|| d |d d< q|S )Nr   r<   r9   )rL   rJ   r?   )r!   rV   resrW   r   r   r%   rM   &  s   

z*SegmentationClusteringPipeline.merge_sequer<   c                 C   s&  t t|D ]}t|| d d|| d< t|| d d|| d< || d || d  |k r|dkrA||d  d || d< q|t|d krV||d  d || d< q|| d ||d  d  ||d  d || d  kr||d  d || d< q||d  d || d< q| |}|S )Nr   r9   r<   )rL   rJ   roundrM   )r!   r   mindurrW   r   r   r%   rU   0  s   8
z%SegmentationClusteringPipeline.smooth)r<   )__name__
__module____qualname____doc__r
   r   r   ru   r@   r   rs   r   r   r7   r-   r.   r/   r*   r+   r,   floatrT   rK   rM   rU   __classcell__r   r   r#   r%   r      s4    

	

3-
 
!
)$r   rx   typingr   r   r   r   r   r@   	soundfilerw   r}   rz   modelscope.fileior   modelscope.metainfor   modelscope.outputsr   modelscope.pipelinesr	   modelscope.pipelines.baser
   r   modelscope.pipelines.builderr   modelscope.utils.constantr   modelscope.utils.loggerr   r(   __all__register_modulerR   segmentation_clusteringr   r   r   r   r%   <module>   s*   