o
    ih                  ,   @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1m2Z2 d dl3m4Z4 G dd dZ5de6de7de6de7de7de7de7dee7e6f deee6e6e6f  de
e6 de
e6 d e
e6 d!e
e6 d"e8d#e
e9 d$e
e9 d%e8d&e8d'e
e7 d(e8d)e8d*e8f,d+d,Z:d-d. Z;d2d/d0Z<e=d1kre<  dS dS )3    N)permutations)Path)AnyListOptionalSequenceTupleUnion)trange)check_argument_types)FrequencyDomainMSE)	SISNRLoss)	PITSolver)NpyScpWriter)SoundScpWriter)DiarizationTask)
EnhS2TTask)	to_device)set_all_random_seed)config_argparse) humanfriendly_parse_size_or_noneint_or_nonestr2boolstr2triple_strstr_or_none)get_commandline_argsc                   @   s   e Zd ZdZ												d'deeef deeef dee d	ee d
e	de	de	dee
 dedede	de	fddZe 	d(deejejf de
deej fddZe d)ddZe	d*dee dee fdd Zd!d" Zd#d$ Zd%d& ZdS )+DiarizeSpeecha  DiarizeSpeech class

    Examples:
        >>> import soundfile
        >>> diarization = DiarizeSpeech("diar_config.yaml", "diar.pth")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> diarization(audio)
        [(spk_id, start, end), (spk_id2, start2, end2)]

    NFcpufloat32train_config
model_filesegment_sizehop_sizenormalize_segment_scaleshow_progressbarnormalize_output_wavnum_spkdevicedtypeenh_s2t_taskmultiply_diar_resultc                 C   s  t  sJ |s	tnt}||||	\}}|r |jddgdgd |jtt|
d  |	| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|d uoU| | _|d uo`|d uo`|| _| jrttd td| d S | jrtd td	|| d S td
 d S )Ndecoder	attractormask_module)inherite_s2t_attrsinherite_enh_attrs)r(   z(Perform segment-wise speaker diarizationzSegment length = {} secz6Perform segment-wise speech separation and diarizationz,Segment length = {} sec, hop length = {} secz/Perform direct speaker diarization on the input)r   r   r   build_model_from_fileinherite_attributestogetattrtorchevalr'   r(   diar_train_args
diar_modelr!   r"   r#   r%   r$   r&   r*   r)   segmenting_diarsegmenting_enh_diarlogginginfoformat)selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   taskr7   r6    r?   N/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/diar_inference.py__init__0   sP   



zDiarizeSpeech.__init__@  speechfsreturnc                    s  t  sJ t|tjrt|}| dksJ | |d}|t	t| j
}|j|gtj|dd}t|| jd}t|| jd}| jr|d | j| krtt|d| j|  }t| j|  }}|ddd|f j}g }	| jr{tnt}
|
|D ]`}t|| j | }|| }||d kr|d }||}|| }|dd||f |ddd|f< n|}|dd||f }|j|gtj|d}| ||\}}| ||\}}|	| qtdd |	D fdd|	D }	tj|	dd	}dn| ||\}}| ||\}}| jr| jr|d | j| krtt|| j| j   }tt|d| | j |  }t| j|  }}|ddd|f j}g }| jr[tnt}
|
|D ]}t|| j  | }|| }||d kr|d }||}|| }|dd||f |ddd|f< n|}|dd||f }|j|gtj|d}| j!"|||\}}}| j#rt$tj%|ddd|f &d
dddt$tj%t'|ddd|f &d
ddd  fdd|D }|tj(|dd	 qa|d td|D ]}| j)dddd| df || ddddd|f dd}t|D ]}|| || |f || dd|f< q0||d krld|| dddd|df< || dddd||f }n|| dddd|df }dddd| df || ddddd|f  d
 dddd| df< tj|gd
d	q	d
|dksJ j|jftj*dd	n| j!"|||\}}| j+r| ,|\}}fddt|D | j-rdd D n
dd D nd| j.dur|d
| j.ksJ |d
| j.f|d|ks0J |d|f|/ 0 }ddt1|   }| jrH|fS |fS )zInference

        Args:
            speech: Input speech data (Batch, Nsamples [, Channels])
            fs: sample rate
        Returns:
            [speaker_info1, speaker_info2, ...]

           r   )r(   
fill_valuer'   Nc                 S   s   g | ]}| d qS )   )size.0xr?   r?   r@   
<listcomp>       z*DiarizeSpeech.__call__.<locals>.<listcomp>c              
      s2   g | ]}t jj|d  |d fdtdqS )r   rI   constantz-inf)r4   nn
functionalpadrJ   floatrK   )max_lenr?   r@   rN      s    dimrI   TrW   keepdimc                    s   g | ]}|   qS r?   r?   rL   w)
enh_energy
mix_energyr?   r@   rN      s    si_snr	criterionc                    s*   g | ]}|  d d d d |f  qS Nr?   )rL   i)interp_predictionwavesr?   r@   rN   (  s    c                 S   s2   g | ]}|t |jd ddd  d   qS )rF   TrX   r   g?)absmaxr   numpyrZ   r?   r?   r@   rN   ,  s    $c                 S   s   g | ]}|   qS r?   )r   rg   rZ   r?   r?   r@   rN   1  s    )2r   
isinstancenpndarrayr4   	as_tensorrW   rJ   r2   r3   r(   new_fulllongr   r'   r8   r!   intceilshaper$   r
   range	new_zerosencodedecodeappendrf   catr)   r9   roundr"   r7   encode_diarr#   sqrtmeanpowsumstackcal_permumationunbindr*   permute_diarr%   r&   r   rg   exp)r=   rC   rD   
batch_sizelengthsnum_segmentstT	pad_shapediarized_wavsrange_rb   sten
speech_seglengths_segencoder_outencoder_out_lensspk_prediction_r&   overlap_length	enh_wavesprocessed_wavpermbatchenh_waves_res_ir?   )r\   rc   rU   r]   rd   r@   __call__z   s  



&


&

( &
zDiarizeSpeech.__call__r^   c           	      C   s6   t td| }t| d}|||\}}}|d }|S )a|  Calculate the permutation between seaprated streams in two adjacent segments.

        Args:
            ref_wavs (List[torch.Tensor]): [(Batch, Nsamples)]
            enh_wavs (List[torch.Tensor]): [(Batch, Nsamples)]
            criterion (str): one of ("si_snr", "mse", "corr)
        Returns:
            perm (torch.Tensor): permutation for enh_wavs (Batch, num_spk)
        )r^   mser_   r   )r   r   r   )	r=   ref_wavsenh_wavsr`   criterion_class
pit_solverr   othersr   r?   r?   r@   r~   C  s
   zDiarizeSpeech.cal_permumation	model_tagkwargsc                 K   s^   | dur(zddl m} W n ty   td  w | }|jdi ||  tdi |S )a'  Build DiarizeSpeech instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.

        Returns:
            DiarizeSpeech: DiarizeSpeech instance.

        Nr   )ModelDownloaderzZ`espnet_model_zoo` is not installed. Please install via `pip install -U espnet_model_zoo`.r?   )espnet_model_zoo.downloaderr   ImportErrorr:   errorupdatedownload_and_unpackr   )r   r   r   dr?   r?   r@   from_pretrainedW  s   zDiarizeSpeech.from_pretrainedc                 C   s2  t |}dd tt|D }g }tjt|dd|d ddddd}|D ]A}|d d d d |f }dg}	t|D ]'}
|	t	
tt||
   t|d d d d |
f   d 7 }	q?||	 q+tjtt	|dd	\}}|d d d d || f |d d d d || f || fS )
Nc                 S   s   g | ]}t |qS r?   )ri   array)rL   pr?   r?   r@   rN   z  rO   z.DiarizeSpeech.permute_diar.<locals>.<listcomp>rF   rI   r   linear)rJ   mode)r   rF   rV   )lenr   rq   Finterpolater4   sigmoid	transposerJ   ri   corrcoefsqueezere   r   rg   ru   rf   
from_numpyr   )r=   rd   r   r&   permute_list	corr_listrc   r   	diar_perm	corr_permqmax_corrmax_idxr?   r?   r@   r   u  s4   "zDiarizeSpeech.permute_diarc                 C   sL   | j r| j||| j\}}}||fS d  }}| j||||\}}||fS ra   )r)   r7   rx   r&   rs   )r=   rC   r   r   r   r   bottleneck_featsbottleneck_feats_lengthsr?   r?   r@   rs     s   
zDiarizeSpeech.encodec                 C   sZ  | j jd u r| jd usJ d| j ||}| j}||fS | jd ur]| j ||tt|d| jd |d| jd\}}t	||d d d | jd d f 
ddd}| j}||fS d}| j ||tt|d|d |d| jd\}}t|}tt|D ]}||  dk r nqt	||d d d |d d f 
ddd}||fS )Nz$Argument "num_spk" must be specifiedr   rF   rI   rH      )r7   r,   r&   r+   r   r4   zerosrJ   r'   bmmpermuter   rq   r   item)r=   r   r   r   r&   r,   att_probmax_num_spkr?   r?   r@   rt     s\   ,
$
$zDiarizeSpeech.decode)NNNNFFFNr   r   FF)rB   )r^   ra   )__name__
__module____qualname____doc__r	   r   strr   rT   boolrn   rA   r4   no_gradTensorri   rj   r   r   r~   staticmethodr   r   r   rs   rt   r?   r?   r?   r@   r   $   sz    

	

J Ir   
output_dirr   r(   rD   ngpuseednum_workers	log_leveldata_path_and_name_and_typekey_filer   r    r   allow_variable_data_keysr!   r"   r#   r$   r&   r%   r*   r)   c           %      C   s  t  sJ |dkrtd|dkrtdtj|dd |dkr#d}nd}t| t|
|||||||||||d}tjdd	|i|}tj	||||	|t
|jd
t|jd
|dd	}t|  d|  d}|rg }|jd urt|jD ]}|t|  d|d  |  d|d  d qtn!t|jjjD ]}|t|  d|d  |  d|d  d q|D ]\}}t|tsJ t|tdd |D sJ |ttt| }t||ksJ t| d| dd | D }|r%|di |\} }!t|D ]!}"|!|" |||" < t| D ]\}#}$||$|" f||# ||" < qqq|di |}!t|D ]}"|!|" |||" < q0q|rJ|D ]}$|$  qB|  d S )NrF   z!batch decoding is not implementedz%only single GPU decoding is supportedz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelr<   cudar   )r   r    r!   r"   r#   r$   r%   r&   r'   r(   r*   r)   r   FT)r(   r   r   r   preprocess_fn
collate_fnr   	inferencez/predictionsz/diarize.scpz/wavs/z/spkz.scpc                 s   s    | ]}t |tV  qd S ra   )rh   r   )rL   sr?   r?   r@   	<genexpr>7  s    zinference.<locals>.<genexpr>z != c                 S   s    i | ]\}}| d s||qS )_lengths)endswith)rL   kvr?   r?   r@   
<dictcomp>:  s     zinference.<locals>.<dictcomp>r?   )r   NotImplementedErrorr:   basicConfigr   dictr   r   r   build_streaming_iteratorbuild_preprocess_fnr6   build_collate_fnr   r&   rq   ru   r   r7   r-   r   rh   typeallr   nextitervaluesitems	enumerateclose)%r   r   r(   rD   r   r   r   r   r   r   r   r    r   r   r!   r"   r#   r$   r&   r%   r*   r)   r'   diarize_speech_kwargsdiarize_speechloaderwriterwav_writersrb   keysr   _bsrd   spk_predictionsbspkr[   r?   r?   r@   r     s   

"""r   c                  C   s  t jdtjd} | jddd dddd	 | jd
tdd | jdtddd | jdtddd | jddg ddd | jdtddd | jdtddd | d}|jdt	dd d! |jd"t
d# |jd$td%d& | d'}|jd(td)d* |jd+td,d* |jd-td.d* | d/}|jd0tdd1d | d2}|jd3td d4d |jd5td d6d |jd7td%d8d |jd9td d:d | d;}|jd<td%d=d |jd>td%d?d |jd@td%dAd |jdBtd%dCd | S )DNzSpeaker Diarization inference)descriptionformatter_classz--log_levelc                 S   s   |   S ra   )upper)rM   r?   r?   r@   <lambda>W  s    zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGr   DEBUGNOTSETzThe verbose level of logging)r   defaultchoiceshelpz--output_dirT)r   requiredz--ngpur   z(The number of gpus. 0 indicates CPU mode)r   r  r  z--seedzRandom seedz--dtyper   )float16r   float64z	Data type)r  r  r  z--fsrB   zSampling ratez--num_workersrF   z)The number of workers used for DataLoaderzInput data relatedz--data_path_and_name_and_typeru   )r   r  actionz
--key_file)r   z--allow_variable_data_keysF)r   r  zThe model configuration relatedz--train_configz"Diarization training configuration)r   r  z--model_filez Diarization model parameter filez--model_tagz]Pretrained model tag. If specify this option, train_config and model_file will be overwrittenzData loading relatedz--batch_sizezThe batch size for inferencezDiarize speech relatedz--segment_sizez>Segment length in seconds for segment-wise speaker diarizationz
--hop_sizezDHop length in seconds for segment-wise speech enhancement/separationz--show_progressbarzOWhether to show a progress bar when performing segment-wise speaker diarizationz	--num_spkz.Predetermined number of speakers for inferencezEnh + Diar relatedz--enh_s2t_taskz'enhancement and diarization joint modelz--normalize_segment_scalezHWhether to normalize the energy of the separated streams in each segmentz--normalize_output_wavz0Whether to normalize the predicted wav to [-1~1]z--multiply_diar_resultz3Whether to multiply diar results to separated waves)r   ArgumentParserargparseArgumentDefaultsHelpFormatteradd_argumentr   rn   r   add_argument_groupr   r   r   rT   r   )parsergroupr?   r?   r@   
get_parserM  s   




r  c                 C   sF   t t tjd t }|| }t|}|dd  tdi | d S )N)fileconfigr?   )	printr   sysstderrr  
parse_argsvarspopr   )cmdr  argsr   r?   r?   r@   main  s   
r  __main__ra   )>r  r:   r  	itertoolsr   pathlibr   typingr   r   r   r   r   r	   rg   ri   r4   torch.nn.functionalrQ   rR   r   tqdmr
   	typeguardr   %espnet2.enh.loss.criterions.tf_domainr   'espnet2.enh.loss.criterions.time_domainr   $espnet2.enh.loss.wrappers.pit_solverr   espnet2.fileio.npy_scpr   espnet2.fileio.sound_scpr   espnet2.tasks.diarr   espnet2.tasks.enh_s2tr    espnet2.torch_utils.device_funcsr   'espnet2.torch_utils.set_all_random_seedr   espnet2.utilsr   espnet2.utils.typesr   r   r   r   r   espnet.utils.cli_utilsr   r   r   rn   r   rT   r   r  r  r   r?   r?   r?   r@   <module>   s       0
	

| 
	
