o
    i\                     @   sZ  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZmZ d dlZd dlZd dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, eej-edkrd dl.m/Z/ nedddZ/G dd de%Z0dS )    N)contextmanager)LooseVersion)permutations)Dict)Optional)TupleList)
functional)	to_device)make_pad_mask)
AbsDecoder)
AbsEncoder)AbsFrontend)
AbsSpecAug)AbsProfileAug)AbsNormalize)force_gatherable)FunASRModel)LabelSmoothingLossSequenceBinaryCrossEntropy)int2vec)	hint_oncez1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   T/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sond/e2e_diar_sond.pyr   #   s   
r   c                /       s<  e Zd ZdZ													
d^dedee dee dee dee	 de
jjdee
jj de
jjdee
jj de
jjdededededee
jj dedededededed ed!ef. fd"d#Zd$d% Zd&d' Zd(d) Z					d_d*e
jd+e
jd,e
jd-e
jd.e
jd/e
jd0ee
jeee
jf e
jf fd1d2Zd3d4 Zd5e
jd6e
jd7e
jd0e
jfd8d9Zd,e
jd-e
jd0e
jfd:d;Zd<d= Zd>e
jd?e
jd@e
jdAe
jd0ee
je
jf f
dBdCZ				d`d*e
jd+e
jd,e
jd-e
jd.e
jd/e
jd0eee
jf fdDdEZd,e
jd-e
jd0ee
je
jf fdFdGZd*e
jd+e
jd0ee
je
jf fdHdIZ e!d*e
jdJe
jd0e
jfdKdLZ"		dadMe
jdNe
jdOe
jdPe
jd0ee
je
jf f
dQdRZ#dSdT Z$	dbd*e
jd+e
jd,e
jd-e
jdUed0e
jee gfdVdWZ%d*e
jd+e
jd0ee
je
jf fdXdYZ&d*e
jd+e
jd0ee
je
jf fdZd[Z'e!d\d] Z(  Z)S )cDiarSondModelzcSpeaker overlap-aware neural diarization model
    reference: https://arxiv.org/abs/2211.10243
    皙?F   N      ?        rawT
vocab_sizefrontendspecaug
profileaug	normalizeencoderspeaker_encoder	ci_scorer	cd_scorerdecoder
token_list
lsm_weightlength_normalized_lossmax_spk_numlabel_aggregatornormalize_speech_speaker	ignore_id"speaker_discrimination_loss_weightinter_score_loss_weightinputs_typemodel_regularizer_weightfreeze_encoderonfly_shuffle_speakerc                    s  t    || _|| _|| _|	| _|| _|| _|| _|| _	|| _
|
| _|| _|| _|| _|| _|| _|| _|| _t||||d| _t|d| _|  | _tdt|tjtjd d f   | _tt | j!t"tjtjd d f " | _#|| _$|| _%d| _&|| _'d | _(d S )N)sizepadding_idx	smoothingnormalize_length)r>      r   ))super__init__r)   r*   r+   r,   r(   r%   r&   r'   r2   r-   r.   r1   r3   r4   r8   r9   r:   r   criterion_diarr   criterion_bcegenerate_pse_embeddingpse_embeddingtorch
from_numpynparangenewaxisfloatpower_weightarrayastypeintint_token_arrr5   r6   forward_stepsr7   to_regularize_parameters)selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   	__class__r   r   rA   -   sR   

"
zDiarSondModel.__init__c                 C   s   g g }}|   D ]/\}}d|v r1d|v r1d|vr1d|v s)d|v s)d|v s)d|v r1|||f q	|||f q	|| _||fS )Nr)   weightbnconv2conv1conv_scdense)named_parametersappendrR   )rS   rR   normal_parametersnameparamr   r   r   get_regularize_parameterso   s   
 z'DiarSondModel.get_regularize_parametersc                 C   sX   t jt| j| jft jd}t| jD ]\}}tt|| jt jd}|||< qt	
|S )N)dtype)vec_dimrb   )rH   zeroslenr.   r1   float32	enumerater   rO   rF   rG   )rS   	embeddingidx	pse_labelembr   r   r   rD   ~   s
   

z$DiarSondModel.generate_pse_embeddingc           	      C   s   |j d |j d ksJ d|j d |j d t|}t|}|j d |j d }}t|D ]%}tt|}t| || |ddf ||< || dd|f ||< q1||fS )zI
        raw_profile: B, N, D
        raw_binary_labels: B, T, N
           r?   zNum profile: {}, Num label: {}r    r   N)shapeformatrF   clonerangelistrandomshuffle)	rS   raw_profileraw_binary_labelsprofilebinary_labelsbsznum_spkiri   r   r   r   rand_permute_speaker   s   


z"DiarSondModel.rand_permute_speakerspeechspeech_lengthsrv   profile_lengthsrw   binary_labels_lengthsreturnc           +      C   s  |j d |j d ksJ |j |j f|j d }| jr&tdddd | j  | jd | _| jj|jkrK| j|j| _| j	|j| _	| j
|j| _
| jr]tdddd | ||\}}| jdurj| ||\}}| jdur| jr| ||||||\}}}t|d| j|j d	  fd
d}tj|| j	 d	dd}	tj|	 | j
k d	d}
| j||||dd\}}|\\}}\}}\}}d	}t|
j d |j d  }||krt|j d |
j d }|
ddd|f }
|ddd|f }|ddd|f }|ddd|f }| ||
|}| ||}| |||
|\}}d}| jdkr.| jdur.|   }t!||
j d d|
j}|| j"|  | j#||   }| j$t%|jd	d|  | jt%|
|  | j|d\	}}}}}}}} }!|dkr|dkr|| || || | | |!| || ||  |! | f\}"}#}$}%}&}'}(n	d\}"}#}$}%}&}'}(t&|' |dur|' nd|dur|' nd|dur|' nd|dur|' nd|dur|' nd|"|#|$|%|&|'|(| jd})t(||)|f|j\}})}*||)|*fS )a  Frontend + Encoder + Speaker Encoder + CI Scorer + CD Scorer + Decoder + Calc loss

        Args:
            speech: (Batch, samples) or (Batch, frames, input_size)
            speech_lengths: (Batch,) default None for chunk interator,
                                     because the chunk-iterator does not
                                     have the speech_lengths returned.
                                     see in
                                     espnet2/iterators/chunk_iter_factory.py
            profile: (Batch, N_spk, dim)
            profile_lengths: (Batch,)
            binary_labels: (Batch, frames, max_spk_num)
            binary_labels_lengths: (Batch,)
        r   zFreeze encoderr9   )rankrl   z'On-the-fly shuffle speaker permutation.r:   Nr?   constantr"   Tdimkeepdimr   )return_inter_outputsmaxlen)predlabellength)r   r   r   r   r   r   r   )loss	loss_diarloss_spk_disloss_inter_ciloss_inter_cdregularizer_losssad_mrsad_frmifacfaccderrQ   ))rm   r9   r   r)   evalrQ   rE   devicetorL   rP   r:   r{   r2   r'   trainingFpadr1   rF   sumargmaxrO   rK   prediction_forwardabsminclassification_lossspeaker_discrimination_lossinternal_score_lossr8   rR   calculate_regularizer_lossr   r5   r6   calc_diarization_errorrh   dictdetachr   )+rS   r|   r}   rv   r~   rw   r   
batch_sizepad_bin_labelsraw_pse_labels
pse_labelsr   inter_outputsci_scorecd_scorelength_diff_tolerancelength_diffmin_lenr   r   r   r   r   
label_maskr   correct
num_framesspeech_scoredspeech_missspeech_falarmspeaker_scoredspeaker_missspeaker_falarmspeaker_errorr   r   r   r   r   r   r   statsrV   r   r   r   forward   s   $









zDiarSondModel.forwardc                 C   s*   d}| j D ]\}}|tj|dd }q|S )Nr"   r?   )p)rR   rF   norm)rS   r   r_   r`   r   r   r   r   -  s   z(DiarSondModel.calculate_regularizer_losspredictionslabelsprediction_lengthsc                 C   s>   t ||jd d}|j||j| jd}| | |}|S )Nrl   r   value)r   rm   masked_fillr   r   r4   rB   
contiguous)rS   r   r   r   mask
pad_labelsr   r   r   r   r   3  s   z!DiarSondModel.classification_lossc           	      C   s   t jj|dddddk }t ||dd}|dt | jd	|  }d}t jj|| d| |  ddd| }t
j|d|dd	|d
| }t |d	| d| }t
|| |d   |  }|S )Nr?   Tordr   r   r   rl   r!   g-q=r   r    )r   epsr"   )rF   linalgr   rK   matmul	transposeeyer1   	unsqueezer   r   cosine_similaritycliprelur   )	rS   rv   r~   profile_maskr   r   coding_norm	cos_thetar   r   r   r   r   <  s"     "z)DiarSondModel.speaker_discrimination_lossc                 C   s@   t ||jd d}|j||jdd|}t|| j}|S )Nrl   r   r   r   )r   rm   r   r   r   r   rh   rE   )rS   r   pse_labels_lengthsr   padding_labelsmulti_labelsr   r   r   calculate_multi_labelsU  s   z$DiarSondModel.calculate_multi_labelsr   r   r   r   c                 C   s0   |  ||}| |||}| |||}||fS r   )r   rC   )rS   r   r   r   r   r   ci_losscd_lossr   r   r   r   \  s   z!DiarSondModel.internal_score_lossc           	      C   s   |  ||\}}||dS )N)featsfeats_lengths)_extract_feats)	rS   r|   r}   rv   r~   rw   r   r   r   r   r   r   collect_featsh  s   	
zDiarSondModel.collect_featsc              
   C   s   t dW |jd | jk r!t|ddd| j|jd  ddgdd}tjj|dddddk }tj	|dd	}| j
d urP| 
||d }|| |fW  d    S ||fW  d    S 1 s^w   Y  d S )
NFrl   r   r   r"   r?   Tr   r   )r   rm   r1   r   r   rF   r   r   rK   r(   r*   )rS   rv   r~   r   r   r   r   encode_speakert  s   
 

$zDiarSondModel.encode_speakerc                 C   sb   | j d ur-| jdkr-| ||\}}t||jd d }||jd }|| |fS ||fS )Nr#   rl   r   r    )	r)   r7   encoder   rm   r   r   r   rK   )rS   r|   r}   speech_maskr   r   r   encode_speech  s   zDiarSondModel.encode_speechivcc                 C   sd   |j d | j d }}| jdd} | d|dd} |jdd}|dd|d}tj| |gdd}|S )Nrl   r   r    r?      )rm   r   expandrF   cat)r|   r   nnttsd_inr   r   r   concate_speech_ivc  s   z DiarSondModel.concate_speech_ivcspeech_encoder_outputsspeaker_encoder_outputsseq_lenspk_lenc                 C   s0  |j d |j d }}|j d |j d }}| jr'tj|dd}tj|dd}| ||}	t|	|| j ||| g}	|d	d| j}
t|
|| j g}
| 
|	|
d }t||| j|dg}|jddg d}t| jtr| |	|
d }t||| j|gg d}||fS | ||}||fS )Nr   rl   r?   r   r    r   )r   r?   rl   )rm   r3   r   r(   r   rF   reshaper1   r   r   r,   squeezepermute
isinstancer+   r   )rS   r   r   r   r   bbr   d_sphd_spkge_inge_lencd_simici_simir   r   r   calc_similarity  s$   zDiarSondModel.calc_similarityc                 C   s   |  ||d }|S )Nr   )r-   )rS   simir   logitsr   r   r   post_net_forward  s   zDiarSondModel.post_net_forwardr   c           
      C   st   |  ||\}}| ||\}}| ||||\}}tj||gdd}| ||}	|r8|	||f||f||fgfS |	S )Nr?   r   )r   r   r   rF   r   r  )
rS   r|   r}   rv   r~   r   r   r   
similarityr  r   r   r   r     s   	z DiarSondModel.prediction_forwardc                 C   s   t d; | ||\}}| jdur| jr| ||\}}| jdur*| ||\}}| ||}|dd \}}W d   n1 sBw   Y  |d|dks\J | |df|d| ksoJ | | f||fS )zxFrontend + Encoder

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch,)
        FNr?   r   rl   )r   r   r&   r   r(   r)   r;   max)rS   r|   r}   r   r   encoder_outputsencoder_outencoder_out_lensr   r   r   r     s$   
	
zDiarSondModel.encodec                 C   s   |j d }|d ur|nt| |j d  }| dks"J |j |d d d | f }| jd ur?| ||\}}||fS ||}}||fS )Nr   rl   )rm   rF   onesrO   r   r  r%   )rS   r|   r}   r   r   r   r   r   r   r     s   


zDiarSondModel._extract_featsc              	   C   s  |  \}}}t||jd dd  }|j  t}| j  dkt}|| }|| }|j  }t	j
|dd}	t	j
|dd}
tt	
|	dk}tt	
t	|	dk|
dk}tt	
t	|	dk|
dk}tt	
|	}tt	
t	|	|
 d}tt	
t	|
|	 d}t	j
t	|dk|dkdd}tt	
t	|	|
| }tdt	
||k|  | }t	
|}|||||||||f	S )Nrl   r   r    r   r?   )axisr!   )r;   r   rm   r   numpydatacpurN   rO   rH   r   rK   logical_andmaximumminimum)r   r   r   r   max_len
num_outputr   label_nppred_npn_refn_sysr   r   r   r   r   r   n_mapr   r   r   r   r   r   r     s:   
z$DiarSondModel.calc_diarization_error)r   Fr   NFr    r!   r"   r#   r"   FT)NNNNN)NNNN)NN)F)*__name__
__module____qualname____doc__rO   r   r   r   r   r   rF   r   Modulerq   rK   boolstrrA   ra   rD   r{   Tensorr   r   r   r   r   r   r   r   r   r   r   staticmethodr   r   r  r   r   r   r   __classcell__r   r   rT   r   r   (   s   
	


B
 
	







&
r   )T)1loggingrr   
contextlibr   distutils.versionr   	itertoolsr   typingr   r   r   r   r  rH   rF   torch.nnr	   r   *funasr.models.transformer.utils.nets_utilsr
   r   !funasr.models.decoder.abs_decoderr   !funasr.models.encoder.abs_encoderr   funasr.frontends.abs_frontendr   !funasr.models.specaug.abs_specaugr   $funasr.models.specaug.abs_profileaugr   funasr.layers.abs_normalizer   funasr.train_utils.device_funcsr   funasr.models.base_modelr   "funasr.losses.label_smoothing_lossr   r   funasr.utils.miscr   funasr.utils.hinterr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s:   