o
    iL:                     @   s  d dl mZ d dlmZ d dlmZmZmZ d dlZ	d dl
Z
d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% ee
j&edkryd dl'm(Z( nedddZ(G dd de#Z)dS )    )contextmanager)permutations)DictOptionalTupleN)parse)check_argument_types)
AbsEncoder)AbsFrontend)
AbsSpecAug)AbsAttractor)
AbsDecoder)AbsNormalize)force_gatherable)AbsESPnetModel)	to_devicez1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/diar/espnet_model.pyr      s   
r   c                       s`  e Zd ZdZ		d(dee dee dee dej	j
deded	ee d
edef fddZ			d)dejdejdejdejdeejeeejf ejf f
ddZ		d*dejdejdejdejdeeejf f
ddZdejdejdejdejdeejejf f
ddZdejdejdeejejf fddZdd Zd d! Zd"d# Zd$d% Zed&d' Z  ZS )+ESPnetDiarizationModelak  Speaker Diarization model

    If "attractor" is "None", SA-EEND will be used.
    Else if "attractor" is not "None", EEND-EDA will be used.
    For the details about SA-EEND and EEND-EDA, refer to the following papers:
    SA-EEND: https://arxiv.org/pdf/1909.06247.pdf
    EEND-EDA: https://arxiv.org/pdf/2005.09921.pdf, https://arxiv.org/pdf/2106.10654.pdf
          ?frontendspecaug	normalizelabel_aggregatorencoderdecoder	attractordiar_weightattractor_weightc
           
         sx   t  sJ t   || _|| _|| _|| _|| _|| _|	| _	|| _
|| _| j
d ur/d | _d S | jd ur:|j| _d S tr   )r   super__init__r   r   r   r   r   r    r!   r   r   num_spkNotImplementedError)
selfr   r   r   r   r   r   r   r    r!   	__class__r   r   r#   +   s    




zESPnetDiarizationModel.__init__Nspeechspeech_lengths
spk_labelsspk_labels_lengthsreturnc           *      K   s  |j d |j d ksJ |j |j f|j d }|dd}|dd}| ||||\}	}
| jdu r9| |	|
}nX|	 }tt|
D ]}|	|t	|
| ddf ||d|
| ddf< qC| ||
t
| t|	d|dd |	d\}}t|	|ddddddf ddd}| ||\}}d}|j d |j d  }|dkr||kr|ddd|j d ddf }| jdu rd\}}| |||
\}}}}n| |||
\}}}}| ||}| j| | j|  }| |||
\	}}}}}}}}} |dkr*|dkr*|| || || || | | || || |  | f\}!}"}#}$}%}&}'n	d	\}!}"}#}$}%}&}'t| |dur@| nd|durJ| nd|!|"|#|$|%|&|'d

}(t||(|f|j\}}(})||(|)fS )a  Frontend + Encoder + Decoder + Calc loss

        Args:
            speech: (Batch, samples)
            speech_lengths: (Batch,) default None for chunk interator,
                                     because the chunk-iterator does not
                                     have the speech_lengths returned.
                                     see in
                                     espnet2/iterators/chunk_iter_factory.py
            spk_labels: (Batch, )
            kwargs: "utt_id" is among the input.
        r   bottleneck_featsNbottleneck_feats_lengths      NN)r   r   r   r   r   r   r   )
lossloss_attloss_pitsad_mrsad_frmifacfaccder)shapegetencoder   r   clonerangelentorchrandpermr   zerossizebmmpermuter   pit_lossattractor_lossr    r!   calc_diarization_errordictdetachr   device)*r&   r)   r*   r+   r,   kwargs
batch_sizer.   r/   encoder_outencoder_out_lenspredencoder_out_shuffledir   att_problength_diff_tolerancelength_diffr6   r5   r4   perm_idx	perm_list
label_permcorrect
num_framesspeech_scoredspeech_missspeech_falarmspeaker_scoredspeaker_missspeaker_falarmspeaker_errorr7   r8   r9   r:   r;   r<   r=   statsweightr   r   r   forwardL   s   $

, 


zESPnetDiarizationModel.forwardc                 K   s   |  ||\}}||dS )N)featsfeats_lengths)_extract_feats)r&   r)   r*   r+   r,   rP   ri   rj   r   r   r   collect_feats   s   
z$ESPnetDiarizationModel.collect_featsr.   r/   c           
      C   sF  t dk | ||\}}| jdur| jr| ||\}}| jdur*| ||\}}|du r8| ||\}}}	n0| jdu rG| ||\}}}	n!tj|	dd|j
d d	dd}| t||fd|\}}}	W d   n1 srw   Y  |d|dksJ | |df|d| ksJ | | f||fS )zFrontend + Encoder

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch,)
            bottleneck_feats: (Batch, Length, ...): used for enh + diar
        FNr1   r0   )rG   r   )r   rk   r   trainingr   r   r   Finterpolate	transposer>   rD   catrG   max)
r&   r)   r*   r.   r/   ri   rj   rR   rS   _r   r   r   r@      s>   



"zESPnetDiarizationModel.encodec                 C   s   |j d }|d ur|nt| |j d  }| dks"J |j |d d d | f }| jd ur?| ||\}}||fS ||}}||fS )Nr   r1   )r>   rD   onesintdimrr   r   )r&   r)   r*   rQ   ri   rj   r   r   r   rk   
  s   


z%ESPnetDiarizationModel._extract_featsc                 C   sd   t jjdd}| ||d|d}|||}|| }t jt j|dddd}t j|dd}|S )Nnone	reductionr1   r0   rv   )rD   nnBCEWithLogitsLosscreate_length_maskrG   summean	unsqueeze)r&   rT   labellengthbce_lossmaskr4   r   r   r   pit_loss_single_permute$  s   
z.ESPnetDiarizationModel.pit_loss_single_permutec              	   C   s   | d}dd tt|D }g }|D ]}|d d d d |f }| |||}	||	 qtj|dd}
tj|
dd\}}t|t|	  }
t
|}g }t|D ]}|||d d |||  f j   qTtt|	 }|
|||fS )Nr0   c                 S   s   g | ]}t |qS r   )nparray).0pr   r   r   
<listcomp>0  s    z3ESPnetDiarizationModel.pit_loss.<locals>.<listcomp>r1   rz   )rG   r   rB   r   appendrD   rq   minr~   floatrC   datacpunumpy
from_numpyr   r   )r&   rT   r   lengths
num_outputpermute_list	loss_listr   r\   	loss_permr4   min_lossmin_idxrQ   
label_listrV   label_permuter   r   r   rJ   -  s    
,zESPnetDiarizationModel.pit_lossc                 C   sL   t |}t|||}t|D ]}d||d || d d f< qt| |}|S )Nr1   )rC   rD   rF   rB   r   )r&   r   max_lenr   rQ   r   rV   r   r   r   r}   @  s   
z)ESPnetDiarizationModel.create_length_maskc                 C   sv   t |}tjjdd}t| t||dd d}d|d d d |dd d f< |||}ttj|dd}|S )Nrw   rx   r0   r1   rz   )rC   rD   r{   r|   r   rF   rG   r   )r&   rW   r   rQ   r   	att_labelr4   r   r   r   rK   H  s    
z%ESPnetDiarizationModel.attractor_lossc              	   C   s  |  \}}}t|||f}t|D ]}d||d || d d f< q|j  t}| j  dkt}	|| }|	| }	|j  }tj	|dd}
tj	|	dd}t
t	|
dk}t
t	t|
dk|dk}t
t	t|
dk|dk}t
t	|
}t
t	t|
| d}t
t	t||
 d}tj	t|dk|	dkdd}t
t	t|
|| }t
dt	||	k|  | }t	|}|||||||||f	S )Nr1   r   r0   )axisr   )rG   r   rF   rB   r   r   r   astyperu   r~   r   logical_andmaximumminimum)rT   r   r   rQ   r   r   r   rV   label_nppred_npn_refn_sysr_   r`   ra   rb   rc   rd   n_mapre   r]   r^   r   r   r   rL   S  s>   
z-ESPnetDiarizationModel.calc_diarization_error)r   r   )NNNr3   )__name__
__module____qualname____doc__r   r
   r   r   rD   r{   Moduler	   r   r   r   r#   Tensorr   r   strrh   rl   r@   rk   r   rJ   r}   rK   staticmethodrL   __classcell__r   r   r'   r   r   !   s    	
$
|

;
	r   )T)*
contextlibr   	itertoolsr   typingr   r   r   r   r   rD   torch.nn.functionalr{   
functionalrn   packaging.versionr   V	typeguardr   espnet2.asr.encoder.abs_encoderr	   !espnet2.asr.frontend.abs_frontendr
   espnet2.asr.specaug.abs_specaugr   $espnet2.diar.attractor.abs_attractorr    espnet2.diar.decoder.abs_decoderr   espnet2.layers.abs_normalizer    espnet2.torch_utils.device_funcsr   espnet2.train.abs_espnet_modelr   &espnet.nets.pytorch_backend.nets_utilsr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s,   