o
    iZ&                     @   s
  d dl mZ d dlmZ d dlmZmZmZmZ d dl	Z
d dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d d	lmZ d d
lmZ d dlmZ eej edkrgnedddZ!dd Z"dd Z#dd Z$G dd dej%Z&dS )    )contextmanager)LooseVersion)DictListTupleOptionalN)WavFrontendMel23)EENDOLATransformerEncoder)EncoderDecoderAttractor)standard_losscal_power_lossfast_batch_pit_n_speaker_loss)create_powerlabel)generate_mapping_dict)force_gatherablez1.6.0Tc                 c   s    d V  d S N )enabledr   r   X/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/eend/e2e_diar_eend_ola.pyautocast   s   
r   c                 C   sD   | j \}}||k r tj| t|| |tj| jgdd} | S )Nr   dim)shapetorchcatzerostofloat32device)attmax_n_speakersCDr   r   r   pad_attractor   s   
$r#   c                 C   sL   t | D ]\}}|jd |k r#tj|d||jd  ddfddd| |< q| S )N   r   constantg        )modevalue)	enumerater   Fpad)tsout_sizeitr   r   r   
pad_labels(   s
   (r/   c                 C   st   g }t | D ]1\}}|jd |k r2|tj|t|jd ||jd  tj|jgdd q|| q|S )Nr$   r   r   )	r(   r   appendr   r   r   r   r   r   )ysr,   	ys_paddedr-   yr   r   r   pad_results/   s   r4   c                       s   e Zd ZdZ				d&dee deded	ed
ede	f fddZ
dd Zdd Zdeej deej dejdeejeeejf ejf fddZ			d'dejdedede	fddZdd  Zd!d" Zd#ejdeeejf fd$d%Z  ZS )(DiarEENDOLAModelzEEND-OLA diarization model            ?Nfrontendencoderencoder_decoder_attractorn_unitsmax_n_speakerattractor_loss_weightc           	         sr   t    || _|| _|| _|| _|| _|d u r!t| jd}|| _t	j
| j|ddd| _t	||d d | _d S )N)max_speaker_numr$   T)batch_firstoov)super__init__r9   encr;   r>   r=   r   mapping_dictnnLSTMpostnetLinearoutput_layer)	selfr9   r:   r;   r<   r=   r>   rE   kwargs	__class__r   r   rC   F   s   
zDiarEENDOLAModel.__init__c                    s   t jjj ddd  j} fdd|D }tj jjj|dddd}|  |}tj|	|d |d dddd	}d
d t
||D }|S )NTr@   padding_valuec                    s   g | ]}t | jqS r   )r   onesr   r   ).0ilenxsr   r   
<listcomp>a   s    z4DiarEENDOLAModel.forward_encoder.<locals>.<listcomp>r   r$   r   c                 S   s    g | ]\}}|d  d| qS r   Nr   )rS   erT   r   r   r   rW   g        )rF   utilsrnnpad_sequencer   r   	unsqueezerD   splitviewzip)rK   rV   ilens	pad_shapexs_maskembr   rU   r   forward_encoder^   s   
"z DiarEENDOLAModel.forward_encoderc                    s   t  t j }tjjj|ddd}tjjj	| 
 t jddd}|\}\}}tjjj|dd|dd } fdd	t|D }fd
d	|D }|S )NTrO   rP   F)r@   enforce_sorted)r@   rQ   total_lengthr   c                    s,   g | ]\}}|d  |  tj  qS r   )r   r   intitem)rS   r-   output)rc   r   r   rW   t   s   , z5DiarEENDOLAModel.forward_post_net.<locals>.<listcomp>c                       g | ]}  |qS r   )rJ   )rS   rl   rK   r   r   rW   u       )r   maxr   rj   rk   rF   r\   r]   r^   pack_padded_sequencecpuint64rH   pad_packed_sequencer(   )rK   logitsrc   maxlenoutputs_r   )rc   rK   r   forward_post_netj   s   z!DiarEENDOLAModel.forward_post_netspeechspeaker_labelsordersreturnc                    s  t |t |ksJ t |t |ftdd |D tj}tdd |D tj}t |}|| dd t |D |\}}dd t |D }	t|	|}
t	|	|
}t
   fdd|
D }W d    n1 suw   Y  fdd|D }dd t |D }||}t||}|| j|  }t }| |d	< | |d
< | |d< ||d< t| |d< t|||f|j\}}}|||fS )Nc                 S      g | ]}t |qS r   lenrS   sphr   r   r   rW          z,DiarEENDOLAModel.forward.<locals>.<listcomp>c                 S      g | ]}|j d  qS )rO   r   )rS   spkr   r   r   rW      ro   c                 S   s   g | ]\}}|| qS r   r   rS   rZ   orderr   r   r   rW          c              	   S   $   g | ]\}}t ||d dqS r$   r   r   matmulpermuterS   rZ   r   r   r   r   rW      s    c                    s4   g | ]}t |  jjj d  jddqS )r   T)non_blocking)r   rr   numpyrE   r=   r   r   )rS   labelencoder_outrK   r   r   rW      s    
c                    s   g | ]}t | jqS r   )r#   r=   rS   r   rn   r   r   rW      r   c              	   S   r   r   r   )rS   rZ   pad_attr   r   r   rW      s    pse_losspit_lossattractor_loss
batch_sizeloss)r   r   tensorr   rs   rg   r;   rb   r   r   no_gradry   r   r>   dictdetachcloner   r   )rK   rz   r{   r|   speech_lengthsspeaker_labels_lengthsr   r   
attractorsspeaker_logitspit_speaker_labelsr   power_tspad_attractorspse_speaker_logitsr   r   statsweightr   r   r   forwardx   sF   $




zDiarEENDOLAModel.forwardT      ?
n_speakersshuffle	thresholdc                    sz  t dd D t j} |}|r;dd |D }|D ]}	tj|	 q j	fddt
||D \}
}n j	|\}
}g }t
||
|D ]?\}}}|rc|dkrc|d |f }|| qK|d urt ||k d }|jrw|d nd }|d |f }|| qKtd qKdd |D } fdd|D }
d	d t
||
D } ||} fd
dt
||D }|||
|fS )Nc                 S   r~   r   r   r   r   r   r   rW      r   z8DiarEENDOLAModel.estimate_sequential.<locals>.<listcomp>c                 S   s   g | ]
}t |jd  qS r   )nparanger   )rS   rZ   r   r   r   rW      s    c                    s2   g | ]\}}|t |t j d  j qS r   )r   
from_numpyr   longr   r   )rz   r   r   rW      s     r   z(n_speakers or threshold has to be given.c                 S   r   r   r   r   r   r   r   rW      ro   c                    s6   g | ]}|j d   jkrt| jn|d j qS rY   )r   r=   r#   r   rn   r   r   rW      s    c              	   S   r   r   r   r   r   r   r   rW      s   $ c                    s   g | ]
\}}  ||qS r   )recover_y_from_powerlabel)rS   logitraw_n_speakerrn   r   r   rW      s    
)r   r   r   rs   rg   r   randomr   r;   estimaterb   r0   nonzerosizeNotImplementedErrorry   )rK   rz   r   r   r   rL   r   rf   r|   r   r   probsattractors_activepr   rZ   silencen_spkraw_n_speakersr1   ru   r   )rK   rz   r   estimate_sequential   sB   




z$DiarEENDOLAModel.estimate_sequentialc                    s   t jt j|dddd}t | jd kd }|D ]}|dkr)||d  ||< qd||< q fdd|D } fdd|D }t tjd	d |D dd
|j	t j
}|d d d |f }|S )NrO   r   rA   r   r$   c                    rm   r   )inv_mapping_funcrS   r-   rn   r   r   rW      ro   z>DiarEENDOLAModel.recover_y_from_powerlabel.<locals>.<listcomp>c                    s.   g | ]}t |d d  jddd qS )   NrO   )binzfillr=   )rS   numrn   r   r   rW      s   . c                 S   s    g | ]}t d d |D qS )c                 S   r~   r   )rj   r   r   r   r   rW      r   zIDiarEENDOLAModel.recover_y_from_powerlabel.<locals>.<listcomp>.<listcomp>)r   array)rS   decr   r   r   rW      r[   )axis)r   argmaxsoftmaxwhererE   r   r   stackr   r   r   )rK   r   	n_speakerpred	oov_indexr-   	decisionsr   rn   r   r      s    
z*DiarEENDOLAModel.recover_y_from_powerlabelc                 C   s>   t |ts	t|}|| jd  v r| jd | }|S d}|S )N	label2decrO   )
isinstancerj   rE   keys)rK   r   r   r   r   r   r      s   
z!DiarEENDOLAModel.inv_mapping_funcbatchc                 K   s   d S r   r   )rK   r   r   r   r   collect_feats  s   zDiarEENDOLAModel.collect_feats)r6   r7   r8   N)NTr   )__name__
__module____qualname____doc__r   r   r	   r
   rj   floatrC   rg   ry   r   r   Tensorr   r   strr   boolr   r   r   r   __classcell__r   r   rM   r   r5   C   sZ    
@
4&
r5   )T)'
contextlibr   distutils.versionr   typingr   r   r   r   r   r   r   torch.nnrF   torch.nn.functional
functionalr)   funasr.frontends.wav_frontendr   funasr.models.eend.encoderr	   ,funasr.models.eend.encoder_decoder_attractorr
   funasr.models.eend.utils.lossesr   r   r   funasr.models.eend.utils.powerr   r   funasr.train_utils.device_funcsr   __version__r   r#   r/   r4   Moduler5   r   r   r   r   <module>   s,    	