o
    iK                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	Z
d dlZd dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" eej#edkrqd dl$m%Z% nedddZ%G dd de"Z&dS )    N)contextmanager)DictListTupleUnion)parse)linear_sum_assignment)check_argument_types)ESPnetASRModel)ESPnetDiarizationModel)ESPnetEnhancementModel)ESPnetSTModel)force_gatherable)AbsESPnetModelz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   T/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/enh/espnet_enh_s2t_model.pyr      s   
r   c                       s\  e Zd ZdZ		d(dedeeeef de	de
f fdd	Z	
d)dejdejdeejeeejf ejf fddZdejdejdeeejf fddZdejdejdeejejf fddZdejdejdedeejejf fddZdejdejdejdejdejf
ddZejZdd Zd)dd Zd!ejfd"d#Zg g fd$ee d%ee fd&d'Z  ZS )*ESPnetEnhS2TModelz+Joint model Enhancement and Speech to Text.Tr   	enh_model	s2t_modelcalc_enh_lossbypass_enh_probc                    s   t  sJ t   || _|| _|| _|| _t| jtr d| _	n| jj	| _	| jj
d urH| jj
dkrJt| jtrL| jrAtd d S td d S d S d S d S )NF   z5The permutation issue will be handled by the Enh lossz5The permutation issue will be handled by the CTC loss)r	   super__init__r   r   r   r   
isinstancer   extract_feats_in_collect_statsnum_spkr
   loggingwarning)selfr   r   r   r   	__class__r   r   r       s&   


zESPnetEnhS2TModel.__init__Nspeechspeech_lengthsreturnc              	      s  dv r%d } ddg}|d durtdd |D nO|jd nIfdd	t| jjD }fd
d	t| jjD }|d durOtdd |D n	tdd |D t| jdd tj	 fdd	|D dd}|d durt
dd |D sJ dd |D |dur|d dur|jd |jd   kr|jd   kr|d jd ksn J |j|j|j|d jfn|jd |jd ksJ |j|jfdv rd }d }|dur| dksJ |j|d jd |jd   kr|jd ksn J |d j|j|jfnd}d}|jd }	|dur!|nt|	 |jd  }| dkr9|jd n| jj}
| jjdurS|
| jjksSJ |
| jjfd}| jrdv s`J fdd	t|
D }tj	|dd}|dd| f }|jdd} dd}d\}}|durt| jts|d drd}d}n|d drd}d}nd}d}| jsd}| jr|r|st | jkrd}d}d}|s| j||d |
i\}}}}|s| j||||||\}}}}|d }|durt||}n|g}|d dur|dddf }|dur,|ddd| f }t| jtrp|du rG| |||d|\}}}n"| tj|dd|t|tj|dddtj|dd\}}}| |d!< nOt| jt r| |d |||d ||\}}}| |d"< n0t| jtr| j|! |||d | d#| d$d%\}}}| |d&< n
t"t#| j d'|dur|| }n|}|dur| nd|d(< |dur| nd|d)< t$|||	f|j%\}}}|||fS )*a  Frontend + Encoder + Decoder + Calc loss

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, ) default None for chunk interator,
                                      because the chunk-iterator does not
                                      have the speech_lengths returned.
                                      see in
                                      espnet2/iterators/chunk_iter_factory.py
            For Enh+ASR task:
                text_spk1: (Batch, Length)
                text_spk2: (Batch, Length)
                ...
                text_spk1_lengths: (Batch,)
                text_spk2_lengths: (Batch,)
                ...
            For other tasks:
                text: (Batch, Length) default None just to keep the argument order
                text_lengths: (Batch,)
                    default None for the same reason as speech_lengths
        texttext_lengthsNr   c                 s       | ]}|  V  qd S r   max.0ref_lengthsr   r   r   	<genexpr>`       
z,ESPnetEnhS2TModel.forward.<locals>.<genexpr>r   c                       g | ]} d  |d  qS )z
text_spk{}r   formatr.   spkkwargsr   r   
<listcomp>f   s    z-ESPnetEnhS2TModel.forward.<locals>.<listcomp>c                    s"   g | ]}  d |d dqS )ztext_spk{}_lengthsr   N)getr4   r5   r7   r   r   r9   j   s    c                 s   r*   r   r+   r-   r   r   r   r0   q   r1   c                 s   s    | ]}|j d  V  qdS r   Nshape)r.   r(   r   r   r   r0   u   s    	ignore_idc                    s*   g | ]}t j|d |jd  f dqS )r   r   )value)Fpadr=   )r.   ref)r>   text_length_maxr   r   r9   y   s       dimc                 s   s    | ]	}|  d kV  qdS r;   rF   r-   r   r   r   r0      s    c                 s   s    | ]}|j V  qd S r   r<   r-   r   r   r   r0      s    
src_textsrc_text_lengths   speech_ref1c                    r2   )zspeech_ref{}r   r3   r5   r7   r   r   r9      s    .utt_id)FFCLEANTREALFr   loss_asrloss_stbottleneck_featsbottleneck_feats_lengths)r%   r&   
spk_labelsspk_labels_lengthsrQ   rR   	loss_diarz is not supported yet.lossloss_enh)&r:   r,   r=   ranger   r   getattrr   torchstackallrG   onesintr   unbindr   r   endswithtrainingrandomr   forward_enhanceforward_lossr   sort_by_permr
   asr_pit_losscatrepeatlendetachr   cloneNotImplementedErrortyper   device)r"   r%   r&   r8   r(   text_ref_lengthstext_refrH   rI   
batch_sizer   
speech_refrL   bypass_enh_flagskip_enhloss_flagrW   perm
speech_prefeature_mixfeature_preothers_loss_s2tstatsweightrV   r   )r>   r8   rD   r   forwardA   sX  






$


	

 









zESPnetEnhS2TModel.forwardc           	      K   s   d|v r|d }| dd }n
|d }| dd }| jr3| jj||||fi |}|d |d }}ntd| j  ||}}||dS )	Nr(   r)   	text_spk1text_spk1_lengthsfeatsfeats_lengthszkGenerating dummy stats for feats and feats_lengths, because encoder_conf.extract_feats_in_collect_stats is )r   r   )r:   r   r   collect_featsr    r!   )	r"   r%   r&   r8   r(   r)   retr   r   r   r   r   r   >  s,   

zESPnetEnhS2TModel.collect_featsc           
         s^    j |\}}}}t|}| j jksJ | j jft fdd|D  \}}	||	fS )zFrontend + Encoder. Note that this method is used by asr_inference.py

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
        c                    s   g | ]	} j |qS r   )r   encode)r.   spr"   r&   r   r   r9   q  s    z,ESPnetEnhS2TModel.encode.<locals>.<listcomp>)r   rc   ri   r   zip)
r"   r%   r&   rv   rw   rx   ry   r   encoder_outencoder_out_lensr   r   r   r   ^  s   zESPnetEnhS2TModel.encoder   c           	      C   sH   | j ||d|i\}}}}| j|||d|d\}}|||fS )zFrontend + Encoder. Note that this method is used by diar_inference.py

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            num_spk: int
        r   rQ   rR   )r   rc   r   r   r:   )	r"   r%   r&   r   rv   rz   ry   r   r   r   r   r   encode_diarv  s   
zESPnetEnhS2TModel.encode_diarr   r   ys_padys_pad_lensc                 C   s   | j ||||S )a,  Compute negative log likelihood(nll) from transformer-decoder

        Normally, this function is called in batchify_nll.

        Args:
            encoder_out: (Batch, Length, Dim)
            encoder_out_lens: (Batch,)
            ys_pad: (Batch, Length)
            ys_pad_lens: (Batch,)
        )r   nll)r"   r   r   r   r   r   r   r   r     s   zESPnetEnhS2TModel.nllc              	      s    j jd u r
tdt % tj fddt jjD dd} 	|\}}W d    n1 s6w   Y  t
|  tjddttjddtjdd\}}	}
||	|
fS )Nz-CTC must be used to determine the permutationc              	      s:   g | ] t j fd dtjjD ddqS )c              	      s*   g | ]}j   | | qS r   )r   _calc_batch_ctc_lossr.   r)hr"   r%   r&   r(   r)   r   r   r9     s    z=ESPnetEnhS2TModel.asr_pit_loss.<locals>.<listcomp>.<listcomp>r   rF   )rZ   r[   rX   r   r   r.   r"   r%   r&   r(   r)   r   r   r9     s    
	z2ESPnetEnhS2TModel.asr_pit_loss.<locals>.<listcomp>rE   rF   r   )r   ctc
ValueErrorrZ   no_gradr[   rX   r   r   permutation_invariant_trainingr   re   rg   rh   ri   )r"   r%   r&   r(   r)   loss0perm_detailmin_lossrV   r|   r}   r   r   r   rf     s&   



zESPnetEnhS2TModel.asr_pit_lossc                    sH   t tj fddtD dd}| |\}}| |fS )a]  The basic permutation loss function.

        Args:
            ref (List[torch.Tensor]): [(batch, ...), ...] x n_spk
            inf (List[torch.Tensor]): [(batch, ...), ...]
            criterion (function): Loss function
            perm: (batch)
        Returns:
            loss: torch.Tensor: (batch)
            perm: list[(num_spk)]
        c                    s2   g | ] t j fd dtD ddqS )c                    s   g | ]} |  qS r   r   r   )	criterionr   infrC   r   r   r9     s    zBESPnetEnhS2TModel._permutation_loss.<locals>.<listcomp>.<listcomp>r   rF   )rZ   r[   rX   r   r   r   r   rC   r   r   r9     s    $z7ESPnetEnhS2TModel._permutation_loss.<locals>.<listcomp>rE   rF   )ri   rZ   r[   rX   r   mean)r"   rC   r   r   ru   lossesr   r   r   r   r   _permutation_loss  s   z#ESPnetEnhS2TModel._permutation_lossr   c           
      C   s   g g }}|j  }t|D ]Y\}}zt|\}}W n2 tyL }	 z&t|	dkrGtddg}|t	
||||f  || W Y d}	~	q d}	~	ww |t	
||||f  |t	j|t	j|jd q|t	|fS )zCompute  PIT loss.

        Args:
            losses (torch.Tensor): (batch, nref, nhyp)
        Returns:
            perm: list: (batch, n_spk)
            loss: torch.Tensor: (batch)
        zcost matrix is infeasibler   r   N)dtypern   )datacpu	enumerater   r   strnparrayappendrZ   r   	as_tensorlongrn   r[   )
r"   r   hyp_permmin_perm_loss
losses_cpubb_lossrow_indcol_inderrr   r   r   r     s&   
	


z0ESPnetEnhS2TModel.permutation_invariant_traininginherite_enh_attrsinherite_s2t_attrsc                 C   sj   t  sJ t|dkr|D ]}t| |t| j|d  qt|dkr1|D ]}t| |t| j|d  q#d S d S )Nr   )r	   ri   setattrrY   r   r   )r"   r   r   attrr   r   r   inherite_attributes  s   
z%ESPnetEnhS2TModel.inherite_attributes)Tr   r   )__name__
__module____qualname____doc__r   r   r
   r   r   boolfloatr   rZ   Tensorr   r   r   r~   r   r   r^   r   r   batchify_nllrf   r   r   r   r   __classcell__r   r   r#   r   r      s    $
 ~
 



""r   )T)'r    rb   
contextlibr   typingr   r   r   r   numpyr   rZ   torch.nn.functionalnn
functionalrA   packaging.versionr   Vscipy.optimizer   	typeguardr	   espnet2.asr.espnet_modelr
   espnet2.diar.espnet_modelr   espnet2.enh.espnet_modelr   espnet2.st.espnet_modelr    espnet2.torch_utils.device_funcsr   espnet2.train.abs_espnet_modelr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s*    