o
    iR                     @   sR  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. ee	j/edkrd dl0m1Z1 nedddZ1G dd de&Z2dS )    N)contextmanager)DictListOptionalTupleUnion)parse)check_argument_types)CTC)
AbsDecoder)
AbsEncoder)AbsFrontend)AbsPostEncoder)AbsPreEncoder)
AbsSpecAug)ErrorCalculatorTransducer)get_transducer_task_io)AbsNormalize)force_gatherable)AbsESPnetModel)ErrorCalculator)th_accuracy)add_sos_eos)LabelSmoothingLossz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   L/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/espnet_model.pyr       s   
r   c                +       sJ  e Zd ZdZ										dAd	ed
eeedf ee f de	e
 de	e de	e de	e dede	e dedede	ejj dedededededededededef* fdd Zd!ejd"ejd#ejd$ejd%eejeeejf ejf f
d&d'Zd!ejd"ejd#ejd$ejd%eeejf f
d(d)Zd!ejd"ejd%eejejf fd*d+Zd!ejd"ejd%eejejf fd,d-Zd.ejd/ejd0ejd1ejd%ejf
d2d3Z	4dBd.ejd/ejd0ejd1ejd5ef
d6d7Zd.ejd/ejd0ejd1ejfd8d9Z d.ejd/ejd0ejd1ejfd:d;Z!d.ejd/ejd<ejfd=d>Z"d!ejd"ejd#ejd$ejfd?d@Z#  Z$S )CESPnetASRModelz*CTC-attention hybrid Encoder-Decoder model      ?        FT<space><blank>
vocab_size
token_list.frontendspecaug	normalize
preencoderencoderpostencoderdecoderctcjoint_network
ctc_weightinterctc_weight	ignore_id
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blankextract_feats_in_collect_statsc              	      s  t  sJ d|  krdksJ | J |d|  kr$dk s)J | J |t   d| _|d | _|d | _|| _|| _|| _|| _	|
 | _|| _|| _|| _|| _|| _|| _t| jdshd| j_| jjrxtj|| j | j_|d u| _d | _| jrddlm} |	| _|| _|| jdd| _|s|rt |	||||||d	| _!n3d | _!| jdkrt"|||||| _n!|dkrd | _n|	| _t#||||d
| _$|s|rt"|||||| _|dkrd | _%n|
| _%|| _&d S )Nr          ?r      interctc_use_conditioningF)RNNTLoss)blankfastemit_lambda)r4   r5   )sizepadding_idx	smoothingnormalize_length)'r	   super__init__blank_idsoseosr$   r1   r/   r0   copyr%   r&   r'   r(   r)   r+   r*   hasattrr;   torchnnLinearoutput_sizeconditioning_layeruse_transducer_decodererror_calculatorwarprnnt_pytorchr<   r,   r.   criterion_transducerr   error_calculator_transr   r   criterion_attr-   r8   )selfr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r<   	__class__r   r   rD   (   s   
$$











zESPnetASRModel.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|jd }|ddd| f }| ||\}}d}	t|tr[|d }	|d }d\}
}}}d\}}d\}}}t }| jdkr| ||||\}}|dur|	 nd|d< ||d	< d}| j
dkr|	dur|	D ](\}}| ||||\}}|| }|dur|	 nd|d
|< ||d|< q|t|	 }d| j
 | | j
|  }| jr| |||\}}}|dur|| j|  }n|}|dur|	 nd|d< ||d< ||d< nI| jdkr| ||||\}
}}}| jdkr |
}n| jdkr)|}n| j| d| j |
  }|
dur>|
	 nd|d< ||d< ||d< ||d< |	 |d< t|||f|j\}}}|||fS )a  Frontend + Encoder + Decoder + Calc loss

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            text: (Batch, Length)
            text_lengths: (Batch,)
            kwargs: "utt_id" is among the input.
        r:   r   N)NNNNNN)NNNr    loss_ctccer_ctczloss_interctc_layer{}zcer_interctc_layer{}loss_transducercer_transducerwer_transducerr9   loss_attacccerwerloss)dimshapemaxencode
isinstancetupledictr/   _calc_ctc_lossdetachr0   formatlenrO   _calc_transducer_loss_calc_att_lossr   device)rU   rX   rY   rZ   r[   kwargs
batch_sizeencoder_outencoder_out_lensintermediate_outsrc   acc_attcer_attwer_attr^   r_   r`   ra   rb   statsloss_interctc	layer_idxintermediate_outloss_iccer_icrg   weightr   r   r   forward   s   






zESPnetASRModel.forwardc                 K   s>   | j r| ||\}}ntd| j   ||}}||dS )NzkGenerating dummy stats for feats and feats_lengths, because encoder_conf.extract_feats_in_collect_stats is )featsfeats_lengths)r8   _extract_featsloggingwarning)rU   rX   rY   rZ   r[   rv   r   r   r   r   r   collect_feats  s   

zESPnetASRModel.collect_featsc           	      C   sd  t d- | ||\}}| jdur| jr| ||\}}| jdur*| ||\}}W d   n1 s4w   Y  | jdurF| ||\}}| jjrW| j||| jd\}}}n	| ||\}}}d}t	|t
ro|d }|d }| jdur|| ||\}}|d|dksJ | |df|d| ksJ | | f|dur||f|fS ||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
        FN)r-   r:   r   )r   r   r'   trainingr(   r)   r*   r;   r-   rl   rm   r+   r?   rj   )	rU   rX   rY   r   r   rx   ry   _rz   r   r   r   rk   .  sD   
	



zESPnetASRModel.encodec                 C   sb   |  dksJ |j|d d d | f }| jd ur(| ||\}}||fS ||}}||fS )Nr:   )rh   ri   rj   r&   )rU   rX   rY   r   r   r   r   r   r   i  s   

zESPnetASRModel._extract_featsrx   ry   ys_padys_pad_lensc                 C   s   t || j| j| j\}}|d }| ||||\}}	|d}
|d}tjjj	|
d||
d| jdd}|
|
d}|jdd}|d|
ksLJ |S )a,  Compute negative log likelihood(nll) from transformer-decoder

        Normally, this function is called in batchify_nll.

        Args:
            encoder_out: (Batch, Length, Dim)
            encoder_out_lens: (Batch,)
            ys_pad: (Batch, Length)
            ys_pad_lens: (Batch,)
        r:   r      r!   none)ignore_index	reductionrh   )r   rF   rG   r1   r,   r?   rJ   rK   
functionalcross_entropyviewsum)rU   rx   ry   r   r   	ys_in_pad
ys_out_pad
ys_in_lensdecoder_outr   rw   decoder_num_classnllr   r   r   r   |  s"   


zESPnetASRModel.nlld   rw   c                 C   s   | d}||kr| ||||}nIg }d}	 t|| |}	|||	ddddf }
|||	 }|||	ddf }|||	 }| |
|||}|| |	}||krUnqt|}| d|ksdJ |S )a3  Compute negative log likelihood(nll) from transformer-decoder

        To avoid OOM, this fuction seperate the input into batches.
        Then call nll for each batch and combine and return results.
        Args:
            encoder_out: (Batch, Length, Dim)
            encoder_out_lens: (Batch,)
            ys_pad: (Batch, Length)
            ys_pad_lens: (Batch,)
            batch_size: int, samples each batch contain when computing nll,
                        you may change this to avoid OOM or increase
                        GPU memory usage
        r   TN)r?   r   minappendrJ   cat)rU   rx   ry   r   r   rw   	total_numr   	start_idxend_idxbatch_encoder_outbatch_encoder_out_lensbatch_ys_padbatch_ys_pad_lens	batch_nllr   r   r   batchify_nll  s2   


zESPnetASRModel.batchify_nllc                 C   s   t || j| j| j\}}|d }| ||||\}}	| ||}
t|d| j|| jd}| j	s5| j
d u r:d\}}n|jdd}| 
| | \}}|
|||fS )Nr:   r!   )ignore_labelr]   r   )r   rF   rG   r1   r,   rT   r   r   r$   r   rP   argmaxcpu)rU   rx   ry   r   r   r   r   r   r   r   rc   r{   r|   r}   ys_hatr   r   r   rt     s    
zESPnetASRModel._calc_att_lossc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r-   r   rP   r   datar   )rU   rx   ry   r   r   r^   r_   r   r   r   r   ro     s   zESPnetASRModel._calc_ctc_losslabelsc                 C   s   t ||| j| jd\}}}}| j|j | |}| |d|d}	| |	|||}
d\}}| j	sB| j
durB| 
||\}}|
||fS )a  Compute Transducer loss.

        Args:
            encoder_out: Encoder output sequences. (B, T, D_enc)
            encoder_out_lens: Encoder output sequences lengths. (B,)
            labels: Label ID sequences. (B, L)

        Return:
            loss_transducer: Transducer loss value.
            cer_transducer: Character error rate for Transducer.
            wer_transducer: Word Error Rate for Transducer.

        )r1   rE   r   r:   r]   N)r   r1   rE   r,   
set_deviceru   r.   	unsqueezerR   r   rS   )rU   rx   ry   r   
decoder_intargett_lenu_lenr   	joint_outr`   ra   rb   r   r   r   rs     s.   

z$ESPnetASRModel._calc_transducer_lossc           	      C   s   | j d u rd S | dksJ |j|jd |jd   kr-|jd   kr-|jd ks:n J |j|j|j|jf|d d d | f }| ||\}}t|trW|d }| j j}d| j _|  ||||}|| j _|S )Nr:   r   F)r-   rh   ri   rj   rk   rl   rm   reduce)	rU   rX   rY   rZ   r[   rx   ry   	do_reducer^   r   r   r   _calc_batch_ctc_loss2  s*   


z#ESPnetASRModel._calc_batch_ctc_loss)
r   r    r!   r    FTTr"   r#   T)r   )%__name__
__module____qualname____doc__intr   r   strr   r   r   r   r   r   r   r   r   r
   rJ   rK   ModulefloatboolrD   Tensorr   r   r   rk   r   r   r   rt   ro   rs   r   __classcell__r   r   rV   r   r   %   s*   	

r
 

;

,
/
 

0r   )T)3r   
contextlibr   typingr   r   r   r   r   rJ   packaging.versionr   V	typeguardr	   espnet2.asr.ctcr
   espnet2.asr.decoder.abs_decoderr   espnet2.asr.encoder.abs_encoderr   !espnet2.asr.frontend.abs_frontendr   'espnet2.asr.postencoder.abs_postencoderr   %espnet2.asr.preencoder.abs_preencoderr   espnet2.asr.specaug.abs_specaugr   'espnet2.asr.transducer.error_calculatorr   espnet2.asr_transducer.utilsr   espnet2.layers.abs_normalizer    espnet2.torch_utils.device_funcsr   espnet2.train.abs_espnet_modelr   espnet.nets.e2e_asr_commonr   &espnet.nets.pytorch_backend.nets_utilsr   3espnet.nets.pytorch_backend.transformer.add_sos_eosr   <espnet.nets.pytorch_backend.transformer.label_smoothing_lossr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s6    