o
    i?                     @   sF  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z% d dl&m$Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- ee	j.edkrd dl/m0Z0 nedddZ0G dd de"Z1dS )    N)contextmanager)DictListOptionalTupleUnion)parse)check_argument_types)CTC)
AbsDecoder)
AbsEncoder)AbsFrontend)AbsPostEncoder)AbsPreEncoder)
AbsSpecAug)AbsNormalize)force_gatherable)AbsESPnetModel)ErrorCalculator)th_accuracy)add_sos_eos)LabelSmoothingLossz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   K/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/st/espnet_model.pyr      s   
r   c                5       sD  e Zd ZdZ												d?ded	eeed
f ee f de	e
 de	e de	e de	e dede	e dede	e de	e de	e de	e de	eeed
f ee f  dededededededededed ed!ed"ef4 fd#d$Zd%ejd&ejd'ejd(ejd)e	ej d*e	ej d+eejeeejf ejf fd,d-Zd%ejd&ejd'ejd(ejd)e	ej d*e	ej d+eeejf fd.d/Zd%ejd&ejd+eejejf fd0d1Zd%ejd&ejd+eejejf fd2d3Z	d@d4ejd5ejd6ejd7ejd8ef
d9d:Zd4ejd5ejd6ejd7ejfd;d<Zd4ejd5ejd6ejd7ejfd=d>Z  ZS )AESPnetSTModelz*CTC-attention hybrid Encoder-Decoder model        FT<space><blank>
vocab_size
token_list.frontendspecaug	normalize
preencoderencoderpostencoderdecoderextra_asr_decoderextra_mt_decoderctcsrc_vocab_sizesrc_token_list
asr_weight	mt_weightmtlalpha	ignore_id
lsm_weightlength_normalized_loss
report_cer
report_werreport_bleu	sym_space	sym_blankextract_feats_in_collect_statsc                    s  t  sJ d|  krdk sJ d J dd|  kr$dk s)J d J dd|  kr6dks;J d J dt   |d | _|d | _|d | _|d | _|| _|| _|| _	|| _
|| _|| _| | _|| _|| _|| _|| _|| _|| _|	| _t||||d| _t||||d| _| j
dkr|d usJ d	| jdkr|| _| jdk r|
| _n|
d urtd
| | jdkr|| _n|d urtd| |rt||||| _ nd | _ |s|r|d usJ d	t!|||||| _"nd | _"|| _#d S )Nr         ?zasr_weight should be [0.0, 1.0)zmt_weight should be [0.0, 1.0)zmtlalpha should be [0.0, 1.0]   )sizepadding_idx	smoothingnormalize_lengthr   z9Missing src_token_list, cannot add asr module to st modelzBNot using extra_asr_decoder because mtlalpha is set as {} (== 1.0)z@Not using extra_mt_decoder because mt_weight is set as {} (== 0))$r	   super__init__soseossrc_sossrc_eosr!   r-   r2   r/   r0   r1   copyr"   r#   r$   r%   r&   r(   r'   r)   r   criterion_stcriterion_asrr,   r*   loggingwarningformatr+   MTErrorCalculatormt_error_calculatorASRErrorCalculatorasr_error_calculatorr:   )selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   	__class__r   r   rB   '   s   
$$$













zESPnetSTModel.__init__speechspeech_lengthstexttext_lengthssrc_textsrc_text_lengthsreturnc                 K   s  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|dur`|  dksBJ |j|jd |jd   krU|jd ks`n J |j|j|jf|jd }|ddd| f }|dur|ddd| f }| ||\}	}
| j|	|
||dd\}}}| jdkr|dusJ d| jdkr| jdkr| |	|
||\}}nd\}}| jdkr| jdk r| |	|
||\}}}}nd	\}}}}| j	dkr| j|	|
||d
d\}}nd\}}| j}|}|dkr|}n|dkr|}n
|| d| |  }| j	| }d| j | j	 | | j|  | j	|  }t
| t|tur5| n|t|turA| n|| |||||||d}t|||f|j\}}}|||fS )aQ  Frontend + Encoder + Decoder + Calc loss

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch,)
            text: (Batch, Length)
            text_lengths: (Batch,)
            src_text: (Batch, length)
            src_text_lengths: (Batch,)
            kwargs: "utt_id" is among the input.
        r<   r   NT)stz*missing source text for asr sub-task of ST)r   Nr;   )r   NNNFr   )lossloss_asrloss_mtloss_stacc_asracc_mtacccer_ctccerwerbleu)dimshapemaxencode_calc_mt_att_lossr/   r1   _calc_ctc_loss_calc_asr_att_lossr0   dictdetachtypefloatr   device)rQ   rT   rU   rV   rW   rX   rY   kwargs
batch_sizeencoder_outencoder_out_lensloss_st_att
acc_st_attbleu_st_attloss_asr_ctccer_asr_ctcloss_asr_attacc_asr_attcer_asr_attwer_asr_attloss_mt_att
acc_mt_attasr_ctc_weightr_   r]   r^   r\   statsweightr   r   r   forward   s   
*








zESPnetSTModel.forwardc           
      K   s>   | j r| ||\}}	ntd| j   ||}}	||	dS )NzkGenerating dummy stats for feats and feats_lengths, because encoder_conf.extract_feats_in_collect_stats is )featsfeats_lengths)r:   _extract_featsrJ   rK   )
rQ   rT   rU   rV   rW   rX   rY   rs   r   r   r   r   r   collect_feats  s   


zESPnetSTModel.collect_featsc                 C   s  t d- | ||\}}| jdur| jr| ||\}}| jdur*| ||\}}W d   n1 s4w   Y  | jdurF| ||\}}| ||\}}}| jdur\| ||\}}|d|dksqJ | |df|d|	 ksJ | |	 f||fS )zFrontend + Encoder. Note that this method is used by st_inference.py

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
        FNr   r<   )
r   r   r$   trainingr%   r&   r'   r(   r=   ri   )rQ   rT   rU   r   r   ru   rv   _r   r   r   rj   .  s0   
	


zESPnetSTModel.encodec                 C   sb   |  dksJ |j|d d d | f }| jd ur(| ||\}}||fS ||}}||fS )Nr<   )rg   rh   ri   r#   )rQ   rT   rU   r   r   r   r   r   r   ]  s   

zESPnetSTModel._extract_featsru   rv   ys_padys_pad_lensr[   c                 C   s   t || j| j| j\}}|d }|r| ||||\}	}
n
| ||||\}	}
| |	|}t|	d| j	|| jd}| j
sB| jd u rEd }n|	jdd}| | | }|||fS )Nr<   r   ignore_labelrg   )r   rC   rD   r2   r)   r+   rH   r   viewr!   r   rN   argmaxcpu)rQ   ru   rv   r   r   r[   	ys_in_pad
ys_out_pad
ys_in_lensdecoder_outr   loss_attacc_attbleu_attys_hatr   r   r   rk   p  s(   

zESPnetSTModel._calc_mt_att_lossc                 C   s   t || j| j| j\}}|d }| ||||\}}	| ||}
t|d| j|| jd}| j	s5| j
d u r:d\}}n|jdd}| 
| | \}}|
|||fS )Nr<   r   r   )NNr   )r   rE   rF   r2   r*   rI   r   r   r-   r   rP   r   r   )rQ   ru   rv   r   r   r   r   r   r   r   r   r   cer_attwer_attr   r   r   r   rm     s$   
z ESPnetSTModel._calc_asr_att_lossc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r,   r   rP   r   datar   )rQ   ru   rv   r   r   loss_ctcrc   r   r   r   r   rl     s   zESPnetSTModel._calc_ctc_loss)r   r   r   r   r   FTTTr   r    TT) __name__
__module____qualname____doc__intr   r   strr   r   r   r   r   r   r   r   r   r
   rq   boolrB   torchTensorr   r   r   rj   r   rk   rm   rl   __classcell__r   r   rR   r   r   $   s   	
w	
z	

/

&
"r   r   )2rJ   
contextlibr   typingr   r   r   r   r   r   packaging.versionr   V	typeguardr	   espnet2.asr.ctcr
   espnet2.asr.decoder.abs_decoderr   espnet2.asr.encoder.abs_encoderr   !espnet2.asr.frontend.abs_frontendr   'espnet2.asr.postencoder.abs_postencoderr   %espnet2.asr.preencoder.abs_preencoderr   espnet2.asr.specaug.abs_specaugr   espnet2.layers.abs_normalizer    espnet2.torch_utils.device_funcsr   espnet2.train.abs_espnet_modelr   espnet.nets.e2e_asr_commonr   rO   espnet.nets.e2e_mt_commonrM   &espnet.nets.pytorch_backend.nets_utilsr   3espnet.nets.pytorch_backend.transformer.add_sos_eosr   <espnet.nets.pytorch_backend.transformer.label_smoothing_lossr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s4    