o
    i]                     @   s,  d Z ddlZddlZddlZddlZddlZddlmZ ddlZddl	Z	ddl
ZddlZddlZddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZmZmZ dd	lmZmZmZ dd
lm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( dZ)G dd dej*Z+G dd de&ej,j-Z.dS )z<RNN sequence-to-sequence speech translation model (pytorch).    N)groupby)reporter)label_smoothing_dist)CTC)lecun_normal_init_parametersset_forget_bias_to_one)get_subsamplepad_list	to_deviceto_torch_tensor)"add_arguments_rnn_attention_common add_arguments_rnn_decoder_common add_arguments_rnn_encoder_common)att_for)decoder_for)encoder_for)STInterface)fill_missing_argsi'  c                   @   s   e Zd ZdZdd ZdS )ReporterzA chainer reporter wrapper.c                 C   s   t d|i|  t d|i|  t d|i|  t d|i|  t d|i|  t d|i|  t d|i|  t d|i|  t d	|	i|  t d
|
i|  tdt|  t d|i|  dS )zReport at every step.loss_asrloss_mtloss_stacc_asracc_mtacccer_ctccerwerbleuz	mtl loss:lossN)r   reportlogginginfostr)selfr   r   r   r   r   r   r   r   r   r   mtl_loss r&   V/home/ubuntu/.local/lib/python3.10/site-packages/espnet/nets/pytorch_backend/e2e_st.pyr    1   s   zReporter.reportN)__name__
__module____qualname____doc__r    r&   r&   r&   r'   r   .   s    r   c                       s   e Zd ZdZedd Zedd Zedd Zedd	 Zd
d Z	 fddZ
dd Zdd Zdd Zdd Zdd Zdd Zd%ddZd%ddZdd  Zd!d" Zd#d$ Z  ZS )&E2EzE2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    c                 C   s"   t |  t |  t |  | S )zAdd arguments.)r,   encoder_add_argumentsattention_add_argumentsdecoder_add_arguments)parserr&   r&   r'   add_argumentsW   s   


zE2E.add_argumentsc                 C      |  d}t|}| S )zAdd arguments for the encoder.zE2E encoder setting)add_argument_groupr   r0   groupr&   r&   r'   r-   _      
zE2E.encoder_add_argumentsc                 C   r2   )z Add arguments for the attention.zE2E attention setting)r3   r   r4   r&   r&   r'   r.   f   r6   zE2E.attention_add_argumentsc                 C   r2   )zAdd arguments for the decoder.zE2E decoder setting)r3   r   r4   r&   r&   r'   r/   m   r6   zE2E.decoder_add_argumentsc                 C   s   | j jtt| j S )zGet total subsampling factor.)encconv_subsampling_factorintnpprod	subsampler$   r&   r&   r'   get_total_subsampling_factort   s   z E2E.get_total_subsampling_factorc                    s  t t|   tjj|  t|| j}|j| _|j	| _	|j
| _
d| j  kr.dk s3J d J dd| j	  krAdk sFJ d J dd| j
  krTdksYJ d J d|j| _|j| _t|dd|_|j| _|j| _|j| _|j| _t | _|d | _|d | _d	| _t|d
dd| _|jrtj|jrt !d|j  t"||j|jd}nd}t|dd| _#t|dd| _$t%||| j| _&t'|| _(t)||| j| j| j(|| _*d| _+d| _,d| _-| jd	kr| j
dkrt.||j/|j0|j1dd| _+| j
dk rt'|| _,t23|}d|_4t)||| j| j| j,|| _-| j	d	krEtjj5||j6| jd| _7tjj8|j0d| _9t%||j6t:j;|j<d t:j=dd| _>| ?  | jd	krS|j@sW|jAr|jB|jC|jD|jE|jF|jG|jH|jI|j|jdd}tJjKdi || _L|j@| _@|jA| _And| _@d| _A|jMr|jB|jCd	|jE|jF|jG|jH|jI|j|jdd}tJjKdi || _N|jM| _Mnd| _Md| _Hd| _Od| _Pd| _QdS )zConstruct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
                g      ?zasr_weight should be [0.0, 1.0)zmt_weight should be [0.0, 1.0)zmtlalpha should be [0.0, 1.0]	char_listN   r   strnn)modearchzUse label smoothing with )
transcriptmultilingualFreplace_sosT)ctc_typereducelocation)padding_idx)pdtyper<   )	beam_sizepenalty
ctc_weightmaxlenratiominlenratio	lm_weightrnnlmnbestspaceblanktgt_langg    _r&   )Rsuperr,   __init__torchnnModuler   r1   
asr_weight	mt_weightmtlalphaetypeverbosegetattrr@   outdir	sym_spacerY   	sym_blankrZ   r   r   soseospadr   r<   lsm_typeospathisfile
train_jsonr!   r"   r   rG   rH   r   r7   r   attr   decctcatt_asrdec_asrr   eprojsdropout_raterI   copydeepcopyatype	Embeddingeunitsembed_mtDropout
dropout_mtr:   oneselayersint64enc_mtinit_like_chainer
report_cer
report_werrQ   rR   rS   rT   rU   rV   rW   rX   argparse	Namespace
recog_argsreport_bleu
trans_argslogzeror   r   )r$   idimodimargs	labeldistargs_asrr   r   	__class__r&   r'   r]   x   s   &&&









zE2E.__init__c                 C   sL   t |  | jjjjdd tjt	| jj
D ]}t| jj
| j qdS )am  Initialize weight like chainer.

        chainer basically uses LeCun way: W ~ Normal(0, fan_in ** -0.5), b = 0
        pytorch basically uses W, b ~ Uniform(-fan_in**-0.5, fan_in**-0.5)
        however, there are two exceptions as far as I know.
        - EmbedID.W ~ Normal(0, 1)
        - LSTM.upward.b[forget_gate_range] = 1 (but not used in NStepLSTM)
        r   rA   N)r   rs   embedweightdatanormal_sixmovesrangelendecoderr   bias_ih)r$   ir&   r&   r'   r     s
   	zE2E.init_like_chainerc                    st   j r|ddddf }|ddddf }nd} ||\}}} j||||d\ _ _} |||\ _}	 _}
}} ||\ _	} j
sO jsSd _nd} jj|t|| j j j j rn|d ndd}g }g }dd |D }t|D ]I\}}|| } fdd|D } fd	d|D }d
| jjd}| jjd
}d
| jjd}||dg7 }||dgg7 }qtj||d  _ j}| j d|  j   _d j   j!  j  j  j   j! j	   _"t# j}t# j}t# j	}t# j"}|t$k r1t%&|s1 j'(||||	| j|
|| j|  j"S t)*d|  j"S )aE  E2E forward.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: loss value
        :rtype: torch.Tensor
        Nr   rA   lang_idsr?   c                 S       g | ]}|d  d dd qS r   yseqrA   r&   .0	nbest_hypr&   r&   r'   
<listcomp>R       zE2E.forward.<locals>.<listcomp>c                    &   g | ]}t |d kr jt | qS r   r9   r@   r   idxr=   r&   r'   r   V  s   & c                    r   r   r   r   r=   r&   r'   r   W         d   zloss (=%f) is not correct)+rG   r7   rs   r   r   forward_asrloss_asr_attloss_asr_ctc
forward_mtr   trainingr   r   recognize_beam_batchr^   tensorr   r@   rW   squeezetolist	enumeratejoinreplacerY   rZ   splitnltk
bleu_scorecorpus_bleurc   r   ra   rb   r   floatCTC_LOSS_THRESHOLDmathisnanr   r    r!   warning)r$   xs_padilensys_pad
ys_pad_srctgt_lang_idshs_padhlens_r   r   r   r   r   lpz
nbest_hypslist_of_refshypsy_hatsr   y_haty_trueseq_hatseq_trueseq_hat_textseq_true_textasr_ctc_weightloss_st_dataloss_asr_dataloss_mt_data	loss_datar&   r=   r'   forward  s   







zE2E.forwardc           "         s   ddl }d\}}d}d\}}	d}
 jdkr||||
||	fS  jdk r |||\}}} js js5 jr jdkrH jjdkrH j	
|j}nd}g g g g f\}}}} j|t|| j j j}dd |D }t|D ]l\}}|| } fd	d|D } fd
d|D }d| jjd}| jjd}d| jjd}| }| }|||| |t| |dd}|dd}|||| |t| qp jsdn	tt|t| }	 jsdn	tt|t| } jdkr 	|||} jdurg }  j	|j}t|D ]_\}}!dd t|!D }|| } fdd|D } fdd|D }d| jd}| jd}d| jd}|dd}|dd}t|dkry| |||t|  q| rt| t|  nd}
||||
||	fS )a  Forward pass in the auxiliary ASR task.

        :param torch.Tensor hs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor hlens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :return: ASR attention loss value
        :rtype: torch.Tensor
        :return: accuracy in ASR attention decoder
        :rtype: float
        :return: ASR CTC loss value
        :rtype: torch.Tensor
        :return: character error rate from CTC prediction
        :rtype: float
        :return: character error rate from attetion decoder prediction
        :rtype: float
        :return: word error rate from attetion decoder prediction
        :rtype: float
        r   N)r?   r?   )NNrA   r?   c                 S   r   r   r&   r   r&   r&   r'   r     r   z#E2E.forward_asr.<locals>.<listcomp>c                    r   r   r   r   r=   r&   r'   r     r   c                    r   r   r   r   r=   r&   r'   r     r   r   r   c                 S   s   g | ]}|d  qS )r   r&   )r   xr&   r&   r'   r     s    c                    r   r   r   r   r=   r&   r'   r     r   c                    r   r   r   r   r=   r&   r'   r     r   )editdistancera   rc   rv   r   r   r   r   rS   rt   log_softmaxr   r   r^   r   r@   rW   r   r   r   rY   rZ   r   appendevalr   r   sumargmaxr   )"r$   r   r   r   r   loss_attloss_ctcr   r   r   r   r   r   r   r   word_edsword_ref_lenschar_edschar_ref_lensnbest_hyps_asrr   r   r   r   r   r   r   r   	hyp_words	ref_words	hyp_chars	ref_charscersyr&   r=   r'   r     s   

	




zE2E.forward_asrc                 C   s   d}d}| j dkr||fS tj|dkdd  }dd |D }t|| j}| | | 	||\}}	}
| 
||	|\}}}
||fS )aN  Forward pass in the auxiliary MT task.

        :param torch.Tensor xs_pad: batch of padded source sequences (B, Tmax, idim)
        :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
        :return: MT loss value
        :rtype: torch.Tensor
        :return: accuracy in MT decoder
        :rtype: float
        r?   r   r   rA   )dimc                 S   s   g | ]}||d k qS r   r&   )r   r   r&   r&   r'   r   	  s    z"E2E.forward_mt.<locals>.<listcomp>)rb   r^   r   cpunumpyr	   rl   r   r   r~   rs   )r$   r   r   r   r   r   ys_srcxs_zero_padr   r   r   r&   r&   r'   r     s   


zE2E.forward_mtc                 C   s   t | jdS )zScorers.)r   )dictrs   r=   r&   r&   r'   scorers  s   zE2E.scorersc                 C   sz   |    |jd g}|dd| jd ddf }t|  }tj||j|jd}|	 
d}| ||\}}}|dS )zEncode acoustic features.

        :param ndarray x: input acoustic feature (T, D)
        :return: encoder outputs
        :rtype: torch.Tensor
        r   N)devicerO   )r   shaper<   next
parametersr^   	as_tensorr   rO   
contiguous	unsqueezer7   r   )r$   r   r   rM   hhsr   r&   r&   r'   encode  s   
z
E2E.encodeNc                 C   s\   t dt|jd   | |d}t dt|d  | j|d d|||}|S )aI  E2E beam search.

        :param ndarray x: input acoustic feature (T, D)
        :param Namespace trans_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        zinput lengths: r   zencoder output lengths: rA   N)	r!   r"   r#   r   r  r  sizers   recognize_beam)r$   r   r   r@   rW   r  r   r&   r&   r'   	translate*  s
   
zE2E.translatec                    s    j }   tjdd |D tjd} fdd|D } fdd|D }t|d} ||\}}	}
tt	t
t|	}	 j||	d|||}|rO   |S )	ao  E2E batch beam search.

        :param list xs: list of input acoustic feature arrays [(T_1, D), (T_2, D), ...]
        :param Namespace trans_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        c                 s   s    | ]}|j d  V  qdS )r   N)r   r   xxr&   r&   r'   	<genexpr>I  s    z&E2E.translate_batch.<locals>.<genexpr>rN   c                    s(   g | ]}|d d  j d d d f qS )Nr   rP   r
  r=   r&   r'   r   L  s   ( z'E2E.translate_batch.<locals>.<listcomp>c                    s   g | ]}t  t| qS r&   )r
   r   r   r
  r=   r&   r'   r   M  s    r?   N)r   r   r:   fromiterr   r	   r7   r^   r   listmapr9   rs   r   train)r$   xsr   r@   rW   prevr   r   r   r   r   r   r&   r=   r'   translate_batch=  s   

zE2E.translate_batchc           
      C   s   |    t 5 | jr!|ddddf }|ddddf }nd}| ||\}}}| jj||||d}	W d   n1 s@w   Y  |   |	S )ad  E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :param torch.Tensor ys_pad_src:
            batch of padded token id sequence tensor (B, Lmax)
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray
        Nr   rA   r   )r   r^   no_gradrG   r7   rs   calculate_all_attentionsr  )
r$   r   r   r   r   r   hpadr   r   att_wsr&   r&   r'   r  ]  s   
zE2E.calculate_all_attentionsc           	      C   s~   d}| j dks| jdkr|S |   t  | ||\}}}| j| 	 }W d   n1 s4w   Y  | 
  |S )a  E2E CTC probability calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :param torch.Tensor
            ys_pad_src: batch of padded token id sequence tensor (B, Lmax)
        :return: CTC probability (B, Tmax, vocab)
        :rtype: float ndarray
        Nr   )ra   rc   r   r^   r  r7   rt   softmaxr   r   r  )	r$   r   r   r   r   probsr  r   r   r&   r&   r'   calculate_all_ctc_probs{  s   
zE2E.calculate_all_ctc_probsc                 C   sT   |dd| j d ddf }|jd g}t| ttj|tjd}|  ||fS )z&Subsample speeh frames in the encoder.Nr   rN   )	r<   r   r
   r^   
from_numpyr:   arrayfloat32r  )r$   r   ilenr  r&   r&   r'   subsample_frames  s
   zE2E.subsample_frames)N)r(   r)   r*   r+   staticmethodr1   r-   r.   r/   r>   r]   r   r   r   r   r   r  r	  r  r  r  r  __classcell__r&   r&   r   r'   r,   N   s0    



 ju

 r,   )/r+   r   ry   r!   r   rn   	itertoolsr   chainerr   r   r:   r   r^   r   espnet.nets.e2e_asr_commonr   espnet.nets.pytorch_backend.ctcr   *espnet.nets.pytorch_backend.initializationr   r   &espnet.nets.pytorch_backend.nets_utilsr   r	   r
   r   (espnet.nets.pytorch_backend.rnn.argumentr   r   r   *espnet.nets.pytorch_backend.rnn.attentionsr   (espnet.nets.pytorch_backend.rnn.decodersr   (espnet.nets.pytorch_backend.rnn.encodersr   espnet.nets.st_interfacer   espnet.utils.fill_missing_argsr   r   Chainr   r_   r`   r,   r&   r&   r&   r'   <module>   s4    