o
    iK                     @   s@  d Z ddlZddlZddlZddlZddlmZ ddlZddlZ	ddl
Z
ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, dZ-G dd dej.Z/G dd deej0j1Z2dS )z<RNN sequence-to-sequence speech recognition model (pytorch).    N)groupby)reporter)ASRInterface)label_smoothing_dist)ctc_for)feature_transform_for)frontend_for)lecun_normal_init_parametersset_forget_bias_to_one)get_subsamplepad_list	to_deviceto_torch_tensor)"add_arguments_rnn_attention_common add_arguments_rnn_decoder_common add_arguments_rnn_encoder_common)att_for)decoder_for)encoder_for)CTCPrefixScorer)fill_missing_argsi'  c                   @   s   e Zd ZdZdd ZdS )ReporterzA chainer reporter wrapper.c                 C   s   t d|i|  t d|i|  t d|i|  t d|i|  t d|i|  t d|i|  tdt|  t d|i|  d	S )
zReport at every step.loss_ctcloss_attacccer_ctccerwerz	mtl loss:lossN)r   reportlogginginfostr)selfr   r   r   r   r   r   mtl_loss r%   W/home/ubuntu/.local/lib/python3.10/site-packages/espnet/nets/pytorch_backend/e2e_asr.pyr   4   s   zReporter.reportN)__name__
__module____qualname____doc__r   r%   r%   r%   r&   r   1   s    r   c                       s   e Zd ZdZedd Zedd Zedd Zedd	 Zd
d Z	 fddZ
dd Zdd Zdd Zdd Zd#ddZd#ddZdd Zdd Zdd  Zd!d" Z  ZS )$E2EzE2E module.

    :param int idim: dimension of inputs
    :param int odim: dimension of outputs
    :param Namespace args: argument Namespace containing options

    c                 C   s"   t |  t |  t |  | S )zAdd arguments.)r+   encoder_add_argumentsattention_add_argumentsdecoder_add_arguments)parserr%   r%   r&   add_argumentsI   s   


zE2E.add_argumentsc                 C      |  d}t|}| S )zAdd arguments for the encoder.zE2E encoder setting)add_argument_groupr   r/   groupr%   r%   r&   r,   Q      
zE2E.encoder_add_argumentsc                 C   r1   )z Add arguments for the attention.zE2E attention setting)r2   r   r3   r%   r%   r&   r-   X   r5   zE2E.attention_add_argumentsc                 C   r1   )zAdd arguments for the decoder.zE2E decoder setting)r2   r   r3   r%   r%   r&   r.   _   r5   zE2E.decoder_add_argumentsc                 C   sD   t | jtjjr| jd jtt| j	 S | jjtt| j	 S )zGet total subsampling factor.r   )

isinstanceenctorchnn
ModuleListconv_subsampling_factorintnpprod	subsampler#   r%   r%   r&   get_total_subsampling_factorf   s   z E2E.get_total_subsampling_factorc                    s  t t|   tjj|  t|| j}|j| _d| j  kr&dks+J d J d|j	| _	|j
| _
t|dd|_|j| _|j| _|j| _|j| _t | _|d | _|d | _t|ddd	| _|jr|tj|jr|td
|j  t||j|jd}nd}t|ddrt ||| _!t"||d d | _#|j$}nd| _!t%||| j| _&t'||| _(t)|| _*t+||| j| j| j*|| _,| -  |j.s|j/r|j0|j1|j2|j3|j4|j5|j6|j7|j|jd
}t8j9di || _:|j.| _.|j/| _/nd| _.d| _/d| _6d| _;d| _<d| _=dS )zConstruct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
                g      ?zmtlalpha should be [0.0, 1.0]	char_listN   asrrnn)modearchzUse label smoothing with )
transcriptuse_frontendF   )
	beam_sizepenalty
ctc_weightmaxlenratiominlenratio	lm_weightrnnlmnbestspaceblankg    _r%   )>superr+   __init__r8   r9   Moduler   r0   mtlalphaetypeverbosegetattrrC   outdir	sym_spacerT   	sym_blankrU   r   r   soseosr   r?   lsm_typeospathisfile
train_jsonr    r!   r   r   frontendr   feature_transformn_melsr   r7   r   ctcr   attr   decinit_like_chainer
report_cer
report_werrL   rM   rN   rO   rP   rQ   rR   rS   argparse	Namespace
recog_argslogzeror   r   )r#   idimodimargs	labeldistrr   	__class__r%   r&   rW   m   sj   &





zE2E.__init__c                 C   sL   t |  | jjjjdd tjt	| jj
D ]}t| jj
| j qdS )am  Initialize weight like chainer.

        chainer basically uses LeCun way: W ~ Normal(0, fan_in ** -0.5), b = 0
        pytorch basically uses W, b ~ Uniform(-fan_in**-0.5, fan_in**-0.5)
        however, there are two exceptions as far as I know.
        - EmbedID.W ~ Normal(0, 1)
        - LSTM.upward.b[forget_gate_range] = 1 (but not used in NStepLSTM)
        r   rD   N)r	   rl   embedweightdatanormal_sixmovesrangelendecoderr
   bias_ih)r#   ir%   r%   r&   rm      s
   	zE2E.init_like_chainerc           %   	      s  ddl } jdur t||\}}} ||\}}n||}} ||\}}} jdkr4d _n ||| _ jdkrGd\ _}	n 	|||\ _}	}|	 _
 jdks_ jdu rbd}
nwg } j|j}t|D ]]\}}dd t|D }|| } fdd|D } fdd|D }d	| jd
}| jd	}d	| jd
}|d
d	}|d
d	}t|dkr||||t|  qo|rt|t| nd}
 js js jsd\}}nĈ jjdkr j|j}nd}g g g g f\}}}} j	|t || j j j!}dd |D }t|D ]m\}}|| } fdd|D } fdd|D }d	| jjd
}| jjd	}d	| jjd
}|" }|" } ||||  |t|  |d
d	}|d
d	}|||| |t| q jsdn	t#t|t| } jsdn	t#t|t| } j}!|!dkr j _$t# j}"d}#n(|!dkrЈ j _$d}"t# j}#n|! j d|!  j   _$t# j}"t# j}#t# j$}$|$t%k rt&'|$s j()|#|"|	|
|||$  j$S t*+d|$  j$S )aE  E2E forward.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: loss value
        :rtype: torch.Tensor
        r   NrD   )NNc                 S   s   g | ]}|d  qS )r   r%   ).0xr%   r%   r&   
<listcomp>  s    zE2E.forward.<locals>.<listcomp>c                    &   g | ]}t |d kr jt | qS r<   rC   r   idxr@   r%   r&   r        & c                    r   r   r   r   r@   r%   r&   r            )rB   rB   rB   c                 S   s    g | ]}|d  d dd qS )r   yseqrD   r   r%   )r   	nbest_hypr%   r%   r&   r   )  s     c                    r   r   r   r   r@   r%   r&   r   -  r   c                    r   r   r   r   r@   r%   r&   r   .  r   zloss (=%f) is not correct),editdistancerg   r   rh   r7   rY   r   rj   r   rl   r   rC   argmaxr|   	enumerater   joinreplacerT   rU   r   appendevalsumtrainingrn   ro   rr   rN   log_softmaxrecognize_beam_batchr8   tensorrR   splitfloatr   CTC_LOSS_THRESHOLDmathisnanr   r   r    warning)%r#   xs_padilensys_padr   hs_padhlensmask_r   r   cersy_hatsr   yy_haty_trueseq_hatseq_trueseq_hat_textseq_true_text	hyp_chars	ref_charsr   r   lpzword_edsword_ref_lenschar_edschar_ref_lens
nbest_hyps	hyp_words	ref_wordsalphaloss_att_dataloss_ctc_data	loss_datar%   r@   r&   forward   s   	





	






zE2E.forwardc                 C   s   t | jt| j| jdS )zScorers.)r   rj   )dictrl   r   rj   ra   r@   r%   r%   r&   scorers`  s   zE2E.scorersc           
      C   s   |    |jd g}|dd| jd ddf }t|  }tj||j|jd}|	 
d}| jdurF| ||\}}}| ||\}}n||}}| ||\}}	}	|dS )zEncode acoustic features.

        :param ndarray x: input acoustic feature (T, D)
        :return: encoder outputs
        :rtype: torch.Tensor
        r   N)devicedtype)r   shaper?   next
parametersr8   	as_tensorr   r   
contiguous	unsqueezerg   rh   r7   squeeze)
r#   r   r   phhsenhancedr   r   r   r%   r%   r&   encoded  s   


z
E2E.encodeNc                 C   sL   |  |d}|jdkr| j|d }nd}| j|d ||||}|S )aI  E2E beam search.

        :param ndarray x: input acoustic feature (T, D)
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        r   rB   N)r   r   rN   rj   r   rl   recognize_beam)r#   r   rr   rC   rR   r   r   r   r%   r%   r&   	recognize  s   

zE2E.recognizec              	      s   j }   tjdd |D tjd} fdd|D } fdd|D }t|d} jdurB ||\}}	}
 ||	\}}	n||}}	 ||	\}}	}|j	dkr^ j
|}d	}nd}d
}tttt|	}	 jj||	|||||d}|r   |S )ao  E2E batch beam search.

        :param list xs: list of input acoustic feature arrays [(T_1, D), (T_2, D), ...]
        :param Namespace recog_args: argument Namespace containing options
        :param list char_list: list of characters
        :param torch.nn.Module rnnlm: language model module
        :return: N-best decoding results
        :rtype: list
        c                 s       | ]}|j d  V  qdS r   Nr   r   xxr%   r%   r&   	<genexpr>      z&E2E.recognize_batch.<locals>.<genexpr>r   c                    (   g | ]}|d d  j d d d f qS Nr   r?   r   r@   r%   r&   r        ( z'E2E.recognize_batch.<locals>.<listcomp>c                       g | ]}t  t| qS r%   r   r   r   r   r@   r%   r&   r         rB   NFT)normalize_score)r   r   r=   fromiterint64r   rg   rh   r7   rN   rj   r   r8   r   listmapr<   rl   r   train)r#   xsrr   rC   rR   prevr   r   r   r   r   r   r   r   r   r   r%   r@   r&   recognize_batch  s:   





zE2E.recognize_batchc                    s    j du r	td j}   tjdd |D tjd} fdd|D } fdd|D }t|d	}  ||\}}}|rC   |	 
 |	 
 |fS )
zForward only in the frontend stage.

        :param ndarray xs: input acoustic feature (T, C, F)
        :return: enhaned feature
        :rtype: torch.Tensor
        NzFrontend does't existc                 s   r   r   r   r   r%   r%   r&   r     r   zE2E.enhance.<locals>.<genexpr>r   c                    r   r   r   r   r@   r%   r&   r     r   zE2E.enhance.<locals>.<listcomp>c                    r   r%   r   r   r@   r%   r&   r     r   rB   )rg   RuntimeErrorr   r   r=   r   r   r   r   cpunumpy)r#   r   r   r   r   r   hlensmr   r%   r@   r&   enhance  s   

zE2E.enhancec           
      C   s   |    t 7 | jdur"| t||\}}}| ||\}}n||}}| ||\}}}| j|||}	W d   n1 sBw   Y  | 	  |	S )a  E2E attention calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, idim)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: attention weights with the following shape,
            1) multi-head case => attention weights (B, H, Lmax, Tmax),
            2) other case => attention weights (B, Lmax, Tmax).
        :rtype: float ndarray
        N)
r   r8   no_gradrg   r   rh   r7   rl   calculate_all_attentionsr   )
r#   r   r   r   r   r   r   hpadr   att_wsr%   r%   r&   r     s   


zE2E.calculate_all_attentionsc           
      C   s   d}| j dkr	|S |   t 9 | jdur+| t||\}}}| ||\}}n||}}| ||\}}}	| j	|
  }W d   n1 sMw   Y  |   |S )aj  E2E CTC probability calculation.

        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax)
        :param torch.Tensor ilens: batch of lengths of input sequences (B)
        :param torch.Tensor ys_pad: batch of padded token id sequence tensor (B, Lmax)
        :return: CTC probability (B, Tmax, vocab)
        :rtype: float ndarray
        Nr   )rY   r   r8   r   rg   r   rh   r7   rj   softmaxr   r   r   )
r#   r   r   r   probsr   r   r   r   r   r%   r%   r&   calculate_all_ctc_probs  s   	



zE2E.calculate_all_ctc_probsc                 C   sT   |dd| j d ddf }|jd g}t| ttj|tjd}|  ||fS )z&Subsample speeh frames in the encoder.Nr   r   )	r?   r   r   r8   
from_numpyr=   arrayfloat32r   )r#   r   ilenr   r%   r%   r&   subsample_frames  s
   zE2E.subsample_frames)N)r'   r(   r)   r*   staticmethodr0   r,   r-   r.   rA   rW   rm   r   r   r   r   r   r   r   r   r   __classcell__r%   r%   rx   r&   r+   @   s.    



X 


5r+   )3r*   rp   r    r   rc   	itertoolsr   chainerr   r=   r~   r8   r   espnet.nets.asr_interfacer   espnet.nets.e2e_asr_commonr   espnet.nets.pytorch_backend.ctcr   7espnet.nets.pytorch_backend.frontends.feature_transformr   .espnet.nets.pytorch_backend.frontends.frontendr   *espnet.nets.pytorch_backend.initializationr	   r
   &espnet.nets.pytorch_backend.nets_utilsr   r   r   r   (espnet.nets.pytorch_backend.rnn.argumentr   r   r   *espnet.nets.pytorch_backend.rnn.attentionsr   (espnet.nets.pytorch_backend.rnn.decodersr   (espnet.nets.pytorch_backend.rnn.encodersr   espnet.nets.scorers.ctcr   espnet.utils.fill_missing_argsr   r   Chainr   r9   rX   r+   r%   r%   r%   r&   <module>   s6   