o
    ¡¿¯iÃ  ã                   @   s¨   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ dZG dd„ deƒZdS )z<RNN sequence-to-sequence speech recognition model (chainer).é    N)Úreporter)ÚChainerASRInterface)Úctc_for)Úatt_for)Údecoder_for)Úencoder_for)Úlabel_smoothing_dist)ÚE2E)Úget_subsamplei'  c                   @   sr   e Zd ZdZedd„ ƒZdd„ Zddd„Zd	d
„ Zddd„Z	dd„ Z
eddd„ƒZeddd„ƒZeddd„ƒZdS )r	   a?  E2E module for chainer backend.

    Args:
        idim (int): Dimension of the inputs.
        odim (int): Dimension of the outputs.
        args (parser.args): Training config.
        flag_return (bool): If True, train() would return
            additional metrics in addition to the training
            loss.

    c                 C   s
   t  | ¡S )zAdd arguments.)ÚE2E_pytorchÚadd_arguments)Úparser© r   úW/home/ubuntu/.local/lib/python3.10/site-packages/espnet/nets/chainer_backend/e2e_asr.pyr   &   s   
zE2E.add_argumentsc                 C   s   | j jtt | j¡ƒ S )zGet total subsampling factor.)ÚencÚconv_subsampling_factorÚintÚnpÚprodÚ	subsample)Úselfr   r   r   Úget_total_subsampling_factor+   s   z E2E.get_total_subsampling_factorTc                 C   s*  t j | ¡ |j| _d| j  krdksJ dƒ‚ J dƒ‚|j| _|j| _|j| _|j| _|d | _|d | _	t
|ddd| _|jrTt d|j ¡ t||j|jd}nd	}|  ¡ ( t||| jƒ| _t||ƒ| _t|ƒ| _t||| j| j	| j|ƒ| _W d	  ƒ n1 s…w   Y  d	| _d	| _|| _d	S )
zÂConstruct an E2E object.

        :param int idim: dimension of inputs
        :param int odim: dimension of outputs
        :param Namespace args: argument Namespace containing options
        r   é   zmtlalpha must be [0,1]ÚasrÚrnn)ÚmodeÚarchzUse label smoothing with )Ú
transcriptN)ÚchainerÚChainÚ__init__ÚmtlalphaÚetypeÚverboseÚ	char_listÚoutdirÚsosÚeosr
   r   Úlsm_typeÚloggingÚinfor   Ú
train_jsonÚ
init_scoper   r   r   Úctcr   Úattr   ÚdecÚaccÚlossÚflag_return)r   ÚidimÚodimÚargsr2   Ú	labeldistr   r   r   r    /   s2   &


ÿ

ø

zE2E.__init__c           	      C   s0  |   ||¡\}}| jdkrd}n|  ||¡}| jdkr d}d}n|  ||¡\}}|| _| j}|dkr6|| _n|dkr>|| _n|| d| |  | _| jjtk rƒt 	| jj¡sƒt
 d|i| ¡ t
 d|i| ¡ t
 d|i| ¡ t dt| jjƒ ¡ t
 d| ji| ¡ nt d	| jj¡ | jr•| j|||fS | jS )
aô  E2E forward propagation.

        Args:
            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
            ilens (chainer.Variable): Batch of length of each input batch. (B,)
            ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)

        Returns:
            float: Loss that calculated by attention and ctc loss.
            float (optional): Ctc loss.
            float (optional): Attention loss.
            float (optional): Accuracy.

        r   Nr   Úloss_ctcÚloss_attr0   z	mtl loss:r1   zloss (=%f) is not correct)r   r!   r-   r/   r0   r1   ÚdataÚCTC_LOSS_THRESHOLDÚmathÚisnanr   Úreportr)   r*   ÚstrÚwarningr2   )	r   ÚxsÚilensÚysÚhsr7   r8   r0   Úalphar   r   r   Úforward]   s2   

zE2E.forwardNc           
   	   C   s  |dd| j d …dd…f }| jj|jd tjd}t | jj|tjd¡}t 	¡ O t 
dd¡8 |  |g|g¡\}}|jdkrK| j |¡jd }nd}| j |d ||||¡}	|	W  d  ƒ W  d  ƒ S 1 slw   Y  W d  ƒ dS 1 s|w   Y  dS )a†  E2E greedy/beam search.

        Args:
            x (chainer.Variable): Input tensor for recognition.
            recog_args (parser.args): Arguments of config file.
            char_list (List[str]): List of Characters.
            rnnlm (Module): RNNLM module defined at `espnet.lm.chainer_backend.lm`.

        Returns:
            List[Dict[str, Any]]: Result of recognition.

        Nr   )ÚdtypeÚtrainFg        )r   ÚxpÚarrayÚshaper   Úint32r   ÚVariableÚfloat32Úno_backprop_modeÚusing_configr   Ú
ctc_weightr-   Úlog_softmaxr9   r/   Úrecognize_beam)
r   ÚxÚ
recog_argsr$   ÚrnnlmÚilenÚhÚ_ÚlpzÚyr   r   r   Ú	recognize“   s   
RñzE2E.recognizec                 C   s"   |   ||¡\}}| j ||¡}|S )ay  E2E attention calculation.

        Args:
            xs (List): List of padded input sequences. [(T1, idim), (T2, idim), ...]
            ilens (np.ndarray): Batch of lengths of input sequences. (B)
            ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]

        Returns:
            float np.ndarray: Attention weights. (B, Lmax, Tmax)

        )r   r/   Úcalculate_all_attentions)r   r@   rA   rB   rC   Úatt_wsr   r   r   r\   ¶   s   zE2E.calculate_all_attentionsr   c                 C   s   ddl m} || dS )z!Get customconverter of the model.r   )ÚCustomConverter)Úsubsampling_factor)Ú(espnet.nets.chainer_backend.rnn.trainingr^   )r_   r^   r   r   r   Úcustom_converterÇ   s   
zE2E.custom_converteréÿÿÿÿr   c                 C   ó   ddl m} || ||||dS )z Get custom_updater of the model.r   )ÚCustomUpdater)Ú	converterÚdeviceÚ
accum_grad)r`   rd   )ÚitersÚ	optimizerre   rf   rg   rd   r   r   r   Úcustom_updaterÎ   s   
ÿzE2E.custom_updaterc                 C   rc   )z)Get custom_parallel_updater of the model.r   )ÚCustomParallelUpdater)re   Údevicesrg   )r`   rk   )rh   ri   re   rl   rg   rk   r   r   r   Úcustom_parallel_updater×   s   ûzE2E.custom_parallel_updater)T)N)r   )rb   r   )r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ústaticmethodr   r   r    rE   r[   r\   ra   rj   rm   r   r   r   r   r	      s    

.
6#r	   )rq   r)   r;   r   Únumpyr   r   Ú)espnet.nets.chainer_backend.asr_interfacer   Úespnet.nets.chainer_backend.ctcr   Ú*espnet.nets.chainer_backend.rnn.attentionsr   Ú(espnet.nets.chainer_backend.rnn.decodersr   Ú(espnet.nets.chainer_backend.rnn.encodersr   Úespnet.nets.e2e_asr_commonr   Ú#espnet.nets.pytorch_backend.e2e_asrr	   r   Ú&espnet.nets.pytorch_backend.nets_utilsr
   r:   r   r   r   r   Ú<module>   s    