o
    if<                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z! ee
j"edkrpddl#m$Z$ nedddZ$G dd de!Z%dS )zESPnet2 ASR Transducer model.    N)contextmanager)DictListOptionalTupleUnion)parse)check_argument_types)AbsFrontend)
AbsSpecAug)
AbsDecoder)Encoder)JointNetwork)get_transducer_task_io)AbsNormalize)force_gatherable)AbsESPnetModelz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   b/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr_transducer/espnet_transducer_model.pyr      s   
r   c                +       s  e Zd ZdZ												d:d	ed
eeedf ee f de	e
 de	e de	e dedededededededededededededededdf* fd d!Zd"ejd#ejd$ejd%ejdeejeeejf ejf f
d&d'Zd"ejd#ejd$ejd%ejdeeejf f
d(d)Zd"ejd#ejdeejejf fd*d+Zd"ejd#ejdeejejf fd,d-Zd.ejd/ejd0ejd1ejd2ejdeeje	e e	e f fd3d4Zd.ejd0ejd1ejd2ejdejf
d5d6Zd7ejd0ejdejfd8d9Z  ZS );ESPnetASRTransducerModelai  ESPnet2ASRTransducerModel module definition.

    Args:
        vocab_size: Size of complete vocabulary (w/ EOS and blank included).
        token_list: List of token
        frontend: Frontend module.
        specaug: SpecAugment module.
        normalize: Normalization module.
        encoder: Encoder module.
        decoder: Decoder module.
        joint_network: Joint Network module.
        transducer_weight: Weight of the Transducer loss.
        fastemit_lambda: FastEmit lambda value.
        auxiliary_ctc_weight: Weight of auxiliary CTC loss.
        auxiliary_ctc_dropout_rate: Dropout rate for auxiliary CTC loss inputs.
        auxiliary_lm_loss_weight: Weight of auxiliary LM loss.
        auxiliary_lm_loss_smoothing: Smoothing rate for LM loss' label smoothing.
        ignore_id: Initial padding ID.
        sym_space: Space symbol.
        sym_blank: Blank Symbol
        report_cer: Whether to report Character Error Rate during validation.
        report_wer: Whether to report Word Error Rate during validation.
        extract_feats_in_collect_stats: Whether to use extract_feats stats collection.

          ?        <space><blank>FT
vocab_size
token_list.frontendspecaug	normalizeencoderdecoderjoint_networktransducer_weightfastemit_lambdaauxiliary_ctc_weightauxiliary_ctc_dropout_rateauxiliary_lm_loss_weightauxiliary_lm_loss_smoothing	ignore_id	sym_space	sym_blank
report_cer
report_werextract_feats_in_collect_statsreturnNc                    s   t    t s
J || _|| _| | _|| _|| _|| _	|| _
|| _|| _|| _|| _d| _d| _|dk| _|dk| _| jrLtj|j|| _|| _| jr[tj|j|| _|| _|	| _|
| _|| _|| _|| _|| _ || _!dS )z-Construct an ESPnetASRTransducerModel object.Nr   )"super__init__r	   r   r+   copyr   r,   r-   r   r    r!   r"   r#   r$   criterion_transducererror_calculatoruse_auxiliary_ctcuse_auxiliary_lm_losstorchnnLinearoutput_sizectc_linctc_dropout_ratelm_linlm_loss_smoothingr%   r&   r'   r)   r.   r/   r0   )selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   	__class__r   r   r3   9   s<   





z!ESPnetASRTransducerModel.__init__speechspeech_lengthstexttext_lengthsc                 K   s  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|jd }|ddd| f }| ||\}}t||| jd\}	}
}}| j|j | |	}| 	|
d|
d}| |||
||\}}}d\}}| jr| ||
||}| jr| ||
}| j| | j|  | j|  }t| | |dkr| nd|dkr| nd||d}t|||f|j\}}}|||fS )	a  Forward architecture and compute loss(es).

        Args:
            speech: Speech sequences. (B, S)
            speech_lengths: Speech sequences lengths. (B,)
            text: Label ID sequences. (B, L)
            text_lengths: Label ID sequences lengths. (B,)
            kwargs: Contains "utts_id".

        Return:
            loss: Main loss value.
            stats: Task statistics.
            weight: Task weights.

           r   N)r+      )r   r   r   )lossloss_transduceraux_ctc_lossaux_lm_losscer_transducerwer_transducer)dimshapemaxencoder   r+   r#   
set_devicedevicer$   	unsqueeze_calc_transducer_lossr7   _calc_ctc_lossr8   _calc_lm_lossr%   r'   r)   dictdetachr   )rA   rD   rE   rF   rG   kwargs
batch_sizeencoder_outencoder_out_lens
decoder_intargett_lenu_lendecoder_out	joint_out
loss_trans	cer_trans	wer_transloss_ctcloss_lmrJ   statsweightr   r   r   forward~   sp   





z ESPnetASRTransducerModel.forwardc                 K   s>   | j r| ||\}}ntd| j   ||}}||dS )a  Collect features sequences and features lengths sequences.

        Args:
            speech: Speech sequences. (B, S)
            speech_lengths: Speech sequences lengths. (B,)
            text: Label ID sequences. (B, L)
            text_lengths: Label ID sequences lengths. (B,)
            kwargs: Contains "utts_id".

        Return:
            {}: "feats": Features sequences. (B, T, D_feats),
                "feats_lengths": Features sequences lengths. (B,)

        zkGenerating dummy stats for feats and feats_lengths, because encoder_conf.extract_feats_in_collect_stats is )featsfeats_lengths)r0   _extract_featsloggingwarning)rA   rD   rE   rF   rG   r\   rn   ro   r   r   r   collect_feats   s   

z&ESPnetASRTransducerModel.collect_featsc                 C   s   t d- | ||\}}| jdur| jr| ||\}}| jdur*| ||\}}W d   n1 s4w   Y  | ||\}}|d|dksVJ | |df|d| ksiJ | | f||fS )a  Encoder speech sequences.

        Args:
            speech: Speech sequences. (B, S)
            speech_lengths: Speech sequences lengths. (B,)

        Return:
            encoder_out: Encoder outputs. (B, T, D_enc)
            encoder_out_lens: Encoder outputs lengths. (B,)

        FNr   rH   )r   rp   r    trainingr!   r"   sizerR   )rA   rD   rE   rn   ro   r^   r_   r   r   r   rS     s$   

zESPnetASRTransducerModel.encodec                 C   sb   |  dksJ |j|ddd| f }| jdur(| ||\}}||fS ||}}||fS )a?  Extract features sequences and features sequences lengths.

        Args:
            speech: Speech sequences. (B, S)
            speech_lengths: Speech sequences lengths. (B,)

        Return:
            feats: Features sequences. (B, T, D_feats)
            feats_lengths: Features sequences lengths. (B,)

        rH   N)rP   rQ   rR   r   )rA   rD   rE   rn   ro   r   r   r   rp   +  s   

z'ESPnetASRTransducerModel._extract_featsr^   re   ra   rb   rc   c              	   C   s   | j du r(zddlm} |d| jd| _ W n ty'   td td Y nw |  ||||}| jsd| j	s9| j
rd| jdu rWddlm} || j| j| j| j| j| j	| j
d	| _| ||\}	}
||	|
fS |ddfS )
a(  Compute Transducer loss.

        Args:
            encoder_out: Encoder output sequences. (B, T, D_enc)
            joint_out: Joint Network output sequences (B, T, U, D_joint)
            target: Target label ID sequences. (B, L)
            t_len: Encoder output sequences lengths. (B,)
            u_len: Target label ID sequences lengths. (B,)

        Return:
            loss_transducer: Transducer loss value.
            cer_transducer: Character error rate for Transducer.
            wer_transducer: Word Error Rate for Transducer.

        Nr   )RNNTLossmean)	reductionr&   zJwarp-rnnt was not installed.Please consult the installation documentation.rH   )ErrorCalculator)r.   r/   )r5   warprnnt_pytorchrv   r&   ImportErrorrq   errorexitrt   r.   r/   r6   'espnet2.asr_transducer.error_calculatorry   r#   r$   r   r,   r-   )rA   r^   re   ra   rb   rc   rv   rK   ry   rN   rO   r   r   r   rW   E  sD   




z.ESPnetASRTransducerModel._calc_transducer_lossc           	   	   C   s   |  tjjj|| jd}tj|dddd}|dk}||  }tj	j
jdd tjjj||||ddd	}W d
   n1 sBw   Y  ||d }|S )aM  Compute CTC loss.

        Args:
            encoder_out: Encoder output sequences. (B, T, D_enc)
            target: Target label ID sequences. (B, L)
            t_len: Encoder output sequences lengths. (B,)
            u_len: Target label ID sequences lengths. (B,)

        Return:
            loss_ctc: CTC loss value.

        )pr   rH   r   rP   T)deterministicsum)zero_infinityrx   N)r=   r9   r:   
functionaldropoutr>   log_softmax	transposecpubackendscudnnflagsctc_lossru   )	rA   r^   ra   rb   rc   ctc_intarget_mask
ctc_targetri   r   r   r   rX     s$   	z'ESPnetASRTransducerModel._calc_ctc_lossrd   c                 C   s   |  |ddddddf d| j}|dtj}t . | }|| j	| jd   |dk}|
|d}|d|dd| j	  W d   n1 sSw   Y  tjjjtj|dd|dd}|
|dd |d }|S )zCompute LM loss.

        Args:
            decoder_out: Decoder output sequences. (B, U, D_dec)
            target: Target label ID sequences. (B, L)

        Return:
            loss_lm: LM loss value.

        Nr   rH   r   r   none)rx   )r?   viewr   typer9   int64no_gradclonefill_r@   masked_fillscatter_rV   r:   r   kl_divr   r   ru   )rA   rd   ra   
lm_loss_in	lm_target	true_distignorerj   r   r   r   rY     s$   *

z&ESPnetASRTransducerModel._calc_lm_loss)r   r   r   r   r   r   r   r   r   FFT)__name__
__module____qualname____doc__intr   r   strr   r   r
   r   r   r   r   r   floatboolr3   r9   Tensorr   rm   rs   rS   rp   rW   rX   rY   __classcell__r   r   rB   r   r      s    $	
E
_
$
*

A
(r   )T)&r   rq   
contextlibr   typingr   r   r   r   r   r9   packaging.versionr   V	typeguardr	   !espnet2.asr.frontend.abs_frontendr
   espnet2.asr.specaug.abs_specaugr   *espnet2.asr_transducer.decoder.abs_decoderr   &espnet2.asr_transducer.encoder.encoderr   $espnet2.asr_transducer.joint_networkr   espnet2.asr_transducer.utilsr   espnet2.layers.abs_normalizer    espnet2.torch_utils.device_funcsr   espnet2.train.abs_espnet_modelr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s*    