o
    i/                     @   sz  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 eej2edkrd dl3m4Z4 ned ddZ4G dd deZ5G dd dej6j7Z8dS )!    N)contextmanager)groupby)DictListOptionalTupleUnion)parse)check_argument_types)CTC)
MLMDecoder)
AbsEncoder)ESPnetASRModel)AbsFrontend)AbsPostEncoder)AbsPreEncoder)
AbsSpecAug)AbsNormalize)TokenIDConverter)force_gatherable)
Hypothesis)ErrorCalculator)mask_uniform)th_accuracy)LabelSmoothingLossz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/asr/maskctc_model.pyr   "   s   
r   c                -       sv  e Zd ZdZ												
	d7dedeeedf ee f de	e
 de	e de	e de	e dede	e dedede	ejj dededededededededed ed!ef, fd"d#Zd$ejd%ejd&ejd'ejd(eejeeejf ejf f
d)d*Zd+ejd,ejd-ejd.ejfd/d0Zd+ejd,ejd-ejd.ejd(ejf
d1d2Z	3d8d+ejd,ejd-ejd.ejd4ef
d5d6Z  ZS )9MaskCTCModelz5Hybrid CTC/Masked LM Encoder-Decoder model (Mask-CTC)N      ?        FT<space><blank><mask>
vocab_size
token_list.frontendspecaug	normalize
preencoderencoderpostencoderdecoderctcjoint_network
ctc_weightinterctc_weight	ignore_id
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blanksym_maskextract_feats_in_collect_statsc                    s   t  sJ t jdi d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d| || |d7 }|| _|d | _| | _| `t	||||d| _
d | _|ss|r~t|||||| _d S d S )Nr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r<      )sizepadding_idx	smoothingnormalize_lengthr   )r
   super__init__appendr'   
mask_tokencopyr(   criterion_attr   criterion_mlmerror_calculatorr   )selfr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   	__class__r   r   rC   *   s|   
	





zMaskCTCModel.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s@  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|jd }|ddd| f }d\}}d\}	}
t }| ||\}}d}t|trf|d }|d }| jdkr| ||||\}	}
|	dur}|		 nd|d< |
|d< d}| j
dkr|dur|D ](\}}| ||||\}}|| }|dur|	 nd|d|< ||d	|< q|t| }d| j
 |	 | j
|  }	| jd
kr| ||||\}}| jdkr|}n| jd
kr|	}n| j|	 d| j |  }|dur|	 nd|d< ||d< |	 |d< t|||f|j\}}}|||fS )zFrontend + Encoder + Decoder + Calc loss

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            text: (Batch, Length)
            text_lengths: (Batch,)
        r=   r   N)NNr"   loss_ctccer_ctczloss_interctc_layer{}zcer_interctc_layer{}g      ?loss_mlmacc_mlmloss)dimshapemaxdictencode
isinstancetupler2   _calc_ctc_lossdetachr3   formatlen_calc_mlm_lossr   device)rJ   rM   rN   rO   rP   kwargs
batch_sizerT   rU   rR   rS   statsencoder_outencoder_out_lensintermediate_outsloss_interctc	layer_idxintermediate_outloss_iccer_icrV   weightr   r   r   forwards   sp   







zMaskCTCModel.forwardrg   rh   ys_padys_pad_lensc                 C   sZ   t || j| j| j\}}| ||||\}}| ||}	t|d| j|| jd}
|	|
fS )Nr#   )ignore_label)	r   rE   eosr4   r/   rH   r   viewr'   )rJ   rg   rh   rq   rr   	ys_in_pad
ys_out_paddecoder_out_rT   rU   r   r   r   rb      s   zMaskCTCModel._calc_mlm_lossc                 C      t r   NotImplementedError)rJ   rg   rh   rq   rr   r   r   r   nll   s   zMaskCTCModel.nlld   re   c                 C   rz   r   r{   )rJ   rg   rh   rq   rr   re   r   r   r   batchify_nll   s   zMaskCTCModel.batchify_nll)Nr!   r"   r#   r"   FTTr$   r%   r&   T)r~   )__name__
__module____qualname____doc__intr   r   strr   r   r   r   r   r   r   r   r   r   torchnnModulefloatboolrC   Tensorr   rp   rb   r}   r   __classcell__r   r   rK   r   r    '   s    	

I
d

r    c                       sX   e Zd ZdZdededef fddZdee fdd	Z	d
e
jdee fddZ  ZS )MaskCTCInferencez+Mask-CTC-based non-autoregressive inference	asr_modeln_iterationsthreshold_probabilityc                    s@   t    |j| _|j| _|j| _|| _|| _t|j	d| _
dS )zInitialize Mask-CTC inference)r(   N)rB   rC   r0   r/   mlmrE   r   r   r   r(   	converter)rJ   r   r   r   rK   r   r   rC   	  s   
zMaskCTCInference.__init__idsc                 C   s&   d | j|}|ddddS )N r&   ry   r$    )joinr   
ids2tokensreplace)rJ   r   rO   r   r   r   ids2text  s   zMaskCTCInference.ids2textenc_outrQ   c              	   C   s  | d}t| j|jdd\}}tdd t|d D }t|dk	d}t
d| ||   g }d}t| D ]?\}}	|d ||jd k r|	|d | kr|| |d | k rq|d |  ||< |d7 }||jd k r|	|d | ks]qEtt|}| j}
t|| |
k 	d}t|| |
k	d}t|}tjdt|tjd|j| j }|| | |d |< t
d	| |d   |dksr| j}||kr|dkr|n|}t|d D ]Q}| || dg|| dg\}}|d | jdd\}}t!||| dd }|| |d || < t|d | jk	d}t
d	| |d   q| || dg|| dg\}}|d | j"dd|d |< t
d	| |d   tj#| jg| d  | jg |jd
}t$|dS )zPerform Mask-CTC inferencer   r#   )rW   c                 S   s   g | ]}|d  qS )r   r   ).0xr   r   r   
<listcomp>!  s    z,MaskCTCInference.forward.<locals>.<listcomp>zctc:{}r=   )dtypezmsk:{})rc   )yseq)%	unsqueezer   expr0   log_softmaxrY   stackr   nonzerosqueezelogginginfor`   r   tolist	enumeraterD   rX   item
from_numpynumpyarrayr   ra   zeroslongtorc   rE   r   ranger   r>   topkargmaxtensorr   )rJ   r   	ctc_probsctc_idsy_haty_idx	probs_hatcntiyp_thresmask_idxconfident_idxmask_numy_inKnum_itertpredry   
pred_scorepred_idcandr   r   r   r   rp     sV   


$ $
zMaskCTCInference.forward)r   r   r   r   r    r   r   rC   r   r   r   r   r   rp   r   r   r   rK   r   r     s     r   )T)9r   
contextlibr   	itertoolsr   typingr   r   r   r   r   r   r   packaging.versionr	   V	typeguardr
   espnet2.asr.ctcr   espnet2.asr.decoder.mlm_decoderr   espnet2.asr.encoder.abs_encoderr   espnet2.asr.espnet_modelr   !espnet2.asr.frontend.abs_frontendr   'espnet2.asr.postencoder.abs_postencoderr   %espnet2.asr.preencoder.abs_preencoderr   espnet2.asr.specaug.abs_specaugr   espnet2.layers.abs_normalizer   espnet2.text.token_id_converterr    espnet2.torch_utils.device_funcsr   espnet.nets.beam_searchr   espnet.nets.e2e_asr_commonr   2espnet.nets.pytorch_backend.maskctc.add_mask_tokenr   &espnet.nets.pytorch_backend.nets_utilsr   <espnet.nets.pytorch_backend.transformer.label_smoothing_lossr   __version__torch.cuda.ampr   r    r   r   r   r   r   r   r   <module>   s>     `