o
    iB                     @   sF  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, ee	j-edkrd dl.m/Z/ nedddZ/G dd deZ0dS )    N)contextmanager)DictListOptionalTupleUnion)parse)check_argument_types)CTC)
AbsDecoder)
AbsEncoder)ESPnetASRModel)AbsFrontend)AbsPostEncoder)AbsPreEncoder)
AbsSpecAug)ErrorCalculatorTransducer)AbsNormalize)AbsPostDecoder)force_gatherable)AbsESPnetModel)ErrorCalculator)LabelSmoothingLossz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   L/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/slu/espnet_model.pyr      s   
r   c                5   @   s  e Zd ZdZ																d6d
edeeedf ee f de	e
 de	e de	e de	e dede	e dedede	ejj de	e de	e deeedf ee f dededededededed ed!ed"ed#ed$ef4d%d&Z		d7d'ejd(ejd)ejd*ejd+ejd,ejd-eejeeejf ejf fd.d/Z		d7d'ejd(ejd)ejd*ejd+ejd,ejd-eeejf fd0d1Z		d7d'ejd(ejd2ejd3ejd-eejejf f
d4d5ZdS )8ESPnetSLUModelz*CTC-attention hybrid Encoder-Decoder modelN      ?        FT<space><blank>
vocab_size
token_list.frontendspecaug	normalize
preencoderencoderpostencoderdecoderctcjoint_networkpostdecoderdeliberationencodertranscript_token_list
ctc_weightinterctc_weight	ignore_id
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blankextract_feats_in_collect_statstwo_passpre_postencoder_normc              	   C   s8  t  sJ d|  krdksJ | J |d|  kr$dk s)J | J |t|  d| _|d | _|d | _|| _|| _|| _|| _	|
 | _|d urU|
 | _|| _|| _|| _|| _|| _|| _|| _|| _|| _| jd ur| jj| jjkrtj| jj| jj| _|| _t| jdsd| j_| jjrtj|| j | j_ |d u| _!d | _"| j!rddl#m$} |	| _%|| _&|| jdd| _'|s|rt(|	||||||d	| _)n5d | _)| jdkrt*|||||| _"n#|dkrd | _%n|	| _%t+||||d
| _,|s|rt*|||||| _"|dkrd | _-n|
| _-|| _.d S )Nr         ?r      interctc_use_conditioningF)RNNTLoss)blankfastemit_lambda)r6   r7   )sizepadding_idx	smoothingnormalize_length)/r	   r   __init__blank_idsoseosr#   r3   r1   r2   copyr$   r0   r;   r<   r%   r&   r'   r(   r*   r.   r)   _output_sizeoutput_size_dimtorchnnLinearuniform_linearr/   hasattrr?   output_sizeconditioning_layeruse_transducer_decodererror_calculatorwarprnnt_pytorchr@   r+   r-   criterion_transducerr   error_calculator_transr   r   criterion_attr,   r:   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r@   r   r   r   rG   '   s   
$$














zESPnetSLUModel.__init__speechspeech_lengthstexttext_lengths
transcripttranscript_lengthsreturnc                 K   s  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|jd }|ddd| f }| ||||\}	}
d}t|	tr]|	d }|	d }	d\}}}}d\}}d\}}}t }| jdkr| |	|
||\}}|dur|	 nd|d< ||d	< d}| j
dkr|dur|D ](\}}| ||
||\}}|| }|dur|	 nd|d
|< ||d|< q|t| }d| j
 | | j
|  }| jr| |	|
|\}}}|dur|| j|  }n|}|dur|	 nd|d< ||d< ||d< nI| jdkr| |	|
||\}}}}| jdkr"|}n| jdkr+|}n| j| d| j |  }|dur@|	 nd|d< ||d< ||d< ||d< |	 |d< t|||f|j\}}}|||fS )a  Frontend + Encoder + Decoder + Calc loss

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            text: (Batch, Length)
            text_lengths: (Batch,)
            kwargs: "utt_id" is among the input.
        r>   r   N)NNNNNN)NNNr   loss_ctccer_ctczloss_interctc_layer{}zcer_interctc_layer{}loss_transducercer_transducerwer_transducerr=   loss_attacccerwerloss)dimshapemaxencode
isinstancetupledictr1   _calc_ctc_lossdetachr2   formatlenrU   _calc_transducer_loss_calc_att_lossr   device)r[   r\   r]   r^   r_   r`   ra   kwargs
batch_sizeencoder_outencoder_out_lensintermediate_outsri   acc_attcer_attwer_attrd   re   rf   rg   rh   statsloss_interctc	layer_idxintermediate_outloss_iccer_icrm   weightr   r   r   forward   s   






zESPnetSLUModel.forwardc           
      K   s>   | j r| ||\}}	ntd| j   ||}}	||	dS )NzkGenerating dummy stats for feats and feats_lengths, because encoder_conf.extract_feats_in_collect_stats is )featsfeats_lengths)r:   _extract_featsloggingwarning)
r[   r\   r]   r^   r_   r`   ra   r|   r   r   r   r   r   collect_feats.  s   


zESPnetSLUModel.collect_featstranscript_padtranscript_pad_lensc              
      s   t d-  ||\}} jdur jr ||\}} jdur* ||\}}W d   n1 s4w   Y   jdurF ||\}} jjrW j|| jd\}}}	n	 ||\}}}	d}
t	|t
ro|d }
|d } jdur| ||\}} jdurG jj jjkr |} fdd|D } j|d\}}}}} t|j|jd	t|j|jd	t|j|jd	t|j|jd	}t|j|jd	}|dddt|f }|| }t|}t|jd ||jd
 fj|jd	}tt|D ]/}t||d|| f ||d|| f t|||  |jd
 fj|jd	fd||< q jdurC ||\}}|}|}|d|dks]J | |df|d| ksqJ | | f|
dur|||
f|fS ||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
        FN)r,   r>   r   c                    s$   g | ]}d   fdd|D qS ) c                    s"   g | ]}|d kr j t| qS )r    )r0   int).0kr[   r   r   
<listcomp>|  s   " z4ESPnetSLUModel.encode.<locals>.<listcomp>.<listcomp>)join)r   k1r   r   r   r   {  s    z)ESPnetSLUModel.encode.<locals>.<listcomp>   )r{      )r   r   r&   trainingr'   r(   r)   r?   r,   rr   rs   r*   r.   rL   rM   rQ   convert_examples_to_featuresrN   
LongTensortor{   rp   zerosro   rangerx   catr/   rC   )r[   r\   r]   r   r   r   r   r~   r   _r   transcript_listtranscript_input_id_featurestranscript_input_mask_featurestranscript_segment_ids_featuretranscript_position_ids_featureinput_id_lengthbert_encoder_outbert_encoder_lensfinal_encoder_out_lensmax_lensencoder_new_outr   r   r   r   rq   D  s   















zESPnetSLUModel.encode)NNNr   r   r    r   FTTr!   r"   TFFrc   )__name__
__module____qualname____doc__r   r   r   strr   r   r   r   r   r   r   r   r   r
   rN   rO   Moduler   floatboolrG   Tensorr   r   r   rq   r   r   r   r   r   $   s    	


 	
 
	
r   )T)1r   
contextlibr   typingr   r   r   r   r   rN   packaging.versionr   V	typeguardr	   espnet2.asr.ctcr
   espnet2.asr.decoder.abs_decoderr   espnet2.asr.encoder.abs_encoderr   espnet2.asr.espnet_modelr   !espnet2.asr.frontend.abs_frontendr   'espnet2.asr.postencoder.abs_postencoderr   %espnet2.asr.preencoder.abs_preencoderr   espnet2.asr.specaug.abs_specaugr   'espnet2.asr.transducer.error_calculatorr   espnet2.layers.abs_normalizer   'espnet2.slu.postdecoder.abs_postdecoderr    espnet2.torch_utils.device_funcsr   espnet2.train.abs_espnet_modelr   espnet.nets.e2e_asr_commonr   <espnet.nets.pytorch_backend.transformer.label_smoothing_lossr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s4    