o
    iI                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlmZ e
ej e
dkrpd dl!m"Z" nedddZ"e#ddG dd dej$j%Z&dS )    N)contextmanager)DictOptionalTuple)LooseVersion)tables)postprocess_utils)DatadirWriter)force_gatherable)CTCPrefixScorer)LabelSmoothingLoss)LengthBonus)get_transducer_task_io)load_audio_text_image_videoextract_fbank)BeamSearchTransducerz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   R/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/transducer/model.pyr      s   
r   model_classes
Transducerc                7       s  e Zd Z																													dCd
ee dee dee dee dedee dedee dedee dedee dedededededededededed ed!ed"ed#ed$ef6 fd%d&Z	d'e
jd(e
jd)e
jd*e
jd+ee
jeee
jf e
jf f
d,d-Zd'e
jd(e
jd+ee
je
jf fd.d/Zd0e
jd1e
jd2e
jd3e
jd4e
jd+ee
jee ee f fd5d6Zd0e
jd2e
jd3e
jd4e
jd+e
jf
d7d8Zd9e
jd2e
jd+e
jfd:d;Zd<d= Z			dDd>ed?ed@efdAdBZ  ZS )Er   N      ?        P   r         Ffrontendfrontend_confspecaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_confjoint_networkjoint_network_conftransducer_weightfastemit_lambdaauxiliary_ctc_weightauxiliary_ctc_dropout_rateauxiliary_lm_loss_weightauxiliary_lm_loss_smoothing
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_lossshare_embeddingc           $         s  t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|dd|i|}| } tj|	}!|!dd|i|
}	|	j}"tj	|}#|#|| |"fi |}d | _
d | _|dk| _|dk| _| jr|tj| || _|| _| jrtj|	j|| _|| _|| _|| _|| _|| _|| _|d ur|n|d | _|d ur|n|d | _|| _|| _|| _|| _|| _ || _!|	| _"|| _#t$||||d| _%|| _&d | _'d | _(d| _)d S )Nr1   r2   r   r   )sizepadding_idx	smoothingnormalize_lengthr   r   )*super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classesjoint_network_classescriterion_transducererror_calculatoruse_auxiliary_ctcuse_auxiliary_lm_losstorchnnLinearctc_linctc_dropout_ratelm_linlm_loss_smoothingr+   r,   r-   r/   r4   r5   r6   r2   r3   r   r!   r#   r%   r'   r)   r   criterion_attr8   beam_searchctc
ctc_weight)$selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   kwargsspecaug_classnormalize_classencoder_classencoder_output_sizedecoder_classdecoder_output_sizejoint_network_class	__class__r   r   r?   $   sx   
'


zTransducer.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}t| jdrE| jjdurE| jjj||dd\}}t||| j	d\}	}
}}| j
|j | 
|	|}| |d|d}| |||
||\}}}d\}}| jr| ||
||}| jr| ||
}| j| | j|  | j|  }t| | |d	kr| nd|d	kr| nd||d
}t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   overlap_chunk_cls)
chunk_outs)r3   r   )r   r   r   )lossloss_transduceraux_ctc_lossaux_lm_losscer_transducerwer_transducer)lenr:   shapeencodehasattrr%   rf   remove_chunkr   r3   r'   
set_devicedevicer)   	unsqueeze_calc_transducer_lossrI   _calc_ctc_lossrJ   _calc_lm_lossr+   r-   r/   dictdetachr
   )rV   ra   rb   rc   rd   rW   
batch_sizeencoder_outencoder_out_lens
decoder_intargett_lenu_lendecoder_out	joint_out
loss_trans	cer_trans	wer_transloss_ctcloss_lmrh   statsweightr   r   r   forward   sj   





zTransducer.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | ||\}}}d}t|trI|d }|d }|durS||f|fS ||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   r   )r   r!   trainingr#   r%   
isinstancetuple)rV   ra   rb   rW   r|   r}   _intermediate_outsr   r   r   rp      s   


zTransducer.encoder|   r   r   r   r   c              
   C   s   | j du r#zddlm} || _ W n ty"   td td Y nw tj|dd}| j ||||d| j	| j
d	d
}| jsn| jsB| jrn| jdu r`ddlm}	 |	| j| j| j| j| j| j| jd| _| |||\}
}||
|fS |ddfS )a(  Compute Transducer loss.

        Args:
            encoder_out: Encoder output sequences. (B, T, D_enc)
            joint_out: Joint Network output sequences (B, T, U, D_joint)
            target: Target label ID sequences. (B, L)
            t_len: Encoder output sequences lengths. (B,)
            u_len: Target label ID sequences lengths. (B,)

        Return:
            loss_transducer: Transducer loss value.
            cer_transducer: Character error rate for Transducer.
            wer_transducer: Word Error Rate for Transducer.

        Nr   )	rnnt_losszJwarp-rnnt was not installed.Please consult the installation documentation.r   r   dimmeanT)	reductionblankr,   gather)ErrorCalculatorTransducer)
report_cer
report_wer)rG   	warp_rnntr   ImportErrorloggingerrorexitrK   log_softmaxr4   r,   r   r   r   rH   funasr.metricsr   r'   r)   
token_list	sym_space	sym_blank)rV   r|   r   r   r   r   RNNTLoss	log_probsri   ErrorCalculatorrl   rm   r   r   r   rv     sH   





z Transducer._calc_transducer_lossc           	   	   C   s   |  tjjj|| jd}tj|dddd}|dk}||  }tj	j
jdd tjjj||||ddd	}W d
   n1 sBw   Y  ||d }|S )aM  Compute CTC loss.

        Args:
            encoder_out: Encoder output sequences. (B, T, D_enc)
            target: Target label ID sequences. (B, L)
            t_len: Encoder output sequences lengths. (B,)
            u_len: Target label ID sequences lengths. (B,)

        Return:
            loss_ctc: CTC loss value.

        )pr   r   r   r   T)deterministicsum)zero_infinityr   N)rN   rK   rL   
functionaldropoutrO   r   	transposecpubackendscudnnflagsctc_lossr:   )	rV   r|   r   r   r   ctc_intarget_mask
ctc_targetr   r   r   r   rw   O  s    	zTransducer._calc_ctc_lossr   c                 C   s   |  |ddddddf d| j}|dtj}t . | }|| j	| jd   |dk}|
|d}|d|dd| j	  W d   n1 sSw   Y  tjjjtj|dd|dd}|
|dd |d }|S )zCompute LM loss.

        Args:
            decoder_out: Decoder output sequences. (B, U, D_dec)
            target: Target label ID sequences. (B, L)

        Return:
            loss_lm: LM loss value.

        Nr   r   r   r   none)r   )rP   viewr2   typerK   int64no_gradclonefill_rQ   masked_fillscatter_ru   rL   r   kl_divr   r   r:   )rV   r   r   
lm_loss_in	lm_target	true_distignorer   r   r   r   rx   u  s    *

 zTransducer._calc_lm_lossc                 K   sz   i }| j d krt| j | jd}|j|d |d}|jtt|d d }||d< t| j| j	|dddd	}|| _
d S )
N)rT   r6   )rT   r   )length_bonusngram	beam_sizer   r   )nbest)rT   r   r6   updaterA   r   rn   r   r'   r)   rS   )rV   rW   scorersrT   r   r   rS   r   r   r   init_beam_search  s"   





zTransducer.init_beam_searchdata_indata_lengthskeyc           !         s  | dddkrtd| dddko jd k}| dddko(| dd d u}td	  jd&i | | d
d _i }t }	t	| j
j| ddd}
t }||	 d|d< t|
| dd j
d\}}t }|| d|d< |   j
j  j
j d |d< |j|d d}|j|d d} ||\}}t|tr|d } j|d dd}|d  j }g }| \}}}t|D ]}t|D ]}\}}d }| dd urt dst| d _ j|d  d }d}t|jtr|j}n|j}tt fdd |}||}| |}t!"|\}}|| |||d!} |#|  |d urId"$||d# || < ||d$ || < ||d% || < qq||fS )'Nr{   r   z!batch decoding is not implementeddecoding_ctc_weightr   gh㈵>	lm_weightlm_filezenable beam_searchr   fsi>  )r   audio_fsz0.3f	load_data	data_typesound)r   r   extract_feati  batch_data_timert   )rt   r   T)is_final
output_dirwriter
best_recogr   c                    s   |  j ko|  jko|  jkS r   )r6   r5   r4   )xrV   r   r   <lambda>  s    z&Transducer.inference.<locals>.<lambda>)r   tokenrc   text_postprocessed r   rc   r   r   )%rA   NotImplementedErrorrT   r   infor   r   timeperf_counterr   r   r   r   r   itemframe_shiftlfr_ntorp   r   r   rS   r:   range	enumeraterq   r	   r   yseqlistfilter
ids2tokenstokens2textr   sentence_postprocessappendjoin)!rV   r   r   r   	tokenizerrW   
is_use_ctc	is_use_lm	meta_datatime1audio_sample_listtime2ra   rb   time3r|   r}   
nbest_hypsresultsbndi	nbest_idxhypibest_writerlast_pos	token_intr   rc   r   r   result_ir   r   r   	inference  s~   	







&zTransducer.inference)NNNNNNNNNNNNr   r   r   r   r   r   r   r   r   r   r   r   r   FF)NNN)__name__
__module____qualname__r   strr   floatintboolr?   rK   Tensorr   r   rp   rv   rw   rx   r   r   r	  __classcell__r   r   r_   r   r   "   s   	
!m
V
$
D
&
%$)T)'r   rK   r   
contextlibr   typingr   r   r   distutils.versionr   funasr.registerr   funasr.utilsr   funasr.utils.datadir_writerr	   funasr.train_utils.device_funcsr
   %funasr.models.transformer.scorers.ctcr   "funasr.losses.label_smoothing_lossr   .funasr.models.transformer.scorers.length_bonusr   *funasr.models.transformer.utils.nets_utilsr   funasr.utils.load_utilsr   r   /funasr.models.transducer.beam_search_transducerr   __version__torch.cuda.ampr   registerrL   Moduler   r   r   r   r   <module>   s,   
