o
    i8h                     @   sX  d dl Z d dlZd dlmZ d dlmZ d dlZd dlm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z)m*Z* d dl+m,Z, eej-edkrd dl.m/Z/ nedddZ/e0ddG dd dej1Z2dS )    N)DictTuple)contextmanager)LooseVersion)tables)CTC)postprocess_utils)th_accuracy)DatadirWriter)
Paraformer)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_maskpad_list)load_audio_text_image_videoextract_fbank)sequence_maskz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   M/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/scama/model.pyr   "   s   
r   model_classesSCAMAc                1       s  e Zd ZdZ																								
	
dDdededededededededededededededededededededed ed!ed"ef0 fd#d$Z	d%e
jd&e
jd'e
jd(e
jd)ee
jeee
jf e
jf f
d*d+Zd%e
jd&e
jd)ee
je
jf fd,d-Z	dEd%e
jd&e
jd.ed)ee
je
jf fd/d0ZdEd1d2Zd3e
jd4e
jd5e
jd6e
jfd7d8Z		dFd3e
jd4e
jd5e
jd6e
jfd9d:Zd;d< Z				dGd=efd>d?Zi fd.efd@dAZddddi fd=ed.efdBdCZ  ZS )Hr   z
    Author: Shiliang Zhang, Zhifu Gao, Haoneng Luo, Ming Lei, Jie Gao, Zhijie Yan, Lei Xie
    SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition
    https://arxiv.org/abs/2006.01712
    N      ?r           P         Fspecaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_confctcctc_conf
ctc_weight	predictorpredictor_confpredictor_biaspredictor_weight
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_lossshare_embeddingc           !         s  t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|dd|i|}| }tj|}|d||d|}|dkr^|
d u rTi }
t	d||d|
}	tj
|}|di |}|| _|d urt|n|d | _|d ur|n|d | _|| _|| _|| _|| _|| _|| _|dkrd | _n|| _t||||d| _|dkrd | _n|	| _|| _|| _|| _t|d| _|| _| jrd | j_|| _d | _ d | _!| jj"d urd	d
l#m$}  | | _%|dd| _&d S d S )Nr2   )r3   encoder_output_sizer   )odimr;   r!         ?)sizepadding_idx	smoothingnormalize_length)rA   r   ),build_scama_mask_for_cross_attention_decoderdecoder_attention_chunk_typechunkr   )'super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classesr   predictor_classesr5   r6   r7   r3   r4   r-   r#   r%   r'   r)   r   criterion_attr+   r.   r1   r0   r   criterion_prer:   embedr9   beam_searcherror_calculatoroverlap_chunk_cls funasr.models.scama.chunk_utilisrB   /build_scama_mask_for_cross_attention_decoder_fnrC   )!selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   kwargsspecaug_classnormalize_classencoder_classr;   decoder_classpredictor_classrB   	__class__r   r   rF   /   sx   
zSCAMA.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s  | d}t| dkr|dddf }t| dkr%|dddf }|jd }| jj| j|}| j|||d\}	}
d\}}d}t	 }| j
dkrq| jjj|	|
dd\}}| ||||\}}|duri| nd|d	< ||d
< | |	|
||\}}}}}| j
dkr||| j  }n| j
| d| j
 |  || j  }|dur| nd|d< ||d< ||d< ||d< |dur|  nd|d< t| |d< | jr|| j  }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        decoding_indr!   Nr   )indNNr   )
chunk_outsloss_ctccer_ctcloss_attacccerwerloss_preloss)rH   lenr>   shaper'   rS   random_choicetrainingencodedictr-   remove_chunk_calc_ctc_lossdetach_calc_att_predictor_lossr1   cputorchcloner9   r0   sumr   device)rV   r_   r`   ra   rb   rW   rd   
batch_sizere   encoder_outencoder_out_lensrh   ri   rn   statsencoder_out_ctcencoder_out_lens_ctcrj   acc_attcer_attwer_attro   weightr   r   r   forward   sR   





zSCAMA.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | ||\}}}t|trC|d }||fS )zEncoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   )r   r#   rs   r%   r'   
isinstancetuple)rV   r_   r`   rW   r   r   _r   r   r   rt      s   


zSCAMA.encodecachec                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | jj|||d d\}}}t|trH|d }|t	|
dgfS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr'   )r   r   r!   )r   r#   rs   r%   r'   forward_chunkr   r   r{   tensorr>   )rV   r_   r`   r   rW   r   r   r   r   r   r   encode_chunk
  s   



zSCAMA.encode_chunkc                 K   s"   | dd}| jj||d |dS )Nis_finalFr'   )r   )rH   r.   r   )rV   r   r   r   rW   r   r   r   r   calc_predictor_chunk*  s   zSCAMA.calc_predictor_chunkr   r   ys_padys_pad_lensc                 C   s  t || j| j| j\}}|d }t||d|j|jdd d d d d f }d }	| jj	d urN| jj	j
d |j|dd}	| jj	jd |j|dd}
||
 }| j|||| j|	|d\}}}}| j||\}}| jj	j}d}|}| jj	j}| jj	jd |j|dd}| j||d|||| jd |	|||| jd}| j||||||d\}}| ||}t|d| j|| jd	}| |||}| js| jd u rd
\}}n|jdd}| | | \}}|||||fS )Nr!   maxlendtyper~   r   r~   r   r4   mask_chunk_predictortarget_label_lengthpredictor_alignmentsencoder_sequence_length
chunk_sizeencoder_chunk_sizeattention_chunk_center_biasattention_chunk_sizeattention_chunk_typesteppredictor_mask_chunk_hoppingdecoder_att_look_back_factormask_shift_att_chunk_decodertarget_lengthis_training)
chunk_maskpre_acoustic_embedsr    )ignore_labelrf   )dim)r   r6   r7   r4   r   r>   r   r~   r'   rS   get_mask_chunk_predictorget_mask_shfit_chunkr.   gen_frame_alignmentschunk_size_pad_shift_cur decoder_att_look_back_factor_cur get_mask_shift_att_chunk_decoderrU   rC   rs   r)   rN   r	   viewr3   rO   type_asrR   argmaxrz   )rV   r   r   r   r   	ys_in_pad
ys_out_pad
ys_in_lensencoder_out_maskr   mask_shfit_chunkr   pre_token_length
pre_alphasr   r   predictor_alignments_lenr   r   r   r   r   
scama_maskdecoder_outrj   r   rn   r   r   ys_hatr   r   r   ry   /  s   



zSCAMA._calc_att_predictor_lossc                 C   s  d\}}t ||d|j|jdd d d d d f }d }| jjjd |j|dd}| jjjd |j|dd}	||	 }| j|||| j	||d\}
}}}| j
||\}}| jjj}d}|}| jjj}| jjjd |j|dd}| j||d|||| jd ||||| jd}|
||||fS )Nrf   r!   r   r   r   r   r   )r   r>   r   r~   r'   rS   r   r   r.   r4   r   r   r   r   rU   rC   rs   )rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   calc_predictor_mask  st   	
zSCAMA.calc_predictor_maskc              
   K   s   ddl m} ddlm} ddlm} i }| jd kr'|| j| jd}|j|d |	d}|j| j
|t|d d }||d	< td
|	dd |	dd|	dd|	dd|	ddd}	||	dd|	|| j| jt||| jd
krtd ndd}
|
| _d S )Nr   )BeamSearchScamaStreaming)CTCPrefixScorer)LengthBonus)r+   r7   )r+   
token_list)r)   length_bonusngramr=   decoding_ctc_weightr   	lm_weightngram_weightpenalty)r)   r+   lmr   r   	beam_sizer"   full)r   weightsscorersr6   r7   r3   r   pre_beam_score_key)funasr.models.scama.beam_searchr   %funasr.models.transformer.scorers.ctcr   .funasr.models.transformer.scorers.length_bonusr   r+   r7   updaterH   r)   rp   ru   r6   r-   rQ   )rV   rW   r   r   r   r   r+   r   r   r   rQ   r   r   r   init_beam_search  s@   








zSCAMA.init_beam_searchkeyc                    s  | di }|j|d d}|j|d d} j|||| ddd\}}	t|tr.|d }d|vr< j|}
|
|d<  j||	|| ddd}|d |d	 |d
 |d f\}}}}| 	 }t
|d	k rjg S | }}| ddr|| dd7 }td|| dd } j|d d |t|t||d}||d< |d  j }g }|D ]2}d}t|jtr|jd	| }q|jd	|  }tt fdd|}||}|}|| q|S )Nr   r~   r~   r   F)r   r   r   running_hypsr!   r"      token_num_relax   )xr   r   r   minlenr   r    c                    s   |  j ko|  jko|  jkS r   )r7   r6   r5   )r   rV   r   r   <lambda>O  s    z&SCAMA.generate_chunk.<locals>.<lambda>)rH   tor   r   r   rQ   init_hypr   roundlongr{   maxintnbestyseqlisttolistfilter
ids2tokensextend)rV   r_   r`   r   	tokenizerfrontendrW   r   r   r   r   predictor_outsr   r   alphaspre_peak_indexr   r   
nbest_hypsresultshyplast_pos	token_inttokenresult_ir   r   r   generate_chunk  sl   	


	
zSCAMA.generate_chunkc                 K   s   | dd}| dg d}| dd}| dd}d}|d	 d
 }|d d |d d  }	dt|d|fj|dt|dfj|d||dd t||d |d  |	fj|ddd	}
|
|d< d |d |d}||d< i |d< td|d< |S )Nr~   cudar   r   
   r   encoder_chunk_look_backr   decoder_chunk_look_backr!   r(   rK   frontend_confn_melslfr_mr   Fr"   )		start_idx
cif_hidden
cif_alphasr   r   
last_chunkoptfeats
tail_chunkr'   )decode_fsmnr   r  r   r)   r   prev_samples)rH   r{   zerosr   empty)rV   r   rW   r~   r   r   r   r   enc_output_size
feats_dimscache_encodercache_decoderr   r   r   
init_cache]  s:   zSCAMA.init_cachec           "      K   s  | dddko| jd k}| dddko| dd d u}	| jd u r6td | jd$i | | dd| _t|d	krE| j|fi | i }
| d
g d}t	|d d }t
 }d| ddi}t||j| dd| dd||d}|d }t
 }|| d|
d< t|dksJ dt|d |d	 f}t	t|| t	| }t	t|| dt	|  }g }t|D ]\}|o||d k|d< ||| |d |  }t|g| dd||d |d d\}}t
 }|| d|
d< |  |j |j d |
d< | j||f||||d|}|| qt|\}}|d	 |d}|g}|d	kr5|| d  ntd	|d< |rH| j|fi | | drot| d} | d d  }!d!||!d" |d	 < ||!d# |d	 < ||
fS )%Nr   r   gh㈵>r   lm_filezenable beam_searchr   r!   r   r   r   i  r   Ffsi>  	data_typesound)r  audio_fsr  r   r   z0.3f	load_datazbatch_size must be set 1r  r   )r  r   r   r   extract_feati  batch_data_time)r   r   r   r   )r   ra   
output_dir
best_recog r   ra   r   )rH   r+   rQ   logginginfor   r   rp   r  r   timeperf_counterr   r  r{   catranger   r}   itemframe_shiftlfr_nr   r   r   sentence_postprocessr  r
   join)"rV   data_indata_lengthsr   r   r   r   rW   
is_use_ctc	is_use_lm	meta_datar   chunk_stride_samplestime1cfgaudio_sample_list	_is_finaltime2audio_samplenmtokensiaudio_sample_ir_   r`   time3tokens_itext_postprocessedr   r   resultwriteribest_writerr   r   r   	inference  s   





	&zSCAMA.inference)NNNNNNNNNNr   NNr   r   r   r    r    r   r!   r"   r   FFr   rf   )NNNN)__name__
__module____qualname____doc__strru   floatr   boolrF   r{   Tensorr   r   r   rt   r   r   ry   r   r   r   r   r  rA  __classcell__r   r   r]   r   r   '   s   	
q
M
!

 
a
I6
U*)T)3r!  r{   torch.nnnntorch.functional
functionalFr  typingr   r   
contextlibr   distutils.versionr   funasr.registerr   funasr.models.ctc.ctcr   funasr.utilsr   funasr.metrics.compute_accr	   funasr.utils.datadir_writerr
   funasr.models.paraformer.modelr   funasr.models.paraformer.searchr   &funasr.models.paraformer.cif_predictorr   funasr.train_utils.device_funcsr   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   r   funasr.utils.load_utilsr   r   funasr.models.scama.utilsr   __version__torch.cuda.ampr   registerModuler   r   r   r   r   <module>   s8   
