o
    i[f                     @   s2  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% e	ej&e	dkrd dl'm(Z( nedddZ(e)ddG dd deZ*dS )    N)DictTuple)contextmanager)LooseVersion)tables)CTC)postprocess_utils)th_accuracy)DatadirWriter)
Paraformer)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_maskpad_list)load_audio_text_image_videoextract_fbankz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   \/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/paraformer_streaming/model.pyr       s   
r   model_classesParaformerStreamingc                       s0  e Zd ZdZ fddZdejdejdejdejdeejee	ejf ejf f
d	d
Z
	d(dejdejdedeejejf fddZdejdejdejdejfddZ	d(ddZdd Zd(ddZdd Z	d(ddZi fdefdd Z				d)d!efd"d#Zddddi fd!edefd$d%Zd&d' Z  ZS )*r   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    c                    sj   t  j|i | |dd| _d | _t| jdr1| jjd ur3ddlm	} || _
|dd| _d S d S d S )Nsampling_ratiog?overlap_chunk_clsr   ),build_scama_mask_for_cross_attention_decoderdecoder_attention_chunk_typechunk)super__init__getr   
scama_maskhasattrencoderr    funasr.models.scama.chunk_utilisr   /build_scama_mask_for_cross_attention_decoder_fnr   )selfargskwargsr   	__class__r   r   r"   -   s   
zParaformerStreaming.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s  | d}t| dkr|dddf }t| dkr%|dddf }|jd }t| jdrD| jj| j|}| j	|||d\}	}
n| 	||\}	}
d\}}d}t
 }| jdkrt| jdrm| jjj|	|
dd	\}}n|	|
}}| ||||\}}|dur| nd|d
< ||d< | |	|
||\}}}}}}| jdkr||| j  }n| j| d| j |  || j  }|dur| nd|d< |dur| nd|d< ||d< ||d< ||d< |dur|  nd|d< t| |d< | jr|| j  }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        decoding_ind   Nr   r   )indNN        
chunk_outsloss_ctccer_ctcloss_attpre_loss_attacccerwerloss_preloss)r#   lensizeshaper%   r&   r   random_choicetrainingencodedict
ctc_weightremove_chunk_calc_ctc_lossdetach_calc_att_predictor_losspredictor_weightcputorchclonelength_normalized_losspredictor_biassumr   device)r)   r.   r/   r0   r1   r+   r3   
batch_sizer5   encoder_outencoder_out_lensr:   r;   rA   statsencoder_out_ctcencoder_out_lens_ctcr<   acc_attcer_attwer_attr=   rB   weightr   r   r   forwardE   s\   





zParaformerStreaming.forwardNcachec                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | jj|||d d\}}}t|trH|d }|t	|
dgfS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr&   rb   r   r4   )r   specaugrG   	normalizer&   forward_chunk
isinstancetuplerQ   tensorrD   )r)   r.   r/   rb   r+   rX   rY   _r   r   r   encode_chunk   s   



z ParaformerStreaming.encode_chunkrX   rY   ys_padys_pad_lensc                 C   s  t ||ddd d d d d f  |j}| jdkr-t|| j| j| j\}}|| j }d }| j	j
d urU| j	j
jd |j|dd}| j	j
jd |j|dd}|| }| j|||| j||d\}	}
}}| j||\}}d }| j	j
d ur| jdkr| j	j
j}d}|}| j	j
j}| j	j
jd |j|dd}| j||d|||| jd ||||| jd}n| j	j
d ur| j	j
j||d d\}}d }d }| jd	kr| jr| |||||	|\}}}n| |||||	|\}}n|	}| |||||}|d |d }}|d u r|}| ||}t|d
| j|| jd}| | |
|
}| js%| j!d u r*d\}}n|j"d
d}| !|# |# \}}||||||fS )Nr4   maxlenr   rV   rW   	ignore_idmask_chunk_predictortarget_label_lengthr    predictor_alignmentsencoder_sequence_length
chunk_sizeencoder_chunk_sizeattention_chunk_center_biasattention_chunk_sizeattention_chunk_typesteppredictor_mask_chunk_hoppingdecoder_att_look_back_factormask_shift_att_chunk_decodertarget_lengthis_trainingr8   r7   )ignore_labelr6   dim)$r   rD   torV   rT   r   soseosrr   r&   r   get_mask_chunk_predictorget_mask_shfit_chunk	predictorgen_frame_alignmentsr   chunk_size_pad_shift_cur decoder_att_look_back_factor_cur get_mask_shift_att_chunk_decoderr(   rG   rK   r   use_1st_decoder_losssampler_with_gradsamplerdecodercriterion_attr	   view
vocab_sizecriterion_pretype_aserror_calculatorargmaxrP   )r)   rX   rY   rl   rm   encoder_out_maskrj   rs   mask_shfit_chunkpre_acoustic_embedspre_token_length
pre_alphasrv   predictor_alignments_lenr$   ry   rz   r{   r   r   decoder_out_1str=   sematic_embedsdecoder_outsdecoder_outr<   r]   rA   r^   r_   ys_hatr   r   r   rN      s   $





	
	

z,ParaformerStreaming._calc_att_predictor_lossc                 C   s  t || dd d d d d f  |j}||d d d d df  }| jr.| jjj| }	n| j|}	t	
  | |||||}
|
d |
d }}|d}|| j}|d}||k|@ d}t	|}| \}}t|D ],}|| ||    | j  }|dkr|| jdt	|| d |  dd qs|d}|| d}|d|j}W d    n1 sw   Y  || d|	|d }|| || fS )Nrn   r   r4   r   )r   indexvalueF   )r   maxr   rV   share_embeddingr   output_layerr`   embedrQ   no_gradr   nerr   rU   	ones_likerD   rangefloatr   longscatter_randpermcudaeqmasked_fill	unsqueeze)r)   rX   rY   rl   rm   r   
chunk_masktgt_maskys_pad_maskedys_pad_embedr   r   rj   pred_tokensnonpad_positionsseq_lenssame_num
input_maskbszseq_lenli
target_numinput_mask_expand_dimr   r   r   r   r   0  sJ   $







zParaformerStreaming.samplerc                 C   sP  t ||ddd d d d d f  |j}d }| jjd ur?| jjjd |j|dd}| jjjd |j|dd}|| }| j|d || j	|d d\}}}}	| j
|| jjdkr]|d n|\}
}d }| jjd ur| jdkr| jjj}d}|}| jjj}| jjjd |j|dd}| j|
|d|||| jd |||d | jd}|| _||||	fS )	Nr4   rn   r   rp   rq   r7   r    ru   )r   rD   r   rV   r&   r   r   r   r   rr   r   tail_thresholdr   r   r   r   r(   rG   r$   )r)   rX   rY   r   rs   r   r   r   r   pre_peak_indexrv   r   r$   ry   rz   r{   r   r   r   r   r   calc_predictor^  sj   $

z"ParaformerStreaming.calc_predictorc                 K   s"   | dd}| jj||d |dS )Nis_finalFr&   )r   )r#   r   rf   )r)   rX   rY   rb   r+   r   r   r   r   calc_predictor_chunk  s   z(ParaformerStreaming.calc_predictor_chunkc                 C   s2   |  ||||| j}|d }tj|dd}||fS )Nr   r   r   )r   r$   rQ   log_softmax)r)   rX   rY   r   rm   r   r   r   r   r   cal_decoder_with_predictor  s   z.ParaformerStreaming.cal_decoder_with_predictorc                 C   s.   | j |||d }|}tj|dd}||fS )Nr   r   r   )r   rf   rQ   r   )r)   rX   rY   r   rm   rb   r   r   r   r   r    cal_decoder_with_predictor_chunk  s   z4ParaformerStreaming.cal_decoder_with_predictor_chunkc                 K   s   | dg d}| dd}| dd}d}|d d }|d	 d
 |d	 d  }dt|d|ft|df||dd t||d |d  |fdd	}	|	|d< d |d |d}
|
|d< i |d< td|d< |S )Nrx   r   
      encoder_chunk_look_backr   decoder_chunk_look_backr4   encoder_confoutput_sizefrontend_confn_melslfr_mFr   )		start_idx
cif_hidden
cif_alphasrx   r   
last_chunkoptfeats
tail_chunkr&   )decode_fsmnr   r   rx   r   frontendprev_samples)r#   rQ   zerosempty)r)   rb   r+   rx   r   r   rW   enc_output_size
feats_dimscache_encodercache_decoderr   r   r   
init_cache  s4   zParaformerStreaming.init_cachekeyc           "   	      s\  | di }|j|d d}|j|d d} j|||| ddd\}}	t|tr.|d } j||	|| ddd}
|
d |
d |
d	 |
d
 f\}}}}|  }t	|dk r\g S  j
||	|||d}|d |d }}g }| \}}}t|d ttfr|d }t|D ]}||d |	| d d f }||d || d d f } jd urÈ j||| dd| ddd}|d  j }n.|jdd}|j	ddd }tj|dd}tj jg|   jg |jd}t||dg}t|D ]5\}}d}t|jtr
|jd| }n	|jd|  }tt fdd|}||} | }!||! qq|S )Nrb   rV   )rV   r   F)rb   r   r   r4   r      rc   maxlenratior7   minlenratio)x	am_scoresr   r   r   r   )yseqscorec                    s   |  j ko|  jko|  jkS r   )r   r   blank_id)r   r)   r   r   <lambda>  s    z4ParaformerStreaming.generate_chunk.<locals>.<lambda>)r#   r   rk   rg   rh   r   roundr   rQ   r   r   rD   listr   beam_searchnbestr   rU   ri   r   tolistr   rV   r   	enumerater   filter
ids2tokensextend)"r)   r.   r/   r   	tokenizerr   r+   rb   rX   rY   predictor_outsr   r   alphasr   r   r   rm   resultsbndir   r   
nbest_hypsr   r   	nbest_idxhyplast_pos	token_inttokenresult_ir   r   r   generate_chunk  st   	





$
z"ParaformerStreaming.generate_chunkc           !      K   sL  | dddko| jd k}| dddko| dd d u}	| jd u r:|	s&|r:td | jd*i | | dd| _t|d	krI| j|fi | i }
| d
g d}t	|d d }t
 }d| ddi}t||j| dd| dd||d}|d }t
 }|| d|
d< t|dksJ dt|d |d	 f}t	t|| t	| }t	t|| dt	|  }g }t|D ]}|o||d k|d< ||| |d |  }|d rt|dk rd|d d< |d d }tj|jd gtjd|j}nt|g| dd||d |d d\}}t
 }|| d|
d< |  |j |j d  |
d!< | j||f||||d"|}|| qt|\}}|d	 |d#}|g}|d	kr`|| d  ntd	|d< |rs| j|fi | | d$rt | d%st!| d$| _"| j"d d& } d'#|| d( |d	 < || d) |d	 < ||
fS )+Ndecoding_ctc_weightr7   gh㈵>	lm_weightlm_filezenable beam_searchr   r4   r   rx   r   i  r   Ffsi>  	data_typesound)r  audio_fsr  r   rb   z0.3f	load_datazbatch_size must be set 1r   Tr&   r   r   )dtyper   )r  r   rb   r   extract_feati  batch_data_time)r   r   rb   r   )r   r0   
output_dirwriter
best_recog r  r0   r   )$r#   ctcr   logginginfoinit_beam_searchr   rC   r   inttimeperf_counterr   r  rQ   catr   ri   rE   int64r   rV   r   rU   itemframe_shiftlfr_nr  r   r   sentence_postprocessr   r%   r
   r  join)!r)   data_indata_lengthsr   r   r   rb   r+   
is_use_ctc	is_use_lm	meta_datarx   chunk_stride_samplestime1cfgaudio_sample_list	_is_finaltime2audio_sampler  mtokensr  audio_sample_ir.   r/   time3tokens_itext_postprocessedrj   r  resultibest_writerr   r   r   	inference,  s   




	&zParaformerStreaming.inferencec                 K   s"   ddl m} |dd| i|}|S )Nr4   )export_rebuild_modelmodelr   )export_metarA  )r)   r+   rA  modelsr   r   r   export  s   zParaformerStreaming.exportr   )NNNN)__name__
__module____qualname____doc__r"   rQ   Tensorr   r   strra   rI   rk   rN   r   r   r   r   r   r   r   r  r@  rE  __classcell__r   r   r,   r   r   %   sx    
V
 
 
.
>
$
[
f)T)+r#  rQ   r  typingr   r   
contextlibr   distutils.versionr   funasr.registerr   funasr.models.ctc.ctcr   funasr.utilsr   funasr.metrics.compute_accr	   funasr.utils.datadir_writerr
   funasr.models.paraformer.modelr   funasr.models.paraformer.searchr   &funasr.models.paraformer.cif_predictorr   funasr.train_utils.device_funcsr   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   r   funasr.utils.load_utilsr   r   __version__torch.cuda.ampr   registerr   r   r   r   r   <module>   s2   
