o
    im9                     @   sB  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z& d dlm'Z' eej(edkrd dl)m*Z* nedddZ*e+ddG dd deZ,dS )    N)contextmanager)LooseVersion)DictListOptionalTuple)tables)CTC)postprocess_utils)th_accuracy)DatadirWriter)
Paraformer)
Hypothesis)force_gatherable)add_sos_eos)ts_prediction_lfr6_standard)make_pad_maskpad_list)load_audio_text_image_videoextract_fbank)	to_devicez1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   X/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/bicif_paraformer/model.pyr       s   
r   model_classesBiCifParaformerc                       s   e Zd ZdZ fddZdejdejdejdejfdd	Zdejdejdejdejfd
dZdd Z	dd Z
dejdejdejdejdeejeeejf ejf f
ddZ				ddefddZdd Z  ZS )r   a7  
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paper1: FunASR: A Fundamental End-to-End Speech Recognition Toolkit
    https://arxiv.org/abs/2305.11013
    Paper2: Achieving timestamp prediction while recognizing with non-autoregressive end-to-end ASR model
    https://arxiv.org/abs/2301.12343
    c                    s   t  j|i | d S r   )super__init__)selfargskwargs	__class__r   r   r   /   s   zBiCifParaformer.__init__encoder_outencoder_out_lensys_padys_pad_lensc           	      C   s   t ||ddd d d d d f  |j}| jdkr-t|| j| j| j\}}|| j }| j	|||| jd\}}}}}| 
|||}|S N   maxlen	ignore_id)r   sizetodevicepredictor_biasr   soseosr.   	predictorcriterion_pretype_as)	r    r%   r&   r'   r(   encoder_out_mask_pre_token_length2	loss_pre2r   r   r   _calc_pre2_loss6   s   $


zBiCifParaformer._calc_pre2_lossc                 C   sT  t ||ddd d d d d f  |j}| jdkr-t|| j| j| j\}}|| j }| j	|||| jd\}}}}	}d }
| j
dkrO| |||||\}}
n|}| ||||}|d |d }}|
d u rh|}
| ||}t|
d| j|| jd}| |||}| js| jd u rd\}}n|
jdd	}| | | \}}|||||fS )
Nr*   r+   r-           r   )ignore_labelNNdim)r   r/   r0   r1   r2   r   r3   r4   r.   r5   sampling_ratiosamplerdecodercriterion_attr   view
vocab_sizer6   r7   trainingerror_calculatorargmaxcpu)r    r%   r&   r'   r(   r8   r9   pre_acoustic_embedspre_token_lengthpre_peak_indexdecoder_out_1stsematic_embedsdecoder_outsdecoder_outloss_attacc_attloss_precer_attwer_attys_hatr   r   r   _calc_att_lossL   s>   $






zBiCifParaformer._calc_att_lossc           	      C   sX   t ||ddd d d d d f  |j}| j|d || jd\}}}}}||||fS r)   )r   r/   r0   r1   r5   r.   )	r    r%   r&   r8   rM   rN   alphasrO   r:   r   r   r   calc_predictor~   s   $zBiCifParaformer.calc_predictorc           	      C   sR   t ||ddd d d d d f  |j}| j|||\}}}}||||fS )Nr*   r+   )r   r/   r0   r1   r5   get_upsample_timestamp)	r    r%   r&   	token_numr8   	ds_alphasds_cif_peak	us_alphasus_peaksr   r   r   calc_predictor_timestamp   s   $z(BiCifParaformer.calc_predictor_timestampspeechspeech_lengthstexttext_lengthsreturnc                 K   s  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}d\}	}
d}t }| jdkrU| ||||\}	}
|	durM|	 nd|d< |
|d< | ||||\}}}}}| 	||||}| jdkr~||| j
  || j
 d  }n| j|	 d| j |  || j
  || j
 d  }|dur| nd|d	< ||d
< ||d< ||d< |dur|  nd|d< |  |d< t| |d< | jrt|| j  }t|||f|j\}}}|||fS )zFrontend + Encoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r*   Nr   r@   r=   loss_ctccer_ctcg      ?rT   acccerwerrV   r;   loss)lenr/   shapeencodedict
ctc_weight_calc_ctc_lossdetachrZ   r<   predictor_weightrL   torchclonelength_normalized_lossintr2   sumr   r1   )r    rd   re   rf   rg   r"   
batch_sizer%   r&   ri   rj   rV   statsrT   rU   rW   rX   r;   rn   weightr   r   r   forward   sZ   



zBiCifParaformer.forwardNkeyc           3   
      sH  | dddko jd k}| dddko| dd d u} jd u r:|s&|r:td  jd,i | | dd _i }	t }
t	||j
| d	d
d}t }||
 d|	d< t|| dd|d\}}t }|| d|	d< |  |j |j d |	d< |j|d d}|j|d d} ||\}}t|tr|d } ||}|d |d |d |d f\}}}}|  }t|dk rg S  ||||}|d |d }} |||\}}}}g }| \}}} t|D ],}!||!d ||! d d f }"||!d ||! d d f }# jd ur. j|"|#| dd| ddd}$|$d  j }$n.|#jdd}%|#jddd }&tj|&dd}&tj jg|%    j!g |%j"d}%t#|%|&dg}$t$|$D ]\}'}(d })| dd urt% d s|t&| d _' j'|'d  d! })d}*t|(j(t)r|(j(d|* }+n	|(j(d|*   }+t)t* fd"d#|+}+|d ur|+|+},|,|,}-t-||! d ||! d  ||! d ||! d  t..|,| d$dd%\}}.t/0|,|.\}/}0}1||! |/|0d&}2|)d urd'1|,|)d( ||! < |0|)d) ||! < |/|)d* ||! < n||! |+d+}2|2|2 q`q||	fS )-Ndecoding_ctc_weightr=   gh㈵>	lm_weightlm_filezenable beam_searchnbestr*   fsi>  )r   audio_fsz0.3f	load_data	data_typesound)r   frontendextract_feati  batch_data_timer1   )r1   r         maxlenratiominlenratio)x	am_scoresr   r   r>   rA   )yseqscore
output_dirwriter
best_recogc                    s   |  j ko|  jko|  jkS r   )r4   r3   blank_id)r   r    r   r   <lambda>N  s    z+BiCifParaformer.inference.<locals>.<lambda>
begin_time)
vad_offset)r   rf   	timestamp tokenr   rf   )r   	token_intr   )3getctcbeam_searchlogginginfoinit_beam_searchr   timeperf_counterr   r   r   r{   itemframe_shiftlfr_nr0   rq   
isinstancetupler\   roundlongrw   maxcal_decoder_with_predictorrc   r/   rangerK   tensorr3   tolistr4   r1   r   	enumeratehasattrr   r   r   listfilter
ids2tokenstokens2textr   copyr
   sentence_postprocessjoinappend)3r    data_indata_lengthsr   	tokenizerr   r"   
is_use_ctc	is_use_lm	meta_datatime1audio_sample_listtime2rd   re   time3r%   r&   predictor_outsrM   rN   r[   rO   rR   rS   r(   r9   ra   rb   resultsbndir   r   
nbest_hypsr   r   	nbest_idxhypibest_writerlast_posr   r   rf   r   text_postprocessedtime_stamp_postprocessed
word_listsresult_ir   r   r   	inference   s   
	



$






4zBiCifParaformer.inferencec                 K   s2   ddl m} d|vrd|d< |dd| i|}|S )Nr*   )export_rebuild_modelmax_seq_leni   modelr   )export_metar   )r    r"   r   modelsr   r   r   exports  s
   zBiCifParaformer.export)NNNN)__name__
__module____qualname____doc__r   rw   Tensorr<   rZ   r\   rc   r   r   strr   r   r   r   __classcell__r   r   r#   r   r   %   sV    

2		
Q
 )T)-r   r   rw   r   
contextlibr   distutils.versionr   typingr   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.metrics.compute_accr   funasr.utils.datadir_writerr   funasr.models.paraformer.modelr   funasr.models.paraformer.searchr   funasr.train_utils.device_funcsr   +funasr.models.transformer.utils.add_sos_eosr   funasr.utils.timestamp_toolsr   *funasr.models.transformer.utils.nets_utilsr   r   funasr.utils.load_utilsr   r   r   __version__torch.cuda.ampr   registerr   r   r   r   r   <module>   s4   
