o
    ihh                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( e)ddG dd dej*j+Z,dS )    N)autocast)UnionDictListTupleOptional)tables)CTC)postprocess_utils)th_accuracy)	to_device)DatadirWriter)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eosadd_sos_and_eos)make_pad_mask)ts_prediction_lfr6_standard)load_audio_text_image_videoextract_fbankmodel_classesEParaformerc                5       s  e Zd ZdZ																						
					
	dIdee dee dedee dedee dedee dedee dedee dededededededed ed!e	d"ed#ed$ed%e	d&e	f4 fd'd(Z
d)ejd*ejd+ejd,ejd-eejeeejf ejf f
d.d/Zd)ejd*ejd-eejejf fd0d1Zd2d3 Zd4d5 Zd6ejd7ejd8ejd9ejfd:d;Zd<d= Zd>d? Zd6ejd7ejd8ejd9ejfd@dAZdBdC Z				dJdDefdEdFZdGdH Z  ZS )Kr   a  
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    Author: Kun Zou, chinazoukun@gmail.com
    E-Paraformer: A Faster and Better Parallel Transformer for Non-autoregressive End-to-End Mandarin Speech Recognition
    https://www.isca-archive.org/interspeech_2024/zou24_interspeech.pdf
    N      ?P   r                 F皙?Tspecaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_confctcctc_conf	predictorpredictor_conf
ctc_weight
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_losspredictor_weightpredictor_biassampling_ratioshare_embeddinguse_1st_decoder_lossc           "         s  t    |d urtj|}|d	i |}|d ur'tj|}|d	i |}tj|}|d	d|i|}| }|d urNtj|} | d	||d|}|dkrb|
d u rXi }
t	d	||d|
}	|d urstj
|}!|!d	i |}|| _|d ur||n|d | _|d ur|n|d | _|| _|| _|| _|| _|| _|| _|dkrd | _n|| _t||||d| _|rt||||d| _|dkrd | _n|	| _|| _|| _|| _|| _t|d| _|| _| jrd | j_ || _!|| _"d | _#d | _$d S )
Nr.   )r/   encoder_output_sizer   )odimr;   r         ?)sizepadding_idx	smoothingnormalize_length)rA    )%super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classesr	   predictor_classesr1   r2   r3   r/   r0   r-   r!   r#   r%   r'   r   criterion_attcriterion_att_1str)   r+   r6   r7   r8   r   criterion_prer9   embedr:   r5   beam_searcherror_calculator)"selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   kwargsspecaug_classnormalize_classencoder_classr;   decoder_classpredictor_class	__class__rB   T/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/e_paraformer/model.pyrD   )   s   
'
zEParaformer.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}d\}	}
d}t }| jdkrU| ||||\}	}
|	durM|	 nd|d< |
|d< | ||||\}}}}}}| jdkrp||| j	  }n| j|	 d| j |  || j	  }|dur||7 }|dur| nd|d< |dur| nd|d	< ||d
< ||d< ||d< |dur| 
 nd|d< t| |d< ||d< | jr|| j  }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   NNr   loss_ctccer_ctcloss_attpre_loss_attacccerwerloss_preloss
batch_size)lenr>   shapeencodedictr-   _calc_ctc_lossdetach_calc_att_lossr6   cputorchcloner5   r7   sumr   device)rR   r\   r]   r^   r_   rS   rk   encoder_outencoder_out_lensrb   rc   ri   statsrd   acc_attcer_attwer_attre   rj   weightrB   rB   r[   forward   sP   



zEParaformer.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | ||\}}}t|trC|d }||fS )zEncoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   )r   r!   trainingr#   r%   
isinstancetuple)rR   r\   r]   rS   rx   ry   _rB   rB   r[   rn      s   


zEParaformer.encodec                 C   sV   t ||ddd d d d d f  |j}| j|d || jd\}}}}||||fS )Nr   maxlenr0   )r   r>   torw   r+   r0   )rR   rx   ry   encoder_out_maskpre_acoustic_embedspre_token_lengthalphaspre_peak_indexrB   rB   r[   calc_predictor  s   $
zEParaformer.calc_predictorc                 C   s.   |  ||||}|d }tj|dd}||fS )Nr   r   dim)r'   rt   log_softmax)rR   rx   ry   sematic_embedsys_pad_lensdecoder_outsdecoder_outrB   rB   r[   cal_decoder_with_predictor%  s   z&EParaformer.cal_decoder_with_predictorrx   ry   ys_padr   c                 C   s  t ||ddd d d d d f  |j}| jdkr-t|| j| j| j\}}|| j }| jdkrCt	|| j| j| j\}}|| j }| j
|||| jd\}}}}	d }
d }| jdkru| jri| |||||\}}
n| |||||\}}
n|}| ||||}|d |d }}|
d u r|}
| jr| |
|}| ||}t|
d| j|| jd}| |||}| js| jd u rd	\}}n|
jdd
}| | | \}}||||||fS )Nr   r   r   r   r   r   r   )ignore_labelra   r   )r   r>   r   rw   r7   r   r2   r3   r0   r   r+   r8   r:   sampler_with_gradsamplerr'   rM   rL   r   viewr/   rN   type_asr   rQ   argmaxrs   )rR   rx   ry   r   r   r   r   r   r   r   decoder_out_1stre   r   r   r   rd   r{   ri   r|   r}   ys_hatrB   rB   r[   rr   .  sR   $










zEParaformer._calc_att_lossc                 C   s  t || dd d d d d f  |j}||d d d d df  }| jr.| jjj| }n| j|}t	
  | ||||}	|	d |	d }
}|
d}|| j}|d}||k|@ d}t	|}| \}}t|D ].}|| ||    | j  }|dkr|| jdt	|| d | |jdd qr|d}|| d}|d|j}W d    n1 sw   Y  || d||d }|| |
| fS Nr   r   r   r   )r   indexvalueFr   )r   maxr   rw   r9   r'   output_layerr~   rO   rt   no_gradr   ner0   rv   	ones_liker>   rangefloatr8   longscatter_randpermeqmasked_fill	unsqueezerR   rx   ry   r   r   r   tgt_maskys_pad_maskedys_pad_embedr   r   r   pred_tokensnonpad_positionsseq_lenssame_num
input_maskbszseq_lenli
target_numinput_mask_expand_dimr   rB   rB   r[   r   m  sN   $





zEParaformer.samplerc                 C   s  t || dd d d d d f  |j}||d d d d df  }| jr.| jjj| }n| j|}| ||||}	|	d |	d }
}|
	d}|
| j}|d}||k|@ d}t|}| \}}t|D ].}|| ||    | j  }|dkr|| jdt|| d | |jdd qm|d}|| d}|d|j}|| d||d }|| |
| fS r   )r   r   r   rw   r9   r'   r   r~   rO   r   r   r0   rv   rt   r   r>   r   r   r8   r   r   r   r   r   r   r   rB   rB   r[   r     sJ   $




zEParaformer.sampler_with_gradc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r)   r   rQ   r   datars   )rR   rx   ry   r   r   rb   rc   r   rB   rB   r[   rp     s   zEParaformer._calc_ctc_lossc              
   K   s   ddl m} ddlm} ddlm} i }| jd kr'|| j| jd}|j|d |	d}|j|t
|d d }||d	< td
|	d |	dd|	dd|	dd|	ddd}	||	dd|	|| j| jt
||| jd
krqd ndd}
|
| _d S )Nr   )BeamSearchPara)CTCPrefixScorer)LengthBonus)r)   r3   )r)   
token_list)length_bonusngramr=   decoding_ctc_weightr   	lm_weightngram_weightpenalty)r'   r)   lmr   r   	beam_sizer   full)r   weightsscorersr2   r3   r/   r   pre_beam_score_key)funasr.models.paraformer.searchr   %funasr.models.transformer.scorers.ctcr   .funasr.models.transformer.scorers.length_bonusr   r)   r3   updaterF   rl   ro   r2   r-   rP   )rR   rS   r   r   r   r   r)   r   r   r   rP   rB   rB   r[   init_beam_search  s>   








zEParaformer.init_beam_searchkeyc           1   
      s  | dddko jd k}| dddko| dd d u}| dd}	 jd u r@|s,|r@td  jd1i | | d	d
 _i }
t|tj	rw| dddkrw||}}t
|jdk rg|d d d d d f }|d urq|d}nQ|jd
 }nKt }t||j| dd| dd|d}t }|| d|
d< t|| dd|d\}}t }|| d|
d< |  |j |j d |
d< |j|d d}|j|d d}| ddr| } ||\}}t|tr|d } ||}|d |d
 |d |d f\}}}}|  }t|d
k rg S  ||||}|d |d
 }}g }| \}}}t|d t tfr=|d }t
||k rH|| }t!|D ]4}||d || d d f } ||d || d d f }! jd ur j| |!| dd| ddd }"|"d  j }"n.|!j"dd!}#|!jdd!d }$tj|$dd!}$tj# j$g|#%   j&g |#j'd}#t(|#|$d"g}"t)|"D ]\}%}&d }'| d#d urt* d$st+| d# _, j,|%d
  d% }'d}(t|&j-t r|&j-d
|( })n	|&j-d
|( % })t t. fd&d'|)})|d urr|/|)}*|0|*}+|	rEt1|| || t22|*| d(dd
d)\},}-t*|d*s<t34|*|-\}+}.}/|| |+|.d+}0nt*|d*sRt34|*\}+}/|| |+d,}0|'d urqd-5|*|'d. || < |+|'d/ || < n|| |)d0}0|6|0 qqL||
fS )2Nr   r   gh㈵>r   lm_filepred_timestampFzenable beam_searchnbestr   	data_typesoundfbank   r   fsi>  )r   audio_fsr   	tokenizerz0.3f	load_data)r   frontendextract_feati  batch_data_timerw   )rw   fp16r   r   maxlenratiominlenratio)x	am_scoresr   r   r   )yseqscore
output_dirwriter
best_recogc                    s   |  j ko|  jko|  jkS )N)r3   r2   r1   )r   rR   rB   r[   <lambda>u  s    z'EParaformer.inference.<locals>.<lambda>
begin_time)
vad_offsetupsample_ratebpemodel)r   r^   	timestamp)r   r^    tokenr^   )r   	token_intrB   )7rF   r)   rP   logginginfor   r   r   rt   Tensorrl   rm   squeezetimeperf_counterr   r   r   rv   itemframe_shiftlfr_nr   halfrn   r   r   roundr   r   r   r>   listr   r   tensorr2   tolistr3   rw   r   	enumeratehasattrr   r   r   filter
ids2tokenstokens2textr   copyr
   sentence_postprocessjoinappend)1rR   data_indata_lengthsr   r   r   rS   
is_use_ctc	is_use_lmr   	meta_datar\   r]   time1audio_sample_listtime2time3rx   ry   predictor_outsr   r   r   r   r   r   r   resultsbndir   r   
nbest_hypsr   r   	nbest_idxhypibest_writerlast_posr   r   text_postprocessedtimestamp_strr   time_stamp_postprocessedr   result_irB   r   r[   	inference  s   









$





1zEParaformer.inferencec                 K   s2   ddl m} d|vrd|d< |dd| i|}|S )Nr   )export_rebuild_modelmax_seq_leni   modelrB   )export_metar'  )rR   rS   r'  modelsrB   rB   r[   export  s
   zEParaformer.export)NNNNNNNNNNNNr   r   r   r   r   r   r   r   Fr   r   r    FT)NNNN)__name__
__module____qualname____doc__r   strr   r   intboolrD   rt   r   r   r   rn   r   r   rr   r   r   rp   r   r  r&  r,  __classcell__rB   rB   rY   r[   r      s   	
 # 
H

	
?((
2
 )-r   r
  rt   r   torch.cuda.ampr   typingr   r   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.utils.datadir_writerr   r   r   &funasr.models.paraformer.cif_predictorr   r   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   r   *funasr.models.transformer.utils.nets_utilsr   funasr.utils.timestamp_toolsr   funasr.utils.load_utilsr   r   registernnModuler   rB   rB   rB   r[   <module>   s,   
