o
    i\                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' e(ddG dd dej)j*Z+dS )    N)autocast)UnionDictListTupleOptional)tables)CTC)postprocess_utils)th_accuracy)	to_device)DatadirWriter)
Hypothesis)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_mask)ts_prediction_lfr6_standard)load_audio_text_image_videoextract_fbankmodel_classes
Paraformerc                5       s  e Zd ZdZ																						
					
	
dFdee dee dedee dedee dedee dedee dedee dedededededededed e	d!ed"ed#ed$e	d%e	f4 fd&d'Z
d(ejd)ejd*ejd+ejd,eejeeejf ejf f
d-d.Zd(ejd)ejd,eejejf fd/d0Zd1d2 Zd3d4 Zd5ejd6ejd7ejd8ejfd9d:Zd;d< Zd5ejd6ejd7ejd8ejfd=d>Zd?d@ Z				dGdAefdBdCZdDdE Z  ZS )Hr   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    N      ?P   r                 F皙?specaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_confctcctc_conf	predictorpredictor_conf
ctc_weight
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_losspredictor_weightpredictor_biassampling_ratioshare_embeddinguse_1st_decoder_lossc           "         s  t    |d urtj|}|d	i |}|d ur'tj|}|d	i |}tj|}|d	d|i|}| }|d urNtj|} | d	||d|}|dkrb|
d u rXi }
t	d	||d|
}	|d urstj
|}!|!d	i |}|| _|d ur||n|d | _|d ur|n|d | _|| _|| _|| _|| _|| _|| _|dkrd | _n|| _t||||d| _|dkrd | _n|	| _|| _|| _|| _|| _t|d| _|| _| jrd | j_|| _ || _!d | _"d | _#d S )
Nr-   )r.   encoder_output_sizer   )odimr:   r         ?)sizepadding_idx	smoothingnormalize_length)r@    )$super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classesr	   predictor_classesr0   r1   r2   r.   r/   r,   r    r"   r$   r&   r   criterion_attr(   r*   r5   r6   r7   r   criterion_prer8   embedr9   r4   beam_searcherror_calculator)"selfr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   kwargsspecaug_classnormalize_classencoder_classr:   decoder_classpredictor_class	__class__rA   R/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/paraformer/model.pyrC   %   st   
'
zParaformer.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}d\}	}
d}t }| jdkrU| ||||\}	}
|	durM|	 nd|d< |
|d< | ||||\}}}}}}| jdkrp||| j	  }n| j|	 d| j |  || j	  }|dur| nd|d< |dur| nd|d	< ||d
< ||d< ||d< |dur| 
 nd|d< t| |d< ||d< | jr|| j  }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   NNr   loss_ctccer_ctcloss_attpre_loss_attacccerwerloss_preloss
batch_size)lenr=   shapeencodedictr,   _calc_ctc_lossdetach_calc_att_lossr5   cputorchcloner4   r6   sumr   device)rP   rZ   r[   r\   r]   rQ   ri   encoder_outencoder_out_lensr`   ra   rg   statsrb   acc_attcer_attwer_attrc   rh   weightrA   rA   rY   forward   sL   



zParaformer.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | ||\}}}t|trC|d }||fS )zEncoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   )r   r    trainingr"   r$   
isinstancetuple)rP   rZ   r[   rQ   rv   rw   _rA   rA   rY   rl      s   


zParaformer.encodec                 C   sV   t ||ddd d d d d f  |j}| j|d || jd\}}}}||||fS )Nr   maxlenr/   )r   r=   toru   r*   r/   )rP   rv   rw   encoder_out_maskpre_acoustic_embedspre_token_lengthalphaspre_peak_indexrA   rA   rY   calc_predictor  s   $
zParaformer.calc_predictorc                 C   s.   |  ||||}|d }tj|dd}||fS )Nr   r   dim)r&   rr   log_softmax)rP   rv   rw   sematic_embedsys_pad_lensdecoder_outsdecoder_outrA   rA   rY   cal_decoder_with_predictor  s   z%Paraformer.cal_decoder_with_predictorrv   rw   ys_padr   c                 C   sX  t ||ddd d d d d f  |j}| jdkr-t|| j| j| j\}}|| j }| j	|||| jd\}}}}	d }
d }| j
dkrP| |||||\}}
n|}| ||||}|d |d }}|
d u ri|}
| ||}t|
d| j|| jd}| |||}| js| jd u rd\}}n|
jdd	}| | | \}}||||||fS )
Nr   r   r   r   r   r   )ignore_labelr_   r   )r   r=   r   ru   r6   r   r1   r2   r/   r*   r7   samplerr&   rK   r   viewr.   rL   type_asr~   rO   argmaxrq   )rP   rv   rw   r   r   r   r   r   r   r   decoder_out_1strc   r   r   r   rb   ry   rg   rz   r{   ys_hatrA   rA   rY   rp     s@   $






zParaformer._calc_att_lossc                 C   s  t || dd d d d d f  |j}||d d d d df  }| jr.| jjj| }n| j|}t	
  | ||||}	|	d |	d }
}|
d}|| j}|d}||k|@ d}t	|}| \}}t|D ].}|| ||    | j  }|dkr|| jdt	|| d | |jdd qr|d}|| d}|d|j}W d    n1 sw   Y  || d||d }|| |
| fS )Nr   r   r   r   )r   indexvalueFr   )r   maxr   ru   r8   r&   output_layerr|   rM   rr   no_gradr   ner/   rt   	ones_liker=   rangefloatr7   longscatter_randpermeqmasked_fill	unsqueeze)rP   rv   rw   r   r   r   tgt_maskys_pad_maskedys_pad_embedr   r   r   pred_tokensnonpad_positionsseq_lenssame_num
input_maskbszseq_lenli
target_numinput_mask_expand_dimr   rA   rA   rY   r   S  sN   $





zParaformer.samplerc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r(   r~   rO   r   datarq   )rP   rv   rw   r   r   r`   ra   r   rA   rA   rY   rn   {  s   zParaformer._calc_ctc_lossc              
   K   s   ddl m} ddlm} ddlm} i }| jd kr'|| j| jd}|j|d |	d}|j|t
|d d }||d	< td
|	d |	dd|	dd|	dd|	ddd}	||	dd|	|| j| jt
||| jd
krqd ndd}
|
| _d S )Nr   )BeamSearchPara)CTCPrefixScorer)LengthBonus)r(   r2   )r(   
token_list)length_bonusngramr<   decoding_ctc_weightr   	lm_weightngram_weightpenalty)r&   r(   lmr   r   	beam_sizer   full)r   weightsscorersr1   r2   r.   r   pre_beam_score_key)funasr.models.paraformer.searchr   %funasr.models.transformer.scorers.ctcr   .funasr.models.transformer.scorers.length_bonusr   r(   r2   updaterE   rj   rm   r1   r,   rN   )rP   rQ   r   r   r   r   r(   r   r   r   rN   rA   rA   rY   init_beam_search  s>   








zParaformer.init_beam_searchkeyc           1   
      s  | dddko jd k}| dddko| dd d u}| dd}	 jd u r@|s,|r@td  jd1i | | d	d
 _i }
t|tj	rw| dddkrw||}}t
|jdk rg|d d d d d f }|d urq|d}nQ|jd
 }nKt }t||j| dd| dd|d}t }|| d|
d< t|| dd|d\}}t }|| d|
d< |  |j |j d |
d< |j|d d}|j|d d}| ddr| } ||\}}t|tr|d } ||}|d |d
 |d |d f\}}}}|  }t|d
k rg S  ||||}|d |d
 }}g }| \}}}t|d t tfr=|d }t
||k rH|| }t!|D ]4}||d || d d f } ||d || d d f }! jd ur j| |!| dd| ddd }"|"d  j }"n.|!j"dd!}#|!jdd!d }$tj|$dd!}$tj# j$g|#%   j&g |#j'd}#t(|#|$d"g}"t)|"D ]\}%}&d }'| d#d urt* d$st+| d# _, j,|%d
  d% }'d}(t|&j-t r|&j-d
|( })n	|&j-d
|( % })t t. fd&d'|)})|d urr|/|)}*|0|*}+|	rEt1|| || t22|*| d(dd
d)\},}-t*|d*s<t34|*|-\}+}.}/|| |+|.d+}0nt*|d*sRt34|*\}+}/|| |+d,}0|'d urqd-5|*|'d. || < |+|'d/ || < n|| |)d0}0|6|0 qqL||
fS )2Nr   r   gh㈵>r   lm_filepred_timestampFzenable beam_searchnbestr   	data_typesoundfbank   r   fsi>  )r   audio_fsr   	tokenizerz0.3f	load_data)r   frontendextract_feati  batch_data_timeru   )ru   fp16r   r   maxlenratiominlenratio)x	am_scoresr   r   r   )yseqscore
output_dirwriter
best_recogc                    s   |  j ko|  jko|  jkS )N)r2   r1   r0   )r   rP   rA   rY   <lambda>3  s    z&Paraformer.inference.<locals>.<lambda>
begin_time)
vad_offsetupsample_ratebpemodel)r   r\   	timestamp)r   r\    tokenr\   )r   	token_intrA   )7rE   r(   rN   logginginfor   r   r   rr   Tensorrj   rk   squeezetimeperf_counterr   r   r   rt   itemframe_shiftlfr_nr   halfrl   r   r   roundr   r   r   r=   listr   r   tensorr1   tolistr2   ru   r   	enumeratehasattrr   r   r   filter
ids2tokenstokens2textr   copyr
   sentence_postprocessjoinappend)1rP   data_indata_lengthsr   r   r   rQ   
is_use_ctc	is_use_lmr   	meta_datarZ   r[   time1audio_sample_listtime2time3rv   rw   predictor_outsr   r   r   r   r   r   r   resultsbndir   r   
nbest_hypsr   r   	nbest_idxhypibest_writerlast_posr   r   text_postprocessedtimestamp_strr   time_stamp_postprocessedr   result_irA   r   rY   	inference  s   









$





1zParaformer.inferencec                 K   s2   ddl m} d|vrd|d< |dd| i|}|S )Nr   )export_rebuild_modelmax_seq_leni   modelrA   )export_metar"  )rP   rQ   r"  modelsrA   rA   rY   exportV  s
   zParaformer.export)NNNNNNNNNNNNr   r   r   r   r   r   r   r   Fr   r   r   FF)NNNN)__name__
__module____qualname____doc__r   strr   r   intboolrC   rr   r   r   r}   rl   r   r   rp   r   rn   r   r   r!  r'  __classcell__rA   rA   rW   rY   r      s   	
 # 
G

	
4(
2
 ),r   r  rr   r   torch.cuda.ampr   typingr   r   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.utils.datadir_writerr   r   r   &funasr.models.paraformer.cif_predictorr   r   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   funasr.utils.timestamp_toolsr   funasr.utils.load_utilsr   r   registernnModuler   rA   rA   rA   rY   <module>   s,   
