o
    iw                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dl m!Z!m"Z" d dl#m$Z$ e%ddG dd dej&j'Z(dS )    N)autocast)UnionDictListTupleOptional)tables)CTC)postprocess_utils)th_accuracy)DatadirWriter)mae_loss)force_gatherable)LabelSmoothingLoss)add_sos_eos)make_pad_maskpad_list)load_audio_text_image_videoextract_fbank)sequence_maskmodel_classesUniASRc                O       s  e Zd ZdZ																																						
			d[dedededededededededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0ed1ed2efN fd3d4Z	d5e
jd6e
jd7e
jd8e
jd9ee
jeee
jf e
jf f
d:d;Zd5e
jd6e
jd7e
jd8e
jd9eee
jf f
d<d=Zd5e
jd6e
jfd>d?Zd@e
jdAe
jd5e
jd6e
jfdBdCZd@e
jdAe
jdDe
jdEe
jd9e
jf
dFdGZ	Hd\d@e
jdAe
jdDe
jdEe
jdIef
dJdKZd@e
jdAe
jdDe
jdEe
jfdLdMZd@e
jdAe
jdDe
jdEe
jfdNdOZd@e
jdAe
jdDe
jdEe
jfdPdQZ		d]d@e
jdAe
jdDe
jdEe
jfdRdSZ		d]d@e
jdAe
jdDe
jdEe
jfdTdUZdVdW Z				d^dXefdYdZZ  ZS )_r   z;
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Nr                 ?chunkP         Fspecaugspecaug_conf	normalizenormalize_confencoderencoder_confencoder2encoder2_confdecoderdecoder_confdecoder2decoder2_conf	predictorpredictor_confpredictor_biaspredictor_weight
predictor2predictor2_confpredictor2_biaspredictor2_weightctcctc_conf
ctc_weightctc2	ctc2_confctc2_weightdecoder_attention_chunk_typedecoder_attention_chunk_type2stride_conv_confloss_weight_model1
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_lossshare_embeddingc)           4         st  t    |d urtj|}*|*di |}|d ur'tj|}+|+di |}tj|},|,dd| i|}| }-tj|	}.|.d|!|-d|
}	tj	|}/|/di |}ddl
m}0 |0di || |- | |- d}| }1tj|},|,dd|1i|}| }2tj|}.|.d|!|2d|}tj	|}/|/di |}|#| _|$| _|%| _|!| _|"| _|| _|| _|| _|| _|| _d | _|	| _d | _d | _t|!|"|&|'d| _|| _|| _t|'d| _|)dd	| _ | jj!d urdd
l"m#}3 |3| _$|| _%|| _&|| _'|| _|| _(|| _)|| _*|| _+|| _,| j&j!d ur$dd
l"m#}3 |3| _-|| _*|'| _.|)dd| _/|)dd| _0d | _1d S )Nr=   )r>   encoder_output_sizer   )Conv1dSubsampling)idimodim)sizepadding_idx	smoothingnormalize_length)rM    encoder1_encoder2_joint_trainingT),build_scama_mask_for_cross_attention_decoderenable_maas_finetuneFfreeze_encoder2 )2super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classespredictor_classes+funasr.models.transformer.utils.subsamplingrG   r@   rA   rB   r>   r?   r5   r8   r   r!   r#   error_calculatorr'   r3   r6   r   criterion_attr+   r.   r   criterion_prerN   overlap_chunk_cls funasr.models.scama.chunk_utilisrO   /build_scama_mask_for_cross_attention_decoder_fnr9   r%   r)   r/   r2   r:   stride_convr<   0build_scama_mask_for_cross_attention_decoder_fn2rD   rP   rQ   beam_search)4selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   rc   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   kwargsspecaug_classnormalize_classencoder_classrF   decoder_classpredictor_classrG   stride_conv_output_sizeencoder2_output_sizerO   	__class__rR   N/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/uniasr/model.pyrT       s   
,

zUniASR.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s  | dd}t| dkr|dddf }t| dkr&|dddf }|jd }| jj| j|}| jrWt	
  | j|||d\}	}
}W d   n1 sQw   Y  n| j|||d\}	}
}d\}}}}d\}}t }d}d\}}}| jd	kr	| jrt	
 B | |
|||\}}}}}||| j  }|dur| nd|d
< ||d< ||d< ||d< |dur|  nd|d< W d   n1 sw   Y  n;| |
|||\}}}}}||| j  }|dur| nd|d
< ||d< ||d< ||d< |dur|  nd|d< |}| jdk r| jr7t	
  | j|
||	||d\}
}W d   n	1 s1w   Y  n| j|
||	||d\}
}d}t|
trS|
d }|
d }
| |
|||\}}}}}||| j  }|durp| nd|d< ||d< ||d< ||d< |dur|  nd|d< |}|| j |d| j   }t	| |d< t	| |d< t	| |d< | jrt|d  }t|||f|j\}}}|||fS )a  Frontend + Encoder + Decoder + Calc loss
        Args:
                        speech: (Batch, Length, ...)
                        speech_lengths: (Batch, )
                        text: (Batch, Length)
                        text_lengths: (Batch,)
        decoding_indNr   r   indNNNNNN)r   r   r   r   loss_attacccerwerloss_pre      ?	loss_att2acc2cer2wer2	loss_pre2loss1loss2loss)rV   lenrJ   shaper#   r`   random_choicetrainingrP   torchno_gradencodedictr<   _calc_att_predictor_lossr.   detachcpurQ   encode2
isinstancetuple_calc_att_predictor_loss2r2   clonerD   intsumr   device)rf   rr   rs   rt   ru   rg   rw   
batch_sizery   
speech_rawencoder_outencoder_out_lensr|   acc_attcer_attwer_attloss_ctccer_ctcstatsr   r   r   r   intermediate_outsweightrR   rR   rq   forward   s   








zUniASR.forwardc                 C   s>   | j r| ||\}}ntd| j   ||}}||dS )NzkGenerating dummy stats for feats and feats_lengths, because encoder_conf.extract_feats_in_collect_stats is )featsfeats_lengths)extract_feats_in_collect_stats_extract_featsloggingwarning)rf   rr   rs   rt   ru   r   r   rR   rR   rq   collect_feats0  s   

zUniASR.collect_featsc           	      K   s   | dd}td% | jdur| jr| ||\}}| jdur(| ||\}}W d   n1 s2w   Y  | |j}| j|||d\}}}t	|t
rS|d }|||fS )Frontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                        speech: (Batch, Length, ...)
                        speech_lengths: (Batch, )
        ry   r   FNrx   )rV   r   r   r   r!   r   tor   r#   r   r   )	rf   rr   rs   rg   ry   r   r   r   _rR   rR   rq   r   C  s   

	

zUniASR.encoder   r   c           
      K   s   | dd}| jjj||dd\}}tj||fdd}|}| jdur*| ||\}}| js5| }| }| j	|||d\}}}	t
|trI|d }||fS )r   ry   r   N
chunk_outsr   dimrx   )rV   r#   r`   remove_chunkr   catrc   rN   r   r%   r   r   )
rf   r   r   rr   rs   rg   ry   encoder_out_rmencoder_out_lens_rmr   rR   rR   rq   r   a  s"   


zUniASR.encode2ys_padys_pad_lensc                 C   s   t || j| j| j\}}|d }| ||||\}}	|d}
|d}tjjj	|
d||
d| jdd}|
|
d}|jdd}|d|
ksLJ |S )aZ  Compute negative log likelihood(nll) from transformer-decoder
        Normally, this function is called in batchify_nll.
        Args:
                        encoder_out: (Batch, Length, Dim)
                        encoder_out_lens: (Batch,)
                        ys_pad: (Batch, Length)
                        ys_pad_lens: (Batch,)
        r   r   r   r   none)ignore_index	reductionr   )r   rA   rB   r?   r'   rJ   r   nn
functionalcross_entropyviewr   )rf   r   r   r   r   	ys_in_pad
ys_out_pad
ys_in_lensdecoder_outr   r   decoder_num_classnllrR   rR   rq   r     s"   


z
UniASR.nlld   r   c                 C   s   | d}||kr| ||||}nIg }d}	 t|| |}	|||	ddddf }
|||	 }|||	ddf }|||	 }| |
|||}|| |	}||krUnqt|}| d|ksdJ |S )a  Compute negative log likelihood(nll) from transformer-decoder
        To avoid OOM, this fuction seperate the input into batches.
        Then call nll for each batch and combine and return results.
        Args:
                        encoder_out: (Batch, Length, Dim)
                        encoder_out_lens: (Batch,)
                        ys_pad: (Batch, Length)
                        ys_pad_lens: (Batch,)
                        batch_size: int, samples each batch contain when computing nll,
                                                                        you may change this to avoid OOM or increase
                                                                        GPU memory usage
        r   TN)rJ   r   minappendr   r   )rf   r   r   r   r   r   	total_numr   	start_idxend_idxbatch_encoder_outbatch_encoder_out_lensbatch_ys_padbatch_ys_pad_lens	batch_nllrR   rR   rq   batchify_nll  s2   


zUniASR.batchify_nllc                 C   s   t || j| j| j\}}|d }| ||||\}}	| ||}
t|d| j|| jd}| j	s5| j
d u r:d\}}n|jdd}| 
| | \}}|
|||fS )Nr   r   ignore_labelr{   r   )r   rA   rB   r?   r'   r^   r   r   r>   r   r]   argmaxr   )rf   r   r   r   r   r   r   r   r   r   r|   r   r   r   ys_hatrR   rR   rq   _calc_att_loss  s   
zUniASR._calc_att_lossc                 C     t || j| j| j\}}|d }t||d|j|jdd d d d d f }d }	| jj	d urN| jj	j
d |j|dd}	| jj	jd |j|dd}
||
 }| j|||| j|	|d\}}}}| j||\}}d }| jj	d ur| jdkr| jj	j}d}|}| jj	j}| jj	jd |j|dd}| j||d|||| jd |	|||| jd}n| jj	d ur| jj	j||d d\}}| j||||||d	\}}| ||}t|d
| j|| jd}| |||}| js| jd u rd\}}n|jd
d}| | | \}}|||||fS Nr   maxlendtyper   r   r   r   r?   mask_chunk_predictortarget_label_lengthr   predictor_alignmentsencoder_sequence_length
chunk_sizeencoder_chunk_sizeattention_chunk_center_biasattention_chunk_sizeattention_chunk_typesteppredictor_mask_chunk_hoppingdecoder_att_look_back_factormask_shift_att_chunk_decodertarget_lengthis_trainingr   )
chunk_maskpre_acoustic_embedsr   r   r{   r   )r   rA   rB   r?   r   rJ   r   r   r#   r`   get_mask_chunk_predictorget_mask_shfit_chunkr+   gen_frame_alignmentsr9   chunk_size_pad_shift_cur decoder_att_look_back_factor_cur get_mask_shift_att_chunk_decoderrb   r   r   r'   r^   r   r   r>   r_   type_asr]   r   r   rf   r   r   r   r   r   r   r   encoder_out_maskr   mask_shfit_chunkr   pre_token_length
pre_alphasr   r   predictor_alignments_len
scama_maskr   r   r   r   r   r   r|   r   r   r   r   r   rR   rR   rq   r        





zUniASR._calc_att_predictor_lossc                 C   r   r   )r   rA   rB   r?   r   rJ   r   r   r%   r`   r   r   r/   r   r:   r   r   r   rd   r   r   r)   r^   r   r   r>   r_   r   r]   r   r   r   rR   rR   rq   r   \  r  z UniASR._calc_att_predictor_loss2c                 C   d  d\}}t ||d|j|jdd d d d d f }d }| jjd urB| jjjd |j|dd}| jjjd |j|dd}	||	 }| j|||| j	||d\}
}}}| j
||\}}d }| jjd ur| jdkr| jjj}d}|}| jjj}| jjjd |j|dd}| j||d|||| jd ||||| jd}n| jjd ur| jjj||d d	\}}|
||||fS 
Nr{   r   r   r   r   r   r   r   r   )r   rJ   r   r   r#   r`   r   r   r+   r?   r   r9   r   r   r   rb   r   r   rf   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   rR   rR   rq   calc_predictor_mask     	


zUniASR.calc_predictor_maskc                 C   r  r  )r   rJ   r   r   r%   r`   r   r   r/   r?   r   r:   r   r   r   rd   r   r   r  rR   rR   rq   calc_predictor_mask2  r  zUniASR.calc_predictor_mask2c              
   K   s  ddl m} ddlm} ddlm} |dd}|dkr | j}n| j}i }| j	d kr8|| j	| j
d}|j|d |d	}	|j||t|	d
 d }
|
|d< td|dd |dd|dd|dd|ddd}||dd||| j| j
t|	|	| jdkrd ndd}|| _d S )Nr   )BeamSearchScama)CTCPrefixScorer)LengthBonusdecoding_modemodel1)r3   rB   )r3   
token_list)r'   length_bonusngramr   decoding_ctc_weightr   	lm_weightngram_weightpenalty)r'   r3   lmr  r  	beam_size   full)r  weightsscorersrA   rB   r>   r  pre_beam_score_key) funasr.models.uniasr.beam_searchr
  %funasr.models.transformer.scorers.ctcr  .funasr.models.transformer.scorers.length_bonusr  rV   r'   r)   r3   rB   updater   r   rA   r5   re   )rf   rg   r
  r  r  r  r'   r  r3   r  r  r  re   rR   rR   rq   init_beam_searche  sH   








zUniASR.init_beam_searchkeyc           $   	   K   s
  | dd}| dd}|dkrd}	d}
n|dkrd	}	d
}
nd}	d
}
| jd u r=td | jd&d|
i| | dd	| _i }t|tjrn| dddkrn||}}t	|j
dk rd|d d d d d f }|d u rm|j
d	 }nKt }t||j| dd| dd|d}t }|| d|d< t|| dd|d\}}t }|| d|d< |  |j |j d |d< |j|d d}|j|d d}| j|d d}| j|||	d\}}}|
dkr| ||}n| j|||||	d\}}| ||}|d }|d	 }|d }|  | }td|  | }| j|d ||ddt|t|d }|d | j }g }|D ]I}d!}t|jtrJ|jd	| } n	|jd	|  } ttd"d# | } | | }!|!|!}"t"|d$sst#$|!\}"}|d |"d%}#|%|# q7||fS )'Ndecoding_modelnormaltoken_num_relaxr  fastr   r  offliner   model2zenable beam_searchr  nbest	data_typesoundfbank   fsi>  )r.  audio_fsr*  	tokenizerz0.3f	load_data)r*  frontendextract_feati  batch_data_timer   )r   rx      r   )xr  r   maxlenratiominlenratior   minlenr   c                 S   s   | dkS )Nr   rR   )r6  rR   rR   rq   <lambda>  s    z"UniASR.inference.<locals>.<lambda>bpemodel)r"  rt   rR   )&rV   re   r   infor!  r)  r   r   Tensorr   r   timeperf_counterr   r.  r   r   itemframe_shiftlfr_nr   r   r   r  r   r	  maxr   yseqlisttolistfilter
ids2tokenstokens2texthasattrr
   sentence_postprocessr   )$rf   data_indata_lengthsr"  r0  r2  rg   r#  r%  rw   r  	meta_datarr   rs   time1audio_sample_listtime2time3r   r   r   r   predictor_outsr  r   r   r   r9  
nbest_hypsresultshyplast_pos	token_inttokentext_postprocessedresult_irR   rR   rq   	inference  s   













zUniASR.inference)(NNNNNNNNNNNNNNr   r   NNr   r   NNr   NNr   r   r   NNr   r   r   r   r   r   r   r   FF)r   r{   rz   )__name__
__module____qualname____doc__strr   r   floatboolrT   r   r=  r   r   r   r   r   r   r   r   r   r   r   r  r	  r!  rE  r\  __classcell__rR   rR   ro   rq   r      s   	
 !"#$%&'() 
w


&
*
.

e
i
V
R5))r>  r   r   torch.cuda.ampr   typingr   r   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.metrics.compute_accr   funasr.utils.datadir_writerr   &funasr.models.paraformer.cif_predictorr   funasr.train_utils.device_funcsr   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   r   funasr.utils.load_utilsr   r   funasr.models.scama.utilsr   registerr   Moduler   rR   rR   rR   rq   <module>   s&   
