o
    i3H                     @   s   d dl Z d dlmZmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ e ddG dd de
j!Z"dS )    N)UnionDictListTupleOptional)autocast)LabelSmoothingLoss)CTC)add_sos_eos)th_accuracy)force_gatherable)load_audio_text_image_videoextract_fbank)postprocess_utils)DatadirWriter)tablesmodel_classesLCBNetc                E       s  e Zd ZdZ																											
									dIdededededededededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.ed/ed0efD fd1d2Z	d3e
jd4e
jd5e
jd6e
jd7ee
jeee
jf e
jf f
d8d9Zd3e
jd4e
jd7ee
je
jf fd:d;Zd<e
jd=e
jd>e
jd?e
jfd@dAZd<e
jd=e
jd>e
jd?e
jfdBdCZdDdE Z				dJdFefdGdHZ  ZS )Kr   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    LCB-NET: LONG-CONTEXT BIASING FOR AUDIO-VISUAL SPEECH RECOGNITION
    https://arxiv.org/abs/2401.06390
    N      ?              TP   r      F<space><blank>specaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_conftext_encodertext_encoder_confbias_predictorbias_predictor_conffusion_encoderfusion_encoder_confctcctc_conf
ctc_weightinterctc_weight
select_numselect_lengthinsert_blank
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blankshare_embeddingc#           ,         s  t    |d urtj|}$|$d
i |}|d ur'tj|}%|%d
i |}tj|}&|&d
d|i|}| }'tj|	}(|(d
d|i|
}	tj|})|)d
i |}tj|}*|*d
i |}|d urwtj|}+|+d
||'d|}|dkr|d u ri }t	d
||'d|}|| _
|d | _|d | _|| _|| _|| _|| _|| _|| _|	| _|| _|| _|| _|| _|| _t| jdsd| j_| jjrtj|| j | j_|| _|dkrd | _ n|| _ t!||||d	| _"d | _#|dkrd | _$n|| _$|"| _%| j%rd | j _&|| _'d | _(d S )Nr2   )r3   encoder_output_sizer   )odimr?   r   interctc_use_conditioningF      ?)sizepadding_idx	smoothingnormalize_length ))super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classesr	   r5   r6   r7   r3   r4   r-   r   r   r!   r%   r)   r'   r/   r0   r1   hasattrrA   torchnnLinearconditioning_layerr.   r#   r   criterion_atterror_calculatorr+   r>   embedr9   beam_search),selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   kwargsspecaug_classnormalize_classencoder_classr?   text_encoder_classfusion_encoder_classbias_predictor_classdecoder_class	__class__rG   N/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/lcbnet/model.pyrI   #   s   
*



zLCBNet.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s,  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}d}	t|tr<|d }	|d }d\}
}}}d\}}t }| jdkrh| ||||\}}|dur`|	 nd|d< ||d< d}| j
dkr|	dur|	D ](\}}| ||||\}}|| }|dur|	 nd|d	|< ||d
|< qu|t |	 }d| j
 | | j
|  }| ||||\}
}}}| jdkr|
}n| jdkr|}n| j| d| j |
  }|
dur|
	 nd|d< ||d< ||d< ||d< t|	 |d< | jrt|d  }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   NNNNNNr   loss_ctccer_ctczloss_interctc_layer{}zcer_interctc_layer{}rB   loss_attacccerwerloss)lenrC   shapeencode
isinstancetupledictr-   _calc_ctc_lossdetachr.   format_calc_att_lossrQ   cloner9   intsumr   device)rY   re   rf   rg   rh   rZ   
batch_sizeencoder_outencoder_out_lensintermediate_outsrn   acc_attcer_attwer_attrl   rm   statsloss_interctc	layer_idxintermediate_outloss_iccer_icrr   weightrG   rG   rd   forward   s`   





zLCBNet.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | jjrB| j||| jd\}}}n	| ||\}}}d}t|trZ|d }|d }|durd||f|fS ||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr+   r   r   )	r   r   trainingr   r!   rA   r+   rv   rw   )rY   re   rf   rZ   r   r   _r   rG   rG   rd   ru     s"   



zLCBNet.encoder   r   ys_padys_pad_lensc                 C   s   t || j| j| j\}}|d }| ||||\}}	| ||}
t|d| j|| jd}| j	s5| j
d u r:d\}}n|jdd}| 
| | \}}|
|||fS )Nr   r   )ignore_labelrk   )dim)r
   r6   r7   r4   r#   rU   r   viewr3   r   rV   argmaxcpu)rY   r   r   r   r   	ys_in_pad
ys_out_pad
ys_in_lensdecoder_outr   rn   r   r   r   ys_hatrG   rG   rd   r|   (  s   
zLCBNet._calc_att_lossc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r+   r   rV   r   datar   )rY   r   r   r   r   rl   rm   r   rG   rG   rd   ry   F  s   zLCBNet._calc_ctc_lossc              
   K   s   ddl m} ddlm} ddlm} i }| jd kr'|| j| jd}|j|d |	d}|j| j
|t|d d }||d	< td
|	dd |	dd|	dd|	dd|	ddd}	||	dd|	|| j| jt||| jd
krtd ndd}
|
| _d S )Nr   )
BeamSearch)CTCPrefixScorer)LengthBonus)r+   r7   r   
token_list)r#   length_bonusngramrB   decoding_ctc_weightg333333?	lm_weightr   ngram_weightpenalty)r#   r+   lmr   r   	beam_size   full)r   weightsscorersr6   r7   r3   r   pre_beam_score_key) funasr.models.transformer.searchr   %funasr.models.transformer.scorers.ctcr   .funasr.models.transformer.scorers.length_bonusr   r+   r7   updaterK   r#   rs   rx   r6   r-   rX   )rY   rZ   r   r   r   r   r+   r   r   r   rX   rG   rG   rd   init_beam_searchW  s@   








zLCBNet.init_beam_searchkeyc           (         s  | dddkrtd jd u r%td  jd)i | | dd _i }t|tj	rV| dddkrV||}}	t
|jd	k rL|d d d d d f }|	d u rU|jd }	n]t }
t||j| d
d| dd|d}t }||
 d|d< |d }t
|dkr|d }nddgg}t|| dd|d\}}	t }|| d|d< d}|	  | d |d< |j|d d}|	j|d d}	 ||	\}}t|tr|d }dd |D }t|j|d d}|jdgtj|ddj|d d} ||\}}} |d |d \}}}}|| } j|d | dd| ddd}|d  j }g }| \}}}t|D ]}t|D ]\}} d }!| dd urdt d sZt| d _  j |d  d! }!d"}"t| j!t"ru| j!d|" }#n	| j!d|" # }#t"t$ fd#d$|#}#|%|#}$|&|$}%t'(|$\}&}|| |$|&d%}'|)|' |!d urd&*|$|!d' || < |&|!d( || < q>q8||fS )*Nr   r   z!batch decoding is not implementedzenable beam_searchnbest	data_typesoundfbankr   fsi>  )r   audio_fsr   	tokenizerz0.3f	load_datar   i&  )r   frontendextract_feat
   i  batch_data_timer   )r   c                 S   s   g | ]	}d d |D qS )c                 S   s    g | ]}|d kr|d n|qS )r   r   rG   ).0xrG   rG   rd   
<listcomp>  s     z/LCBNet.inference.<locals>.<listcomp>.<listcomp>rG   )r   sublistrG   rG   rd   r     s    z$LCBNet.inference.<locals>.<listcomp>)dtype
fill_valuemaxlenratior   minlenratio)r   r   r   
output_dirwriter
best_recogr   c                    s   |  j ko|  jko|  jkS )N)r7   r6   r5   )r   rY   rG   rd   <lambda>  s    z"LCBNet.inference.<locals>.<lambda>)r   tokenrg    r   rg   rG   )+rK   NotImplementedErrorrX   logginginfor   r   rv   rQ   Tensorrs   rt   timeperf_counterr   r   r   r   itemtoru   rw   tensornew_fulllongrC   r%   r)   range	enumeraterP   r   r   yseqlisttolistfilter
ids2tokenstokens2textr   sentence_postprocessappendjoin)(rY   data_indata_lengthsr   r   r   rZ   	meta_datare   rf   time1sample_listtime2audio_sample_listocr_sample_listtime3frame_shiftr   r   ocr_list_newocrocr_lengthsocr_lensr   
fusion_out
nbest_hypsresultsbndi	nbest_idxhypibest_writerlast_pos	token_intr   rg   text_postprocessedresult_irG   r   rd   	inference  s   

















!zLCBNet.inference)"NNNNNNNNNNNNNNNNr   r   r   r   Tr   r   r   r   r   r   r   FTTr   r   Frj   )__name__
__module____qualname____doc__strrx   floatr~   boolrI   rQ   r   r   r   r   ru   r|   ry   r   r   r  __classcell__rG   rG   rb   rd   r      s,   	
 !"$ 
^
#

0)#r   typingr   r   r   r   r   r   rQ   torch.nnrR   torch.cuda.ampr   "funasr.losses.label_smoothing_lossr   funasr.models.ctc.ctcr	   +funasr.models.transformer.utils.add_sos_eosr
   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.utils.load_utilsr   r   funasr.utilsr   funasr.utils.datadir_writerr   funasr.registerr   registerModuler   rG   rG   rG   rd   <module>   s"   
