o
    iW@                     @   s   d dl Z d dlmZmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ e ddG dd de
j!Z"dS )    N)UnionDictListTupleOptional)autocast)LabelSmoothingLoss)CTC)add_sos_eos)th_accuracy)force_gatherable)load_audio_text_image_videoextract_fbank)postprocess_utils)DatadirWriter)tablesmodel_classesTransformerc                3       s  e Zd ZdZ																					
					
d?dedededededededededededededededededed ed!ed"ed#ed$ed%ed&ef2 fd'd(Z	d)e
jd*e
jd+e
jd,e
jd-ee
jeee
jf e
jf f
d.d/Zd)e
jd*e
jd-ee
je
jf fd0d1Zd2e
jd3e
jd4e
jd5e
jfd6d7Zd2e
jd3e
jd4e
jd5e
jfd8d9Zd:d; Z				d@d<efd=d>Z  ZS )Ar   z*CTC-attention hybrid Encoder-Decoder modelN      ?        P   r         FT<space><blank>specaugspecaug_conf	normalizenormalize_confencoderencoder_confdecoderdecoder_confctcctc_conf
ctc_weightinterctc_weight
input_size
vocab_size	ignore_idblank_idsoseos
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blankshare_embeddingc                     s  t    |d urtj|}|d
i |}|d ur'tj|}|d
i |}tj|}|d
d|i|}| }|d urNtj|}|d
||d|}|dkrb|
d u rXi }
t	d
||d|
}	|| _
|d urk|n|d | _|d urv|n|d | _|| _|| _|| _|| _|| _|| _t| jdsd| j_| jjrtj|| j | j_|| _|dkrd | _n|| _t||||d	| _d | _|dkrd | _n|	| _|| _| jrd | j_ || _!d | _"d S )Nr(   )r)   encoder_output_sizer   )odimr5   r   interctc_use_conditioningF      ?)sizepadding_idx	smoothingnormalize_length )#super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizedecoder_classesr	   r+   r,   r-   r)   r*   r&   r   r   r    hasattrr7   torchnnLinearconditioning_layerr'   r"   r   criterion_atterror_calculatorr$   r4   embedr/   beam_search) selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   kwargsspecaug_classnormalize_classencoder_classr5   decoder_class	__class__r=   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/transformer/model.pyr?      sp   
!

zTransformer.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s,  t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}d}	t|tr<|d }	|d }d\}
}}}d\}}t }| jdkrh| ||||\}}|dur`|	 nd|d< ||d< d}| j
dkr|	dur|	D ](\}}| ||||\}}|| }|dur|	 nd|d	|< ||d
|< qu|t |	 }d| j
 | | j
|  }| ||||\}
}}}| jdkr|
}n| jdkr|}n| j| d| j |
  }|
dur|
	 nd|d< ||d< ||d< ||d< t|	 |d< | jrt|d  }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   NNNNNNr   loss_ctccer_ctczloss_interctc_layer{}zcer_interctc_layer{}r8   loss_attacccerwerloss)lenr9   shapeencode
isinstancetupledictr&   _calc_ctc_lossdetachr'   format_calc_att_lossrG   cloner/   intsumr   device)rO   rX   rY   rZ   r[   rP   
batch_sizeencoder_outencoder_out_lensintermediate_outsra   acc_attcer_attwer_attr_   r`   statsloss_interctc	layer_idxintermediate_outloss_iccer_icre   weightr=   r=   rW   forward   s`   





zTransformer.forwardc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | jjrB| j||| jd\}}}n	| ||\}}}d}t|trZ|d }|d }|durd||f|fS ||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr$   r   r   )	r   r   trainingr   r    r7   r$   ri   rj   )rO   rX   rY   rP   ru   rv   _rw   r=   r=   rW   rh      s"   


zTransformer.encoderu   rv   ys_padys_pad_lensc                 C   s   t || j| j| j\}}|d }| ||||\}}	| ||}
t|d| j|| jd}| j	s5| j
d u r:d\}}n|jdd}| 
| | \}}|
|||fS )Nr   r   )ignore_labelr^   )dim)r
   r,   r-   r*   r"   rK   r   viewr)   r   rL   argmaxcpu)rO   ru   rv   r   r   	ys_in_pad
ys_out_pad
ys_in_lensdecoder_outr   ra   rx   ry   rz   ys_hatr=   r=   rW   ro   	  s   
zTransformer._calc_att_lossc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r$   r   rL   r   datar   )rO   ru   rv   r   r   r_   r`   r   r=   r=   rW   rl   '  s   zTransformer._calc_ctc_lossc              
   K   s   ddl m} ddlm} ddlm} i }| jd kr'|| j| jd}|j|d |	d}|j| j
|t|d d }||d	< td
|	dd |	dd|	dd|	dd|	ddd}	||	dd|	|| j| jt||| jd
krtd ndd}
|
| _d S )Nr   )
BeamSearch)CTCPrefixScorer)LengthBonus)r$   r-   r   
token_list)r"   length_bonusngramr8   decoding_ctc_weightr   	lm_weightr   ngram_weightpenalty)r"   r$   lmr   r   	beam_size
   full)r   weightsscorersr,   r-   r)   r   pre_beam_score_key) funasr.models.transformer.searchr   %funasr.models.transformer.scorers.ctcr   .funasr.models.transformer.scorers.length_bonusr   r$   r-   updaterA   r"   rf   rk   r,   r&   rN   )rO   rP   r   r   r   r   r$   r   r   r   rN   r=   r=   rW   init_beam_search8  s@   








zTransformer.init_beam_searchkeyc                     s  | dddkrtd jd u r%td  jd$i | | dd _i }t|tj	rV| dddkrV||}}	t
|jd	k rL|d d d d d f }|	d u rU|jd }	nKt }
t||j| d
d| dd|d}t }||
 d|d< t|| dd|d\}}	t }|| d|d< |	  |j |j d |d< |j|d d}|	j|d d}	 ||	\}}t|tr|d } j|d | dd| ddd}|d  j }g }| \}}}t|D ]}t|D ]\}}d }| dd urt ds
t| d _ j|d  d }d}t|jtr%|jd| }n	|jd|   }tt! fdd|}|"|}|#|}t$%|\}}|| ||d }|&| |d urod!'||d" || < ||d# || < qq||fS )%Nrt   r   z!batch decoding is not implementedzenable beam_searchnbest	data_typesoundfbank   fsi>  )r   audio_fsr   	tokenizerz0.3f	load_data)r   frontendextract_feati  batch_data_timers   )rs   r   maxlenratior   minlenratio)xr   r   
output_dirwriter
best_recogr   c                    s   |  j ko|  jko|  jkS )N)r-   r,   r+   )r   rO   r=   rW   <lambda>  s    z'Transformer.inference.<locals>.<lambda>)r   tokenrZ    r   rZ   r=   )(rA   NotImplementedErrorrN   logginginfor   r   ri   rG   Tensorrf   rg   timeperf_counterr   r   r   rr   itemframe_shiftlfr_ntorh   rj   r9   range	enumeraterF   r   r   yseqlisttolistfilter
ids2tokenstokens2textr   sentence_postprocessappendjoin) rO   data_indata_lengthsr   r   r   rP   	meta_datarX   rY   time1audio_sample_listtime2time3ru   rv   
nbest_hypsresultsbndi	nbest_idxhypibest_writerlast_pos	token_intr   rZ   text_postprocessedr   result_ir=   r   rW   	inferencee  s   















!zTransformer.inference)NNNNNNNNNNr   r   r   r   r   r   r   r   r   FTTr   r   Fr]   )__name__
__module____qualname____doc__strrk   floatrq   boolr?   rG   r   r   r   r   rh   ro   rl   r   r   r   __classcell__r=   r=   rU   rW   r      s    	
k
]
'

0)#r   typingr   r   r   r   r   r   rG   torch.nnrH   torch.cuda.ampr   "funasr.losses.label_smoothing_lossr   funasr.models.ctc.ctcr	   +funasr.models.transformer.utils.add_sos_eosr
   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.utils.load_utilsr   r   funasr.utilsr   funasr.utils.datadir_writerr   funasr.registerr   registerModuler   r=   r=   r=   rW   <module>   s"    
