o
    i"                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ e ddG dd dej!j"Z#dS )    N)autocast)UnionDictListTupleOptional)tables)CTC)postprocess_utils)DatadirWriter)mae_loss)force_gatherable)add_sos_eos)make_pad_mask)ts_prediction_lfr6_standard)load_audio_text_image_videoextract_fbankmodel_classesMonotonicAlignerc                       s   e Zd ZdZ											d!dedee dee d	ed
ee dedee dedee dedef fddZ	de
jde
jde
jde
jdee
jeee
jf e
jf f
ddZdd Zde
jde
jdee
je
jf fddZ				d"defdd Z  ZS )#r   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Achieving timestamp prediction while recognizing with non-autoregressive end-to-end ASR model
    https://arxiv.org/abs/2301.12343
    P   Nr   F
input_sizespecaugspecaug_conf	normalizenormalize_confencoderencoder_conf	predictorpredictor_confpredictor_biaslength_normalized_lossc                    s   t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|dd|i|}| }tj|}|di |	}|| _	|| _
|| _|| _t|d| _|
| _d S )Nr   )normalize_length )super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizepredictor_classesr   r   r   r   r   criterion_prer   )selfr   r   r   r   r   r   r   r   r   r   r    kwargsspecaug_classnormalize_classencoder_classencoder_output_sizepredictor_class	__class__r"   Y/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/monotonic_aligner/model.pyr$       s$   

zMonotonicAligner.__init__speechspeech_lengthstexttext_lengthsreturnc                 C   s  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|jd }|ddd| f }|ddd| f }| ||\}}t||dddddddf  |j}| jdkrt	|ddd\}	}|| j }| j
|||dd\}	}	}	}	}
| ||
|
}|}t }|dur|  nd|d< t| |d	< t|||f|j\}}}|||fS )
zFrontend + Encoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
           r   Nmaxlen   )	ignore_idloss_preloss)dimshapemaxencoder   sizetodevicer   r   r   r+   type_asdictdetachcputorchcloner   )r,   r6   r7   r8   r9   
batch_sizeencoder_outencoder_out_lensencoder_out_mask_pre_token_length2rA   rB   statsweightr"   r"   r5   forwardC   s0   :
$


zMonotonicAligner.forwardc           	      C   sR   t ||ddd d d d d f  |j}| j|||\}}}}||||fS )Nr;   r<   )r   rG   rH   rI   r   get_upsample_timestamp)	r,   rQ   rR   	token_numrS   	ds_alphasds_cif_peak	us_alphasus_peaksr"   r"   r5   calc_predictor_timestampv   s   $z)MonotonicAligner.calc_predictor_timestampc                 K   s   t d% | jdur| jr| ||\}}| jdur"| ||\}}W d   n1 s,w   Y  | ||\}}}t|trC|d }||fS )zEncoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        FNr   )r   r   trainingr   r   
isinstancetuple)r,   r6   r7   r-   rQ   rR   rT   r"   r"   r5   rF      s   


zMonotonicAligner.encodekeyc           !      K   s  i }t  }t||j|dd|dd|d\}	}
t  }|| d|d< t|	|dd|d\}}t  }|| d|d	< |  |j |j	 d
 |d< |j
|d d}|j
|d d}| ||\}}tdd |
D 
|j}| j|||d\}}}}g }d }|dd urt| dst|d| _| jd }tt|||
D ]O\}\}}}||}t|d || d  |d || d  t|\}}t||\}}}|| ||d} ||  |r||d || < ||d || < q||fS )Nfsi>  	data_typesound)rd   audio_fsre   	tokenizerz0.3f	load_data)re   frontendextract_feati  batch_data_timerI   )rI   c                 S   s   g | ]}t |d  qS )r;   )len).0ir"   r"   r5   
<listcomp>   s    z.MonotonicAligner.inference.<locals>.<listcomp>)rZ   
output_dirwritertp_res   )rc   r8   	timestamptimestamp_listtimestamp_str)timeperf_counterr   rd   r&   r   sumitemframe_shiftlfr_nrH   rF   rN   tensorrI   r_   hasattrr   rr   	enumeratezip
ids2tokensr   copyr
   sentence_postprocessappend)!r,   data_indata_lengthsrc   rh   rj   r-   	meta_datatime1
audio_listtext_token_int_listtime2r6   r7   time3rQ   rR   r9   rT   r]   r^   resultsibest_writerro   us_alphaus_peak	token_inttokenrw   ru   text_postprocessedtime_stamp_postprocessedresult_ir"   r"   r5   	inference   sl   	









zMonotonicAligner.inference)r   NNNNNNNNr   F)NNNN)__name__
__module____qualname____doc__intr   strr   boolr$   rN   Tensorr   rX   r_   rF   listr   __classcell__r"   r"   r3   r5   r      s|    	
#
3	
 )$rx   r   rN   torch.cuda.ampr   typingr   r   r   r   r   funasr.registerr   funasr.models.ctc.ctcr	   funasr.utilsr
   funasr.utils.datadir_writerr   &funasr.models.paraformer.cif_predictorr   funasr.train_utils.device_funcsr   +funasr.models.transformer.utils.add_sos_eosr   *funasr.models.transformer.utils.nets_utilsr   funasr.utils.timestamp_toolsr   funasr.utils.load_utilsr   r   registernnModuler   r"   r"   r"   r5   <module>   s"   
