o
    i#                     @   s   d dl Z d dlmZmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ ed
dG dd de
jZdS )    N)UnionDictListTupleOptional)CTC)force_gatherable)load_audio_text_image_videoextract_fbank)postprocess_utils)DatadirWriter)tables)
Hypothesismodel_classesr   c                       s  e Zd ZdZ														d+d	ed
ededededededededededededef fddZde	j
de	j
de	j
de	j
dee	j
eee	j
f e	j
f f
ddZde	j
de	j
dee	j
e	j
f fd d!Zd"e	j
d#e	j
d$e	j
d%e	j
fd&d'Z				d,d(efd)d*Z  ZS )-Transformerz*CTC-attention hybrid Encoder-Decoder modelNP   r         Fspecaugspecaug_conf	normalizenormalize_confencoderencoder_confctc_conf
input_size
vocab_size	ignore_idblank_idsoseoslength_normalized_lossc                    s   t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|dd|i|}| }|d u r@i }td|	|d|}|| _	|d urS|n|	d | _
|d ur^|n|	d | _|	| _|
| _|| _|| _|| _d | _|| _|| _d S )Nr   )odimencoder_output_sizer    )super__init__r   specaug_classesgetnormalize_classesencoder_classesoutput_sizer   r   r    r!   r   r   r   r   r   error_calculatorctcr"   )selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   kwargsspecaug_classnormalize_classencoder_classr$   r.   	__class__r%   K/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/ctc/model.pyr'      s0   

zTransformer.__init__speechspeech_lengthstexttext_lengthsreturnc                 K   s   t | dkr|dddf }t | dkr |dddf }|jd }| ||\}}d\}	}
t }| ||||\}	}
|	}t| |d< | j	rTt
|d  }t|||f|j\}}}|||fS )zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   )NNloss)lensizeshapeencodedict_calc_ctc_losstorchclonedetachr"   intsumr   device)r/   r7   r8   r9   r:   r0   
batch_sizeencoder_outencoder_out_lensloss_ctccer_ctcstatsr<   weightr%   r%   r6   forwardF   s"   

zTransformer.forwardc                 K   sR   | j dur| jr|  ||\}}| jdur| ||\}}| ||\}}||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        N)r   trainingr   r   )r/   r7   r8   r0   rJ   rK   r%   r%   r6   r@   s   s   
zTransformer.encoderJ   rK   ys_padys_pad_lensc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r.   rQ   r-   argmaxdatacpu)r/   rJ   rK   rR   rS   rL   rM   ys_hatr%   r%   r6   rB      s   zTransformer._calc_ctc_losskeyc           #         s&  | dddkrtdi }t|tjr=| dddkr=||}}	t|jdk r3|d d d d d f }|	d u r<|jd }	nKt }
t	||j
| dd	| dd|d
}t }||
 d|d< t|| dd|d\}}	t }|| d|d< |	  |j |j d |d< |j|d d}|	j|d d}	 ||	\}}t|tr|d } j|}g }| \}}}t|d ttfr|d }t||k r|| }t|D ]}||d || d d f }|jdd}tj|dd}tj jg|   jg |jd}t|dg}t |D ]\}}d }| dd ur1t! ds't"| d _# j#|d  d }d}t|j$trB|j$d| }n	|j$d|  }tt% fdd|}|&|}|'|}t()|\} }!|| || d}"|*|" |d urd+||d || < | |d || < qq||fS ) NrI   r   z!batch decoding is not implemented	data_typesoundfbank   fsi>  )r^   audio_fsrZ   	tokenizerz0.3f	load_data)rZ   frontendextract_feati  batch_data_timerH   )rH   r   r   )dim)yseq
output_dirwriter
best_recogc                    s   |  j ko|  jko|  jkS )N)r!   r    r   )xr/   r%   r6   <lambda>   s    z'Transformer.inference.<locals>.<lambda>)rY   tokenr9    rm   r9   ),r)   NotImplementedError
isinstancerC   Tensorr=   r?   timeperf_counterr	   r^   r
   rG   itemframe_shiftlfr_ntor@   tupler.   log_softmaxr>   listrangerU   unique_consecutivetensorr    tolistr!   rH   r   	enumeratehasattrr   rh   rf   filter
ids2tokenstokens2textr   sentence_postprocessappendjoin)#r/   data_indata_lengthsrY   r`   rb   r0   	meta_datar7   r8   time1audio_sample_listtime2time3rJ   rK   
ctc_logitsresultsbndirj   rf   
nbest_hyps	nbest_idxhypibest_writerlast_pos	token_intrm   r9   text_postprocessed_result_ir%   rk   r6   	inference   s   







$



!zTransformer.inference)NNNNNNNr   r   r   r   r   r   F)NNNN)__name__
__module____qualname____doc__strrA   rF   boolr'   rC   rq   r   r   rP   r@   rB   rz   r   __classcell__r%   r%   r4   r6   r      s    	
1
-

r   )loggingtypingr   r   r   r   r   rr   rC   torch.nnnnfunasr.models.ctc.ctcr   funasr.train_utils.device_funcsr   funasr.utils.load_utilsr	   r
   funasr.utilsr   funasr.utils.datadir_writerr   funasr.registerr   funasr.models.paraformer.searchr   registerModuler   r%   r%   r%   r6   <module>   s    
