o
    i)I                     @   sd  d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlZd dlm  mZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, eej-edkrd dl.m/Z/ nedddZ/G dd de,Z0dS )    N)contextmanager)LooseVersion)Dict)List)Optional)Tuple)Union)AbsNormalize)LabelSmoothingLossNllLoss)CTC)
AbsDecoder)
AbsEncoder)AbsFrontend)AbsPostEncoder)AbsPreEncoder)
AbsSpecAug)add_sos_eos)ErrorCalculator)th_accuracy)force_gatherable)FunASRModelz1.6.0)autocastTc                 c   s    d V  d S )N )enabledr   r   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sa_asr/e2e_sa_asr.pyr   #   s   
r   c                +       s8  e Zd ZdZ											dCd	ed
edeeedf ee f de	e
 de	e de	e dedejjdededededededededededededef* fdd Zd!ejd"ejd#ejd$ejd%ejd&ejd'ejd(ejd)eejeeejf ejf fd*d+Zd!ejd"ejd#ejd$ejd)eeejf f
d,d-Zd!ejd"ejd)eejejf fd.d/Zd!ejd"ejd)eejejf fd0d1Zd2ejd3ejd4ejd5ejd)ejf
d6d7Z	8dDd2ejd3ejd4ejd5ejd9ef
d:d;Zd<ejd=ejd3ejd4ejd5ejd%ejd>ejd'ejd(ejfd?d@Zd2ejd3ejd4ejd5ejfdAdBZ  Z S )E
SAASRModelz*CTC-attention hybrid Encoder-Decoder model      ?        FT<space><blank>
vocab_sizemax_spk_num
token_list.frontendspecaug	normalizeasr_encoderspk_encoderdecoderctc
spk_weight
ctc_weightinterctc_weight	ignore_id
lsm_weightlength_normalized_loss
report_cer
report_wer	sym_space	sym_blankextract_feats_in_collect_statsc                    s\  d|  krdksJ | J |d|  krdk s$J | J |t    d| _d| _d| _|| _|| _|| _|| _|| _	|| _
| | _|| _|| _|| _|| _|| _t| jdsbd| j_| jjrrtj|| j | j_d | _|dkr}d | _n|	| _t||||d| _t|||d	| _|s|rt|||||| _|dkrd | _ n|
| _ || _!d S )
Nr         ?r         interctc_use_conditioningF)sizepadding_idx	smoothingnormalize_length)r;   r<   r>   )"super__init__blank_idsoseosr"   r#   r/   r,   r-   r.   copyr$   r%   r&   r'   r(   r)   hasattrr:   torchnnLinearoutput_sizeconditioning_layererror_calculatorr*   r
   criterion_attr   criterion_spkr   r+   r6   )selfr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   	__class__r   r   r@   +   s^   $$




zSAASRModel.__init__speechspeech_lengthstexttext_lengthsprofileprofile_lengthstext_idtext_id_lengthsreturnc	                 C   s  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|jd }	|ddd| f }| ||\}
}}d}t|
tr\|
d }|
d }
d\}}}}}}d\}}t }| jdkrz| |
|||\}}d}| j	dkr|dur|D ](\}}| ||||\}}|| }|dur|
 nd|d|< ||d|< q|t| }d| j	 | | j	|  }| jd	kr| |
||||||||	\}}}}}}| jdkr|}n| jd	kr|}n| j| d| j |  }| jdkr|}n| j| d| j |  }t|
 |
 |dur|
 nd|dur$|
 nd|dur.|
 nd|||||d

}t|||	f|j\}}}|||fS )a$  Frontend + Encoder + Decoder + Calc loss

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            text: (Batch, Length)
            text_lengths: (Batch,)
            profile: (Batch, Length, Dim)
            profile_lengths: (Batch,)
        r8   r   N)NNNNNNNNr   zloss_interctc_layer{}zcer_interctc_layer{}r7   )
lossloss_asrloss_attloss_ctcloss_spkaccacc_spkcerwercer_ctc)dimshapemaxencode
isinstancetupledictr-   _calc_ctc_lossr.   detachformatlen_calc_att_lossr,   r   device)rN   rQ   rR   rS   rT   rU   rV   rW   rX   
batch_sizeasr_encoder_outencoder_out_lensspk_encoder_outintermediate_outsr]   r_   acc_attra   cer_attwer_attr^   rd   statsloss_interctc	layer_idxintermediate_outloss_iccer_icr\   r[   weightr   r   r   forward   s   :







zSAASRModel.forwardc                 C   s>   | j r| ||\}}ntd| j   ||}}||dS )NzkGenerating dummy stats for feats and feats_lengths, because encoder_conf.extract_feats_in_collect_stats is )featsfeats_lengths)r6   _extract_featsloggingwarning)rN   rQ   rR   rS   rT   r   r   r   r   r   collect_feats   s   

zSAASRModel.collect_featsc                 C   s  t d1 | ||\}}| }| jdur!| jr!| ||\}}| jdur.| ||\}}W d   n1 s8w   Y  | jjrN| j||| jd\}}}n	| ||\}}}d}	t	|t
rf|d }	|d }| ||d }
|
d|dkrtj|
dd|ddd	dd}n|
}|d|dksJ | |df|d| ksJ | | f|d|dksJ | |df|	dur||	f||fS |||fS )
zFrontend + Encoder. Note that this method is used by asr_inference.py

        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
        FN)r+   r8   r   r   nearest)r;   mode)r   r   cloner&   trainingr'   r(   r:   r+   ri   rj   r)   r;   Finterpolate	transposerg   )rN   rQ   rR   r   r   	feats_rawencoder_outrt   _rv   encoder_out_spk_oriencoder_out_spkr   r   r   rh   	  sN   
	


zSAASRModel.encodec                 C   sb   |  dksJ |j|d d d | f }| jd ur(| ||\}}||fS ||}}||fS )Nr8   )re   rf   rg   r%   )rN   rQ   rR   r   r   r   r   r   r   F  s   

zSAASRModel._extract_featsr   rt   ys_padys_pad_lensc                 C   s   t || j| j| j\}}|d }| ||||\}}	|d}
|d}tjjj	|
d||
d| jdd}|
|
d}|jdd}|d|
ksLJ |S )a,  Compute negative log likelihood(nll) from transformer-decoder

        Normally, this function is called in batchify_nll.

        Args:
            encoder_out: (Batch, Length, Dim)
            encoder_out_lens: (Batch,)
            ys_pad: (Batch, Length)
            ys_pad_lens: (Batch,)
        r8   r   r9   r   none)ignore_index	reductionre   )r   rB   rC   r/   r*   r;   rF   rG   
functionalcross_entropyviewsum)rN   r   rt   r   r   	ys_in_pad
ys_out_pad
ys_in_lensdecoder_outr   rr   decoder_num_classnllr   r   r   r   Y  s"   


zSAASRModel.nlld   rr   c                 C   s   | d}||kr| ||||}nIg }d}	 t|| |}	|||	ddddf }
|||	 }|||	ddf }|||	 }| |
|||}|| |	}||krUnqt|}| d|ksdJ |S )a3  Compute negative log likelihood(nll) from transformer-decoder

        To avoid OOM, this fuction seperate the input into batches.
        Then call nll for each batch and combine and return results.
        Args:
            encoder_out: (Batch, Length, Dim)
            encoder_out_lens: (Batch,)
            ys_pad: (Batch, Length)
            ys_pad_lens: (Batch,)
            batch_size: int, samples each batch contain when computing nll,
                        you may change this to avoid OOM or increase
                        GPU memory usage
        r   TN)r;   r   minappendrF   cat)rN   r   rt   r   r   rr   	total_numr   	start_idxend_idxbatch_encoder_outbatch_encoder_out_lensbatch_ys_padbatch_ys_pad_lens	batch_nllr   r   r   batchify_nll  s2   


zSAASRModel.batchify_nllrs   ru   profile_lensc
              	   C   s  t || j| j| j\}
}|d }| ||||
|||\}}}|d}d| j| f}tj||ddd}| 	||}| 
t||}t|d| j|| jd}t|d| j|| jd}| jsd| jd u rid\}}n|jdd}| | | \}}||||||fS )	Nr8   r   r   constant)r   value)ignore_labelrZ   r   )r   rB   rC   r/   r*   r;   r#   r   padrL   rM   rF   logr   r   r"   r   rK   argmaxcpu)rN   rs   ru   rt   r   r   rU   r   rW   rX   r   r   r   r   weights_no_padr   spk_num_no_padr   weightsr]   r_   ra   rw   rx   ry   ys_hatr   r   r   rp     s>   



zSAASRModel._calc_att_lossc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r+   r   rK   r   datar   )rN   r   rt   r   r   r^   rd   r   r   r   r   rl     s   zSAASRModel._calc_ctc_loss)r   r   r   r   r   FTTr    r!   T)r   )!__name__
__module____qualname____doc__intr   r   strr   r   r   r   r	   r   rF   rG   Moduler   r   floatboolr@   Tensorr   r   r   rh   r   r   r   rp   rl   __classcell__r   r   rO   r   r   (   s.   	
Y	

r

=

,
/	

?r   )T)1r   
contextlibr   distutils.versionr   typingr   r   r   r   r   rF   torch.nn.functionalrG   r   r   funasr.layers.abs_normalizer	   "funasr.losses.label_smoothing_lossr
   r   funasr.models.ctcr   !funasr.models.decoder.abs_decoderr   !funasr.models.encoder.abs_encoderr   funasr.frontends.abs_frontendr   )funasr.models.postencoder.abs_postencoderr   'funasr.models.preencoder.abs_preencoderr   !funasr.models.specaug.abs_specaugr   +funasr.models.transformer.utils.add_sos_eosr   funasr.metricsr   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.models.base_modelr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s:   