o
    i+&                     @   sR  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddlm
Z
 dd	lmZ ddlZdd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( eej)edkrddl*m+Z+ nedddZ+G dd de(Z,dS ) z*
Author: Speech Lab, Alibaba Group, China
    N)contextmanager)LooseVersion)Dict)List)Optional)Tuple)Union)AbsNormalize)LabelSmoothingLoss)CTC)
AbsDecoder)
AbsEncoder)AbsFrontend)AbsPostEncoder)AbsPreEncoder)
AbsSpecAug)add_sos_eos)ErrorCalculator)th_accuracy)force_gatherable)FunASRModelz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   P/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/xvector/e2e_sv.pyr   %   s   
r   c                       s0  e Zd ZdZdedeeedf ee f de	e
 de	e de	e de	e d	ed
e	e dejjdef fddZdejdejdejdejdeejeeejf ejf f
ddZdejdejdejdejdeeejf f
ddZdejdejdeejejf fddZdejdejdeejejf fddZ  ZS )ESPnetSVModelz*CTC-attention hybrid Encoder-Decoder model
vocab_size
token_list.frontendspecaug	normalize
preencoderencoderpostencoderpooling_layerdecoderc                    sN   t    || _| | _|| _|| _|| _|| _|| _	|| _
|	| _|
| _d S r   )super__init__r   copyr   r   r    r!   r"   r$   r#   r%   r&   )selfr   r   r   r    r!   r"   r#   r$   r%   r&   	__class__r   r   r(   -   s   


zESPnetSVModel.__init__speechspeech_lengthstexttext_lengthsreturnc                 C   s  |  dksJ |j|jd |jd   kr&|jd   kr&|jd ks3n J |j|j|j|jf|jd }|ddd| f }| ||\}}d}t|tr[|d }|d }d\}	}
}}d\}}d\}}}t }| jdkr| ||||\}}|dur|	 nd|d< ||d	< d}| j
dkr|dur|D ](\}}| ||||\}}|| }|dur|	 nd|d
|< ||d|< q|t| }d| j
 | | j
|  }| jr| |||\}}}|dur|| j|  }n|}|dur|	 nd|d< ||d< ||d< nI| jdkr| ||||\}	}
}}| jdkr |	}n| jdkr)|}n| j| d| j |	  }|	dur>|		 nd|d< |
|d< ||d< ||d< t|	 |d< t|||f|j\}}}|||fS )zFrontend + Encoder + Decoder + Calc loss
        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
            text: (Batch, Length)
            text_lengths: (Batch,)
           r   N)NNNN)NN)NNNg        loss_ctccer_ctczloss_interctc_layer{}zcer_interctc_layer{}loss_transducercer_transducerwer_transducerg      ?loss_attacccerwerloss)dimshapemaxencode
isinstancetupledict
ctc_weight_calc_ctc_lossdetachinterctc_weightformatlenuse_transducer_decoder_calc_transducer_loss_calc_att_losstorchcloner   device)r*   r-   r.   r/   r0   
batch_sizeencoder_outencoder_out_lensintermediate_outsr8   acc_attcer_attwer_attr3   r4   r5   r6   r7   statsloss_interctc	layer_idxintermediate_outloss_iccer_icr<   weightr   r   r   forwardI   s   :





zESPnetSVModel.forwardc                 C   s>   | j r| ||\}}ntd| j   ||}}||dS )NzkGenerating dummy stats for feats and feats_lengths, because encoder_conf.extract_feats_in_collect_stats is )featsfeats_lengths)extract_feats_in_collect_stats_extract_featsloggingwarning)r*   r-   r.   r/   r0   r_   r`   r   r   r   collect_feats   s   

zESPnetSVModel.collect_featsc                 C   s   t d- | ||\}}| jdur| jr| ||\}}| jdur*| ||\}}W d   n1 s4w   Y  | jdurF| ||\}}| ||\}}| jdur[| ||\}}||fS )zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
            speech: (Batch, Length, ...)
            speech_lengths: (Batch, )
        FN)r   rb   r    trainingr!   r"   r#   r$   )r*   r-   r.   r_   r`   rQ   rR   r   r   r   r@      s   



zESPnetSVModel.encodec                 C   sb   |  dksJ |j|d d d | f }| jd ur(| ||\}}||fS ||}}||fS )Nr2   )r=   r>   r?   r   )r*   r-   r.   r_   r`   r   r   r   rb      s   

zESPnetSVModel._extract_feats)__name__
__module____qualname____doc__intr   r   strr   r   r   r   r	   r   r   r   rM   nnModuler   r(   Tensorr   r^   re   r@   rb   __classcell__r   r   r+   r   r   *   sv    	

x

"r   )T)-rj   rc   
contextlibr   distutils.versionr   typingr   r   r   r   r   rM   funasr.layers.abs_normalizer	   "funasr.losses.label_smoothing_lossr
   funasr.models.ctcr   !funasr.models.decoder.abs_decoderr   !funasr.models.encoder.abs_encoderr   funasr.frontends.abs_frontendr   )funasr.models.postencoder.abs_postencoderr   'funasr.models.preencoder.abs_preencoderr   !funasr.models.specaug.abs_specaugr   +funasr.models.transformer.utils.add_sos_eosr   funasr.metricsr   funasr.metrics.compute_accr   funasr.train_utils.device_funcsr   funasr.models.base_modelr   __version__torch.cuda.ampr   r   r   r   r   r   <module>   s:    