o
    ߥi                     @   s   d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d	d
lmZmZ d	dlmZ ejejejdG dd deZG dd de	jZ dS )    N)DictOptionalTuple)Models)
TorchModel)Tensor)MODELS)update_conf)Tasks   )
GlobalCMVNload_kaldi_cmvn)FSMN)module_namec                       sp   e Zd ZdZ					ddededed	ed
edee f fddZ	dd Z
deeef fddZdd Z  ZS )FSMNDecoratorz? A decorator of FSMN for integrating into modelscope framework N  '
  F	model_dir	cmvn_filebackbone	input_dim
output_dimtrainingc           	         sZ   t  j|g|R i | d| _d| _|r | ||||| _dS |tj|dd| _dS )a  initialize the fsmn model from the `model_dir` path.

        Args:
            model_dir (str): the model path.
            cmvn_file (str): cmvn file
            backbone (dict): params related to backbone
            input_dim (int): input dimension of network
            output_dim (int): output dimension of network
            training (bool): training or inference mode
        Nzconfig.yaml)model_workspaceconfig_path)super__init__model	model_cfg
init_modelospathjoin)	selfr   r   r   r   r   r   argskwargs	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/kws/nearfield/model.pyr      s   

zFSMNDecorator.__init__c                 C   s   t | dr| j  d S d S )Ntmp_dir)hasattrr*   cleanupr#   r(   r(   r)   __del__<   s   
zFSMNDecorator.__del__returnc                 C   s$   | j dur|dur| j |S | jS )zP
        Args:
            input (torch.Tensor): Input tensor (B, T, D)
        N)r   forwardr   )r#   inputr(   r(   r)   r0   @   s   zFSMNDecorator.forwardc                 C   s   |d urt |\}}tt| t| }nd }d}d }	|d }
|d }|d }|d }|d }|d }|d }|d	 }|d
 }t||
|||||||||}d }d }t|||||	|||}|S )N   input_affine_dim
num_layers
linear_dimproj_dim
left_orderright_orderleft_strideright_strideoutput_affine_dim)r   r   torch
from_numpyfloatr   KWSModel)r#   r   r   r   r   meanistdglobal_cmvn
hidden_dimpreprocessingr3   r4   r5   r6   r7   r8   r9   r:   r;   
classifier
activation	kws_modelr(   r(   r)   r   J   s8   

zFSMNDecorator.init_model)NNr   r   F)__name__
__module____qualname____doc__strdictintr   boolr   r.   r   r   r0   r   __classcell__r(   r(   r&   r)   r      s.    !
r   c                       s   e Zd ZdZdedededeej deej dejdejd	ejf fd
dZdd Z	dd Z
ejdddejdfdejdejdeejejf fddZdd Z  ZS )r?   a  Our model consists of four parts:
    1. global_cmvn: Optional, (idim, idim)
    2. preprocessing: feature dimension projection, (idim, hdim)
    3. backbone: backbone or feature extractor of the whole network, (hdim, hdim)
    4. classifier: output layer or classifier of KWS model, (hdim, odim)
    5. activation:
        nn.Sigmoid for wakeup word
        nn.Identity for speech command dataset
    idimodimhdimrB   rD   r   rE   rF   c	           	         s>   t    || _|| _|| _|| _|| _|| _|| _|| _	dS )ab  
        Args:
            idim (int): input dimension of network
            odim (int): output dimension of network
            hdim (int): hidden dimension of network
            global_cmvn (nn.Module): cmvn for input feature, (idim, idim)
            preprocessing (nn.Module): feature dimension projection, (idim, hdim)
            backbone (nn.Module): backbone or feature extractor of the whole network, (hdim, hdim)
            classifier (nn.Module): output layer or classifier of KWS model, (hdim, odim)
            activation (nn.Module): nn.Identity for training, nn.Sigmoid for inference
        N)
r   r   rQ   rR   rS   rB   rD   r   rE   rF   )	r#   rQ   rR   rS   rB   rD   r   rE   rF   r&   r(   r)   r   w   s   

zKWSModel.__init__c                 C   s
   | j  S N)r   to_kaldi_netr-   r(   r(   r)   rU      s   
zKWSModel.to_kaldi_netc                 C   s   | j |S rT   )r   to_pytorch_net)r#   
kaldi_filer(   r(   r)   rV      s   zKWSModel.to_pytorch_netr   )dtypexin_cacher/   c                 C   sh   | j d ur
|  |}| jd ur| |}| ||\}}| jd ur&| |}| jd ur0| |}||fS rT   )rB   rD   r   rE   rF   )r#   rY   rZ   	out_cacher(   r(   r)   r0      s   







zKWSModel.forwardc                 C   s"   | j d ur
| j   | j  d S rT   )rD   fuse_modulesr   r-   r(   r(   r)   r\      s   

zKWSModel.fuse_modules)rH   rI   rJ   rK   rN   r   nnModuler   rU   rV   r<   zerosr>   r   r   r0   r\   rP   r(   r(   r&   r)   r?   l   s<    
	 
r?   )!r    systempfiletypingr   r   r   r<   torch.nnr]   modelscope.metainfor   modelscope.modelsr   modelscope.models.baser   modelscope.models.builderr   "modelscope.utils.audio.audio_utilsr	   modelscope.utils.constantr
   cmvnr   r   fsmnr   register_modulekeyword_spotting"speech_kws_fsmn_char_ctc_nearfieldr   r^   r?   r(   r(   r(   r)   <module>   s(   T