o
    ߥi                     @   s   d dl Z d dlmZmZ d dlZd dlZd dlmZ d dl	m
  mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ G d
d dejZejejej dG dd deZ!dS )    N)AnyDict)Models)MODELS
TorchModel)CAMPPlus)
DenseLayer)ERes2Net)Tasks)create_devicec                       s,   e Zd Z			d fdd	Zdd Z  ZS )	LinearClassifierr        c                    s`   t    t | _tjdd| _t|D ]}| jt	||dd |}qtj
||dd| _d S )NT)inplace)bias)super__init__nn
ModuleListblocksReLU	nonlinearrangeappendr   Linearlinear)self	input_dim
num_blocks	inter_dimout_neurons_	__class__ k/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/lanuage_recognition_eres2net.pyr      s   

zLinearClassifier.__init__c                 C   s,   |  |}| jD ]}||}q| |}|S )N)r   r   r   )r   xlayerr$   r$   r%   forward(   s
   



zLinearClassifier.forward)r   r   r   )__name__
__module____qualname__r   r(   __classcell__r$   r$   r"   r%   r      s    r   )module_namec                       sF   e Zd ZdZdeeef f fddZdd Zdd Z	d	d
 Z
  ZS )LanguageRecognitionERes2NetzA speech language recognition model using the ERes2Net architecture as the backbone.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _| jd | _| jd | _t|d | _t	| j| jd| _
t| jt| jd d| _|d	 }|d
 }| || | j
| j | j| j | j
  | j  d S )N	embed_dimchannels	fbank_dimsample_ratedevice)r0   
m_channels	languages)r   r    pretrained_encoderpretrained_backend)r   r   r/   r0   r5   feature_dimr3   r   r4   r	   encoderr   lenbackend_load_check_pointtoeval)r   	model_dirr/   argskwargsr7   r8   r"   r$   r%   r   :   s*   
z$LanguageRecognitionERes2Net.__init__c                 C   s   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}| | }| d}||fS )N   r      zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarraytorch
from_numpyr;   shape	unsqueeze_extract_featurer:   r>   r4   r<   detachcpuargmax)r   audiofeatureembsscoresoutputr$   r$   r%   r(   U   s   


z#LanguageRecognitionERes2Net.forwardc                 C   sX   g }|D ] }t j|d| j| jd}||jddd }||d qt|}|S )Nr   )num_mel_binssample_frequencyT)dimkeepdim)	KaldifbankrL   r9   r3   meanr   rI   cat)r   rQ   featuresaurR   r$   r$   r%   rM   c   s   
z,LanguageRecognitionERes2Net._extract_featurec                 C   sT   | j tjtj| j|tdd | j	tjtj| j|tdd d S )NrO   )map_location)
r:   load_state_dictrI   loadospathjoinr@   r4   r<   )r   r7   r8   r$   r$   r%   r=   o   s   z-LanguageRecognitionERes2Net._load_check_point)r)   r*   r+   __doc__r   strr   r   r(   rM   r=   r,   r$   r$   r"   r%   r.   1   s    r.   )"rc   typingr   r   numpyrG   rI   torch.nnr   torchaudio.compliance.kaldi
compliancekaldirZ   modelscope.metainfor   modelscope.modelsr   r    modelscope.models.audio.sv.DTDNNr   'modelscope.models.audio.sv.DTDNN_layersr   #modelscope.models.audio.sv.ERes2Netr	   modelscope.utils.constantr
   modelscope.utils.devicer   Moduler   register_modulespeech_language_recognitioneres2net_lrer.   r$   r$   r$   r%   <module>   s$   