o
    ߥi&                     @   s   d dl Z d dlmZmZ d dlZd dlZd dlmZ d dl	m
  mZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ G d	d
 d
ejZejejejdG dd deZdS )    N)AnyDict)Models)MODELS
TorchModel)CAMPPlus)
DenseLayer)Tasks)create_devicec                       s,   e Zd Z			d fdd	Zdd Z  ZS )	LinearClassifierr        c                    s`   t    t | _tjdd| _t|D ]}| jt	||dd |}qtj
||dd| _d S )NT)inplace)bias)super__init__nn
ModuleListblocksReLU	nonlinearrangeappendr   Linearlinear)self	input_dim
num_blocks	inter_dimout_neurons_	__class__ h/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/audio/sv/lanuage_recognition_model.pyr      s   

zLinearClassifier.__init__c                 C   s,   |  |}| jD ]}||}q| |}|S )N)r   r   r   )r   xlayerr#   r#   r$   forward'   s
   



zLinearClassifier.forward)r   r   r   )__name__
__module____qualname__r   r'   __classcell__r#   r#   r!   r$   r      s    r   )module_namec                       sF   e Zd ZdZdeeef f fddZdd Zdd Z	d	d
 Z
  ZS )LanguageRecognitionCAMPPluszA speech language recognition model using the CAM++ architecture as the backbone.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    model_configc                    s   t  j||g|R i | || _| jd | _| jd | _| jd | _t|d | _t| j| j| _	t
| jt| jd d| _|d }|d }| || | j	| j | j| j | j	  | j  d S )	Nemb_size	fbank_dimsample_ratedevice	languages)r   r   pretrained_encoderpretrained_backend)r   r   r.   r/   feature_dimr1   r
   r2   r   encoderr   lenbackend_load_check_pointtoeval)r   	model_dirr.   argskwargsr4   r5   r!   r#   r$   r   9   s$   
z$LanguageRecognitionCAMPPlus.__init__c                 C   s   t |tjrt|}t|jdkr|d}t|jdks"J d| |}| 	|
| j}| | }| d}||fS )N   r      zFmodelscope error: the shape of input audio to model needs to be [N, T])
isinstancenpndarraytorch
from_numpyr8   shape	unsqueeze_extract_featurer7   r;   r2   r9   detachcpuargmax)r   audiofeatureembsscoresoutputr#   r#   r$   r'   R   s   


z#LanguageRecognitionCAMPPlus.forwardc                 C   sX   g }|D ] }t j|d| j| jd}||jddd }||d qt|}|S )Nr   )num_mel_binssample_frequencyT)dimkeepdim)	KaldifbankrI   r6   r1   meanr   rF   cat)r   rN   featuresaurO   r#   r#   r$   rJ   `   s   
z,LanguageRecognitionCAMPPlus._extract_featurec                 C   sT   | j tjtj| j|tdd | j	tjtj| j|tdd d S )NrL   )map_location)
r7   load_state_dictrF   loadospathjoinr=   r2   r9   )r   r4   r5   r#   r#   r$   r:   l   s   z-LanguageRecognitionCAMPPlus._load_check_point)r(   r)   r*   __doc__r   strr   r   r'   rJ   r:   r+   r#   r#   r!   r$   r-   0   s    r-   ) r`   typingr   r   numpyrD   rF   torch.nnr   torchaudio.compliance.kaldi
compliancekaldirW   modelscope.metainfor   modelscope.modelsr   r    modelscope.models.audio.sv.DTDNNr   'modelscope.models.audio.sv.DTDNN_layersr   modelscope.utils.constantr	   modelscope.utils.devicer
   Moduler   register_modulespeech_language_recognitioncampplus_lrer-   r#   r#   r#   r$   <module>   s"   