o
    ॵi!                     @   s   d dl Z d dlZd dlmZmZmZmZ d dlZd dl	m   m
Z d dlZd dlmZ d dlmZ d dlmZ d dlmZmZ G dd deZd	d
 ZG dd dZeejG dd deZdS )    N)AnyDictTupleUnion)File)Preprocessor)PREPROCESSORS)FieldsModeKeysc                       sR   e Zd ZdZejfdedef fddZdeee	f deee	f fdd	Z
  ZS )
AudioBrainPreprocessorzA preprocessor takes audio file path and reads it into tensor

    Args:
        takes: the audio file field name
        provides: the tensor field name
        mode: process mode, default 'inference'
    takesprovidesc                    sB   t t| j|g|R i | || _|| _dd l}|jjj| _d S )Nr   )superr   __init__r   r   speechbraindataio
read_audio)selfr   r   modeargskwargssb	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/audio.pyr      s
   zAudioBrainPreprocessor.__init__datareturnc                 C   s   |  || j }||| j< |S N)r   r   r   )r   r   resultr   r   r   __call__&   s   
zAudioBrainPreprocessor.__call__)__name__
__module____qualname____doc__r
   	INFERENCEstrr   r   r   r    __classcell__r   r   r   r   r      s    *r   c                 C   s   t | ddd}| }|d}|d|}|d|}tj||d | tjdd	}|d
}|d|}|d|}tj||d | tjdd	}|  ||fS )Nrzutf-8)encodingAddShift[]    )dtypesepRescale)openreadfindnp
fromstringfloat32close)filenamefpall_strpos1pos2pos3meanscaler   r   r   load_kaldi_feature_transform,   s   

rA   c                   @   s0   e Zd ZdZ			dddZdd Zd	d
 ZdS )Featurez%Extract feat from one utterance.
    specNFc                 C   s   || _ || _|d |d  d | _|d |d  d | _tj| jdd| _d| _|durNtj	
|rNtd|  t|\}}t|| _t|| _d	| _|rg| j | _| jri| j | _| j | _dS dS dS )
aF  

        Args:
            fbank_config (dict):
            feat_type (str):
                raw: do nothing
                fbank: use kaldi.fbank
                spec: Real/Imag
                logpow: log(1+|x|^2)
            mvn_file (str): the path of data file for mean variance normalization
            cuda:
        frame_lengthsample_frequencyi  frame_shiftF)periodicNzloading mvn file: T)fbank_config	feat_typen_fft
hop_lengthtorchhamming_windowwindowmvnospathexistsprintrA   
from_numpyshiftr@   cuda)r   rH   rI   mvn_filerV   rU   r@   r   r   r   r   ?   s6   zFeature.__init__c              	   C   s   | j dkr|S | j dkr-ddlm  m} t|jdkr!|d}|j|fi | j}|S | j dkrUt	j
|d | j| j| j| jdd	d
}t	j|j|jgdddd}|S | j dkr|t	j
|| j| j| j| jdd	d
}t	|d }t	d| dd}|S )zm

        Args:
            utt: in [-32768, 32767] range

        Returns:
             [..., T, F]
        rawfbankr   Nr-   rC   i   FT)centerreturn_complexdimlogpow   )rI   torchaudio.compliance.kaldi
compliancekaldilenshape	unsqueezerY   rH   rL   stftrJ   rK   rN   catrealimagpermuteabslog)r   uttrd   featrC   abspowr   r   r   computee   sB   
	



zFeature.computec                 C   s   | j r|| j }|| j }|S r   )rO   rU   r@   )r   rp   r   r   r   	normalize   s   

zFeature.normalize)rC   NF)r!   r"   r#   r$   r   rr   rs   r   r   r   r   rB   ;   s    
&(rB   c                   @   sN   e Zd ZdZdd Zdeeeee	f f deee	f fddZ
edd	 Zd
S )LinearAECAndFbanki>  c                 C   sT   dd l }d| j | _|d | _t|d |d |d | _| | _|d dk| _d S )	Nr   i   linear_aec_delayrH   rI   rO   mask_onnearend_mic)	MinDAECSAMPLE_RATEtrunc_lengthru   rB   featureloadmitaecmask_on_mic)r   	io_configrx   r   r   r   r      s   

zLinearAECAndFbank.__init__r   r   c                 C   s  t |tr| |d \}}| |d \}}t|}n%| |d \}}| |d \}}d|v r=| |d \}}nt|}| j||\}}}}	tt| j	| g}
t
|
|g}tt|t|t|t|	t|}d}t|| j}||| ||| ||| |	|| ||| f\}}}}	}t }tt|}| j|}tj||gdd}tt|}| j|}tj||gdd}tt|	}	| j|	}tj||gdd}| j|}|durtt|}| jr|}n|}|||d}|S )	u7   Linear filtering the near end mic and far end audio, then extract the feature.

        Args:
            data: Dict with two keys and correspond audios: "nearend_mic" and "farend_speech".

        Returns:
            Dict with two keys and Tensor values: "base" linear filtered audio，and "feature"
        r   r-   rw   farend_speechnearend_speechr]   N)basetargetr{   )
isinstancetupleload_wavr5   
zeros_liker}   do_linear_aeczerosintru   concatenateminre   rz   rL   FloatTensorrT   r7   r{   rr   ri   rs   r~   )r   r   rw   fsr   r   out_micout_ref
out_linearout_echoextra_zerosflenfstartrp   fbank_nearend_micfbank_out_linearfbank_out_echor   out_datar   r   r   r       sT   
	

zLinearAECAndFbank.__call__c                 C   s   dd l }t| trt| } nt| trt| }t|} n
tdt	|  dt
| \}}t|jdkr;td|tjkrH|||tj}|tjtjfS )Nr   zUnsupported input type: .r-   z(modelscope error:The audio must be mono.)librosar   bytesioBytesIOr&   r   r3   	TypeErrortypewavre   rf   
ValueErrorrt   ry   resampleastyper5   r7   )inputsr   
file_bytessample_rater   r   r   r   r      s   



zLinearAECAndFbank.load_wavN)r!   r"   r#   ry   r   r   r   r   r&   r   r    staticmethodr   r   r   r   r   rt      s    *	Crt   )r   rP   typingr   r   r   r   numpyr5   scipy.io.wavfilewavfiler   rL   modelscope.fileior   modelscope.preprocessorsr    modelscope.preprocessors.builderr   modelscope.utils.constantr	   r
   r   rA   rB   register_moduleaudiort   r   r   r   r   <module>   s   
Y