o
    “©i@,  ã                   @   sø  d dl mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl	Z
d dlZd dlmZ d dlZd dlZd dlZd dlm  mZ d dlZd dlZd d	lmZ d d
lmZ d dlmZmZ d dlm Z  d dl!Z!d dl"Z"d dl#Z#dZ$dZ%e ƒ Z&e"j' (¡ rƒdndZ)e!j*de)dZ+e ,d¡Z-e ,d¡Z.e"j/j0ddZ1dd„ Z2G dd„ dƒZ3dd„ Z4dd„ Z5e6dkrzeƒ Z7e7j8de9dd  e7j8d!e9dd  e7 :¡ Z;g g g g g g g g g g g d"œZ<e=eee;j>d#ƒƒƒZ?d$d%„ e?D ƒZ@ejA d&d'¡ZBejA d&d(¡ZCe3eCeBƒZDeeEe?e@ƒƒD ].\ZFZGeG Hd)¡d ZIejJeGdd*d+\ZKZLeKd, eMeNeKƒƒ ZKejJeFdd*d+\ZOZLeOd, eMeNeOƒƒ ZOePeQeOƒeQeKƒƒZReKdeR… ZKeOdeR… ZOeDeFdd-ƒZSe# JeG¡\ZTZUeUdkrkeVjWeUdd.ZXeXeTƒZTe# JeF¡\ZYZUeUdkr‚eVjWeUdd.ZXeXeYƒZYe2eGƒZZe2eFƒZ[e\eeZe[ƒd/ƒZe\eeZe[ƒd/ƒZeTd   ]¡  ¡ eYd   ]¡  ¡ gZ^e-e^dd*d0d1Z_e.dPi e_¤Žj`Z`e"j/jajbe`dd ]¡ Z`e1e`d  e`d2 ƒ c¡ Zde<d3  eeI¡ e<d4  eedeKeOd5ƒ¡ e<d6  eeeKeOdd*d7¡ e<d8  ee5eOeKƒ¡ e<d9  eeSd9 ¡ e<d:  eeSd: ¡ e<d;  eeSd; ¡ e<d<  eeSd< ¡ e<d=  ee¡ e<d>  ee¡ e<d?  eed¡ qe
 fe<¡Zgehd@jie4egd4  j¡ ƒŽ ƒ ehdAjie4egd6  j¡ ƒŽ ƒ ehdBjie4egd8  j¡ ƒŽ ƒ ehdCjie4egd9  j¡ ƒŽ ƒ ehdDjie4egd:  j¡ ƒŽ ƒ ehdEjie4egd;  j¡ ƒŽ ƒ ehdFjie4egd<  j¡ ƒŽ ƒ ehdGjie4egd=  j¡ ƒŽ ƒ ehdHjie4egd>  j¡ ƒŽ ƒ ehdIjie4egd?  j¡ ƒŽ ƒ ejke;jld*dJ emee;jldKƒdLƒZnen od@jie4egd4  j¡ ƒŽ dM ¡ en odAjie4egd6  j¡ ƒŽ dM ¡ en odBjie4egd8  j¡ ƒŽ dM ¡ en odCjie4egd9  j¡ ƒŽ dM ¡ en odDjie4egd:  j¡ ƒŽ dM ¡ en odEjie4egd;  j¡ ƒŽ dM ¡ en odFjie4egd<  j¡ ƒŽ dM ¡ en odGjie4egd=  j¡ ƒŽ dM ¡ en odHjie4egd>  j¡ ƒŽ dM ¡ en odIjie4egd?  j¡ ƒŽ dM ¡ egjpee;jldNƒd-dO dS dS )Qé    )Újoin)Úglob)ÚArgumentParser)Úread)Útqdm)ÚpesqN)Ústoi)Úwer)Úcer)ÚWav2Vec2FeatureExtractorÚWavLMForXVector)ÚEnglishTextNormalizeré€>  g…ëQ¸"@ÚcudaÚcpuzlarge-v3-turbo)Údevicezmicrosoft/wavlm-base-plus-svéÿÿÿÿ)Údimc                 C   s"   t  | ¡}|d  ¡ }t|ƒ}|S )NÚtext)Úwhisper_modelÚ
transcribeÚstripÚ
normalizer)Úwav_pathÚresultÚpred© r   úR/home/ubuntu/.local/lib/python3.10/site-packages/solospeech/metrics/cal_metrics.pyÚasr!   s   
r   c                   @   s0   e Zd Zddd„Zdd
d„Zdd„ Zdd„ ZdS )ÚComputeScoreÚreturnNc                 C   s   t  |¡| _t  |¡| _d S ©N)ÚortÚInferenceSessionÚ	onnx_sessÚp808_onnx_sess)ÚselfÚprimary_model_pathÚp808_model_pathr   r   r   Ú__init__(   s   zComputeScore.__init__éx   é@  é    r   Tc                 C   s<   t jj|||d ||d}|rt j|tjdd d }|jS )Né   )ÚyÚsrÚn_fftÚ
hop_lengthÚn_mels)Úrefé(   )ÚlibrosaÚfeatureÚmelspectrogramÚpower_to_dbÚnpÚmaxÚT)r&   Úaudior2   Ú
frame_sizer1   r/   Úto_dbÚmel_specr   r   r   Úaudio_melspec,   s   zComputeScore.audio_melspecc                 C   s|   |rt  g d¢¡}t  g d¢¡}t  g d¢¡}nt  g d¢¡}t  g d¢¡}t  g d¢¡}||ƒ}||ƒ}	||ƒ}
||	|
fS )N)g~ÎZ!Õu¿gæÔBÉät?gœ¿½¬ãò?g˜Bµ§Ã¼¿)g/¥Ëîà„¿gò­Õ?,œ?g/íÄzÝ!ó?g¹¨X-—*Ï¿)gPùËz©¿g ³7!BVÜ?gojû²Å¿gD²WŠª ï?)g;êYR±¿gö~ñØñ?g•ëz¢§?)g
­Úw=µ¿gc9ðˆó?g%P° zu?)gÀž†ÚÀ¿gFØÇu¿ù?gCy›ÏXÙ¿)r9   Úpoly1d)r&   ÚsigÚbakÚovrÚis_personalized_MOSÚp_ovrÚp_sigÚp_bakÚsig_polyÚbak_polyÚovr_polyr   r   r   Úget_polyfit_val2   s   
zComputeScore.get_polyfit_valc           !      C   sR  t  |¡\}}|}||krt |||¡}n|}t|ƒ}tt| ƒ}	t|ƒ|	k r3t ||¡}t|ƒ|	k s'tt 	t|ƒ| ¡t ƒd }
|}g }g }g }g }g }g }g }t
|
ƒD ]“}|t|| ƒt|t | ƒ… }t|ƒ|	k roqVt |¡ d¡tjd d …f }t | j|d d… d¡ d¡tjd d …d d …f }d|i}d|i}| j d |¡d d d }| j d |¡d d \}}}|  ||||¡\}}}| |¡ | |¡ | |¡ | |¡ | |¡ | |¡ | |¡ qV||| |dœ} |
| d< t |¡| d	< t |¡| d
< t |¡| d< t |¡| d< t |¡| d< t |¡| d< t |¡| d< | S )Nr-   Úfloat32i`ÿÿÿ)r<   Úinput_1r   )ÚfilenameÚ
len_in_secr/   Únum_hopsÚOVRL_rawÚSIG_rawÚBAK_rawÚOVRLÚSIGÚBAKÚP808_MOS)Úsfr   r5   ÚresampleÚlenÚintÚINPUT_LENGTHr9   ÚappendÚfloorÚrangeÚarrayÚastypeÚnewaxisr@   r%   Úrunr$   rL   Úmean)!r&   ÚfpathÚsampling_raterE   ÚaudÚinput_fsÚfsr<   Úactual_audio_lenÚlen_samplesrQ   Úhop_len_samplesÚpredicted_mos_sig_seg_rawÚpredicted_mos_bak_seg_rawÚpredicted_mos_ovr_seg_rawÚpredicted_mos_sig_segÚpredicted_mos_bak_segÚpredicted_mos_ovr_segÚpredicted_p808_mosÚidxÚ	audio_segÚinput_featuresÚp808_input_featuresÚoiÚp808_oiÚp808_mosÚmos_sig_rawÚmos_bak_rawÚmos_ovr_rawÚmos_sigÚmos_bakÚmos_ovrÚ	clip_dictr   r   r   Ú__call__B   s^   ÿ 4





zComputeScore.__call__)r    N)r*   r+   r,   r   T)Ú__name__Ú
__module__Ú__qualname__r)   r@   rL   rƒ   r   r   r   r   r   '   s
    

r   c                 C   s,   | t  | ¡  } t  | ¡}t  | ¡}||fS r!   )r9   Úisnanre   Ústd)Údatare   rˆ   r   r   r   Úmean_stdw   s   

rŠ   c                 C   sè   | j |j ks
J dƒ‚| jdkr!| tjdd…f } |tjdd…f }| tj| ddd } |tj|ddd }tj||  dddtj|d ddd }|| }| | }dt tj|d dd	tj|d dd	 ¡ }t|ƒdkrp|S |d
 S )a  
    Compute Scale-Invariant Signal-to-Distortion Ratio (SI-SDR) using numpy.

    Args:
        estimate (np.ndarray): Estimated signal of shape (num_samples,) or (batch_size, num_samples)
        reference (np.ndarray): Reference (clean) signal of the same shape as estimate.
    
    Returns:
        np.ndarray: SI-SDR value for each signal in the batch (or a single value if single sample).
    z/Estimate and reference must have the same shaper-   Nr   T)ÚaxisÚkeepdimsé   é
   )r‹   r   )ÚshapeÚndimr9   rc   re   ÚsumÚlog10r[   )ÚestimateÚ	referenceÚscaleÚ
projectionÚnoiseÚsdrr   r   r   Úsi_sdr}   s   
(.r™   Ú__main__z
--test_dir)ÚtypeÚdefaultz--output_dir)rO   r   ÚestoiÚsisdrrU   rV   rW   rX   r	   r
   Úsimz
*_pred.wavc                 C   s   g | ]}|  d d¡‘qS )z	_pred.wavz_ref.wav)Úreplace)Ú.0Úitemr   r   r   Ú
<listcomp>­   s    r£   ÚDNSMOSzmodel_v8.onnxzsig_bak_ovr.onnxú/T)r/   ÚmonogÍÌÌÌÌÌì?F)Ú	orig_freqÚnew_freqé   Úpt)rg   ÚpaddingÚreturn_tensorsr-   rO   r   Úwbr   )Úextendedrž   rU   rV   rW   rX   r	   r
   rŸ   u   PESQ: {:.3f} Â± {:.3f}u   ESTOI: {:.3f} Â± {:.3f}u   SISDR: {:.3f} Â± {:.3f}u   OVRL: {:.3f} Â± {:.3f}u   SIG: {:.3f} Â± {:.3f}u   BAK: {:.3f} Â± {:.3f}u   P808_MOS: {:.3f} Â± {:.3f}u   WER: {:.3f} Â± {:.3f}u   CER: {:.3f} Â± {:.3f}u   SIM: {:.3f} Â± {:.3f})Úexist_okz_avg_results.txtÚwÚ
z_results.csv)Úindexr   )qÚos.pathr   r   Úargparser   Ú	soundfiler   r   r   ÚpandasÚpdr5   Úpystoir   Únumpyr9   ÚosÚnumpy.polynomial.polynomialÚ
polynomialÚpolyÚonnxruntimer"   rY   Újiwerr	   Úcalculate_werr
   Úcalculate_cerÚtransformersr   r   Úwhisper.normalizersr   ÚwhisperÚtorchÚ
torchaudioÚSAMPLING_RATEr]   r   r   Úis_availabler   Ú
load_modelr   Úfrom_pretrainedÚxvector_extractorÚxvector_computerÚnnÚCosineSimilarityÚ
cosine_simr   r   rŠ   r™   r„   ÚparserÚadd_argumentÚstrÚ
parse_argsÚargsr‰   ÚsortedÚtest_dirÚnoisy_filesÚclean_filesÚpathr(   r'   ÚdnsmodelÚzipÚ
noisy_fileÚ
clean_fileÚsplitrO   ÚloadÚxÚ_r:   ÚabsÚx_hatÚminr[   ÚlengÚdnsmosÚgt_wavr/   Ú
transformsÚResampleÚ	resamplerÚpred_wavÚgt_asrÚpred_asrÚroundr   r<   ÚinputsÚ
embeddingsÚ
functionalÚ	normalizer¢   rŸ   r^   Ú	DataFrameÚdfÚprintÚformatÚto_numpyÚmakedirsÚ
output_dirÚopenÚlogÚwriteÚto_csvr   r   r   r   Ú<module>   sØ    

P
&


 
          ¦