o
    .wiR@                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlZ	d dl
Z
d dlmZ d dl
mZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZ er]er]d dlZd dlZnd	\ZZd
ddgiZdZdededefddZede ej!e"e#ef f fddZ$d3ddZ%G dd dej!Z&G dd dej!Z'G dd dej!Z(G dd dej!Z)G dd  d ej!Z*G d!d" d"ej!Z+G d#d$ d$ej!Z,G d%d& d&e
jj!Z-d'e	j.d(ed)e"e#ef de	j.fd*d+Z/d,ed)e"e#ef de eef fd-d.Z0d/ej!d0edej1fd1d2Z2dS )4    N)	lru_cache)Any)Tensor)adaptive_max_pool2drelusoftmax)pack_padded_sequencepad_packed_sequence)rank_zero_info)_LIBROSA_AVAILABLE_REQUESTS_AVAILABLE)NN)'non_intrusive_speech_quality_assessmentlibrosarequestsz~/.torchmetrics/NISQApredsfsreturnc                 C   s   t rtstdt \}}t|tr|dkrtd| |  | d| j	d }t
|  ||}tt||\}}t  ||||j	d }W d   n1 sXw   Y  || j	dd d S )u6  `Non-Intrusive Speech Quality Assessment`_ (NISQA v2.0) [1], [2].

    .. hint::
        Usingsing this metric requires you to have ``librosa`` and ``requests`` installed. Install as
        ``pip install librosa requests``.

    Args:
        preds: float tensor with shape ``(...,time)``
        fs: sampling frequency of input

    Returns:
        Float tensor with shape ``(...,5)`` corresponding to overall MOS, noisiness, discontinuity, coloration and
        loudness in that order

    Raises:
        ModuleNotFoundError:
            If ``librosa`` or ``requests`` are not installed
        RuntimeError:
            If the input is too short, causing the number of mel spectrogram windows to be zero
        RuntimeError:
            If the input is too long, causing the number of mel spectrogram windows to exceed the maximum allowed

    Example:
        >>> import torch
        >>> from torchmetrics.functional.audio.nisqa import non_intrusive_speech_quality_assessment
        >>> _ = torch.manual_seed(42)
        >>> preds = torch.randn(16000)
        >>> non_intrusive_speech_quality_assessment(preds, 16000)
        tensor([1.0433, 1.9545, 2.6087, 1.3460, 1.7117])

    References:
        - [1] G. Mittag and S. Möller, "Non-intrusive speech quality assessment for super-wideband speech communication
          networks", in Proc. ICASSP, 2019.
        - [2] G. Mittag, B. Naderi, A. Chehadi and S. Möller, "NISQA: A deep CNN-self-attention model for
          multidimensional speech quality prediction with crowdsourced datasets", in Proc. INTERSPEECH, 2021.

    ziNISQA metric requires that librosa and requests are installed. Install as `pip install librosa requests`.r   z9Argument `fs` expected to be a positive integer, but got N)   )r   r   ModuleNotFoundError_load_nisqa_model
isinstanceint
ValueErrorevalreshapeshape_get_librosa_melspeccpunumpy_segment_specstorch
from_numpyno_gradexpand)r   r   modelargsxn_wins r)   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/torchmetrics/functional/audio/nisqa.pyr   B   s   &

r   c                  C   sb   t jt jtd} t j| st  tj| ddd}|d }t	|}|j
|d dd ||fS )zLoad NISQA model and its parameters.

    Returns:
        Tuple ``(model,args)`` where ``model`` is the NISQA model and ``args`` is a dictionary with all its parameters

    	nisqa.tarr   T)map_locationweights_onlyr&   model_state_dict)strict)ospath
expanduserjoin	NISQA_DIRexists_download_weightsr!   load	_NISQADIMload_state_dict)
model_path
checkpointr&   r%   r)   r)   r*   r   {   s   r   c                  C   s   d} t jt}t j|dd t j|d}t j|rdS td|  d|  t	| }t
|d}||j W d   dS 1 sDw   Y  dS )	zDownload NISQA model weights.zNhttps://github.com/gabrielmittag/NISQA/raw/refs/heads/master/weights/nisqa.tarT)exist_okr+   Nzdownloading z to wb)r0   r1   r2   r4   makedirsr3   r5   r
   r   getopenwritecontent)url	nisqa_dirsavetomyfilefr)   r)   r*   r6      s   
"r6   c                       D   e Zd Zdeeef ddf fddZdededefdd	Z  Z	S )
r8   r&   r   Nc                    s6   t    t|| _t|| _t|}t|d| _d S )Nr   )	super__init__
_Framewisecnn_TimeDependencytime_dependency_Pooling_get_clonespool_layers)selfr&   pool	__class__r)   r*   rJ      s
   


z_NISQADIM.__init__r'   r(   c                    s@   |   |  \  fdd| jD }tj|ddS )Nc                    s   g | ]}| qS r)   r)   ).0modr(   r'   r)   r*   
<listcomp>       z%_NISQADIM.forward.<locals>.<listcomp>   dim)rL   rN   rQ   r!   cat)rR   r'   r(   outr)   rX   r*   forward   s   z_NISQADIM.forward
__name__
__module____qualname__dictstrr   rJ   r   r`   __classcell__r)   r)   rT   r*   r8      s    r8   c                       rH   )
rK   r&   r   Nc                       t    t|| _d S N)rI   rJ   	_AdaptCNNr%   rR   r&   rT   r)   r*   rJ         
z_Framewise.__init__r'   r(   c                 C   sN   t ||ddd}| |jd}|j|d}t|ddt| d\}}|S )NTF)batch_firstenforce_sortedr[   )data        )rm   padding_valuetotal_length)r   r%   ro   	unsqueeze_replacer	   r   max)rR   r'   r(   x_packed_r)   r)   r*   r`      s
   z_Framewise.forwardra   r)   r)   rT   r*   rK          rK   c                       s@   e Zd Zdeeef ddf fddZdedefddZ  Z	S )	rj   r&   r   Nc                    s  t    |d | _|d | _|d | _tj|d d| _|d d dkr'd	nd
}tjd|d |d |d| _	t
| j	j| _tj| j	j|d |d |d| _t
| jj| _tj| jj|d |d |d| _t
| jj| _tj| jj|d |d |d| _t
| jj| _tj| jj|d |d |d| _t
| jj| _tj| jj|d |d d |d d fd	d| _t
| jj| _d S )N
cnn_pool_1
cnn_pool_2
cnn_pool_3cnn_dropout)pcnn_kernel_sizer   r[   )r[   r   )r[   r[   cnn_c_out_1)paddingcnn_c_out_2cnn_c_out_3)rI   rJ   pool_1pool_2pool_3nn	Dropout2ddropoutConv2dconv1BatchNorm2dout_channelsbn1conv2bn2conv3bn3conv4bn4conv5bn5conv6bn6)rR   r&   cnn_padrT   r)   r*   rJ      s.   



    z_AdaptCNN.__init__r'   c                 C   s   t | | |}t|| jd}t | | |}t|| jd}| |}t | 	| 
|}| |}t | | |}t|| jd}| |}t | | |}| |}t | | |}|d| jj| jd  S )N)output_sizer   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   viewr   )rR   r'   r)   r)   r*   r`      s   



z_AdaptCNN.forwardra   r)   r)   rT   r*   rj      s    rj   c                       rH   )
rM   r&   r   Nc                    rh   ri   )rI   rJ   _SelfAttentionr%   rk   rT   r)   r*   rJ      rl   z_TimeDependency.__init__r'   r(   c                 C      |  ||S ri   r%   rR   r'   r(   r)   r)   r*   r`         z_TimeDependency.forwardra   r)   r)   rT   r*   rM      rx   rM   c                       sV   e Zd Zdeeef ddf fddZdddZded	ede	eef fd
dZ
  ZS )r   r&   r   Nc                    s`   t    t|}t|d | _t|d |d d  |d | _t||d | _	| 
  d S )Ntd_sa_d_modelr   r{   r   td_sa_num_layers)rI   rJ   _SelfAttentionLayerr   	LayerNormnorm1LinearlinearrP   layers_reset_parameters)rR   r&   encoder_layerrT   r)   r*   rJ      s   
"z_SelfAttention.__init__c                 C   s*   |   D ]}| dkrtj| qd S )Nr[   )
parametersr]   r   initxavier_uniform_)rR   r}   r)   r)   r*   r      s
   z _SelfAttention._reset_parameterssrcr(   c                 C   sJ   |  |}|dd}| |}| jD ]	}|||\}}q|dd|fS )Nr[   r   )r   	transposer   r   )rR   r   r(   outputrW   r)   r)   r*   r`     s   


z_SelfAttention.forwardr   N)rb   rc   rd   re   rf   r   rJ   r   r   tupler`   rg   r)   r)   rT   r*   r      s    
&r   c                       sL   e Zd Zdeeef ddf fddZdededeeef fdd	Z	  Z
S )
r   r&   r   Nc                    s   t    t|d |d |d | _t|d |d | _t|d | _t|d |d | _	t
|d | _t
|d | _t|d | _t|d | _t| _d S )Nr   td_sa_nheadtd_sa_dropouttd_sa_h)rI   rJ   r   MultiheadAttention	self_attnr   linear1Dropoutr   linear2r   r   norm2dropout1dropout2r   
activationrk   rT   r)   r*   rJ     s   

z_SelfAttentionLayer.__init__r   r(   c              	   C   s   t |jd d d d f |d d d f k }| j|||| dd }|| | }| |}| | | | 	|}|| 
| }| |}||fS )Nr   )key_padding_mask)r!   aranger   r   r   r   r   r   r   r   r   r   )rR   r   r(   masksrc2r)   r)   r*   r`     s   ,

z_SelfAttentionLayer.forward)rb   rc   rd   re   rf   r   rJ   r   r   r`   rg   r)   r)   rT   r*   r   
  s    &r   c                       rH   )
rO   r&   r   Nc                    rh   ri   )rI   rJ   
_PoolAttFFr%   rk   rT   r)   r*   rJ   %  rl   z_Pooling.__init__r'   r(   c                 C   r   ri   r   r   r)   r)   r*   r`   )  r   z_Pooling.forwardra   r)   r)   rT   r*   rO   #  rx   rO   c                       rH   )
r   r&   r   Nc                    s^   t    t|d |d | _t|d d| _t|d d| _t| _t	|d | _
d S )Nr   
pool_att_hr[   pool_att_dropout)rI   rJ   r   r   r   r   linear3r   r   r   r   rk   rT   r)   r*   rJ   /  s   
z_PoolAttFF.__init__r'   r(   c              	   C   s   |  | | | |}|dd}t|jd d d d f |d d d f k }td||	d < t
|dd}t||}|d}| |S )N   r[   z-infr\   )r   r   r   r   r   r!   r   r   floatrs   r   bmmsqueezer   )rR   r'   r(   attr   r)   r)   r*   r`   7  s   ,

z_PoolAttFF.forwardra   r)   r)   rT   r*   r   -  s    r   ysrr&   c                 C   s   t ||d  }t ||d  }t * tjddd tjj| |d|d ||dd	d
d|d d|d ddd}W d   n1 sAw   Y  tdd |D S )a  Compute mel spectrogram from waveform using librosa.

    Args:
        y: waveform with shape ``(batch_size,time)``
        sr: sampling rate
        args: dictionary with all NISQA parameters

    Returns:
        Mel spectrogram with shape ``(batch_size,n_mels,n_frames)``

    ms_hop_lengthms_win_lengthignorez-Empty filters detected in mel frequency basis)messageNms_n_ffthannTreflect      ?	ms_n_melsrp   ms_fmaxFslaney)r   r   Sn_fft
hop_length
win_lengthwindowcenterpad_modepowern_melsfminfmaxhtknormc                 S   s   g | ]}t j|d dddqS )r   g-C6?g      T@)refamintop_db)r   amplitude_to_db)rV   mr)   r)   r*   rY   h  s    z(_get_librosa_melspec.<locals>.<listcomp>)	r   warningscatch_warningsfilterwarningsr   featuremelspectrogramnpstack)r   r   r&   r   r   melspecr)   r)   r*   r   B  s.   
r   r'   c           
      C   s  |d }|d }|d }| j d |d  }|dk rtdt|}t|}|d|d }| dddd|ddf d	d} | dddd|f } t|| }||k r^td
t| j d || j d | j d	 f}	| |	ddd|f< |	t	|fS )a   Segment mel spectrogram into overlapping windows.

    Args:
        x: mel spectrogram with shape ``(batch_size,n_mels,n_frames)``
        args: dictionary with all NISQA parameters

    Returns:
        Tuple ``(x_padded,n_wins)```, where ``x_padded`` is the segmented mel spectrogram with shape
        ``(batch_size,max_length,n_mels,seg_length)`` where the second dimension is the number of windows and was
        padded to ``max_length``, and ``n_wins`` is the number of windows and is 0-dimensional

    ms_seg_lengthms_seg_hop_lengthms_max_segmentsr   r[   zInput signal is too short.r   N   zFMaximum number of mel spectrogram windows exceeded. Use shorter audio.)
r   RuntimeErrorr!   r   rs   r   mathceilzerostensor)
r'   r&   
seg_lengthseg_hop
max_lengthr(   idx1idx2idx3x_paddedr)   r)   r*   r    k  s"   

&$r    modulenc                    s   t  fddt|D S )z Create ``n`` copies of a module.c                    s   g | ]}t  qS r)   )copydeepcopy)rV   ir  r)   r*   rY     rZ   z_get_clones.<locals>.<listcomp>)r   
ModuleListrange)r  r  r)   r  r*   rP     s   rP   r   )3r  r   r0   r   	functoolsr   typingr   r   r   r!   torch.nnr   r   torch.nn.functionalr   r   r   torch.nn.utils.rnnr   r	   torchmetrics.utilitiesr
   torchmetrics.utilities.importsr   r   r   r   __doctest_requires__r4   r   r   r   Modulere   rf   r   r6   r8   rK   rj   rM   r   r   rO   r   ndarrayr   r    r  rP   r)   r)   r)   r*   <module>   sF   %
9"
,

&&) 