o
    #i                  
   @   s   d dl mZ d dlZd dlmZ d dlZG dd dejZG dd dejZG dd dejZ	d	e
d
ededede	f
ddZde	fddZdS )    )TupleNc                       @   e Zd ZdZdedef fddZdejdejfdd	Z  Z	S )
AttPoolzAttention-Pooling module that estimates the attention score.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    	input_dimatt_dimc                    s.   t t|   t|d| _t||| _d S )N   )superr   __init__nnLinearlinear1linear2selfr   r   	__class__ _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/torchaudio/models/squim/subjective.pyr	      s   zAttPool.__init__xreturnc                 C   sF   |  |}|dd}tjj|dd}t||d}| |}|S )zApply attention and pooling.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Attention score with dimensions `(batch, att_dim)`.
           r   dim)	r   	transposer
   
functionalsoftmaxtorchmatmulsqueezer   )r   r   attr   r   r   forward   s   


zAttPool.forward
__name__
__module____qualname____doc__intr	   r   Tensorr    __classcell__r   r   r   r   r      s    r   c                       r   )
	PredictorzPrediction module that apply pooling and attention, then predict subjective metric scores.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    r   r   c                    s$   t t|   t||| _|| _d S N)r   r)   r	   r   att_pool_layerr   r   r   r   r   r	   0   s   
zPredictor.__init__r   r   c                 C   sD   |  |}tjj|dd}tjdd| j|jd}|| jdd}|S )a  Predict subjective evaluation metric score.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r   r   r      )stepsdevice)	r+   r
   r   r   r   linspacer   r.   sum)r   r   Br   r   r   r    5   s
   
	zPredictor.forwardr!   r   r   r   r   r)   (   s    r)   c                       sp   e Zd ZdZdejdejdejf fddZdejdejd	e	ejejf fd
dZ
dejdejfddZ  ZS )SquimSubjectiveaP  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **subjective** metric scores
    for speech enhancement (e.g., Mean Opinion Score (MOS)). The model is adopted from *NORESQA-MOS*
    :cite:`manocha2022speech` which predicts MOS scores given the input speech and a non-matching reference.

    Args:
        ssl_model (torch.nn.Module): The self-supervised learning model for feature extraction.
        projector (torch.nn.Module): Projection layer that projects SSL feature to a lower dimension.
        predictor (torch.nn.Module): Predict the subjective scores.
    	ssl_model	projector	predictorc                    s$   t t|   || _|| _|| _d S r*   )r   r2   r	   r3   r4   r5   )r   r3   r4   r5   r   r   r   r	   P   s   
zSquimSubjective.__init__waveform	referencer   c                    s`   |j d } j d }||k r$|| d }tj fddt|D dd | ddd|f fS )a  Cut or pad the reference Tensor to make it aligned with waveform Tensor.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor, torch.Tensor): The aligned waveform and reference Tensors
                with same dimensions `(batch, time)`.
        r   c                    s   g | ]} qS r   r   ).0_r7   r   r   
<listcomp>e   s    z1SquimSubjective._align_shapes.<locals>.<listcomp>r   N)shaper   catrange)r   r6   r7   
T_waveformT_referencenum_paddingr   r;   r   _align_shapesV   s   

 zSquimSubjective._align_shapesc                 C   sh   |  ||\}}| | j|d d }| | j|d d }tj||fdd}| |}d| S )a  Predict subjective evaluation metric score.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r   r8   r   r      )rC   r4   r3   extract_featuresr   r>   r5   )r   r6   r7   concat
score_diffr   r   r   r    h   s   

zSquimSubjective.forward)r"   r#   r$   r%   r
   Moduler	   r   r'   r   rC   r    r(   r   r   r   r   r2   E   s
     
&r2   ssl_typefeat_dimproj_dimr   r   c                 C   s4   t tj|  }t||}t|d |}t|||S )a  Build a custome :class:`torchaudio.prototype.models.SquimSubjective` model.

    Args:
        ssl_type (str): Type of self-supervised learning (SSL) models.
            Must be one of ["wav2vec2_base", "wav2vec2_large"].
        feat_dim (int): Feature dimension of the SSL feature representation.
        proj_dim (int): Output dimension of projection layer.
        att_dim (int): Dimension of attention scores.
    r   )getattr
torchaudiomodelsr
   r   r)   r2   )rI   rJ   rK   r   r3   r4   r5   r   r   r   squim_subjective_modelz   s   rO   c                   C   s   t dddddS )zXBuild :class:`torchaudio.prototype.models.SquimSubjective` model with default arguments.wav2vec2_basei       rD   )rI   rJ   rK   r   )rO   r   r   r   r   squim_subjective_base   s   rR   )typingr   r   torch.nnr
   rM   rH   r   r)   r2   strr&   rO   rR   r   r   r   r   <module>   s&     5
