o
    |ۂi/                     @   sH  U d dl Z d dlmZmZmZ d dlZd dlmZ d dlm  m	Z
 dedefddZdedfZeeef ed	< G d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdedededejjfddZ	d&dededededededed ed!ee defd"d#Zdefd$d%ZdS )'    N)ListOptionalTuplexreturnc                 C   s   dddt d|  d    S )a  The metric defined by ITU-T P.862 is often called 'PESQ score', which is defined
    for narrow-band signals and has a value range of [-0.5, 4.5] exactly. Here, we use the metric
    defined by ITU-T P.862.2, commonly known as 'wide-band PESQ' and will be referred to as "PESQ score".

    Args:
        x (float): Narrow-band PESQ score.

    Returns:
        (float): Wide-band PESQ score.
    g+?g@   g;pΈgׁsF@)mathexp)r    r
   d/home/ubuntu/maya3_transcribe/venv/lib/python3.10/site-packages/torchaudio/models/squim/objective.pytransform_wb_pesq_range	   s   r         ?g      @	PESQRangec                       sF   e Zd Zd
deeef ddf fddZdejdejfdd	Z  Z	S )RangeSigmoid        r   	val_ranger   Nc                    s<   t t|   t|trt|dksJ || _t | _	d S )N   )
superr   __init__
isinstancetuplelenr   nnSigmoidsigmoid)selfr   	__class__r
   r   r       s   zRangeSigmoid.__init__r   c                 C   s,   |  || jd | jd   | jd  }|S )Nr   r   )r   r   r   r   outr
   r
   r   forward&   s   (zRangeSigmoid.forward)r   )
__name__
__module____qualname__r   floatr   torchTensorr!   __classcell__r
   r
   r   r   r      s     r   c                       sF   e Zd ZdZddededdf fdd	Zd
ejdejfddZ  Z	S )EncoderzEncoder module that transform 1D waveform to 2D representations.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 512)
        win_len (int, optional): kernel size in the Conv1D layer. (Default: 32)
           feat_dimwin_lenr   Nc                    s,   t t|   tjd|||d dd| _d S )Nr   r   F)stridebias)r   r)   r   r   Conv1dconv1d)r   r,   r-   r   r
   r   r   3   s   zEncoder.__init__r   c                 C   s    |j dd}t| |}|S )a  Apply waveforms to convolutional layer and ReLU layer.

        Args:
            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.

        Returns:
            (torch,Tensor): Feature Tensor with dimensions `(batch, channel, frame)`.
        r   dim)	unsqueezeFrelur1   r   r
   r
   r   r!   8   s   	zEncoder.forward)r*   r+   )
r"   r#   r$   __doc__intr   r&   r'   r!   r(   r
   r
   r   r   r)   +   s    r)   c                       sJ   e Zd Zddededededdf
 fdd	Zd
ejdejfddZ	  Z
S )	SingleRNNr   rnn_type
input_sizehidden_sizedropoutr   Nc                    sR   t t|   || _|| _|| _tt|||d|ddd| _t	|d || _
d S )Nr   T)r=   batch_firstbidirectionalr   )r   r9   r   r:   r;   r<   getattrr   rnnLinearproj)r   r:   r;   r<   r=   r   r
   r   r   G   s   	zSingleRNN.__init__r   c                 C   s   |  |\}}| |}|S N)rA   rC   )r   r   r    _r
   r
   r   r!   Y   s   
zSingleRNN.forward)r   )r"   r#   r$   strr8   r%   r   r&   r'   r!   r(   r
   r
   r   r   r9   F   s    $r9   c                       s   e Zd ZdZ							dd	ed
ededededededdf fddZdejde	ejef fddZ
dejde	ejef fddZdejdedejfddZdejdejfddZ  ZS )DPRNNa  *Dual-path recurrent neural networks (DPRNN)* :cite:`luo2020dual`.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 64)
        hidden_dim (int, optional): Hidden dimension in the RNN layer of DPRNN. (Default: 128)
        num_blocks (int, optional): Number of DPRNN layers. (Default: 6)
        rnn_type (str, optional): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"]. (Default: "LSTM")
        d_model (int, optional): The number of expected features in the input. (Default: 256)
        chunk_size (int, optional): Chunk size of input for DPRNN. (Default: 100)
        chunk_stride (int, optional): Stride of chunk input for DPRNN. (Default: 50)
    @         LSTM   d   2   r,   
hidden_dim
num_blocksr:   d_model
chunk_sizechunk_strider   Nc           	         s   t t|   || _tg | _tg | _tg | _tg | _	t
|D ].}| jt||| | jt||| | jtjd|dd | j	tjd|dd q&tt||dt | _|| _|| _d S )Nr   g:0yE>)eps)r   rG   r   rP   r   
ModuleListrow_rnncol_rnnrow_normcol_normrangeappendr9   	GroupNorm
SequentialConv2dPReLUconvrR   rS   )	r   r,   rO   rP   r:   rQ   rR   rS   rE   r   r
   r   r   m   s"   

zDPRNN.__init__r   c                 C   sF   |j d }| j| j|| j  | j  }t|| j|| j g}||fS )N)shaperR   rS   r5   pad)r   r   seq_lenrestr    r
   r
   r   	pad_chunk   s   
zDPRNN.pad_chunkc           	      C   s   |  |\}}|j\}}}|d d d d d | j f  ||d| j}|d d d d | jd f  ||d| j}tj||gdd}|||d| jdd }||fS )Nra      r2   r   )	rf   rb   rS   
contiguousviewrR   r&   cat	transpose)	r   r   r    re   
batch_sizer,   rd   	segments1	segments2r
   r
   r   chunking   s   0.zDPRNN.chunkingre   c           	      C   s   |j \}}}}|dd ||d| jd }|d d d d d d d | jf  ||dd d d d | jd f }|d d d d d d | jd f  ||dd d d d d | j f }|| }|dkrw|d d d d d | f }| }|S )Nr   rg   ra   r   )rb   rk   rh   ri   rR   rS   )	r   r   re   rl   r3   rE   r    out1out2r
   r
   r   merging   s   "HJzDPRNN.mergingc                 C   s*  |  |\}}|j\}}}}|}t| j| j| j| jD ]d\}}	}
}|dddd 	|| |d }||}|	|||ddddd }|	|}|| }|dddd 	|| |d }|
|}|	|||ddddd }||}|| }q| 
|}| ||}|dd }|S )Nr   rg   r   r   ra   )ro   rb   ziprV   rX   rW   rY   permuterh   ri   r`   rr   rk   )r   r   re   rl   rE   dim1dim2r    rV   rX   rW   rY   row_inrow_outcol_incol_outr
   r
   r   r!      s$   "& & 

zDPRNN.forward)rH   rI   rJ   rK   rL   rM   rN   )r"   r#   r$   r7   r8   rF   r   r&   r'   r   rf   ro   rr   r!   r(   r
   r
   r   r   rG   `   s<    		rG   c                       s>   e Zd Zd
deddf fddZdejdejfdd	Z  ZS )AutoPoolr   pool_dimr   Nc                    s>   t t|   || _tj|d| _| dtt	
d d S )Nr2   alphar   )r   r{   r   r|   r   Softmaxsoftmaxregister_parameter	Parameterr&   ones)r   r|   r   r
   r   r      s   zAutoPool.__init__r   c                 C   s0   |  t|| j}tjt||| jd}|S )Nr2   )r   r&   mulr}   sumr|   )r   r   weightr    r
   r
   r   r!      s   zAutoPool.forward)r   )	r"   r#   r$   r8   r   r&   r'   r!   r(   r
   r
   r   r   r{      s    r{   c                       sN   e Zd ZdZdejdejdejf fddZdej	de
ej	 fd	d
Z  ZS )SquimObjectivea  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **objective** metric scores
    for speech enhancement (e.g., STOI, PESQ, and SI-SDR).

    Args:
        encoder (torch.nn.Module): Encoder module to transform 1D waveform to 2D feature representation.
        dprnn (torch.nn.Module): DPRNN module to model sequential feature.
        branches (torch.nn.ModuleList): Transformer branches in which each branch estimate one objective metirc score.
    encoderdprnnbranchesc                    s$   t t|   || _|| _|| _d S rD   )r   r   r   r   r   r   )r   r   r   r   r   r
   r   r      s   
zSquimObjective.__init__r   r   c                 C   sz   |j dkrtd|j  d|tj|d dddd d  }| |}| |}g }| jD ]}|||jdd	 q-|S )
z
        Args:
            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.

        Returns:
            List(torch.Tensor): List of score Tenosrs. Each Tensor is with dimension `(batch,)`.
        r   z/The input must be a 2D Tensor. Found dimension .r   T)r3   keepdimg      ?   r2   )	ndim
ValueErrorr&   meanr   r   r   r[   squeeze)r   r   r    scoresbranchr
   r
   r   r!      s   
 


zSquimObjective.forward)r"   r#   r$   r7   r   ModulerU   r   r&   r'   r   r!   r(   r
   r
   r   r   r      s    	"r   rQ   nheadmetricc                 C   s   t j| || d ddd}t }|dkr't t | | t  t | dt }n+|dkrAt t | | t  t | dttd}nt t | | t  t | d}t |||S )	al  Create branch module after DPRNN model for predicting metric score.

    Args:
        d_model (int): The number of expected features in the input.
        nhead (int): Number of heads in the multi-head attention model.
        metric (str): The metric name to predict.

    Returns:
        (nn.Module): Returned module to predict corresponding metric score.
       r   T)r=   r>   stoir   pesq)r   )r   TransformerEncoderLayerr{   r]   rB   r_   r   r   )rQ   r   r   layer1layer2layer3r
   r
   r   _create_branch   s$   



"r   r,   r-   rO   rP   r:   rR   rS   c	                 C   sb   |du r|d }t | |}	t| ||||||}
tt||dt||dt||dg}t|	|
|S )a  Build a custome :class:`torchaudio.models.squim.SquimObjective` model.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module.
        win_len (int): Kernel size in the Encoder module.
        d_model (int): The number of expected features in the input.
        nhead (int): Number of heads in the multi-head attention model.
        hidden_dim (int): Hidden dimension in the RNN layer of DPRNN.
        num_blocks (int): Number of DPRNN layers.
        rnn_type (str): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"].
        chunk_size (int): Chunk size of input for DPRNN.
        chunk_stride (int or None, optional): Stride of chunk input for DPRNN.
    Nr   r   r   sisdr)r)   rG   r   rU   r   r   )r,   r-   rQ   r   rO   rP   r:   rR   rS   r   r   r   r
   r
   r   squim_objective_model  s   



r   c                
   C   s   t dddddddddS )zSBuild :class:`torchaudio.models.squim.SquimObjective` model with default arguments.rL   rH   r   r   rK   G   )r,   r-   rQ   r   rO   rP   r:   rR   )r   r
   r
   r
   r   squim_objective_base;  s   r   rD   )r   typingr   r   r   r&   torch.nnr   torch.nn.functional
functionalr5   r%   r   r   __annotations__r   r   r)   r9   rG   r{   r   r8   rF   modulesr   r   r   r
   r
   r
   r   <module>   sN   
 `()	

&