o
    -iV<                     @   s   d dl Z d dlmZmZmZ d dlZd dlm  mZ	 d dlmZm
Z
 g dZG dd dejZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZdS )    N)ListOptionalTuple)nnTensor)ResBlock	MelResNet	Stretch2dUpsampleNetworkWaveRNNc                       s>   e Zd ZdZddeddf fddZdedefd	d
Z  ZS )r   af  ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.

    Args:
        n_freq: the number of bins in a spectrogram. (Default: ``128``)

    Examples
        >>> resblock = ResBlock()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = resblock(input)  # shape: (10, 128, 512)
       n_freqreturnNc                    sR   t    ttj||dddt|tjddtj||dddt|| _d S )N   Fin_channelsout_channelskernel_sizebiasTinplace)super__init__r   
SequentialConv1dBatchNorm1dReLUresblock_model)selfr   	__class__ R/home/ubuntu/LTX-2/.venv/lib/python3.10/site-packages/torchaudio/models/wavernn.pyr      s   


zResBlock.__init__specgramc                 C   s   |  || S )zPass the input through the ResBlock layer.
        Args:
            specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_freq, n_time)
        )r   r   r#   r!   r!   r"   forward(   s   	zResBlock.forward)r   	__name__
__module____qualname____doc__intr   r   r%   __classcell__r!   r!   r   r"   r      s    r   c                       sP   e Zd ZdZ	ddedededed	ed
df fddZded
efddZ  ZS )r   a  MelResNet layer uses a stack of ResBlocks on spectrogram.

    Args:
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> melresnet = MelResNet()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = melresnet(input)  # shape: (10, 128, 508)
    
   r      n_res_blockr   n_hiddenn_outputr   r   Nc                    sh   t     fddt|D }tjtj| |ddt tjddg|tj |ddR  | _d S )	Nc                    s   g | ]}t  qS r!   )r   ).0_r0   r!   r"   
<listcomp>I   s    z&MelResNet.__init__.<locals>.<listcomp>Fr   Tr   r   )r   r   r   )	r   r   ranger   r   r   r   r   melresnet_model)r   r/   r   r0   r1   r   	ResBlocksr   r4   r"   r   D   s   

zMelResNet.__init__r#   c                 C   s
   |  |S )zPass the input through the MelResNet layer.
        Args:
            specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
        )r7   r$   r!   r!   r"   r%   S   s   
	zMelResNet.forwardr-   r   r   r   r.   r&   r!   r!   r   r"   r   4   s"    r   c                       s@   e Zd ZdZdededdf fddZdedefd	d
Z  ZS )r	   a  Upscale the frequency and time dimensions of a spectrogram.

    Args:
        time_scale: the scale factor in time dimension
        freq_scale: the scale factor in frequency dimension

    Examples
        >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)

        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
    
time_scale
freq_scaler   Nc                    s   t    || _|| _d S N)r   r   r;   r:   )r   r:   r;   r   r!   r"   r   m   s   

zStretch2d.__init__r#   c                 C   s   | | jd | jdS )zPass the input through the Stretch2d layer.

        Args:
            specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).

        Return:
            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
        )repeat_interleaver;   r:   r$   r!   r!   r"   r%   s   s   
zStretch2d.forwardr&   r!   r!   r   r"   r	   _   s    r	   c                       sh   e Zd ZdZ					ddee dededed	ed
eddf fddZdedeeef fddZ	  Z
S )r
   a  Upscale the dimensions of a spectrogram.

    Args:
        upsample_scales: the list of upsample scales.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
        >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
    r-   r   r.   upsample_scalesr/   r   r0   r1   r   r   Nc                    s   t    d}|D ]}||9 }q	|| _|d d | | _t|||||| _t|d| _g }	|D ]2}
t|
d}tj	ddd|
d d fd|
fdd}t
jj|jd|
d d   |	| |	| q/tj|	 | _d S )Nr      r   F)r   r   r   paddingr         ?)r   r   total_scaleindentr   resnetr	   resnet_stretchr   Conv2dtorchinit	constant_weightappendr   upsample_layers)r   r@   r/   r   r0   r1   r   rD   upsample_scale	up_layersscalestretchconvr   r!   r"   r      s$   
	


zUpsampleNetwork.__init__r#   c                 C   sf   |  |d}| |}|d}|d}| |}|ddddd| j| j f }||fS )a  Pass the input through the UpsampleNetwork layer.

        Args:
            specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)

        Return:
            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
        where total_scale is the product of all elements in upsample_scales.
        r   N)rF   	unsqueezerG   squeezerN   rE   )r   r#   resnet_outputupsampling_outputr!   r!   r"   r%      s   



&zUpsampleNetwork.forwardr9   )r'   r(   r)   r*   r   r+   r   r   r   r%   r,   r!   r!   r   r"   r
      s.    "r
   c                       s   e Zd ZdZ							ddee deded	ed
ededededededdf fddZdededefddZe	j
jddedee deeee f fddZ  ZS )r   aW  WaveRNN model from *Efficient Neural Audio Synthesis* :cite:`wavernn`
    based on the implementation from `fatchord/WaveRNN <https://github.com/fatchord/WaveRNN>`_.

    The original implementation was introduced in *Efficient Neural Audio Synthesis*
    :cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
    The product of `upsample_scales` must equal `hop_length`.

    See Also:
        * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wavernn>`__
        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

    Args:
        upsample_scales: the list of upsample scales.
        n_classes: the number of output classes.
        hop_length: the number of samples between the starts of consecutive frames.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_rnn: the dimension of RNN layer. (Default: ``512``)
        n_fc: the dimension of fully connected layer. (Default: ``512``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)

    Example
        >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
        >>> waveform, sample_rate = torchaudio.load(file)
        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
        >>> output = wavernn(waveform, specgram)
        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
    r-      r.   r   r@   	n_classes
hop_lengthr/   n_rnnn_fcr   r   r0   r1   r   Nc                    s:  t    || _|d r|d n|d | _|| _|
d | _|| _|| _tt	
| j| _d}|D ]}||9 }q0|| jkrFtd| d| t||||	|
|| _t|| j d || _tj||dd| _tj|| j |dd| _tjdd| _tjdd| _t|| j || _t|| j || _t|| j| _d S )	NrA   r      z/Expected: total_scale == hop_length, but found z != T)batch_firstr   )r   r   r   _padr[   n_auxrZ   rY   r+   mathlog2n_bits
ValueErrorr
   upsampler   LinearfcGRUrnn1rnn2r   relu1relu2fc1fc2fc3)r   r@   rY   rZ   r/   r[   r\   r   r   r0   r1   rD   rO   r   r!   r"   r      s,   



zWaveRNN.__init__waveformr#   c                    s  | ddkrtd| ddkrtd|d|d}}| d}tjd| j|j|jd}tjd| j|j|jd} |\}}|	dd}|	dd} fddt
d	D }|d
d
d
d
|d |d f }|d
d
d
d
|d |d f }	|d
d
d
d
|d |d f }
|d
d
d
d
|d |d f }tj|d||gdd} |}|} ||\}}|| }|}tj||	gdd} ||\}}|| }tj||
gdd} |} |}tj||gdd} |} |} |}|dS )a  Pass the input through the WaveRNN model.

        Args:
            waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
            specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)

        Return:
            Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
        r   z*Require the input channel of waveform is 1z*Require the input channel of specgram is 1r   )dtypedevicerA   c                    s   g | ]} j | qS r!   r`   r2   ir   r!   r"   r5   .  s    z#WaveRNN.forward.<locals>.<listcomp>r.   N   r]   r>   dim)sizerd   rU   rI   zerosr[   rq   rr   re   	transposer6   catrT   rg   ri   rj   rm   rk   rn   rl   ro   )r   rp   r#   
batch_sizeh1h2auxaux_idxa1a2a3a4xresr3   r!   rv   r"   r%     sB   
""""






zWaveRNN.forwardlengthsc                    s  |j }|j}tjj|jjf}|\} |dur#|jj }g }|	 \}}}tj
d|jf||d}	tj
d|jf||d}
tj
|df||d} fddtdD }t|D ]|ddddf }fdd|D \}}}}tj|||gdd}|}|d|	\}}	||	d	  }tj||gdd}|d|
\}}
||
d	  }tj||gdd}t|}tj||gdd}t|}|}tj|dd}t|d }d
| d
j d  d }|| q^t|dd
d	|fS )a  Inference method of WaveRNN.

        This function currently only supports multinomial sampling, which assumes the
        network is trained on cross entropy loss.

        Args:
            specgram (Tensor):
                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
            lengths (Tensor or None, optional):
                Indicates the valid length of each audio in the batch.
                Shape: `(batch, )`.
                When the ``specgram`` contains spectrograms with different durations,
                by providing ``lengths`` argument, the model will compute
                the corresponding valid output lengths.
                If ``None``, it is assumed that all the audio in ``waveforms``
                have valid length. Default: ``None``.

        Returns:
            (Tensor, Optional[Tensor]):
            Tensor
                The inferred waveform of size `(n_batch, 1, n_time)`.
                1 stands for a single channel.
            Tensor or None
                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
                is returned.
                It indicates the valid length in time axis of the output Tensor.
        Nr   )rr   rq   c                    s6   g | ]} d d j | j |d  d d f qS )Nr   rs   rt   )r   r   r!   r"   r5   x  s   6 z!WaveRNN.infer.<locals>.<listcomp>r]   c                    s"   g | ]}|d d d d  f qS r<   r!   )r2   a)ru   r!   r"   r5   ~  s   " rx   r   rA   rC   )rr   rq   rI   r   
functionalpadr_   re   rD   rz   r{   r[   r6   r}   rg   ri   rT   rj   Frelurm   rn   ro   softmaxmultinomialfloatrc   rM   stackpermute)r   r#   r   rr   rq   outputb_sizer3   seq_lenr   r   r   	aux_splitm_ta1_ta2_ta3_ta4_tinplogits	posteriorr!   )r   ru   r   r"   inferK  s@   

zWaveRNN.infer)r-   rX   rX   r.   r   r   r   r<   )r'   r(   r)   r*   r   r+   r   r   r%   rI   jitexportr   r   r   r,   r!   r!   r   r"   r      sF    %	
*92r   )ra   typingr   r   r   rI   torch.nn.functionalr   r   r   r   __all__Moduler   r   r	   r
   r   r!   r!   r!   r"   <module>   s    	#+!G