o
    -ÑiV<  ã                   @   s¬   d dl Z d dlmZmZmZ d dlZd dlm  mZ	 d dlmZm
Z
 g d¢ZG dd„ dejƒZG dd„ dejƒZG d	d
„ d
ejƒZG dd„ dejƒZG dd„ dejƒZdS )é    N)ÚListÚOptionalÚTuple)ÚnnÚTensor)ÚResBlockÚ	MelResNetÚ	Stretch2dÚUpsampleNetworkÚWaveRNNc                       s>   e Zd ZdZddeddf‡ fdd„Zdedefd	d
„Z‡  ZS )r   af  ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.

    Args:
        n_freq: the number of bins in a spectrogram. (Default: ``128``)

    Examples
        >>> resblock = ResBlock()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = resblock(input)  # shape: (10, 128, 512)
    é€   Ún_freqÚreturnNc                    sR   t ƒ  ¡  t tj||dddt |¡tjddtj||dddt |¡¡| _d S )Né   F©Úin_channelsÚout_channelsÚkernel_sizeÚbiasT©Úinplace)ÚsuperÚ__init__r   Ú
SequentialÚConv1dÚBatchNorm1dÚReLUÚresblock_model)Úselfr   ©Ú	__class__© úR/home/ubuntu/LTX-2/.venv/lib/python3.10/site-packages/torchaudio/models/wavernn.pyr      s   


ûzResBlock.__init__Úspecgramc                 C   s   |   |¡| S )zéPass the input through the ResBlock layer.
        Args:
            specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_freq, n_time)
        )r   ©r   r#   r!   r!   r"   Úforward(   s   	zResBlock.forward)r   ©	Ú__name__Ú
__module__Ú__qualname__Ú__doc__Úintr   r   r%   Ú__classcell__r!   r!   r   r"   r      s    r   c                       sP   e Zd ZdZ	ddedededed	ed
df‡ fdd„Zded
efdd„Z‡  ZS )r   a  MelResNet layer uses a stack of ResBlocks on spectrogram.

    Args:
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> melresnet = MelResNet()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = melresnet(input)  # shape: (10, 128, 508)
    é
   r   é   Ún_res_blockr   Ún_hiddenÚn_outputr   r   Nc                    sh   t ƒ  ¡  ‡ fdd„t|ƒD ƒ}tjtj|ˆ |ddt ˆ ¡tjddg|¢tjˆ |dd‘R Ž | _d S )	Nc                    s   g | ]}t ˆ ƒ‘qS r!   )r   )Ú.0Ú_©r0   r!   r"   Ú
<listcomp>I   s    z&MelResNet.__init__.<locals>.<listcomp>Fr   Tr   r   )r   r   r   )	r   r   Úranger   r   r   r   r   Úmelresnet_model)r   r/   r   r0   r1   r   Ú	ResBlocksr   r4   r"   r   D   s   

ýüûzMelResNet.__init__r#   c                 C   s
   |   |¡S )zÿPass the input through the MelResNet layer.
        Args:
            specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
        )r7   r$   r!   r!   r"   r%   S   s   
	zMelResNet.forward©r-   r   r   r   r.   r&   r!   r!   r   r"   r   4   s"    ÿÿÿÿÿÿþr   c                       s@   e Zd ZdZdededdf‡ fdd„Zdedefd	d
„Z‡  ZS )r	   a‘  Upscale the frequency and time dimensions of a spectrogram.

    Args:
        time_scale: the scale factor in time dimension
        freq_scale: the scale factor in frequency dimension

    Examples
        >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)

        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
    Ú
time_scaleÚ
freq_scaler   Nc                    s   t ƒ  ¡  || _|| _d S ©N)r   r   r;   r:   )r   r:   r;   r   r!   r"   r   m   s   

zStretch2d.__init__r#   c                 C   s   |  | jd¡  | jd¡S )zþPass the input through the Stretch2d layer.

        Args:
            specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).

        Return:
            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
        éþÿÿÿéÿÿÿÿ)Úrepeat_interleaver;   r:   r$   r!   r!   r"   r%   s   s   
zStretch2d.forwardr&   r!   r!   r   r"   r	   _   s    r	   c                       sh   e Zd ZdZ					ddee dededed	ed
eddf‡ fdd„Zdedeeef fdd„Z	‡  Z
S )r
   añ  Upscale the dimensions of a spectrogram.

    Args:
        upsample_scales: the list of upsample scales.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
        >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
    r-   r   r.   Úupsample_scalesr/   r   r0   r1   r   r   Nc                    sÔ   t ƒ  ¡  d}|D ]}||9 }q	|| _|d d | | _t|||||ƒ| _t|dƒ| _g }	|D ]2}
t|
dƒ}tj	ddd|
d d fd|
fdd}t
jj |jd|
d d  ¡ |	 |¡ |	 |¡ q/tj|	Ž | _d S )Nr   é   r   F)r   r   r   Úpaddingr   ç      ð?)r   r   Útotal_scaleÚindentr   Úresnetr	   Úresnet_stretchr   ÚConv2dÚtorchÚinitÚ	constant_ÚweightÚappendr   Úupsample_layers)r   r@   r/   r   r0   r1   r   rD   Úupsample_scaleÚ	up_layersÚscaleÚstretchÚconvr   r!   r"   r   ‘   s$   
	

ÿ
zUpsampleNetwork.__init__r#   c                 C   sf   |   |¡ d¡}|  |¡}| d¡}| d¡}|  |¡}| d¡dd…dd…| j| j …f }||fS )a¿  Pass the input through the UpsampleNetwork layer.

        Args:
            specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)

        Return:
            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
        where total_scale is the product of all elements in upsample_scales.
        r   N)rF   Ú	unsqueezerG   ÚsqueezerN   rE   )r   r#   Úresnet_outputÚupsampling_outputr!   r!   r"   r%   °   s   


&zUpsampleNetwork.forwardr9   )r'   r(   r)   r*   r   r+   r   r   r   r%   r,   r!   r!   r   r"   r
   €   s.    ùþýüûúùø"r
   c                       s¨   e Zd ZdZ							ddee deded	ed
ededededededdf‡ fdd„Zdededefdd„Ze	j
jddedee deeee f fdd„ƒZ‡  ZS )r   aW  WaveRNN model from *Efficient Neural Audio Synthesis* :cite:`wavernn`
    based on the implementation from `fatchord/WaveRNN <https://github.com/fatchord/WaveRNN>`_.

    The original implementation was introduced in *Efficient Neural Audio Synthesis*
    :cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
    The product of `upsample_scales` must equal `hop_length`.

    See Also:
        * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wavernn>`__
        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

    Args:
        upsample_scales: the list of upsample scales.
        n_classes: the number of output classes.
        hop_length: the number of samples between the starts of consecutive frames.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_rnn: the dimension of RNN layer. (Default: ``512``)
        n_fc: the dimension of fully connected layer. (Default: ``512``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)

    Example
        >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
        >>> waveform, sample_rate = torchaudio.load(file)
        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
        >>> output = wavernn(waveform, specgram)
        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
    r-   é   r.   r   r@   Ú	n_classesÚ
hop_lengthr/   Ún_rnnÚn_fcr   r   r0   r1   r   Nc                    s:  t ƒ  ¡  || _|d r|d n|d | _|| _|
d | _|| _|| _tt	 
| j¡ƒ| _d}|D ]}||9 }q0|| jkrFtd|› d|› ƒ‚t||||	|
|ƒ| _t || j d |¡| _tj||dd| _tj|| j |dd| _tjdd| _tjdd| _t || j |¡| _t || j |¡| _t || j¡| _d S )	NrA   r   é   z/Expected: total_scale == hop_length, but found z != T)Úbatch_firstr   )r   r   r   Ú_padr[   Ún_auxrZ   rY   r+   ÚmathÚlog2Ún_bitsÚ
ValueErrorr
   Úupsampler   ÚLinearÚfcÚGRUÚrnn1Úrnn2r   Úrelu1Úrelu2Úfc1Úfc2Úfc3)r   r@   rY   rZ   r/   r[   r\   r   r   r0   r1   rD   rO   r   r!   r"   r   è   s,   



zWaveRNN.__init__Úwaveformr#   c                    s  |  d¡dkrtdƒ‚|  d¡dkrtdƒ‚| d¡| d¡}}|  d¡}tjd|ˆ j|j|jd}tjd|ˆ j|j|jd}ˆ  |¡\}}| 	dd¡}| 	dd¡}‡ fdd„t
d	ƒD ƒ}|d
d
…d
d
…|d |d …f }|d
d
…d
d
…|d |d …f }	|d
d
…d
d
…|d |d …f }
|d
d
…d
d
…|d |d …f }tj| d¡||gdd}ˆ  |¡}|}ˆ  ||¡\}}|| }|}tj||	gdd}ˆ  ||¡\}}|| }tj||
gdd}ˆ  |¡}ˆ  |¡}tj||gdd}ˆ  |¡}ˆ  |¡}ˆ  |¡}| d¡S )a  Pass the input through the WaveRNN model.

        Args:
            waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
            specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)

        Return:
            Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
        r   z*Require the input channel of waveform is 1z*Require the input channel of specgram is 1r   )ÚdtypeÚdevicerA   c                    s   g | ]}ˆ j | ‘qS r!   ©r`   ©r2   Úi©r   r!   r"   r5   .  s    z#WaveRNN.forward.<locals>.<listcomp>r.   Né   r]   r>   ©Údim)Úsizerd   rU   rI   Úzerosr[   rq   rr   re   Ú	transposer6   ÚcatrT   rg   ri   rj   rm   rk   rn   rl   ro   )r   rp   r#   Ú
batch_sizeÚh1Úh2ÚauxÚaux_idxÚa1Úa2Úa3Úa4ÚxÚresr3   r!   rv   r"   r%     sB   
""""





zWaveRNN.forwardÚlengthsc                    s  |j }|j}tjj |ˆjˆjf¡}ˆ |¡\}‰ |dur#|ˆjj }g }| 	¡ \}}}tj
d|ˆjf||d}	tj
d|ˆjf||d}
tj
|df||d}‡ ‡fdd„tdƒD ƒ}t|ƒD ]œ‰|dd…dd…ˆf }‡fdd„|D ƒ\}}}}tj|||gdd}ˆ |¡}ˆ | d¡|	¡\}}	||	d	  }tj||gdd}ˆ | d¡|
¡\}}
||
d	  }tj||gdd}t ˆ |¡¡}tj||gdd}t ˆ |¡¡}ˆ |¡}tj|dd}t |d¡ ¡ }d
| d
ˆj d  d }| |¡ q^t |¡ dd
d	¡|fS )a¾  Inference method of WaveRNN.

        This function currently only supports multinomial sampling, which assumes the
        network is trained on cross entropy loss.

        Args:
            specgram (Tensor):
                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
            lengths (Tensor or None, optional):
                Indicates the valid length of each audio in the batch.
                Shape: `(batch, )`.
                When the ``specgram`` contains spectrograms with different durations,
                by providing ``lengths`` argument, the model will compute
                the corresponding valid output lengths.
                If ``None``, it is assumed that all the audio in ``waveforms``
                have valid length. Default: ``None``.

        Returns:
            (Tensor, Optional[Tensor]):
            Tensor
                The inferred waveform of size `(n_batch, 1, n_time)`.
                1 stands for a single channel.
            Tensor or None
                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
                is returned.
                It indicates the valid length in time axis of the output Tensor.
        Nr   )rr   rq   c                    s6   g | ]}ˆ d d …ˆj | ˆj |d  …d d …f ‘qS )Nr   rs   rt   )r   r   r!   r"   r5   x  s   6 z!WaveRNN.infer.<locals>.<listcomp>r]   c                    s"   g | ]}|d d …d d …ˆ f ‘qS r<   r!   )r2   Úa)ru   r!   r"   r5   ~  s   " rx   r   rA   rC   )rr   rq   rI   r   Ú
functionalÚpadr_   re   rD   rz   r{   r[   r6   r}   rg   ri   rT   rj   ÚFÚrelurm   rn   ro   ÚsoftmaxÚmultinomialÚfloatrc   rM   ÚstackÚpermute)r   r#   r‰   rr   rq   ÚoutputÚb_sizer3   Úseq_lenr   r€   r‡   Ú	aux_splitÚm_tÚa1_tÚa2_tÚa3_tÚa4_tÚinpÚlogitsÚ	posteriorr!   )r   ru   r   r"   ÚinferK  s@   

zWaveRNN.infer)r-   rX   rX   r.   r   r   r   r<   )r'   r(   r)   r*   r   r+   r   r   r%   rI   ÚjitÚexportr   r   r    r,   r!   r!   r   r"   r   Ç   sF    %õþýüûúùø	÷
öõô*92r   )ra   Útypingr   r   r   rI   Útorch.nn.functionalr   r‹   r   r   Ú__all__ÚModuler   r   r	   r
   r   r!   r!   r!   r"   Ú<module>   s    	#+!G