o
    i7                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ ddl	m
Z
 G dd dejZG d	d
 d
ejZG dd deZdd ZdddZG dd dejZG dd dejZG dd deZdd ZG dd dejZdS )    N)List)RotaryPositionalEmbeddings)ResidualFSQ   TransformerBlockc                	       sL   e Zd ZdZ	ddedededef fddZd	ejd
ejfddZ	  Z
S )ISTFTa  
    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
    See issue: https://github.com/pytorch/pytorch/issues/62323
    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
    The NOLA constraint is met as we trim padded samples anyway.

    Args:
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames.
        win_length (int): The size of window frame and STFT filter.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    samen_fft
hop_length
win_lengthpaddingc                    sL   t    |dvrtd|| _|| _|| _|| _t|}| 	d| d S )N)centerr	   #Padding must be 'center' or 'same'.window)
super__init__
ValueErrorr   r
   r   r   torchhann_windowregister_buffer)selfr
   r   r   r   r   	__class__ P/home/ubuntu/.local/lib/python3.10/site-packages/neucodec/codec_decoder_vocos.pyr      s   

zISTFT.__init__specreturnc                 C   sV  | j dkrtj|| j| j| j| jddS | j dkr"| j| j d }ntd| dks0J d|j	\}}}tj
j|| jd	d
d}|| jddddf  }|d	 | j | j }tjjj|d	|fd	| jfd	| jfddddd|| f }| j d	|dd	d}	tjjj|	d	|fd	| jfd	| jfd ||  }
|
dk sJ ||
 }|S )a  
        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.

        Args:
            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
                            N is the number of frequency bins, and T is the number of time frames.

        Returns:
            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
        r   T)r   r	      r      zExpected a 3D tensor as inputr   backward)dimnormN)output_sizekernel_sizestrider   gdy=)r   r   istftr
   r   r   r   r   r!   shapefftirfftnn
functionalfoldsquareexpand	transposesqueezeall)r   r   padBNTifftr#   y	window_sqwindow_enveloper   r   r   forward'   sJ   

zISTFT.forwardr	   __name__
__module____qualname____doc__intstrr   r   Tensorr;   __classcell__r   r   r   r   r      s    r   c                   @   &   e Zd ZdZdejdejfddZdS )FourierHeadz'Base class for inverse fourier modules.xr   c                 C      t d)aJ  
        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        -Subclasses must implement the forward method.NotImplementedErrorr   rH   r   r   r   r;   d   s   	zFourierHead.forwardNr>   r?   r@   rA   r   rD   r;   r   r   r   r   rG   a       rG   c                	       sJ   e Zd ZdZddedededef fddZd	ejd
ejfddZ	  Z
S )	ISTFTHeada  
    ISTFT Head module for predicting STFT complex coefficients.

    Args:
        dim (int): Hidden dimension of the model.
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames, which should align with
                          the resolution of the input features.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    r	   r!   r
   r   r   c                    s8   t    |d }tj||| _t||||d| _d S )Nr   )r
   r   r   r   )r   r   r   r+   Linearoutr   r'   )r   r!   r
   r   r   out_dimr   r   r   r   |   s   
zISTFTHead.__init__rH   r   c                 C   s|   |  |}|dd}|jddd\}}t|}tj|dd}t|}t|}||d|   }| |}|	d|fS )ay  
        Forward pass of the ISTFTHead module.

        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        r   r   r!   g      Y@)maxy              ?)
rR   r0   chunkr   expclipcossinr'   	unsqueeze)r   rH   x_predmagpr8   Saudior   r   r   r;      s   




zISTFTHead.forwardr<   r=   r   r   r   r   rP   p   s     rP   c                 C   s   | t |  S N)r   sigmoid)rH   r   r   r   nonlinearity   s   rc       c                 C   s   t jj|| dddS )Nư>T)
num_groupsnum_channelsepsaffine)r   r+   	GroupNorm)in_channelsrf   r   r   r   	Normalize   s   rl   c                       s0   e Zd Zdddd fdd
Zd	ddZ  ZS )
ResnetBlockNFi   )out_channelsconv_shortcuttemb_channelsc                   s   t    || _|d u r|n|}|| _|| _t|| _tjj	||dddd| _
|dkr3tj||| _t|| _tj|| _tjj	||dddd| _| j| jkrp| jrbtjj	||dddd| _d S tjj	||dddd| _d S d S )Nr   r   )r$   r%   r   r   )r   r   rk   rn   use_conv_shortcutrl   norm1r   r+   Conv1dconv1rQ   	temb_projnorm2Dropoutdropoutconv2ro   nin_shortcut)r   rk   rn   ro   rx   rp   r   r   r   r      s2   
	





zResnetBlock.__init__c                 C   s   |}|  |}t|}| |}|d ur'|| t|d d d d d d f  }| |}t|}| |}| |}| j| jkrQ| j	rL| 
|}|| S | |}|| S ra   )rr   rc   rt   ru   rv   rx   ry   rk   rn   rq   ro   rz   )r   rH   tembhr   r   r   r;      s    

&




zResnetBlock.forwardra   )r>   r?   r@   r   r;   rE   r   r   r   r   rm      s    $rm   c                   @   rF   )BackbonezeBase class for the generator's backbone. It preserves the same temporal resolution across all layers.rH   r   c                 K   rI   )ai  
        Args:
            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
                        C denotes output features, and L is the sequence length.

        Returns:
            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
                    and H denotes the model dimension.
        rJ   rK   )r   rH   kwargsr   r   r   r;      s   
zBackbone.forwardNrN   r   r   r   r   r}      rO   r}   c                       s8   e Zd ZdZd fdd	Zdejd	ejfd
dZ  ZS )VocosBackbonea  
    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization

    Args:
        input_channels (int): Number of input features channels.
        dim (int): Hidden dimension of the model.
        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
        num_layers (int): Number of ConvNeXtBlock layers.
        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
                                                None means non-conditional model. Defaults to None.
             @   c           
         s   t    tjddd| _d| _}d}t||| j|dt||| j|dg}tj| | _|}t	|d fdd	t
|D }tj| | _tjd
d| _t||| j|dt||| j|dg}	tj|	 | _d S )N   r   )r$   r   r   g?)rk   rn   rp   rx   rT   c                    s   g | ]	}t  d qS ))r!   n_headsrotary_embedr   ).0_heads
hidden_dimtime_rotary_embedr   r   
<listcomp>%  s    z*VocosBackbone.__init__.<locals>.<listcomp>re   )rh   )r   r   r+   rs   embedtemb_chrm   
Sequential	prior_netr   rangetransformers	LayerNormfinal_layer_normpost_net)
r   r   depthr   pos_meb_dimblock_inrx   r   transformer_blocksr   r   r   r   r   	  sP   

zVocosBackbone.__init__rH   r   c                 C   sf   | dd}| |}| |}| dd}| |}| dd}| |}| dd}| |}|S )Nr   r   )r0   r   r   r   r   r   rM   r   r   r   r;   >  s   




zVocosBackbone.forward)r   r   r   r   )	r>   r?   r@   rA   r   r   rD   r;   rE   r   r   r   r   r      s    5r   c                 C   s6   t | tjrtjj| jdd tj| jd d S d S )Ng{Gz?)stdr   )
isinstancer+   rs   inittrunc_normal_weight	constant_biasmr   r   r   init_weightsK  s   r   c                       s   e Zd Z													
	d  fdd	Zd!ddZdd Zdd Zdd Zdd Zdd Z	dd Z
dd Zdd Z  ZS )"CodecDecoderVocosr   r   r   r   @  r            ?F @  c                    s\   t    || _t|g ddd| _t||||d| _t|| jd | jdd| _| 	  d S )N)   r   r   r   r   r   r   r   r   )r!   levelsnum_quantizers)r   r   r   r   r   r	   )r!   r
   r   r   )
r   r   r   r   	quantizerr   backbonerP   headreset_parameters)r   r   r   r   r   r   vq_num_quantizersvq_dimvq_commit_weightvq_weight_initvq_full_commit_losscodebook_sizecodebook_dimr   r   r   r   R  s   

zCodecDecoderVocos.__init__Tc                 C   sj   |du r%| ddd}| |\}}| ddd}| ddd}||d fS | |}| |\}}||fS )NTr   r   r   )permuter   r   r   )r   rH   vqqr   r   r   r   r;   u  s   

zCodecDecoderVocos.forwardc                 C   s   | j  | _ | j |}|S ra   )r   evalvq2embr   r   rH   r   r   r   r     s   zCodecDecoderVocos.vq2embc                 C   s   | j  | _ | j  }|S ra   )r   r   get_emb)r   embsr   r   r   r     s   
zCodecDecoderVocos.get_embc                 C   s$   |d d d d d f }|  |}|S ra   modelr   r   r   r   inference_vq  s   
zCodecDecoderVocos.inference_vqc                 C   s$   |  |\}}}}| |}|d fS ra   )r   r   )r   rH   r   lossperpr   r   r   inference_0  s   
zCodecDecoderVocos.inference_0c                 C   s   |  |}|d fS ra   r   rM   r   r   r   	inference  s   
zCodecDecoderVocos.inferencec                 C      dd }|  | dS )z:Remove weight normalization module from all of the layers.c                 S   s*   z
t jj|  W d S  ty   Y d S w ra   )r   r+   utilsremove_weight_normr   r   r   r   r   _remove_weight_norm  s
   zACodecDecoderVocos.remove_weight_norm.<locals>._remove_weight_normNapply)r   r   r   r   r   r     s   z$CodecDecoderVocos.remove_weight_normc                 C   r   )z9Apply weight normalization module from all of the layers.c                 S   s.   t | tjst | tjrtjj|  d S d S ra   )r   r+   rs   ConvTranspose1dr   r   weight_normr   r   r   r   _apply_weight_norm  s   z?CodecDecoderVocos.apply_weight_norm.<locals>._apply_weight_normNr   )r   r   r   r   r   apply_weight_norm  s   z#CodecDecoderVocos.apply_weight_normc                 C   s   |  t d S ra   )r   r   )r   r   r   r   r     s   z"CodecDecoderVocos.reset_parameters)r   r   r   r   r   r   r   r   FFr   r   )T)r>   r?   r@   r   r;   r   r   r   r   r   r   r   r   rE   r   r   r   r   r   Q  s.    
#	r   )rd   )r   torch.nnr+   typingr   torchtune.modulesr   vector_quantize_pytorchr   bs_roformer5r   Moduler   rG   rP   rc   rl   rm   r}   r   r   r   r   r   r   r   <module>   s     V4
<P