o
    ̳iV                     @   s  d dl Z d dlZd dlZd dlmZ ddlmZ ddlm	Z	m
Z
mZ ddlT ddlmZ d dlmZ ddlmZmZ dd	lmZ d d
lmZ d dlmZ d dlmZmZ G dd dejZG dd dejZG dd deZdd Zd,ddZG dd dejZ G dd dejZ!d-ddZ"G dd  d ejZ#G d!d" d"e#Z$d#d$ Z%G d%d& d&ejZ&G d'd( d(ejZ'd)d* Z(e)d+kre(  dS dS ).    N   )
ResidualVQ)WNConv1dDecoderBlockResLSTM)*)activations)Optional)ConvNeXtBlockAdaLayerNormTransformerBlock)RotaryPositionalEmbeddings)ResidualFSQ)Module
ModuleListc                	       J   e Zd ZdZddedededef fddZd	ejd
ejfddZ	  Z
S )ISTFTa  
    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
    See issue: https://github.com/pytorch/pytorch/issues/62323
    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
    The NOLA constraint is met as we trim padded samples anyway.

    Args:
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames.
        win_length (int): The size of window frame and STFT filter.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    samen_fft
hop_length
win_lengthpaddingc                    sL   t    |dvrtd|| _|| _|| _|| _t|}| 	d| d S )N)centerr   #Padding must be 'center' or 'same'.window)
super__init__
ValueErrorr   r   r   r   torchhann_windowregister_buffer)selfr   r   r   r   r   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/xcodec2/vq/codec_decoder_vocos.pyr       s   

zISTFT.__init__specreturnc                 C   sV  | j dkrtj|| j| j| j| jddS | j dkr"| j| j d }ntd| dks0J d|j	\}}}tj
j|| jd	d
d}|| jddddf  }|d	 | j | j }tjjj|d	|fd	| jfd	| jfddddd|| f }| j d	|dd	d}	tjjj|	d	|fd	| jfd	| jfd ||  }
|
dk sJ ||
 }|S )a  
        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.

        Args:
            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
                            N is the number of frequency bins, and T is the number of time frames.

        Returns:
            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
        r   T)r   r      r      zExpected a 3D tensor as inputr   backward)dimnormN)output_sizekernel_sizestrider   gdy=)r   r   istftr   r   r   r   r   r,   shapefftirfftnn
functionalfoldsquareexpand	transposesqueezeall)r"   r'   padBNTifftr.   y	window_sqwindow_enveloper%   r%   r&   forward+   s0   

zISTFT.forwardr   __name__
__module____qualname____doc__intstrr   r   TensorrF   __classcell__r%   r%   r#   r&   r      s     r   c                   @   &   e Zd ZdZdejdejfddZdS )FourierHeadz'Base class for inverse fourier modules.xr(   c                 C      t d)aJ  
        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        -Subclasses must implement the forward method.NotImplementedErrorr"   rS   r%   r%   r&   rF   \   s   	zFourierHead.forwardNrI   rJ   rK   rL   r   rO   rF   r%   r%   r%   r&   rR   Y       rR   c                	       r   )	ISTFTHeada  
    ISTFT Head module for predicting STFT complex coefficients.

    Args:
        dim (int): Hidden dimension of the model.
        n_fft (int): Size of Fourier transform.
        hop_length (int): The distance between neighboring sliding window frames, which should align with
                          the resolution of the input features.
        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
    r   r,   r   r   r   c                    s8   t    |d }tj||| _t||||d| _d S )Nr)   )r   r   r   r   )r   r   r   r6   Linearoutr   r2   )r"   r,   r   r   r   out_dimr#   r%   r&   r   t   s   
zISTFTHead.__init__rS   r(   c                 C   s|   |  |}|dd}|jddd\}}t|}tj|dd}t|}t|}||d|   }| |}|	d|fS )ay  
        Forward pass of the ISTFTHead module.

        Args:
            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
                        L is the sequence length, and H denotes the model dimension.

        Returns:
            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
        r   r)   r,   g      Y@)maxy              ?)
r]   r;   chunkr   expclipcossinr2   	unsqueeze)r"   rS   x_predmagprC   Saudior%   r%   r&   rF   z   s   




zISTFTHead.forwardrG   rH   r%   r%   r#   r&   r[   h   s     r[   c                 C   s   | t |  S N)r   sigmoid)rS   r%   r%   r&   nonlinearity   s   rn       c                 C   s   t jj|| dddS )Nư>T)
num_groupsnum_channelsepsaffine)r   r6   	GroupNorm)in_channelsrq   r%   r%   r&   	Normalize   s   rw   c                       s0   e Zd Zdddd fdd
Zd	ddZ  ZS )
ResnetBlockNFi   )out_channelsconv_shortcuttemb_channelsc                   s   t    || _|d u r|n|}|| _|| _t|| _tjj	||dddd| _
|dkr3tj||| _t|| _tj|| _tjj	||dddd| _| j| jkrp| jrbtjj	||dddd| _d S tjj	||dddd| _d S d S )Nr*   r   r/   r0   r   r   )r   r   rv   ry   use_conv_shortcutrw   norm1r   r6   Conv1dconv1r\   	temb_projnorm2Dropoutdropoutconv2rz   nin_shortcut)r"   rv   ry   rz   r   r{   r#   r%   r&   r      sN   


zResnetBlock.__init__c                 C   s   |}|  |}t|}| |}|d ur'|| t|d d d d d d f  }| |}t|}| |}| |}| j| jkrQ| j	rL| 
|}|| S | |}|| S rl   )r~   rn   r   r   r   r   r   rv   ry   r}   rz   r   )r"   rS   tembhr%   r%   r&   rF      s    

&




zResnetBlock.forwardrl   rI   rJ   rK   r   rF   rP   r%   r%   r#   r&   rx      s
    &rx   c                       s$   e Zd Z fddZdd Z  ZS )	AttnBlockc                    s~   t    || _t|| _tjj||dddd| _tjj||dddd| _	tjj||dddd| _
tjj||dddd| _d S )Nr   r   r|   )r   r   rv   rw   r-   r   r6   r   qkvproj_out)r"   rv   r#   r%   r&   r      s6   

zAttnBlock.__init__c           
      C   s   |}|  |}| |}| |}| |}|j\}}}|ddd}t||}	|	t|d  }	tj	j
j|	dd}	|	ddd}	t||	}| |}|| S )Nr   r)   r   g      r_   )r-   r   r   r   r3   permuter   bmmrM   r6   r7   softmaxr   )
r"   rS   h_r   r   r   bcr   w_r%   r%   r&   rF      s   




zAttnBlock.forwardr   r%   r%   r#   r&   r      s    r   vanillac                 C   sB   |dv sJ d| dt d| d|  d |dkrt| S d S )N)r   linearnonez
attn_type z unknownzmaking attention of type 'z' with z in_channelsr   )printr   )rv   	attn_typer%   r%   r&   	make_attn  s
   r   c                   @   rQ   )BackbonezeBase class for the generator's backbone. It preserves the same temporal resolution across all layers.rS   r(   c                 K   rT   )ai  
        Args:
            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
                        C denotes output features, and L is the sequence length.

        Returns:
            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
                    and H denotes the model dimension.
        rU   rV   )r"   rS   kwargsr%   r%   r&   rF     s   
zBackbone.forwardNrY   r%   r%   r%   r&   r     rZ   r   c                       s:   e Zd ZdZ	d fdd	Zdejd	ejfd
dZ  ZS )VocosBackbonea  
    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization

    Args:
        input_channels (int): Number of input features channels.
        dim (int): Hidden dimension of the model.
        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
        num_layers (int): Number of ConvNeXtBlock layers.
        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
                                                None means non-conditional model. Defaults to None.
             @   c           
         s   t    tjddd| _d| _}d}t||| j|dt||| j|dg}tj| | _|}t	|d fdd	t
|D }tj| | _tjd
d| _t||| j|dt||| j|dg}	tj|	 | _d S )N   r*   )r/   r   r   g?)rv   ry   r{   r   r_   c                    s   g | ]	}t  d qS ))r,   n_headsrotary_embedr   ).0_heads
hidden_dimtime_rotary_embedr%   r&   
<listcomp>L  s    z*VocosBackbone.__init__.<locals>.<listcomp>rp   )rs   )r   r   r6   r   embedtemb_chrx   
Sequential	prior_netr   rangetransformers	LayerNormfinal_layer_normpost_net)
r"   r   depthr   pos_meb_dimblock_inr   r   transformer_blocksr   r#   r   r&   r   4  s8   

zVocosBackbone.__init__rS   r(   c                 C   sf   | dd}| |}| |}| dd}| |}| dd}| |}| dd}| |}|S )Nr   r)   )r;   r   r   r   r   r   rX   r%   r%   r&   rF   \  s   




zVocosBackbone.forward)r   r   r   r   )	rI   rJ   rK   rL   r   r   rO   rF   rP   r%   r%   r#   r&   r   &  s
    (r   c                 C   s6   t | tjrtjj| jdd tj| jd d S d S )Ng{Gz?)stdr   )
isinstancer6   r   inittrunc_normal_weight	constant_biasmr%   r%   r&   init_weightsn  s   r   c                       s   e Zd Z													
	d  fdd	Zd!ddZdd Zdd Zdd Zdd Zdd Z	dd Z
dd Zdd Z  ZS )"CodecDecoderVocosr   r   r   r   @  r            ?F @  c                    s\   t    || _t|g ddd| _t||||d| _t|| jd | jdd| _| 	  d S )N)   r   r   r   r   r   r   r   r   )r,   levelsnum_quantizersr   r   r   r   r   r   r,   r   r   r   )
r   r   r   r   	quantizerr   backboner[   headreset_parametersr"   r   r   r   r   r   vq_num_quantizersvq_dimvq_commit_weightvq_weight_initvq_full_commit_losscodebook_sizecodebook_dimr#   r%   r&   r   t  s   
zCodecDecoderVocos.__init__Tc                 C   sj   |du r%| ddd}| |\}}| ddd}| ddd}||d fS | |}| |\}}||fS )NTr   r)   r   )r   r   r   r   )r"   rS   vqr   r   r%   r%   r&   rF     s   

zCodecDecoderVocos.forwardc                 C      | j  | _ | j |}|S rl   r   evalvq2embr"   r   rS   r%   r%   r&   r        zCodecDecoderVocos.vq2embc                 C      | j  | _ | j  }|S rl   r   r   get_embr"   embsr%   r%   r&   r        
zCodecDecoderVocos.get_embc                 C   $   |d d d d d f }|  |}|S rl   modelr   r%   r%   r&   inference_vq     
zCodecDecoderVocos.inference_vqc                 C   $   |  |\}}}}| |}|d fS rl   r   r   r"   rS   r   lossperpr%   r%   r&   inference_0     
zCodecDecoderVocos.inference_0c                 C      |  |}|d fS rl   r   rX   r%   r%   r&   	inference     
zCodecDecoderVocos.inferencec                 C      dd }|  | dS ):Remove weight normalization module from all of the layers.c                 S   *   z
t jj|  W d S  ty   Y d S w rl   r   r6   utilsremove_weight_normr   r   r%   r%   r&   _remove_weight_norm  
   zACodecDecoderVocos.remove_weight_norm.<locals>._remove_weight_normNapplyr"   r   r%   r%   r&   r        z$CodecDecoderVocos.remove_weight_normc                 C   r   )9Apply weight normalization module from all of the layers.c                 S   .   t | tjst | tjrtjj|  d S d S rl   r   r6   r   ConvTranspose1dr   r   weight_normr   r%   r%   r&   _apply_weight_norm     z?CodecDecoderVocos.apply_weight_norm.<locals>._apply_weight_normNr   r"   r  r%   r%   r&   apply_weight_norm     z#CodecDecoderVocos.apply_weight_normc                 C      |  t d S rl   r   r   r"   r%   r%   r&   r        z"CodecDecoderVocos.reset_parameters)r   r   r   r   r   r   r   r   FFr   r   TrI   rJ   rK   r   rF   r   r   r   r   r   r   r
  r   rP   r%   r%   r#   r&   r   s  s.    
)	r   c                       s   e Zd Z													d fd
d	Zd ddZdd Zdd Zdd Zdd Zdd Z	dd Z
dd Zdd Z  ZS )!CodecDecoderVocos_transposer   r   r   r   r   r   r   Fr   c                    s   t    || _t||||d||	|
d| _t||||d| _tt	 tj
||dddddt	 tj
||ddd| _t|| jd | jd	d
| _|   d S )Nr)   )r   r,   r   r   threshold_ema_dead_code
commitmentweight_initfull_commit_lossr   r*   r   )rv   ry   r/   r0   r   output_padding)rv   ry   r/   r   r   r   r   )r   r   r   r   r   r   r   r6   r   GELUr  inverse_mel_convr[   r   r   r   r#   r%   r&   r     sB   
z$CodecDecoderVocos_transpose.__init__Tc                 C   sB   |du r|  |\}}}|||fS | |}| |\}}||fS )NT)r   r   r   )r"   rS   r   r   commit_lossr   r%   r%   r&   rF     s   

z#CodecDecoderVocos_transpose.forwardc                 C   r   rl   r   r   r%   r%   r&   r     r   z"CodecDecoderVocos_transpose.vq2embc                 C   r   rl   r   r   r%   r%   r&   r   "  r   z#CodecDecoderVocos_transpose.get_embc                 C   r   rl   r   r   r%   r%   r&   r   '  r   z(CodecDecoderVocos_transpose.inference_vqc                 C   r   rl   r   r   r%   r%   r&   r   ,  r   z'CodecDecoderVocos_transpose.inference_0c                 C   r   rl   r   rX   r%   r%   r&   r   1  r   z%CodecDecoderVocos_transpose.inferencec                 C   r   )r   c                 S   r   rl   r   r   r%   r%   r&   r   9  r   zKCodecDecoderVocos_transpose.remove_weight_norm.<locals>._remove_weight_normNr   r   r%   r%   r&   r   6  r  z.CodecDecoderVocos_transpose.remove_weight_normc                 C   r   )r  c                 S   r  rl   r  r   r%   r%   r&   r  D  r  zICodecDecoderVocos_transpose.apply_weight_norm.<locals>._apply_weight_normNr   r	  r%   r%   r&   r
  A  r  z-CodecDecoderVocos_transpose.apply_weight_normc                 C   r  rl   r  r  r%   r%   r&   r   J  r  z,CodecDecoderVocos_transpose.reset_parameters)r   r   r   r   r   r   r   r   FFr   r   r  r  r%   r%   r#   r&   r    s.    
7		r  c                  C   s   t t j r	dnd} td|   t | }td d}d}d}t |||| }td|j  |	  t 
  ||d	d
}td td|j  d}W d    d S 1 s]w   Y  d S )NcudacpuzUsing device: zModel initialized.r)   r   2   zDummy input shape: F)r   z
Forward pass without VQ:zOutput shape: r   )r   devicer  is_availabler   r  torandnr3   r   no_grad)r  r   
batch_sizerv   sequence_lengthdummy_inputoutput_no_vqr   r%   r%   r&   mainP  s    
"r'  __main__)ro   )r   )*sysnumpynpr   torch.nnr6   residual_vqr   moduler   r   r   alias_free_torch r   typingr	   r
   r   bs_roformer5r   torchtune.modulesr   vector_quantize_pytorchr   r   r   r   rR   r[   rn   rw   rx   r   r   r   r   r   r   r  r'  rI   r%   r%   r%   r&   <module>   s>    H0
=
1Hit-
