o
    پiY                     @   s$  d dl Z d dlmZ d dlmZ d dlZd dlm  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZ ejjdd	 ZG d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZeZdS )    N)bisect_right)Union)	rearrange)nn)DacVAEConfig)DiagonalGaussianDistributionc                 C   sN   | j }| |d |d d} | |d  t||  d  } | |} | S )Nr      g&.>   )shapereshape
reciprocaltorchsinpow)xalphar    r   a/home/ubuntu/.local/lib/python3.10/site-packages/sglang/multimodal_gen/runtime/models/vaes/dac.pysnake   s
   $
r   c                       s$   e Zd Z fddZdd Z  ZS )Snake1dc                    s$   t    ttd|d| _d S )Nr   )super__init__r   	Parameterr   onesr   )selfchannels	__class__r   r   r      s   
zSnake1d.__init__c                 C   s   t || jS N)r   r   r   r   r   r   r   forward#   s   zSnake1d.forward)__name__
__module____qualname__r   r!   __classcell__r   r   r   r   r      s    r   c                       sN   e Zd ZdZdededef fddZdd Zd	d
 Zdd Zdd Z	  Z
S )VectorQuantizea  
    Implementation of VQ similar to Karpathy's repo:
    https://github.com/karpathy/deep-vector-quantization
    Additionally uses following tricks from Improved VQGAN
    (https://arxiv.org/pdf/2110.04627.pdf):
        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
            for improved codebook usage
        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
            improves training stability
    	input_dimcodebook_sizecodebook_dimc                    sL   t    || _|| _tj||dd| _tj||dd| _t||| _	d S )Nr   kernel_size)
r   r   r(   r)   r   Conv1din_projout_proj	Embeddingcodebook)r   r'   r(   r)   r   r   r   r   3   s   
zVectorQuantize.__init__c                 C   s|   |  |}| |\}}tj|| ddddg}tj|| ddddg}|||   }| |}|||||fS )aI  Quantize the input tensor using a fixed codebook and return the corresponding codebook vectors.

        Args:
            z (torch.Tensor): Input tensor with shape ``[B, D, T]``.

        Returns:
            tuple: A tuple containing:
                - z_q (torch.Tensor): Quantized continuous representation with shape ``[B, D, T]``.
                - commitment_loss (torch.Tensor): Commitment loss scalar to train encoder to predict
                  vectors closer to codebook entries.
                - codebook_loss (torch.Tensor): Codebook loss scalar to update the codebook.
                - indices (torch.Tensor): Codebook indices (quantized discrete representation) with shape ``[B, T]``.
                - z_e (torch.Tensor): Projected latents (continuous representation before quantization) with shape ``[B, D, T]``.
        none)	reductionr   r
   )r-   decode_latentsFmse_lossdetachmeanr.   )r   zz_ez_qindicescommitment_losscodebook_lossr   r   r   r!   <   s   

zVectorQuantize.forwardc                 C   s   t || jjS r   )r4   	embeddingr0   weightr   embed_idr   r   r   
embed_code[   s   zVectorQuantize.embed_codec                 C   s   |  |ddS )Nr   r
   )rB   	transposer@   r   r   r   decode_code^   s   zVectorQuantize.decode_codec                 C   s   t |d}| jj}t|}t|}|djdddd| |   |djddd  }t | dd d|	dd}| 
|}||fS )	Nzb d t -> (b t) dr
   r   T)keepdimz(b t) -> b tr   )b)r   r0   r?   r4   	normalizer   sumtmaxsizerD   )r   latents	encodingsr0   distr;   r:   r   r   r   r3   a   s   


 
zVectorQuantize.decode_latents)r"   r#   r$   __doc__intr   r!   rB   rD   r3   r%   r   r   r   r   r&   '   s    	r&   c                       sz   e Zd ZdZ					ddeded	ed
eeef def
 fddZddefddZ	de
jfddZde
jfddZ  ZS )ResidualVectorQuantizezg
    Introduced in SoundStream: An end2end neural audio codec
    https://arxiv.org/abs/2107.03312
       	                 r'   n_codebooksr(   r)   quantizer_dropoutc                    s   t    t tr fddt|D  || _ | _| _dg}| jD ]}||d |  q$t	|| _
t fddt|D | _|| _d S )Nc                    s   g | ]} qS r   r   ).0_)r)   r   r   
<listcomp>   s    z3ResidualVectorQuantize.__init__.<locals>.<listcomp>r   r	   c                    s   g | ]
}t  | qS r   )r&   )rY   ir)   r(   r'   r   r   r[      s    )r   r   
isinstancerP   rangerW   r)   r(   appendtuple_codebook_dim_offsetsr   
ModuleList
quantizersrX   )r   r'   rW   r(   r)   rX   dim_offsetsdimr   r]   r   r   z   s    




zResidualVectorQuantize.__init__Nn_quantizersc                 C   s  d}|}d}d}g }g }|du r| j }| j}	| jr|jd }
|j}tj|
f| j d |tjd}| jdkrVtj	d| j d |
f|d}t
|
| j }|dkrV|d| |d|< t|	D ]<\}}||\}}}}}||k }|||ddddf   }|| }|||  7 }|||  7 }|| || qZn6t|	D ]1\}}||kr n(||\}}}}}|| }|| }|| 7 }|| 7 }|| || qtj|dd}tj|dd}|||||fS )a  Quantize the input tensor using a fixed set of codebooks and return the corresponding codebook vectors.

        Args:
            z (torch.Tensor): Input tensor with shape ``[B, D, T]``.
            n_quantizers (int, optional): Number of quantizers to use. If ``None``,
                all quantizers are used. When ``n_quantizers`` < ``self.n_codebooks``,
                quantizer dropout is applied. Note: if ``self.quantizer_dropout`` > 0
                and in training mode, this argument is ignored and a random number of
                quantizers is used.

        Returns:
            tuple: A tuple containing:
                - z_q (torch.Tensor): Quantized continuous representation with shape ``[B, D, T]``.
                - codes (torch.Tensor): Codebook indices for each codebook with shape ``[B, N, T]``
                  (quantized discrete representation of input).
                - latents (torch.Tensor): Projected latents with shape ``[B, N*D, T]``
                  (continuous representation before quantization).
                - commitment_loss (torch.Tensor): Commitment loss scalar to train encoder to predict
                  vectors closer to codebook entries.
                - codebook_loss (torch.Tensor): Codebook loss scalar to update the codebook.
        r   Nr   )devicedtype)rh   rf   )rW   rd   trainingr   rh   r   fulllongrX   randintrP   	enumerater7   r`   stackcat)r   r8   rg   r:   residualr<   r=   codebook_indicesrL   rd   
batch_sizerh   dropout	n_dropoutr\   	quantizerz_q_icommitment_loss_icodebook_loss_i	indices_iz_e_imaskcodesr   r   r   r!      sn   



zResidualVectorQuantize.forwardr~   c                 C   sx   d}g }|j d }t|D ]$}| j| |dd|ddf }|| | j| |}|| }q|tj|dd|fS )a  Reconstruct the continuous representation from quantized codes.

        Args:
            codes (torch.Tensor): Quantized discrete representation with shape ``[B, N, T]``.

        Returns:
            tuple: A tuple containing:
                - z_q (torch.Tensor): Quantized continuous representation with shape ``[B, D, T]``.
                - z_p (torch.Tensor): Concatenated latent space representation with shape ``[B, N*D, T]``.
                - codes (torch.Tensor): Original input codebook indices with shape ``[B, N, T]``.
        rV   r   Nrj   )r   r_   rd   rD   r`   r.   r   rq   )r   r~   r:   z_prW   r\   z_p_irx   r   r   r   
from_codes   s   
"

z!ResidualVectorQuantize.from_codesrL   c                 C   s   d}g }g }| j }t||jd d }t|D ]8}|| ||d  }}	| j| |dd||	ddf \}
}||
 || | j| |
}|| }q|tj	|ddtj
|ddfS )a.  Reconstruct the continuous representation from unquantized latents.

        Args:
            latents (torch.Tensor): Continuous representation after projection with shape ``[B, N*D, T]``.

        Returns:
            tuple: A tuple containing:
                - z_q (torch.Tensor): Quantized representation of full-projected space with shape ``[B, D, T]``.
                - z_p (torch.Tensor): Quantized representation of latent space with shape ``[B, N*D, T]``.
                - codes (torch.Tensor): Codebook indices with shape ``[B, N, T]``.
        r   r   Nrj   )rb   r   r   r_   rd   r3   r`   r.   r   rq   rp   )r   rL   r:   r   r~   dimsrW   r\   jkr   codes_irx   r   r   r   from_latents  s   *


z#ResidualVectorQuantize.from_latents)rR   rS   rT   rU   rV   r   )r"   r#   r$   rO   rP   r   listfloatr   r!   r   Tensorr   r   r%   r   r   r   r   rQ   t   s*    
ZrQ   c                       0   e Zd Zd	dedef fddZdd Z  ZS )
ResidualUnit   r   rf   dilationc              
      sP   t    d| d }tt|tj||d||dt|tj||dd| _d S )N   r
      )r+   r   paddingr   r*   )r   r   r   
Sequentialr   r,   block)r   rf   r   padr   r   r   r   %  s   

zResidualUnit.__init__c                 C   sD   |  |}|jd |jd  d }|dkr|d|| f }|| S )Nr	   r
   r   .)r   r   )r   r   yr   r   r   r   r!   /  s
   
zResidualUnit.forwardr   r   r"   r#   r$   rP   r   r!   r%   r   r   r   r   r   $  s    
r   c                       r   )
EncoderBlockr   r   rf   stridec                    sp   t    tt|d ddt|d ddt|d ddt|d tj|d |d| |t|d d| _	d S )Nr
   r   r      rS   )r+   r   r   )
r   r   r   r   r   r   r,   mathceilr   )r   rf   r   r   r   r   r   8  s   


zEncoderBlock.__init__c                 C   
   |  |S r   r   r    r   r   r   r!   H     
zEncoderBlock.forwardr   r   r   r   r   r   r   7  s    r   c                       s>   e Zd Zdg ddfdededef fddZdd	 Z  ZS )
Encoder@   )r
      rU   rU   d_modelstridesd_latentc              	      s   t    tjd|dddg| _|D ]}|d9 }|  jt||dg7  _q|  jt|tj||dddg7  _tj| j | _|| _d S )Nr   r   r   r+   r   r
   )r   )	r   r   r   r,   r   r   r   r   enc_dim)r   r   r   r   r   r   r   r   r   M  s   

zEncoder.__init__c                 C   r   r   r   r    r   r   r   r!   f  r   zEncoder.forward)r"   r#   r$   rP   r   r   r!   r%   r   r   r   r   r   L  s    r   c                       s4   e Zd Zddededef fddZd	d
 Z  ZS )DecoderBlockr   rU   r   r'   
output_dimr   c                    sb   t    tt|tj||d| |t|d |d dt|ddt|ddt|dd| _	d S )Nr
   )r+   r   r   output_paddingr   r   r   rS   )
r   r   r   r   r   ConvTranspose1dr   r   r   r   )r   r'   r   r   r   r   r   r   k  s   




zDecoderBlock.__init__c                 C   r   r   r   r    r   r   r   r!   |  r   zDecoderBlock.forward)r   rU   r   r   r   r   r   r   r   j  s    r   c                       s.   e Zd Z	ddef fddZdd Z  ZS )Decoderr   d_outc           
         s   t    tj||dddg}t|D ]\}}|d|  }|d|d   }	|t||	|g7 }q|t|	tj|	|dddt g7 }tj| | _	d S )Nr   r   r   r
   r   )
r   r   r   r,   ro   r   r   Tanhr   model)
r   input_channelr   ratesr   layersr\   r   r'   r   r   r   r   r     s   
zDecoder.__init__c                 C   r   r   )r   r    r   r   r   r!     r   zDecoder.forward)r   r   r   r   r   r   r     s    r   c                       s   e Zd Zdef fddZedd Zedd Zedd	 Z	d
d Z
	ddejdefddZdejfddZ		ddejdedefddZ  ZS )DACconfigc                    s  t    |j| _|j| _|j| _|j| _|j| _t|j| _	|j
| _
|jd u r4|jdt|j  }n|j}|| _|jrFt|j|j|| _|jsd|j| _|j| _|j| _t||j|j|j|jd| _ntj|d| d| _tj||d| _|jrt||j|j| _| | j d S )Nr
   )r'   rW   r(   r)   rX   r   ) r   r   
continuousdecoder_dimdecoder_ratesencoder_dimencoder_ratesr   prod
hop_lengthsample_rate
latent_dimlenload_encoderr   encoderrW   r(   r)   rQ   rX   rw   r   r   r,   
quant_convpost_quant_convload_decoderr   decoderapplyinit_weights)r   r   r   r   r   r   r     sD   


zDAC.__init__c                 C   s6   t | tjrtjj| jdd tj| jd d S d S )Ng{Gz?)stdr   )r^   r   r,   inittrunc_normal_r?   	constant_bias)mr   r   r   r     s   zDAC.init_weightsc                 C      t |  jS r   )next
parametersri   r   r   r   r   ri        z	DAC.dtypec                 C   r   r   )r   r   rh   r   r   r   r   rh     r   z
DAC.devicec                 C   sV   |d u r| j }|| j ksJ |jd }t|| j | j | }tj|d|f}|S )Nr	   r   )r   r   r   r   r   r   
functionalr   )r   
audio_datar   length	right_padr   r   r   
preprocess  s   
zDAC.preprocessNr   rg   c                 C   sT   |  |}| js| ||\}}}}}n| |}t|}d\}}}}|||||fS )a  Encode audio data into latent representations.

        This method processes audio through the encoder network and optionally applies
        vector quantization (in VQ mode) or projects to a Gaussian distribution (in
        continuous mode) to produce latent representations.

        Args:
            audio_data (torch.Tensor): Audio data to encode, with shape ``[B, 1, T]``.
            n_quantizers (int, optional): Number of quantizers to use. If ``None``,
                all quantizers are used. Only applicable in VQ mode (``continuous=False``).

        Returns:
            tuple: A tuple containing:
                - z (torch.Tensor): Encoded representation. In VQ mode, this is the
                  quantized continuous representation with shape ``[B, D, T]``. In
                  continuous mode, this is a ``DiagonalGaussianDistribution`` object.
                - codes (torch.Tensor or None): Codebook indices with shape ``[B, N, T]``
                  in VQ mode, ``None`` in continuous mode.
                - latents (torch.Tensor or None): Projected latents with shape ``[B, N*D, T]``
                  in VQ mode, ``None`` in continuous mode.
                - commitment_loss (torch.Tensor): Commitment loss scalar.
                - codebook_loss (torch.Tensor): Codebook loss scalar.

        Note:
            In continuous mode, the encoded representation is projected through a
            quantization convolution layer and wrapped in a ``DiagonalGaussianDistribution``
            for VAE training.
        )NNr   r   )r   r   rw   r   r   )r   r   rg   r8   r~   rL   r<   r=   r   r   r   encode  s   
!
z
DAC.encoder8   c                 C   s,   | j s
| |}|S | |}| |}|S )a  Decode latent representations back to audio waveforms.

        This method takes latent representations (either quantized from VQ mode or sampled
        from the posterior in continuous mode) and reconstructs the corresponding audio
        through the decoder network.

        Args:
            z (torch.Tensor): Latent representation to decode, with shape ``[B, D, T]``.
                In VQ mode (``continuous=False``), this is the quantized continuous
                representation. In continuous mode (``continuous=True``), this is sampled
                from the posterior distribution.

        Returns:
            torch.Tensor: Decoded audio data with shape ``[B, 1, T']``. The output length
            T' is determined by the decoder's upsampling rates and may differ from the
            input temporal dimension T.

        Note:
            In continuous mode (``continuous=True``), the input is first passed through
            a post-quantization convolution layer before being fed to the decoder.
        )r   r   r   )r   r8   audior   r   r   decode  s   


z
DAC.decoder   c                 C   s   |j d }| ||}| js-| ||\}}}}}	| |}
|
dd|f |||||	dS | ||\}}}}}| }| |}
|jdd}| }|
dd|f ||dS )a-  Model forward pass.

        Args:
            audio_data (torch.Tensor): Audio to encode, shape [B, 1, T].
            sample_rate (int, optional): Sample rate in Hz. Defaults to
                ``self.sample_rate`` when ``None``.
            n_quantizers (int, optional): Number of quantizers to use. When ``None``,
                all quantizers are used. Only used in VQ mode (``continuous=False``).

        Returns:
            dict: A dictionary containing different keys depending on the mode:

            **VQ Mode (``continuous=False``):**
                - "audio" (torch.Tensor): Decoded audio, shape [B, 1, length].
                - "z" (torch.Tensor): Quantized continuous representation, shape [B, D, T].
                - "codes" (torch.Tensor): Codebook indices, shape [B, N, T].
                - "latents" (torch.Tensor): Projected latents, shape [B, N*D, T].
                - "vq/commitment_loss" (torch.Tensor): Commitment loss.
                - "vq/codebook_loss" (torch.Tensor): Codebook loss.

            **Continuous Mode (``continuous=True``):**
                - "audio" (torch.Tensor): Decoded audio, shape [B, 1, length].
                - "z" (torch.Tensor): Latent representation, shape [B, D, T].
                - "kl_loss" (torch.Tensor): KL divergence loss (for VAE training).
        r	   .N)r   r8   r~   rL   zvq/commitment_losszvq/codebook_loss)r   r
   )r   )r   r8   kl_loss)r   r   r   r   r   sampleklr7   )r   r   r   rg   r   r8   r~   rL   r<   r=   r   	posteriorrZ   r   r   r   r   r!   5  s.   

	
zDAC.forwardr   )NN)r"   r#   r$   r   r   staticmethodr   propertyri   rh   r   r   r   rP   r   r   r!   r%   r   r   r   r   r     s6    0



-!r   )r   bisectr   typingr   r   torch.nn.functionalr   r   r4   einopsr   -sglang.multimodal_gen.configs.models.vaes.dacr   0sglang.multimodal_gen.runtime.models.vaes.commonr   jitscriptr   Moduler   r&   rQ   r   r   r   r   r   r   
EntryClassr   r   r   r   <module>   s.   
	M 1  T