o
    ۷i@                     @   sD  d dl Z d dlmZ d dlZd dlZd dlmZ d dlm	Z	 ddl
mZmZ ddlmZ ddlmZ ddlmZ d	d
lmZ ddlmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd deZeG dd deZeG dd deZG dd dejZ G dd dejZ!G dd  d eeeZ"dS )!    N)	dataclass)weight_norm   )ConfigMixinregister_to_config)
BaseOutput)apply_forward_hook)randn_tensor   )
ModelMixin   )AutoencoderMixinc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )Snake1dz;
    A 1-dimensional Snake activation function module.
    Tc                    sP   t    ttd|d| _ttd|d| _d| j_d| j_|| _	d S )Nr   T)
super__init__nn	Parametertorchzerosalphabetarequires_gradlogscale)self
hidden_dimr   	__class__ g/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/autoencoders/autoencoder_oobleck.pyr   #   s   

zSnake1d.__init__c                 C   s~   |j }| js	| jnt| j}| js| jnt| j}||d |d d}||d  t|| 	d  }||}|S )Nr   r   g&.>r
   )
shaper   r   r   expr   reshape
reciprocalsinpow)r   hidden_statesr    r   r   r   r   r   forward,   s   $
zSnake1d.forwardT__name__
__module____qualname____doc__r   r'   __classcell__r   r   r   r   r      s    	r   c                       s4   e Zd ZdZd
dedef fddZdd	 Z  ZS )OobleckResidualUnitza
    A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
       r   	dimensiondilationc                    s^   t    d| d }t|| _ttj||d||d| _t|| _ttj||dd| _	d S )N   r
      )kernel_sizer2   paddingr   )r5   )
r   r   r   snake1r   r   Conv1dconv1snake2conv2)r   r1   r2   padr   r   r   r   =   s   


zOobleckResidualUnit.__init__c                 C   sb   |}|  | |}| | |}|jd |jd  d }|dkr+|d|| f }|| }|S )aq  
        Forward pass through the residual unit.

        Args:
            hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
                Input tensor .

        Returns:
            output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`)
                Input tensor after passing through the residual unit.
        r   r
   r   .)r9   r7   r;   r:   r    )r   hidden_stateoutput_tensorr6   r   r   r   r'   F   s   zOobleckResidualUnit.forward)r0   r   r*   r+   r,   r-   intr   r'   r.   r   r   r   r   r/   8   s    	r/   c                       0   e Zd ZdZddef fddZdd Z  ZS )	OobleckEncoderBlockz&Encoder block used in Oobleck encoder.r   stridec              
      sj   t    t|dd| _t|dd| _t|dd| _t|| _tt	j
||d| |t|d d| _d S )Nr   r2   r   	   r
   r5   rC   r6   )r   r   r/   	res_unit1	res_unit2	res_unit3r   r7   r   r   r8   mathceilr9   r   	input_dim
output_dimrC   r   r   r   r   `   s   

 
zOobleckEncoderBlock.__init__c                 C   s2   |  |}| |}| | |}| |}|S N)rG   rH   r7   rI   r9   r   r=   r   r   r   r'   k   s
   


zOobleckEncoderBlock.forwardr   r?   r   r   r   r   rB   ]   s    rB   c                       rA   )	OobleckDecoderBlockz&Decoder block used in Oobleck decoder.r   rC   c              
      sj   t    t|| _ttj||d| |t|d d| _	t
|dd| _t
|dd| _t
|dd| _d S )Nr
   rF   r   rD   r   rE   )r   r   r   r7   r   r   ConvTranspose1drJ   rK   conv_t1r/   rG   rH   rI   rL   r   r   r   r   w   s   

	zOobleckDecoderBlock.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rO   )r7   rT   rG   rH   rI   rP   r   r   r   r'      s   




zOobleckDecoderBlock.forwardrQ   r?   r   r   r   r   rR   t   s    rR   c                   @   sd   e Zd ZddejdefddZddejdB dejfd	d
Zddd dejfddZ	dejfddZ
dS )#OobleckDiagonalGaussianDistributionF
parametersdeterministicc                 C   sV   || _ |jddd\| _| _tj| jd | _| j| j | _t	
| j| _|| _d S )Nr
   r   )dimg-C6?)rV   chunkmeanscaler   
functionalsoftplusstdvarr   loglogvarrW   )r   rV   rW   r   r   r   r      s   
z,OobleckDiagonalGaussianDistribution.__init__N	generatorreturnc                 C   s0   t | jj|| jj| jjd}| j| j|  }|S )N)rb   devicedtype)r	   rZ   r    rV   rd   re   r^   )r   rb   samplexr   r   r   rf      s   z*OobleckDiagonalGaussianDistribution.sampleotherc                 C   s   | j r	tdgS |d u r | j| j | j | j d d S t| j|j d|j }| j|j }| j|j }|| | d }|d }|S )Ng        g      ?r   r
   )rW   r   TensorrZ   r_   ra   sumr%   )r   rh   normalized_diff	var_ratiologvar_diffklr   r   r   rn      s   &z&OobleckDiagonalGaussianDistribution.klc                 C   s   | j S rO   )rZ   r   r   r   r   mode   s   z(OobleckDiagonalGaussianDistribution.mode)FrO   )r*   r+   r,   r   ri   boolr   	Generatorrf   rn   rp   r   r   r   r   rU      s
    rU   c                   @   s   e Zd ZU dZded< dS )AutoencoderOobleckOutputar  
    Output of AutoencoderOobleck encoding method.

    Args:
        latent_dist (`OobleckDiagonalGaussianDistribution`):
            Encoded outputs of `Encoder` represented as the mean and standard deviation of
            `OobleckDiagonalGaussianDistribution`. `OobleckDiagonalGaussianDistribution` allows for sampling latents
            from the distribution.
    rU   latent_distN)r*   r+   r,   r-   __annotations__r   r   r   r   rs      s   
 
rs   c                   @   s   e Zd ZU dZejed< dS )OobleckDecoderOutputz
    Output of decoding method.

    Args:
        sample (`torch.Tensor` of shape `(batch_size, audio_channels, sequence_length)`):
            The decoded output sample from the last layer of the model.
    rf   N)r*   r+   r,   r-   r   ri   ru   r   r   r   r   rv      s   
 rv   c                       (   e Zd ZdZ fddZdd Z  ZS )OobleckEncoderzOobleck Encoderc           	   	      s   t    |}dg| }ttj||ddd| _g | _t|D ]\}}|  jt|||  |||d   |dg7  _qt	| j| _||d  }t
|| _ttj||ddd| _d S )Nr   r4   r   r5   r6   rM   rN   rC   r   )r   r   r   r   r8   r9   block	enumeraterB   
ModuleListr   r7   r;   )	r   encoder_hidden_sizeaudio_channelsdownsampling_ratioschannel_multiplesstridesstride_indexrC   d_modelr   r   r   r      s"   




zOobleckEncoder.__init__c                 C   6   |  |}| jD ]}||}q| |}| |}|S rO   r9   r{   r7   r;   )r   r=   moduler   r   r   r'         




zOobleckEncoder.forwardr)   r   r   r   r   rx          rx   c                       rw   )OobleckDecoderzOobleck Decoderc                    s   t    |}dg| }ttj|||d  ddd| _g }t|D ] \}}	|t||t||   ||t|| d   |	dg7 }q"t	|| _
|}
t|
| _ttj||dddd| _d S )	Nr   r   r4   r   ry   rz   F)r5   r6   bias)r   r   r   r   r8   r9   r|   rR   lenr}   r{   r   r7   r;   )r   channelsinput_channelsr   upsampling_ratiosr   r   r{   r   rC   rN   r   r   r   r     s"   

 
zOobleckDecoder.__init__c                 C   r   rO   r   )r   r=   layerr   r   r   r'     r   zOobleckDecoder.forwardr)   r   r   r   r   r      r   r   c                       s   e Zd ZdZdZdZedg dg dddddf fd	d
	Ze	dde	j
dedeee B fddZdde	j
dedee	j
B fddZe	dde	jdedee	jB fddZ			dde	j
dedede	jdB dee	j
B f
ddZ  ZS )AutoencoderOoblecka  
    An autoencoder for encoding waveforms into latents and decoding latent representations into waveforms. First
    introduced in Stable Audio.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        encoder_hidden_size (`int`, *optional*, defaults to 128):
            Intermediate representation dimension for the encoder.
        downsampling_ratios (`list[int]`, *optional*, defaults to `[2, 4, 4, 8, 8]`):
            Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder.
        channel_multiples (`list[int]`, *optional*, defaults to `[1, 2, 4, 8, 16]`):
            Multiples used to determine the hidden sizes of the hidden layers.
        decoder_channels (`int`, *optional*, defaults to 128):
            Intermediate representation dimension for the decoder.
        decoder_input_channels (`int`, *optional*, defaults to 64):
            Input dimension for the decoder. Corresponds to the latent dimension.
        audio_channels (`int`, *optional*, defaults to 2):
            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
        sampling_rate (`int`, *optional*, defaults to 44100):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
    F   )r
      r      r   )r   r
   r   r   r0   @   r
   iD  c                    st   t    || _|| _|| _|d d d | _tt|| _	|| _
t||||d| _t|||| j|d| _d| _d S )Nr   )r~   r   r   r   )r   r   r   r   r   F)r   r   r~   r   decoder_channelsr   r@   npprod
hop_lengthsampling_raterx   encoderr   decoderuse_slicing)r   r~   r   r   r   decoder_input_channelsr   r   r   r   r   r   B  s*   

zAutoencoderOobleck.__init__Trg   return_dictrc   c                    s^    j r|jd dkr fdd|dD }t|}n |}t|}|s*|fS t|dS )a  
        Encode a batch of images into latents.

        Args:
            x (`torch.Tensor`): Input batch of images.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

        Returns:
                The latent representations of the encoded images. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        r   r   c                    s   g | ]}  |qS r   )r   ).0x_slicero   r   r   
<listcomp>x  s    z-AutoencoderOobleck.encode.<locals>.<listcomp>)rt   )r   r    splitr   catr   rU   rs   )r   rg   r   encoded_slicesh	posteriorr   ro   r   encodeg  s   

zAutoencoderOobleck.encodezc                 C   s   |  |}|s
|fS t|dS )Nrf   )r   rv   )r   r   r   decr   r   r   _decode  s   

zAutoencoderOobleck._decodeNc                    sX    j r|jd dkr fdd|dD }t|}n |j}|s'|fS t|dS )a  
        Decode a batch of images.

        Args:
            z (`torch.Tensor`): Input batch of latent vectors.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether to return a [`~models.vae.OobleckDecoderOutput`] instead of a plain tuple.

        Returns:
            [`~models.vae.OobleckDecoderOutput`] or `tuple`:
                If return_dict is True, a [`~models.vae.OobleckDecoderOutput`] is returned, otherwise a plain `tuple`
                is returned.

        r   r   c                    s   g | ]}  |jqS r   )r   rf   )r   z_slicero   r   r   r     s    z-AutoencoderOobleck.decode.<locals>.<listcomp>r   )r   r    r   r   r   r   rf   rv   )r   r   r   rb   decoded_slicesdecodedr   ro   r   decode  s   
zAutoencoderOobleck.decoderf   sample_posteriorrb   c           	      C   sJ   |}|  |j}|r|j|d}n| }| |j}|s |fS t|dS )ah  
        Args:
            sample (`torch.Tensor`): Input sample.
            sample_posterior (`bool`, *optional*, defaults to `False`):
                Whether to sample from the posterior.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`OobleckDecoderOutput`] instead of a plain tuple.
        )rb   r   )r   rt   rf   rp   r   rv   )	r   rf   r   r   rb   rg   r   r   r   r   r   r   r'     s   
zAutoencoderOobleck.forwardr(   )TN)FTN)r*   r+   r,   r-    _supports_gradient_checkpointing_supports_group_offloadingr   r   r   r   ri   rq   rs   tuplerU   r   rv   r   FloatTensorr   rr   r'   r.   r   r   r   r   r   &  s\    $
 r   )#rJ   dataclassesr   numpyr   r   torch.nnr   torch.nn.utilsr   configuration_utilsr   r   utilsr   utils.accelerate_utilsr   utils.torch_utilsr	   modeling_utilsr   vaer   Moduler   r/   rB   rR   objectrU   rs   rv   rx   r   r   r   r   r   r   <module>   s0   %(((