o
    pi                     @   s<  d dl mZ d dlmZmZ d dlZd dlZd dlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ eG dd deZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de Z!G dd de	jZ"G dd de	jZ#dS )    )	dataclass)OptionalTupleN   )
BaseOutputis_torch_version)randn_tensor   )get_activation)SpatialNorm)AutoencoderTinyBlockUNetMidBlock2Dget_down_blockget_up_blockc                   @   s.   e Zd ZU dZejed< dZeej	 ed< dS )DecoderOutputz
    Output of decoding method.

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
            The decoded output sample from the last layer of the model.
    sampleNcommit_loss)
__name__
__module____qualname____doc__torchTensor__annotations__r   r   FloatTensor r   r   _/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/autoencoders/vae.pyr   !   s   
 
r   c                       s|   e Zd ZdZ									dd	ed
edeedf deedf dedededef fddZde	j
de	j
fddZ  ZS )Encodera  
    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
            options.
        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
            The number of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
        act_fn (`str`, *optional*, defaults to `"silu"`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
        double_z (`bool`, *optional*, defaults to `True`):
            Whether to double the number of output channels for the last block.
    r   DownEncoderBlock2D@   r	       siluTin_channelsout_channelsdown_block_types.block_out_channelslayers_per_blocknorm_num_groupsact_fndouble_zc
                    s  t    || _tj||d dddd| _tg | _|d }
t|D ])\}}|
}|| }
|t	|d k}t
|| j||
| dd|||
d d}| j| q#t|d d|dd|d |d |	d		| _tj|d |dd
| _t | _|rud| n|}tj|d |ddd| _d| _d S )Nr   r      kernel_sizestridepaddingư>)

num_layersr$   r%   add_downsample
resnet_epsdownsample_paddingresnet_act_fnresnet_groupsattention_head_dimtemb_channelsdefault	r$   r4   r6   output_scale_factorresnet_time_scale_shiftr8   r7   r9   add_attentionnum_channels
num_groupsepsr	   r0   F)super__init__r(   nnConv2dconv_in
ModuleListdown_blocks	enumeratelenr   appendr   	mid_block	GroupNormconv_norm_outSiLUconv_actconv_outgradient_checkpointing)selfr$   r%   r&   r'   r(   r)   r*   r+   mid_block_add_attentionoutput_channelidown_block_typeinput_channelis_final_block
down_blockconv_out_channels	__class__r   r   rF   G   sZ   


zEncoder.__init__r   returnc                 C   s   |  |}| jrP| jrPdd }tddr4| jD ]}tjjj|||dd}qtjjj|| j|dd}n+| jD ]}tjj|||}q7tjj|| j|}n| jD ]}||}qS| |}| 	|}| 
|}| |}|S )z*The forward method of the `Encoder` class.c                        fdd}|S )Nc                         |  S Nr   inputsmoduler   r   custom_forward      zFEncoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr   rh   ri   r   rg   r   create_custom_forward      z.Encoder.forward.<locals>.create_custom_forward>=1.11.0Fuse_reentrant)rI   trainingrU   r   rK   r   utils
checkpointrO   rQ   rS   rT   )rV   r   rl   r]   r   r   r   forward   s*   










zEncoder.forward)	r   r   r   r    r	   r"   r#   TT)r   r   r   r   intr   strboolrF   r   r   ru   __classcell__r   r   r_   r   r   /   s:    

	Er   c                       s   e Zd ZdZ										dd
ededeedf deedf dedededef fddZ	ddej	de
ej	 dej	fddZ  ZS )Decodera  
    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
            The number of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
        act_fn (`str`, *optional*, defaults to `"silu"`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
        norm_type (`str`, *optional*, defaults to `"group"`):
            The normalization type to use. Can be either `"group"` or `"spatial"`.
    r   UpDecoderBlock2Dr    r	   r"   r#   groupTr$   r%   up_block_types.r'   r(   r)   r*   	norm_typec
                    sL  t    || _tj||d dddd| _tg | _|dkr!|nd }
t|d d|d|dkr0dn||d ||
|	d		| _	t
t|}|d
 }t|D ].\}}|}|| }|t|d k}t|| jd ||d | d||||
|d}| j| |}qI|dkrt|d
 |
| _ntj|d
 |dd| _t | _tj|d
 |ddd| _d| _d S )Nr:   r   r,   r-   spatialr1   r}   r;   r<   r   r2   r$   r%   prev_output_channeladd_upsampler4   r6   r7   r8   r9   r>   r@   rD   F)rE   rF   r(   rG   rH   rI   rJ   	up_blocksr   rO   listreversedrL   rM   r   rN   r   rQ   rP   rR   rS   rT   rU   )rV   r$   r%   r~   r'   r(   r)   r*   r   rW   r9   reversed_block_out_channelsrX   rY   up_block_typer   r\   up_blockr_   r   r   rF      sd   


zDecoder.__init__Nr   latent_embedsra   c                 C   s4  |  |}tt| j j}| jrh| jrhdd }tddrEt	j
jj|| j||dd}||}| jD ]}t	j
jj||||dd}q4n9t	j
j|| j||}||}| jD ]}t	j
j||||}qYn| ||}||}| jD ]}|||}qv|du r| |}n| ||}| |}| |}|S )z*The forward method of the `Decoder` class.c                    rb   )Nc                     rc   rd   r   re   rg   r   r   ri   )  rj   zFDecoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr   rk   r   rg   r   rl   (  rm   z.Decoder.forward.<locals>.create_custom_forwardrn   ro   Frp   N)rI   nextiterr   
parametersdtyperr   rU   r   r   rs   rt   rO   torQ   rS   rT   )rV   r   r   upscale_dtyperl   r   r   r   r   ru     sJ   



	





zDecoder.forward)	r   r   r{   r    r	   r"   r#   r}   Trd   r   r   r   r   rv   r   rw   rF   r   r   r   ru   ry   r   r   r_   r   rz      sH    

	Orz   c                       sD   e Zd ZdZdededdf fddZdejdejfd	d
Z  Z	S )UpSamplea&  
    The `UpSample` layer of a variational autoencoder that upsamples its input.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
    r$   r%   ra   Nc                    s0   t    || _|| _tj||dddd| _d S )N   r	   r,   r-   )rE   rF   r$   r%   rG   ConvTranspose2ddeconv)rV   r$   r%   r_   r   r   rF   i  s   
zUpSample.__init__xc                 C   s   t |}| |}|S )z+The forward method of the `UpSample` class.)r   relur   )rV   r   r   r   r   ru   s  s   

zUpSample.forward
r   r   r   r   rv   rF   r   r   ru   ry   r   r   r_   r   r   ^  s    

r   c                       sV   e Zd ZdZ			ddedededed	d
f
 fddZddejd	ejfddZ  Z	S )MaskConditionEncoderz)
    used in AsymmetricAutoencoderKL
             in_chout_chres_chr/   ra   Nc              
      s  t    g }|dkr.|d }|d }||kr|}|dkr|}|||f |d9 }|dksg }|D ]	\}}	||	 q2||d d  g }
|}tt|D ]+}|| }|dks]|dkrk|
tj||dddd n|
tj||dddd |}qOtj|
 | _d S )Nr,   r	   r:   r   r   r-   r   )	rE   rF   rN   rangerM   rG   rH   
Sequentiallayers)rV   r   r   r   r/   channelsin_ch_r%   _in_ch_out_chr   lout_ch_r_   r   r   rF     s2   

zMaskConditionEncoder.__init__r   c                 C   sJ   i }t t| jD ]}| j| }||}||tt|j< t|}q	|S )z7The forward method of the `MaskConditionEncoder` class.)r   rM   r   rw   tupleshaper   r   )rV   r   maskoutr   layerr   r   r   ru     s   
zMaskConditionEncoder.forward)r   r   r   rd   r   r   r   r_   r   r   z  s"     %r   c                       s   e Zd ZdZ								dd	ed
edeedf deedf dedededef fddZ			ddej	de
ej	 de
ej	 de
ej	 dej	f
ddZ  ZS )MaskConditionDecodera  The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
    decoder with a conditioner on the mask and masked image.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
            The number of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
        act_fn (`str`, *optional*, defaults to `"silu"`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
        norm_type (`str`, *optional*, defaults to `"group"`):
            The normalization type to use. Can be either `"group"` or `"spatial"`.
    r   r{   r    r	   r"   r#   r}   r$   r%   r~   .r'   r(   r)   r*   r   c	                    sb  t    || _tj||d dddd| _tg | _|dkr!|nd }	t|d d|d|dkr0dn||d ||	d	| _	t
t|}
|
d
 }t|D ].\}}|}|
| }|t|d k}t|| jd ||d | d||||	|d}| j| |}qHt||d
 |d d| _|dkrt|d
 |	| _ntj|d
 |dd| _t | _tj|d
 |ddd| _d| _d S )Nr:   r   r,   r-   r   r1   r}   r;   )r$   r4   r6   r=   r>   r8   r7   r9   r   r   )r   r   r   r@   rD   F)rE   rF   r(   rG   rH   rI   rJ   r   r   rO   r   r   rL   rM   r   rN   r   condition_encoderr   rQ   rP   rR   rS   rT   rU   )rV   r$   r%   r~   r'   r(   r)   r*   r   r9   r   rX   rY   r   r   r\   r   r_   r   r   rF     sl   


zMaskConditionDecoder.__init__Nzimager   r   ra   c                 C   st  |}|  |}tt| j j}| jr*| jr*dd }tddrt	j
jj|| j||dd}||}|durQ|durQd| | }t	j
jj|| j||dd}	| jD ]8}
|dur|dur|	tt|j }tjj||jd	d d
d}|| |d|   }t	j
jj||
||dd}qT|dur|dur|| |	tt|j d|   }nt	j
j|| j||}||}|dur|durd| | }t	j
j|| j||}	| jD ]8}
|dur|dur|	tt|j }tjj||jd	d d
d}|| |d|   }t	j
j||
||}q|dur)|dur)|| |	tt|j d|   }ns| ||}||}|durK|durKd| | }| ||}	| jD ]3}
|dur{|dur{|	tt|j }tjj||jd	d d
d}|| |d|   }|
||}qN|dur|dur|| |	tt|j d|   }|du r| |}n| ||}| |}| |}|S )z7The forward method of the `MaskConditionDecoder` class.c                    rb   )Nc                     rc   rd   r   re   rg   r   r   ri   &  rj   zSMaskConditionDecoder.forward.<locals>.create_custom_forward.<locals>.custom_forwardr   rk   r   rg   r   rl   %  rm   z;MaskConditionDecoder.forward.<locals>.create_custom_forwardrn   ro   Frp   Nr,   nearest)sizemode)rI   r   r   r   r   r   rr   rU   r   r   rs   rt   rO   r   r   rw   r   r   rG   
functionalinterpolaterQ   rS   rT   )rV   r   r   r   r   r   r   rl   masked_imageim_xr   sample_mask_r   r   r   ru     s   



"

"

"


zMaskConditionDecoder.forward)r   r   r{   r    r	   r"   r#   r}   )NNNr   r   r   r_   r   r     sR    

	Tr   c                       s   e Zd ZdZ				ddededed	ed
edef fddZde	j
de	j
fddZde	j
de	j
fddZde	jdee	je	jef fddZde	j
deedf de	jfddZ  ZS )VectorQuantizerz
    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
    multiplications and allows for post-hoc remapping of indices.
    NrandomFTn_evq_embed_dimbetaunknown_indexsane_index_shapelegacyc                    s   t    || _|| _|| _|| _t| j| j| _| jj	j
d| j d| j  || _| jd urk| dtt| j |  | jjd | _|| _| jdkrY| j| _| jd | _td| j d| j d	| j d
 n|| _|| _d S )Ng            ?usedr   extrar,   z
Remapping z indices to z indices. Using z for unknown indices.)rE   rF   r   r   r   r   rG   	Embedding	embeddingweightdatauniform_remapregister_bufferr   tensornploadr   r   re_embedr   printr   )rV   r   r   r   r   r   r   r   r_   r   r   rF     s.   




zVectorQuantizer.__init__indsra   c                 C   s   |j }t|dksJ ||d d}| j|}|d d d d d f |d k }|d}|ddk }| jdkrOt	j
d| j|| j dj|jd||< n| j||< ||S )	Nr,   r   r:   )NN.r	   r   )r   )device)r   rM   reshaper   r   longargmaxsumr   r   randintr   r   )rV   r   ishaper   matchnewunknownr   r   r   remap_to_used  s   "

(

zVectorQuantizer.remap_to_usedc                 C   s   |j }t|dksJ ||d d}| j|}| j| jj d kr,d||| jj d k< t|d d d f |j d dg d d f d|}||S )Nr,   r   r:   )r   rM   r   r   r   r   r   gather)rV   r   r   r   backr   r   r   unmap_to_all  s   2
zVectorQuantizer.unmap_to_allr   c                 C   sN  | dddd }|d| j}tjt|| jjdd}| ||j	}d }d }| j
sH| jt| | d  t||  d  }nt| | d | jt||  d   }|||   }| dddd }| jd ur||j	d d}| |}|dd}| jr||j	d |j	d |j	d }|||||ffS )Nr   r	   r   r,   r:   dim)permute
contiguousviewr   r   argmincdistr   r   r   r   r   meandetachr   r   r   r   )rV   r   z_flattenedmin_encoding_indicesz_q
perplexitymin_encodingslossr   r   r   ru     s$   42

 zVectorQuantizer.forwardindicesr   .c                 C   sb   | j d ur||d d}| |}|d}| |}|d ur/||}|dddd }|S )Nr   r:   r   r,   r	   )r   r   r   r   r   r   r   )rV   r   r   r   r   r   r   get_codebook_entry  s   




z"VectorQuantizer.get_codebook_entry)Nr   FT)r   r   r   r   rv   floatrw   rx   rF   r   
LongTensorr   r   r   r   ru   r   ry   r   r   r_   r   r     s.    %"
*"r   c                   @   s   e Zd ZddejdefddZddeej dejfd	d
Z	ddd dejfddZ
g dfdejdeedf dejfddZdejfddZdS )DiagonalGaussianDistributionFr   deterministicc                 C   s   || _ tj|ddd\| _| _t| jdd| _|| _td| j | _t| j| _	| jrAtj
| j| j j| j jd | _	| _d S d S )Nr	   r,   r   g      >g      4@      ?)r   r   )r   r   chunkr   logvarclampr   expstdvar
zeros_liker   r   )rV   r   r   r   r   r   rF      s   z%DiagonalGaussianDistribution.__init__N	generatorra   c                 C   s0   t | jj|| jj| jjd}| j| j|  }|S )N)r   r   r   )r   r   r   r   r   r   r   )rV   r   r   r   r   r   r   r     s   z#DiagonalGaussianDistribution.sampleotherc                 C   s   | j r	tdgS |d u r%dtjt| jd| j d | j g dd S dtjt| j|j d|j | j|j  d | j |j g dd S )N        r   r	   r   r,   r	   r   r   )r   r   r   r   powr   r   r   )rV   r   r   r   r   kl  s&   
zDiagonalGaussianDistribution.klr  r   dims.c                 C   sR   | j r	tdgS tdtj }dtj|| j t|| j	 d| j
  |d S )Nr  g       @r   r	   r   )r   r   r   r   logpir   r   r  r   r   )rV   r   r  logtwopir   r   r   nll*  s    z DiagonalGaussianDistribution.nllc                 C   s   | j S rd   )r   )rV   r   r   r   r   3  s   z!DiagonalGaussianDistribution.mode)Frd   )r   r   r   r   r   rx   rF   r   	Generatorr   r  r   rv   r	  r   r   r   r   r   r     s    *	r   c                
       s\   e Zd ZdZdededeedf deedf def
 fdd	Zd
ej	dej	fddZ
  ZS )EncoderTinya  
    The `EncoderTiny` layer is a simpler version of the `Encoder` layer.

    Args:
        in_channels (`int`):
            The number of input channels.
        out_channels (`int`):
            The number of output channels.
        num_blocks (`Tuple[int, ...]`):
            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
            use.
        block_out_channels (`Tuple[int, ...]`):
            The number of output channels for each block.
        act_fn (`str`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
    r$   r%   
num_blocks.r'   r*   c                    s   t    g }t|D ]7\}}|| }	|dkr$|tj||	ddd n|tj|	|	ddddd t|D ]}
|t|	|	| q6q|tj|d |ddd tj| | _	d| _
d S )	Nr   r   r,   r.   r0   r	   F)r.   r0   r/   biasr:   )rE   rF   rL   rN   rG   rH   r   r   r   r   rU   )rV   r$   r%   r  r'   r*   r   rY   	num_blockrA   _r_   r   r   rF   I  s,   

zEncoderTiny.__init__r   ra   c                 C   sp   | j r+| jr+dd }tddrtjjj|| j|dd}|S tjj|| j|}|S | |dd}|S )	z.The forward method of the `EncoderTiny` class.c                    rb   )Nc                     rc   rd   r   re   rg   r   r   ri   r  rj   zJEncoderTiny.forward.<locals>.create_custom_forward.<locals>.custom_forwardr   rk   r   rg   r   rl   q  rm   z2EncoderTiny.forward.<locals>.create_custom_forwardrn   ro   Frp   r,   r	   )	rr   rU   r   r   rs   rt   r   adddivrV   r   rl   r   r   r   ru   m  s   
zEncoderTiny.forwardr   r   r   r   rv   r   rw   rF   r   r   ru   ry   r   r   r_   r   r  7  s    

$r  c                       sd   e Zd ZdZdededeedf deedf deded	ef fd
dZdej	dej	fddZ
  ZS )DecoderTinya  
    The `DecoderTiny` layer is a simpler version of the `Decoder` layer.

    Args:
        in_channels (`int`):
            The number of input channels.
        out_channels (`int`):
            The number of output channels.
        num_blocks (`Tuple[int, ...]`):
            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
            use.
        block_out_channels (`Tuple[int, ...]`):
            The number of output channels for each block.
        upsampling_scaling_factor (`int`):
            The scaling factor to use for upsampling.
        act_fn (`str`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
    r$   r%   r  .r'   upsampling_scaling_factorr*   upsample_fnc              
      s   t    tj||d dddt|g}t|D ]?\}	}
|	t|d k}||	 }t|
D ]}|t	||| q,|sD|tj
||d |sH|n|}|tj||dd|d qtj| | _d| _d S )Nr   r   r,   r  )scale_factorr   )r.   r0   r  F)rE   rF   rG   rH   r
   rL   rM   r   rN   r   Upsampler   r   rU   )rV   r$   r%   r  r'   r  r*   r  r   rY   r  r\   rA   r  conv_out_channelr_   r   r   rF     s.   



zDecoderTiny.__init__r   ra   c                 C   s~   t |d d }| jr2| jr2dd }tddr&t jjj|| j|dd}nt jj|| j|}n| |}|d	d	S )
z.The forward method of the `DecoderTiny` class.r   c                    rb   )Nc                     rc   rd   r   re   rg   r   r   ri     rj   zJDecoderTiny.forward.<locals>.create_custom_forward.<locals>.custom_forwardr   rk   r   rg   r   rl     rm   z2DecoderTiny.forward.<locals>.create_custom_forwardrn   ro   Frp   r	   r,   )
r   tanhrr   rU   r   rs   rt   r   mulsubr  r   r   r   ru     s   

zDecoderTiny.forwardr  r   r   r_   r   r    s$    

)r  )$dataclassesr   typingr   r   numpyr   r   torch.nnrG   rs   r   r   utils.torch_utilsr   activationsr
   attention_processorr   unets.unet_2d_blocksr   r   r   r   r   Moduler   rz   r   r   r   r   objectr   r  r  r   r   r   r   <module>   s0     &5 Wz8L