o
    ۷iƄ                     @   sZ  d dl mZ d dlZd dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZ eG d
d deZeG dd deZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd deZG dd deZG dd dejZ G d d! d!ejZ!G d"d# d#Z"dS )$    )	dataclassN   )
BaseOutput)randn_tensor   )get_activation)SpatialNorm)AutoencoderTinyBlockUNetMidBlock2Dget_down_blockget_up_blockc                   @   s   e Zd ZU dZejed< dS )EncoderOutputz
    Output of encoding method.

    Args:
        latent (`torch.Tensor` of shape `(batch_size, num_channels, latent_height, latent_width)`):
            The encoded latent.
    latentN)__name__
__module____qualname____doc__torchTensor__annotations__ r   r   W/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/autoencoders/vae.pyr       s   
 r   c                   @   s.   e Zd ZU dZejed< dZejdB ed< dS )DecoderOutputz
    Output of decoding method.

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
            The decoded output sample from the last layer of the model.
    sampleNcommit_loss)	r   r   r   r   r   r   r   r   FloatTensorr   r   r   r   r   -   s   
 
r   c                       s|   e Zd ZdZ									dd	ed
edeedf deedf dedededef fddZde	j
de	j
fddZ  ZS )Encodera  
    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        down_block_types (`tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
            options.
        block_out_channels (`tuple[int, ...]`, *optional*, defaults to `(64,)`):
            The number of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
        act_fn (`str`, *optional*, defaults to `"silu"`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
        double_z (`bool`, *optional*, defaults to `True`):
            Whether to double the number of output channels for the last block.
    r   DownEncoderBlock2D@   r       siluTin_channelsout_channelsdown_block_types.block_out_channelslayers_per_blocknorm_num_groupsact_fndouble_zc
                    s  t    || _tj||d dddd| _tg | _|d }
t|D ])\}}|
}|| }
|t	|d k}t
|| j||
| dd|||
d d}| j| q#t|d d|dd|d |d |	d		| _tj|d |dd
| _t | _|rud| n|}tj|d |ddd| _d| _d S )Nr   r      kernel_sizestridepaddingư>)

num_layersr#   r$   add_downsample
resnet_epsdownsample_paddingresnet_act_fnresnet_groupsattention_head_dimtemb_channelsdefault	r#   r3   r5   output_scale_factorresnet_time_scale_shiftr7   r6   r8   add_attentionnum_channels
num_groupsepsr   r/   F)super__init__r'   nnConv2dconv_in
ModuleListdown_blocks	enumeratelenr   appendr
   	mid_block	GroupNormconv_norm_outSiLUconv_actconv_outgradient_checkpointing)selfr#   r$   r%   r&   r'   r(   r)   r*   mid_block_add_attentionoutput_channelidown_block_typeinput_channelis_final_block
down_blockconv_out_channels	__class__r   r   rE   S   sZ   


zEncoder.__init__r   returnc                 C   s   |  |}t r | jr | jD ]}| ||}q| | j|}n| jD ]}||}q#| |}| |}| |}| 	|}|S )z*The forward method of the `Encoder` class.)
rH   r   is_grad_enabledrT   rJ   _gradient_checkpointing_funcrN   rP   rR   rS   )rU   r   r\   r   r   r   forward   s   







zEncoder.forward)	r   r   r   r   r   r!   r"   TT)r   r   r   r   inttuplestrboolrE   r   r   rc   __classcell__r   r   r^   r   r   ;   s:    

	Er   c                       s   e Zd ZdZ										dd
ededeedf deedf dedededef fddZ	ddej	dej	dB dej	fddZ
  ZS )Decodera  
    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        up_block_types (`tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
        block_out_channels (`tuple[int, ...]`, *optional*, defaults to `(64,)`):
            The number of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
        act_fn (`str`, *optional*, defaults to `"silu"`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
        norm_type (`str`, *optional*, defaults to `"group"`):
            The normalization type to use. Can be either `"group"` or `"spatial"`.
    r   UpDecoderBlock2Dr   r   r!   r"   groupTr#   r$   up_block_types.r&   r'   r(   r)   	norm_typec
                    sL  t    || _tj||d dddd| _tg | _|dkr!|nd }
t|d d|d|dkr0dn||d ||
|	d		| _	t
t|}|d
 }t|D ].\}}|}|| }|t|d k}t|| jd |||| d||||
|d}| j| |}qI|dkrt|d
 |
| _ntj|d
 |dd| _t | _tj|d
 |ddd| _d| _d S )Nr9   r   r+   r,   spatialr0   rl   r:   r;   r   r1   r#   r$   prev_output_channeladd_upsampler3   r5   r6   r7   r8   r=   r?   rC   F)rD   rE   r'   rF   rG   rH   rI   	up_blocksr
   rN   listreversedrK   rL   r   rM   r   rP   rO   rQ   rR   rS   rT   )rU   r#   r$   rm   r&   r'   r(   r)   rn   rV   r8   reversed_block_out_channelsrW   rX   up_block_typerq   r[   up_blockr^   r   r   rE      sd   


zDecoder.__init__Nr   latent_embedsr`   c                 C   s   |  |}t r"| jr"| | j||}| jD ]	}| |||}qn| ||}| jD ]}|||}q+|du r=| |}n| ||}| |}| 	|}|S )z*The forward method of the `Decoder` class.N)
rH   r   ra   rT   rb   rN   rs   rP   rR   rS   )rU   r   ry   rx   r   r   r   rc     s   




zDecoder.forward)	r   r   rj   r   r   r!   r"   rl   TNr   r   r   r   rd   re   rf   rE   r   r   rc   rh   r   r   r^   r   ri      sH    

	Ori   c                       sD   e Zd ZdZdededdf fddZdejdejfd	d
Z  Z	S )UpSamplea&  
    The `UpSample` layer of a variational autoencoder that upsamples its input.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
    r#   r$   r`   Nc                    s0   t    || _|| _tj||dddd| _d S )N   r   r+   r,   )rD   rE   r#   r$   rF   ConvTranspose2ddeconv)rU   r#   r$   r^   r   r   rE   E  s   
zUpSample.__init__xc                 C   s   t |}| |}|S )z+The forward method of the `UpSample` class.)r   relur   rU   r   r   r   r   rc   O  s   

zUpSample.forward
r   r   r   r   rd   rE   r   r   rc   rh   r   r   r^   r   r|   :  s    

r|   c                       sV   e Zd ZdZ			ddedededed	d
f
 fddZddejd	ejfddZ  Z	S )MaskConditionEncoderz)
    used in AsymmetricAutoencoderKL
             in_chout_chres_chr.   r`   Nc              
      s  t    g }|dkr.|d }|d }||kr|}|dkr|}|||f |d9 }|dksg }|D ]	\}}	||	 q2||d d  g }
|}tt|D ]+}|| }|dks]|dkrk|
tj||dddd n|
tj||dddd |}qOtj|
 | _d S )Nr+   r   r9   r   r   r,   r}   )	rD   rE   rM   rangerL   rF   rG   
Sequentiallayers)rU   r   r   r   r.   channelsin_ch_r$   _in_ch_out_chr   lout_ch_r^   r   r   rE   [  s2   

zMaskConditionEncoder.__init__r   c                 C   sJ   i }t t| jD ]}| j| }||}||tt|j< t|}q	|S )z7The forward method of the `MaskConditionEncoder` class.)r   rL   r   rf   re   shaper   r   )rU   r   maskoutr   layerr   r   r   rc     s   
zMaskConditionEncoder.forward)r   r   r   rz   r   r   r   r^   r   r   V  s"     %r   c                       s   e Zd ZdZ								dd	ed
edeedf deedf dedededef fddZ			ddej	dej	dB dej	dB dej	dB dej	f
ddZ
  ZS )MaskConditionDecodera  The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
    decoder with a conditioner on the mask and masked image.

    Args:
        in_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            The number of output channels.
        up_block_types (`tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
        block_out_channels (`tuple[int, ...]`, *optional*, defaults to `(64,)`):
            The number of output channels for each block.
        layers_per_block (`int`, *optional*, defaults to 2):
            The number of layers per block.
        norm_num_groups (`int`, *optional*, defaults to 32):
            The number of groups for normalization.
        act_fn (`str`, *optional*, defaults to `"silu"`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
        norm_type (`str`, *optional*, defaults to `"group"`):
            The normalization type to use. Can be either `"group"` or `"spatial"`.
    r   rj   r   r   r!   r"   rl   r#   r$   rm   .r&   r'   r(   r)   rn   c	                    sb  t    || _tj||d dddd| _tg | _|dkr!|nd }	t|d d|d|dkr0dn||d ||	d	| _	t
t|}
|
d
 }t|D ].\}}|}|
| }|t|d k}t|| jd ||d | d||||	|d}| j| |}qHt||d
 |d d| _|dkrt|d
 |	| _ntj|d
 |dd| _t | _tj|d
 |ddd| _d| _d S )Nr9   r   r+   r,   ro   r0   rl   r:   )r#   r3   r5   r<   r=   r7   r6   r8   r   rp   )r   r   r   r?   rC   F)rD   rE   r'   rF   rG   rH   rI   rs   r
   rN   rt   ru   rK   rL   r   rM   r   condition_encoderr   rP   rO   rQ   rR   rS   rT   )rU   r#   r$   rm   r&   r'   r(   r)   rn   r8   rv   rW   rX   rw   rq   r[   rx   r^   r   r   rE     sl   


zMaskConditionDecoder.__init__Nzimager   ry   r`   c                 C   s(  |}|  |}tt| j j}t r| jr| 	| j
||}||}|dur;|dur;d| | }| 	| j||}| jD ]2}	|duri|duri|tt|j }
tjj||jdd dd}|| |
d|   }| 	|	||}q>|dur|dur|| |tt|j d|   }nl| 
||}||}|dur|durd| | }| ||}| jD ]0}	|dur|dur|tt|j }
tjj||jdd dd}|| |
d|   }|	||}q|dur|dur|| |tt|j d|   }|du r| |}n| ||}| |}| |}|S )z7The forward method of the `MaskConditionDecoder` class.Nr+   nearest)sizemode)rH   nextiterrs   
parametersdtyper   ra   rT   rb   rN   tor   rf   re   r   rF   
functionalinterpolaterP   rR   rS   )rU   r   r   r   ry   r   upscale_dtypemasked_imageim_xrx   sample_mask_r   r   r   rc     sR   


"

"


zMaskConditionDecoder.forward)r   r   rj   r   r   r!   r"   rl   )NNNr{   r   r   r^   r   r     sR    

	Tr   c                       s   e Zd ZdZ				ddededed	ed
edef fddZde	j
de	j
fddZde	j
de	j
fddZde	jdee	je	jef fddZde	j
deedf de	jfddZ  ZS )VectorQuantizerz
    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
    multiplications and allows for post-hoc remapping of indices.
    NrandomFTn_evq_embed_dimbetaunknown_indexsane_index_shapelegacyc                    s   t    || _|| _|| _|| _t| j| j| _| jj	j
d| j d| j  || _| jd urk| dtt| j |  | jjd | _|| _| jdkrY| j| _| jd | _td| j d| j d	| j d
 n|| _|| _d S )Ng            ?usedr   extrar+   z
Remapping z indices to z indices. Using z for unknown indices.)rD   rE   r   r   r   r   rF   	Embedding	embeddingweightdatauniform_remapregister_bufferr   tensornploadr   r   re_embedr   printr   )rU   r   r   r   r   r   r   r   r^   r   r   rE   >  s.   




zVectorQuantizer.__init__indsr`   c                 C   s   |j }t|dksJ ||d d}| j|}|d d d d d f |d k }|d}|ddk }| jdkrOt	j
d| j|| j dj|jd||< n| j||< ||S )	Nr+   r   r9   )NN.r   r   )r   )device)r   rL   reshaper   r   longargmaxsumr   r   randintr   r   )rU   r   ishaper   matchnewunknownr   r   r   remap_to_usedc  s   "

(

zVectorQuantizer.remap_to_usedc                 C   s   |j }t|dksJ ||d d}| j|}| j| jj d kr,d||| jj d k< t|d d d f |j d dg d d f d|}||S )Nr+   r   r9   )r   rL   r   r   r   r   r   gather)rU   r   r   r   backr   r   r   unmap_to_allq  s   2
zVectorQuantizer.unmap_to_allr   c                 C   sN  | dddd }|d| j}tjt|| jjdd}| ||j	}d }d }| j
sH| jt| | d  t||  d  }nt| | d | jt||  d   }|||   }| dddd }| jd ur||j	d d}| |}|dd}| jr||j	d |j	d |j	d }|||||ffS )Nr   r   r   r+   r9   dim)permute
contiguousviewr   r   argmincdistr   r   r   r   r   meandetachr   r   r   r   )rU   r   z_flattenedmin_encoding_indicesz_q
perplexitymin_encodingslossr   r   r   rc   {  s$   42

 zVectorQuantizer.forwardindicesr   .c                 C   sb   | j d ur||d d}| |}|d}| |}|d ur/||}|dddd }|S )Nr   r9   r   r+   r   )r   r   r   r   r   r   r   )rU   r   r   r   r   r   r   get_codebook_entry  s   




z"VectorQuantizer.get_codebook_entry)Nr   FT)r   r   r   r   rd   floatrf   rg   rE   r   
LongTensorr   r   r   re   rc   r   rh   r   r   r^   r   r   5  s.    %"
*"r   c                   @   s   e Zd ZddejdefddZddejdB dejfd	d
Zddd dejfddZ	g dfdejde
edf dejfddZdejfddZdS )DiagonalGaussianDistributionFr   deterministicc                 C   s   || _ tj|ddd\| _| _t| jdd| _|| _td| j | _t| j| _	| jrAtj
| j| j j| j jd | _	| _d S d S )Nr   r+   r   g      >g      4@      ?)r   r   )r   r   chunkr   logvarclampr   expstdvar
zeros_liker   r   )rU   r   r   r   r   r   rE     s   z%DiagonalGaussianDistribution.__init__N	generatorr`   c                 C   s0   t | jj|| jj| jjd}| j| j|  }|S )N)r   r   r   )r   r   r   r   r   r   r   )rU   r   r   r   r   r   r   r     s   z#DiagonalGaussianDistribution.sampleotherc                 C   s   | j r	tdgS |d u r%dtjt| jd| j d | j g dd S dtjt| j|j d|j | j|j  d | j |j g dd S )N        r   r   r   r+   r   r   r   )r   r   r   r   powr   r   r   )rU   r   r   r   r   kl  s&   
zDiagonalGaussianDistribution.klr   r   dims.c                 C   sR   | j r	tdgS tdtj }dtj|| j t|| j	 d| j
  |d S )Nr   g       @r   r   r   )r   r   r   r   logpir   r   r   r   r   )rU   r   r   logtwopir   r   r   nll  s    z DiagonalGaussianDistribution.nllc                 C      | j S rz   )r   rU   r   r   r   r        z!DiagonalGaussianDistribution.mode)Frz   )r   r   r   r   r   rg   rE   	Generatorr   r   re   rd   r   r   r   r   r   r   r     s    *	r   c                   @   sH   e Zd ZdejfddZddejdB dejfddZdejfd	d
ZdS )IdentityDistributionr   c                 C   s
   || _ d S rz   r   )rU   r   r   r   r   rE     s   
zIdentityDistribution.__init__Nr   r`   c                 C   r   rz   r   )rU   r   r   r   r   r     r   zIdentityDistribution.samplec                 C   r   rz   r   r   r   r   r   r     r   zIdentityDistribution.moderz   )	r   r   r   r   r   rE   r   r   r   r   r   r   r   r     s    r   c                
       s\   e Zd ZdZdededeedf deedf def
 fdd	Zd
ej	dej	fddZ
  ZS )EncoderTinya  
    The `EncoderTiny` layer is a simpler version of the `Encoder` layer.

    Args:
        in_channels (`int`):
            The number of input channels.
        out_channels (`int`):
            The number of output channels.
        num_blocks (`tuple[int, ...]`):
            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
            use.
        block_out_channels (`tuple[int, ...]`):
            The number of output channels for each block.
        act_fn (`str`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
    r#   r$   
num_blocks.r&   r)   c                    s   t    g }t|D ]7\}}|| }	|dkr$|tj||	ddd n|tj|	|	ddddd t|D ]}
|t|	|	| q6q|tj|d |ddd tj| | _	d| _
d S )	Nr   r   r+   r-   r/   r   F)r-   r/   r.   biasr9   )rD   rE   rK   rM   rF   rG   r   r	   r   r   rT   )rU   r#   r$   r  r&   r)   r   rX   	num_blockr@   _r^   r   r   rE     s,   

zEncoderTiny.__init__r   r`   c                 C   s:   t  r| jr| | j|}|S | |dd}|S )z.The forward method of the `EncoderTiny` class.r+   r   )r   ra   rT   rb   r   adddivr   r   r   r   rc   (  s
   zEncoderTiny.forwardr{   r   r   r^   r   r     s    

$r   c                       sd   e Zd ZdZdededeedf deedf deded	ef fd
dZdej	dej	fddZ
  ZS )DecoderTinya  
    The `DecoderTiny` layer is a simpler version of the `Decoder` layer.

    Args:
        in_channels (`int`):
            The number of input channels.
        out_channels (`int`):
            The number of output channels.
        num_blocks (`tuple[int, ...]`):
            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
            use.
        block_out_channels (`tuple[int, ...]`):
            The number of output channels for each block.
        upsampling_scaling_factor (`int`):
            The scaling factor to use for upsampling.
        act_fn (`str`):
            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
    r#   r$   r  .r&   upsampling_scaling_factorr)   upsample_fnc              
      s   t    tj||d dddt|g}t|D ]?\}	}
|	t|d k}||	 }t|
D ]}|t	||| q,|sD|tj
||d |sH|n|}|tj||dd|d qtj| | _d| _d S )Nr   r   r+   r  )scale_factorr   )r-   r/   r  F)rD   rE   rF   rG   r   rK   rL   r   rM   r	   Upsampler   r   rT   )rU   r#   r$   r  r&   r	  r)   r
  r   rX   r  r[   r@   r  conv_out_channelr^   r   r   rE   H  s.   



zDecoderTiny.__init__r   r`   c                 C   sJ   t |d d }t  r| jr| | j|}n| |}|ddS )z.The forward method of the `DecoderTiny` class.r   r   r+   )r   tanhra   rT   rb   r   mulsubr   r   r   r   rc   q  s
   
zDecoderTiny.forwardr{   r   r   r^   r   r  4  s$    

)r  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
AutoencoderMixinc                 C   (   t | dstd| jj dd| _dS )a  
        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        
use_tilingz*Tiling doesn't seem to be implemented for .TN)hasattrNotImplementedErrorr_   r   r  r   r   r   r   enable_tiling  s   

zAutoencoderMixin.enable_tilingc                 C   
   d| _ dS )z
        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
        decoding in one step.
        FN)r  r   r   r   r   disable_tiling     
zAutoencoderMixin.disable_tilingc                 C   r  )z
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        use_slicingz+Slicing doesn't seem to be implemented for r  TN)r  r  r_   r   r  r   r   r   r   enable_slicing  s   

zAutoencoderMixin.enable_slicingc                 C   r  )z
        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
        decoding in one step.
        FN)r  r   r   r   r   disable_slicing  r  z AutoencoderMixin.disable_slicingN)r   r   r   r  r  r  r  r   r   r   r   r    s
    
	r  )#dataclassesr   numpyr   r   torch.nnrF   utilsr   utils.torch_utilsr   activationsr   attention_processorr   unets.unet_2d_blocksr	   r
   r   r   r   r   Moduler   ri   r|   r   r   r   objectr   r   r   r  r  r   r   r   r   <module>   s4   y 5 +z8BK