o
    Gi                     @   s   d dl Z d dlmZ ddlmZmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ G dd	 d	ejZG d
d de
eZdS )    N   )ConfigMixinregister_to_config)DecoderOutputVectorQuantizer)
ModelMixin)VQEncoderOutput)apply_forward_hookc                       s(   e Zd ZdZ fddZdd Z  ZS )MixingResidualBlockz=
    Residual block with mixing used by Paella's VQ-VAE.
    c              	      s   t    tj|ddd| _ttdtj||d|d| _tj|ddd| _	tt
||t t
||| _tjtddd	| _d S )
NFgư>)elementwise_affineeps   r   )kernel_sizegroups   T)requires_grad)super__init__nn	LayerNormnorm1
SequentialReplicationPad2dConv2d	depthwisenorm2LinearGELUchannelwise	Parametertorchzerosgammas)selfinp_channels	embed_dim	__class__ k/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.pyr       s   
zMixingResidualBlock.__init__c              	   C   s   | j }| |ddddddddd|d   |d  }|| ||d   }| |ddddddddd|d   |d  }|| |dddddddd|d   }|S )Nr      r   r         )r"   r   permuter   r   r   )r#   xmodsx_tempr(   r(   r)   forward0   s   66.zMixingResidualBlock.forward)__name__
__module____qualname____doc__r   r1   __classcell__r(   r(   r&   r)   r
      s    r
   c                       s   e Zd ZdZe									d!d	ed
edededededededef fddZed"de	j
dedefddZe	d#de	j
dededee	j
B fddZd"de	j
dedee	j
B fdd Z  ZS )$PaellaVQModela]  VQ-VAE model from Paella model.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
    implements for all the model (such as downloading or saving, etc.)

    Parameters:
        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
        up_down_scale_factor (int, *optional*, defaults to 2): Up and Downscale factor of the input image.
        levels  (int, *optional*, defaults to 2): Number of levels in the model.
        bottleneck_blocks (int, *optional*, defaults to 12): Number of bottleneck blocks in the model.
        embed_dim (int, *optional*, defaults to 384): Number of hidden channels in the model.
        latent_channels (int, *optional*, defaults to 4): Number of latent channels in the VQ-VAE model.
        num_vq_embeddings (int, *optional*, defaults to 8192): Number of codebook vectors in the VQ-VAE.
        scale_factor (float, *optional*, defaults to 0.3764): Scaling factor of the latent space.
    r   r*        r+       绸?in_channelsout_channelsup_down_scale_factorlevelsbottleneck_blocksr%   latent_channelsnum_vq_embeddingsscale_factorc
              
      s  t     fddtt|D }
tt|tj||d  |
d dd| _g }t|D ])}|dkrG|	tj|
|d  |
| dddd t
|
| |
| d }|	| q.|	ttj|
d	 |dd
dt| tj| | _t||d
dd| _ttj||
d	 ddg}t|D ]F}t|dkr|ndD ]}t
|
|d |  |
|d |  d }|	| q||d k r|	tj|
|d |  |
|d |  dddd qtj| | _ttj|
d ||d  ddt|| _d S )Nc                    s   g | ]} d |  qS )r*   r(   ).0ir%   r(   r)   
<listcomp>Z   s    z*PaellaVQModel.__init__.<locals>.<listcomp>r*   r   r   )r   r+   )r   stridepaddingF)r   biasg      ?)vq_embed_dimlegacybeta)r   r   reversedranger   r   PixelUnshuffler   in_blockappendr
   BatchNorm2ddown_blocksr   
vquantizerConvTranspose2d	up_blocksPixelShuffle	out_block)r#   r<   r=   r>   r?   r@   r%   rA   rB   rC   c_levelsrU   rE   blockrX   jr&   rF   r)   r   K   sJ   
&&"
zPaellaVQModel.__init__Tr.   return_dictreturnc                 C   s(   |  |}| |}|s|fS t|dS )N)latents)rR   rU   r   )r#   r.   r^   hr(   r(   r)   encode   s
   


zPaellaVQModel.encodera   force_not_quantizec                 C   sB   |s|  |\}}}n|}| |}| |}|s|fS t|dS )Nsample)rV   rX   rZ   r   )r#   ra   rc   r^   quant_r.   decr(   r(   r)   decode   s   


zPaellaVQModel.decodere   c                 C   s0   |}|  |j}| |j}|s|fS t|dS )z
        Args:
            sample (`torch.Tensor`): Input sample.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
        rd   )rb   r`   ri   re   r   )r#   re   r^   r.   ra   rh   r(   r(   r)   r1      s   
zPaellaVQModel.forward)	r   r   r*   r*   r8   r9   r+   r:   r;   )T)TT)r2   r3   r4   r5   r   intfloatr   r	   r    Tensorboolr   rb   r   ri   r1   r6   r(   r(   r&   r)   r7   9   sZ    	
7	(r7   )r    torch.nnr   configuration_utilsr   r   models.autoencoders.vaer   r   models.modeling_utilsr   models.vq_modelr   utils.accelerate_utilsr	   Moduler
   r7   r(   r(   r(   r)   <module>   s   