o
    Gi?F                     @   s>  d dl mZ d dlZd dlmZ ddlmZmZ ddlmZm	Z	 ddl
mZ dd	lmZmZ dd
lmZmZ ddlmZ e	eZeG dd deZG dd dejZG dd deeeZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!dS )     )	dataclassN)nn   )ConfigMixinregister_to_config)
BaseOutputlogging   )AttentionMixin)	AttentionAttnProcessor)TimestepEmbedding	Timesteps)
ModelMixinc                   @   s   e Zd ZU dZejed< dS )Kandinsky3UNetOutputNsample)__name__
__module____qualname__r   torchTensor__annotations__ r   r   Z/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/unets/unet_kandinsky3.pyr      s   
 r   c                       $   e Zd Z fddZdd Z  ZS )Kandinsky3EncoderProjc                    s,   t    tj||dd| _t|| _d S )NF)bias)super__init__r   Linearprojection_linear	LayerNormprojection_norm)selfencoder_hid_dimcross_attention_dim	__class__r   r   r   %   s   
zKandinsky3EncoderProj.__init__c                 C   s   |  |}| |}|S N)r    r"   )r#   xr   r   r   forward*   s   

zKandinsky3EncoderProj.forwardr   r   r   r   r*   __classcell__r   r   r&   r   r   $   s    r   c                       s~   e Zd Ze								dded	ed
ededeee B deedf deee B def fddZdd ZdddZ  Z	S )Kandinsky3UNet          @   r   i  i   r/   i      in_channelstime_embedding_dimgroupsattention_head_dimlayers_per_blockblock_out_channels.r%   r$   c	                    s  t    d}	d}
d}d}|}|d d }t|ddd| _t||| _t| || _tj	||ddd	| _
t| | _|gt| }tt|d d
 |dd  } fdd|D }t||g }|||g}tt|}g }t|| _tg | _tt|g|R  D ]1\}\\}}}}}|| jd k}||| jd kr|nd | jt||||||||	|
|| qtg | _ttt|g|R  D ]#\}\\}}}}}|dk}| jt|| |||||||	|
|| qt||| _t | _tj	||ddd	| _d S )Nr.   r	   )FTTTr   F   )flip_sin_to_cosdownscale_freq_shiftr   kernel_sizepaddingc                    s   g | ]}|r nd qS r(   r   ).0is_existr%   r   r   
<listcomp>X   s    z+Kandinsky3UNet.__init__.<locals>.<listcomp>) r   r   r   	time_projr   time_embeddingKandinsky3AttentionPoolingadd_time_conditionr   Conv2dconv_inr   encoder_hid_projlistziplenmapreversed
num_levels
ModuleListdown_blocks	enumerateappendKandinsky3DownSampleBlock	up_blocksKandinsky3UpSampleBlockpop	GroupNormconv_norm_outSiLUconv_act_outconv_out)r#   r4   r5   r6   r7   r8   r9   r%   r$   expansion_ratiocompression_ratioadd_cross_attentionadd_self_attentionout_channelsinit_channelshidden_dimsin_out_dims	text_dims
num_blockslayer_paramsrev_layer_paramscat_dimslevelin_dimout_dimres_block_numtext_dimself_attentiondown_sample	up_sampler&   rC   r   r   1   s   




zKandinsky3UNet.__init__c                 C   s   |  t  dS )ze
        Disables custom attention processors and sets the default attention implementation.
        N)set_attn_processorr   )r#   r   r   r   set_default_attn_processor   s   z)Kandinsky3UNet.set_default_attn_processorNTc                 C   s  |d urd| |j d }|d}t|s.t|tr tjntj}tj	|g||j
d}nt|jdkr=|d   |j
}||jd }| | |j}| |}| |}|d urc| |||}g }	| |}t| jD ]\}
}|||||}|
| jd kr|	| qot| jD ]\}
}|
dkrtj||	 gdd}|||||}q| |}| |}| |}|s|fS t|dS )Nr:   g     )dtypedevicer   dim)r   )torv   	unsqueezer   	is_tensor
isinstancefloatfloat32int32tensorrw   rN   shapeexpandrE   rF   rK   rH   rJ   rT   rS   rQ   rU   rW   catrY   r[   r]   r^   r   )r#   r   timestepencoder_hidden_statesencoder_attention_maskreturn_dictrv   time_embed_input
time_embedhidden_statesrl   rr   rs   r   r   r   r*      s>   









zKandinsky3UNet.forward)r.   r/   r0   r1   r   r2   r3   r3   )NNT)
r   r   r   r   inttupler   ru   r*   r,   r   r   r&   r   r-   0   s:    


	]r-   c                       8   e Zd Z								d fdd		Zdd
dZ  ZS )rX   Nr   r0   r1   r.   r	   Tc              
      s@  t    d |r
dnd d d ggd gd g|d   }|| |fg||fg|d   ||fg }g }g }g }|| _|| _|rJ|t||d |||	 n|t  t||D ]5\\}}}|t	|||||
| |d ury|t||||||	 n|t  |t	|||||
 qVt
|| _t
|| _t
|| _d S )NTr.   r:   r	   r   r   rq   context_dimrU   Kandinsky3AttentionBlockr   IdentityrM   Kandinsky3ResNetBlockrR   
attentions
resnets_inresnets_out)r#   r4   cat_dimrc   time_embed_dimr   rh   r6   head_dimr_   r`   rs   rq   up_resolutionshidden_channelsr   r   r   
in_channelout_channelup_resolutionr&   r   r   r      sF   
*z Kandinsky3UpSampleBlock.__init__c           	      C   sr   t | jdd  | j| jD ]\}}}|||}| jd ur$||||||}|||}q| jr7| jd |||d}|S )Nr:   r   
image_mask)rM   r   r   r   r   rq   	r#   r)   r   contextcontext_maskr   	attention	resnet_in
resnet_outr   r   r   r*      s   $

zKandinsky3UpSampleBlock.forwardNr   r0   r1   r.   r	   TTNNNr+   r   r   r&   r   rX      s    :rX   c                       r   )rV   Nr   r0   r1   r.   r	   Tc              
      s2  t    g }g }g }|| _|| _|r |t||d ||| n|t  d gd g|d  d d |
r6dnd d gg }||fg||fg|d   }t||D ]5\\}}}|t	|||||	 |d urq|t|||||| n|t  |t	|||||	| qOt
|| _t
|| _t
|| _d S )Nr.   r:   Fr   )r#   r4   rc   r   r   rh   r6   r   r_   r`   rr   rq   r   r   r   r   r   r   r   r   r&   r   r   r     s@   
*z"Kandinsky3DownSampleBlock.__init__c           	      C   sr   | j r| jd |||d}t| jdd  | j| jD ]\}}}|||}| jd ur1||||||}|||}q|S )Nr   r   r:   )rq   r   rM   r   r   r   r   r   r   r   r*   >  s   $

z!Kandinsky3DownSampleBlock.forwardr   r   r+   r   r   r&   r   rV     s    7rV   c                       r   )Kandinsky3ConditionalGroupNormc                    sb   t    tj||dd| _tt t|d| | _| jd j	j
  | jd jj
  d S )NF)affiner	   r:   )r   r   r   rZ   norm
Sequentialr\   r   context_mlpweightdatazero_r   )r#   r6   normalized_shaper   r&   r   r   r   K  s
   
z'Kandinsky3ConditionalGroupNorm.__init__c                 C   s\   |  |}tt|jdd  D ]}|d}q|jddd\}}| ||d  | }|S )Nr	   r@   r:   rx   g      ?)r   rangerN   r   r{   chunkr   )r#   r)   r   _scaleshiftr   r   r   r*   R  s   
z&Kandinsky3ConditionalGroupNorm.forwardr+   r   r   r&   r   r   J  s    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	Kandinsky3Blockr   r0   Nc                    s   t    t|||| _t | _|d ur"|r"tj||ddd| _nt	 | _t
|dk}tj||||d| _|d urI|sItj||ddd| _d S t	 | _d S )Nr	   r>   strider:   r=   )r   r   r   
group_normr   r\   
activationConvTranspose2drs   r   r   rI   
projectionrr   )r#   r4   rc   r   r>   norm_groupsr   r?   r&   r   r   r   ^  s   


zKandinsky3Block.__init__c                 C   s8   |  ||}| |}| |}| |}| |}|S r(   )r   r   rs   r   rr   )r#   r)   r   r   r   r   r*   o  s   



zKandinsky3Block.forward)r   r0   Nr+   r   r   r&   r   r   ]  s    r   c                       s2   e Zd Zddddg f fdd	Zdd Z  ZS )	r   r0   r	   r.   Nc           
         s   t    g d}t||| }||fg||fgd  ||fg }	t fddt|	||D | _d|v r@tj||dddnt | _	||krQtj
||ddnt | _d	|v rftj
||ddd| _d S t | _d S )
N)r:   r   r   r:   r	   c              	      s(   g | ]\\}}}}t ||| |qS r   )r   )rA   r   r   r>   r   r   r   r   r   rD     s    z2Kandinsky3ResNetBlock.__init__.<locals>.<listcomp>Tr   r:   )r>   F)r   r   maxr   rR   rM   resnet_blocksr   r   shortcut_up_samplerI   shortcut_projectionshortcut_down_sample)
r#   r4   rc   r   r   r`   r   kernel_sizeshidden_channelr   r&   r   r   r   y  s.   
 
zKandinsky3ResNetBlock.__init__c                 C   sD   |}| j D ]}|||}q| |}| |}| |}|| }|S r(   )r   r   r   r   )r#   r)   r   outresnet_blockr   r   r   r*     s   



zKandinsky3ResNetBlock.forwardr+   r   r   r&   r   r   x  s    r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	rG   r1   c                    s"   t    t||||dd| _d S )NFdim_headrn   out_bias)r   r   r   r   )r#   num_channelsr   r   r&   r   r   r     s   
z#Kandinsky3AttentionPooling.__init__Nc                 C   s4   |j |jd}| |jddd||}||d S )Nrv   r:   T)ry   keepdim)rz   rv   r   meansqueeze)r#   r)   r   r   r   r   r   r*     s   z"Kandinsky3AttentionPooling.forward)r1   r(   r+   r   r   r&   r   rG     s    
rG   c                       s(   e Zd Zd	 fdd	Zd
ddZ  ZS )r   Nr0   r1   r.   c              
      sz   t    t|||| _t||p|||dd| _|| }t|||| _ttj	||dddt
 tj	||ddd| _d S )NFr   r:   )r>   r   )r   r   r   in_normr   r   out_normr   r   rI   r\   feed_forward)r#   r   r   r   r   r   r_   r   r&   r   r   r     s    

z!Kandinsky3AttentionBlock.__init__c           	      C   s   |j dd  \}}| ||}||j d d|| ddd}|d ur&|n|}|d ur3|j|jd}| |||}|dddd|j d d||}|| }| ||}| 	|}|| }|S )Nr   r@   r	   r:   r   )
r   r   reshapepermuterz   rv   r   r{   r   r   )	r#   r)   r   r   r   r   heightwidthr   r   r   r   r*     s   "&
z Kandinsky3AttentionBlock.forward)Nr0   r1   r.   r   r+   r   r   r&   r   r     s    r   )"dataclassesr   r   r   configuration_utilsr   r   utilsr   r   r   r
   attention_processorr   r   
embeddingsr   r   modeling_utilsr   
get_loggerr   loggerr   Moduler   r-   rX   rV   r   r   r   rG   r   r   r   r   r   <module>   s,   
 GD,