o
    piQ                     @   sP  d dl mZ d dlmZmZmZ d dlZd dlZd dlmZ ddl	m
Z
mZ ddlmZmZ dd	lmZmZmZ dd
lmZmZ ddlmZ eeZeG dd deZG dd dejZG dd dee
ZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd dejZ$G dd dejZ%dS )     )	dataclass)DictTupleUnionN)nn   )ConfigMixinregister_to_config)
BaseOutputlogging   )	AttentionAttentionProcessorAttnProcessor)TimestepEmbedding	Timesteps)
ModelMixinc                   @   s   e Zd ZU dZejed< dS )Kandinsky3UNetOutputNsample)__name__
__module____qualname__r   torchTensor__annotations__ r   r   d/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/unets/unet_kandinsky3.pyr       s   
 r   c                       $   e Zd Z fddZdd Z  ZS )Kandinsky3EncoderProjc                    s,   t    tj||dd| _t|| _d S )NF)bias)super__init__r   Linearprojection_linear	LayerNormprojection_norm)selfencoder_hid_dimcross_attention_dim	__class__r   r   r!   &   s   
zKandinsky3EncoderProj.__init__c                 C   s   |  |}| |}|S N)r#   r%   )r&   xr   r   r   forward+   s   

zKandinsky3EncoderProj.forwardr   r   r   r!   r-   __classcell__r   r   r)   r   r   %   s    r   c                       s   e Zd Ze								d!ded	ed
ededeeee f dee deeee f def fddZede	e
ef fddZdeee	e
ef f fddZdd Zd"ddZd#dd Z  ZS )$Kandinsky3UNet          @   r   i  i   r2   i      in_channelstime_embedding_dimgroupsattention_head_dimlayers_per_blockblock_out_channelsr(   r'   c	                    s  t    d}	d}
d}d}|}|d d }t|ddd| _t||| _t| || _tj	||ddd	| _
t| | _|gt| }tt|d d
 |dd  } fdd|D }t||g }|||g}tt|}g }t|| _tg | _tt|g|R  D ]1\}\\}}}}}|| jd k}||| jd kr|nd | jt||||||||	|
|| qtg | _ttt|g|R  D ]#\}\\}}}}}|dk}| jt|| |||||||	|
|| qt||| _t | _tj	||ddd	| _d S )Nr1   r   )FTTTr   F   )flip_sin_to_cosdownscale_freq_shiftr   kernel_sizepaddingc                    s   g | ]}|r nd qS r+   r   ).0is_existr(   r   r   
<listcomp>Y   s    z+Kandinsky3UNet.__init__.<locals>.<listcomp>) r    r!   r   	time_projr   time_embeddingKandinsky3AttentionPoolingadd_time_conditionr   Conv2dconv_inr   encoder_hid_projlistziplenmapreversed
num_levels
ModuleListdown_blocks	enumerateappendKandinsky3DownSampleBlock	up_blocksKandinsky3UpSampleBlockpop	GroupNormconv_norm_outSiLUconv_act_outconv_out)r&   r7   r8   r9   r:   r;   r<   r(   r'   expansion_ratiocompression_ratioadd_cross_attentionadd_self_attentionout_channelsinit_channelshidden_dimsin_out_dims	text_dims
num_blockslayer_paramsrev_layer_paramscat_dimslevelin_dimout_dimres_block_numtext_dimself_attentiondown_sample	up_sampler)   rF   r   r!   2   s   




zKandinsky3UNet.__init__returnc                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                    sF   t |dr|j||  d< | D ]\}} |  d| || q|S Nset_processorz
.processor.)hasattr	processornamed_children)rx   ry   rz   sub_namechildfn_recursive_add_processorsr   r   r      s
   
zCKandinsky3UNet.attn_processors.<locals>.fn_recursive_add_processors)strr   r   Moduler   r   r   )r&   rz   rx   ry   r   r   r   attn_processors   s
   &	zKandinsky3UNet.attn_processorsr   c                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.rx   ry   c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S r{   )r~   
isinstancedictr|   r\   r   )rx   ry   r   r   r   fn_recursive_attn_processorr   r   r      s   

zFKandinsky3UNet.set_attn_processor.<locals>.fn_recursive_attn_processorN)rQ   r   keysr   r   
ValueErrorr   r   r   r   r   )r&   r   countrx   ry   r   r   r   set_attn_processor   s   
z!Kandinsky3UNet.set_attn_processorc                 C   s   |  t  dS )ze
        Disables custom attention processors and sets the default attention implementation.
        N)r   r   )r&   r   r   r   set_default_attn_processor   s   z)Kandinsky3UNet.set_default_attn_processorFc                 C   s   t |dr
||_d S d S )Ngradient_checkpointing)r~   r   )r&   ry   valuer   r   r   _set_gradient_checkpointing   s   

z*Kandinsky3UNet._set_gradient_checkpointingNTc                 C   s  |d urd| |j d }|d}t|s.t|tr tjntj}tj	|g||j
d}nt|jdkr=|d   |j
}||jd }| | |j}| |}| |}|d urc| |||}g }	| |}t| jD ]\}
}|||||}|
| jd kr|	| qot| jD ]\}
}|
dkrtj||	 gdd}|||||}q| |}| |}| |}|s|fS t|dS )Nr=   g     )dtypedevicer   dim)r   )tor   	unsqueezer   	is_tensorr   floatfloat32int32tensorr   rQ   shapeexpandrH   rI   rN   rK   rM   rW   rV   rT   rX   rZ   catr\   r^   r`   ra   r   )r&   r   timestepencoder_hidden_statesencoder_attention_maskreturn_dictr   time_embed_input
time_embedhidden_statesro   ru   rv   r   r   r   r-      s>   









zKandinsky3UNet.forward)r1   r2   r3   r4   r   r5   r6   r6   )F)NNT)r   r   r   r	   intr   r   r!   propertyr   r   r   r   r   r   r   r-   r/   r   r   r)   r   r0   1   sB    	]"
r0   c                       8   e Zd Z								d fdd		Zdd
dZ  ZS )r[   Nr   r3   r4   r1   r   Tc              
      s@  t    d |r
dnd d d ggd gd g|d   }|| |fg||fg|d   ||fg }g }g }g }|| _|| _|rJ|t||d |||	 n|t  t||D ]5\\}}}|t	|||||
| |d ury|t||||||	 n|t  |t	|||||
 qVt
|| _t
|| _t
|| _d S )NTr1   r=   r   r    r!   rt   context_dimrX   Kandinsky3AttentionBlockr   IdentityrP   Kandinsky3ResNetBlockrU   
attentions
resnets_inresnets_out)r&   r7   cat_dimrf   time_embed_dimr   rk   r9   head_dimrb   rc   rv   rt   up_resolutionshidden_channelsr   r   r   
in_channelout_channelup_resolutionr)   r   r   r!      sF   
*z Kandinsky3UpSampleBlock.__init__c           	      C   sr   t | jdd  | j| jD ]\}}}|||}| jd ur$||||||}|||}q| jr7| jd |||d}|S )Nr=   r   
image_mask)rP   r   r   r   r   rt   	r&   r,   r   contextcontext_maskr   	attention	resnet_in
resnet_outr   r   r   r-   9  s   $

zKandinsky3UpSampleBlock.forwardNr   r3   r4   r1   r   TTNNNr.   r   r   r)   r   r[      s    :r[   c                       r   )rY   Nr   r3   r4   r1   r   Tc              
      s2  t    g }g }g }|| _|| _|r |t||d ||| n|t  d gd g|d  d d |
r6dnd d gg }||fg||fg|d   }t||D ]5\\}}}|t	|||||	 |d urq|t|||||| n|t  |t	|||||	| qOt
|| _t
|| _t
|| _d S )Nr1   r=   Fr   )r&   r7   rf   r   r   rk   r9   r   rb   rc   ru   rt   r   r   r   r   r   r   r   r   r)   r   r   r!   F  s@   
*z"Kandinsky3DownSampleBlock.__init__c           	      C   sr   | j r| jd |||d}t| jdd  | j| jD ]\}}}|||}| jd ur1||||||}|||}q|S )Nr   r   r=   )rt   r   rP   r   r   r   r   r   r   r   r-   }  s   $

z!Kandinsky3DownSampleBlock.forwardr   r   r.   r   r   r)   r   rY   E  s    7rY   c                       r   )Kandinsky3ConditionalGroupNormc                    sb   t    tj||dd| _tt t|d| | _| jd j	j
  | jd jj
  d S )NF)affiner   r=   )r    r!   r   r]   norm
Sequentialr_   r"   context_mlpweightdatazero_r   )r&   r9   normalized_shaper   r)   r   r   r!     s
   
z'Kandinsky3ConditionalGroupNorm.__init__c                 C   s\   |  |}tt|jdd  D ]}|d}q|jddd\}}| ||d  | }|S )Nr   rC   r=   r   g      ?)r   rangerQ   r   r   chunkr   )r&   r,   r   _scaleshiftr   r   r   r-     s   
z&Kandinsky3ConditionalGroupNorm.forwardr.   r   r   r)   r   r     s    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	Kandinsky3Blockr   r3   Nc                    s   t    t|||| _t | _|d ur"|r"tj||ddd| _nt	 | _t
|dk}tj||||d| _|d urI|sItj||ddd| _d S t	 | _d S )Nr   rA   strider=   r@   )r    r!   r   
group_normr   r_   
activationConvTranspose2drv   r   r   rL   
projectionru   )r&   r7   rf   r   rA   norm_groupsr   rB   r)   r   r   r!     s   


zKandinsky3Block.__init__c                 C   s8   |  ||}| |}| |}| |}| |}|S r+   )r   r   rv   r   ru   )r&   r,   r   r   r   r   r-     s   



zKandinsky3Block.forward)r   r3   Nr.   r   r   r)   r   r     s    r   c                       s2   e Zd Zddddg f fdd	Zdd Z  ZS )	r   r3   r   r1   Nc           
         s   t    g d}t||| }||fg||fgd  ||fg }	t fddt|	||D | _d|v r@tj||dddnt | _	||krQtj
||ddnt | _d	|v rftj
||ddd| _d S t | _d S )
N)r=   r   r   r=   r   c              	      s(   g | ]\\}}}}t ||| |qS r   )r   )rD   r   r   rA   r   r   r   r   r   rG     s    z2Kandinsky3ResNetBlock.__init__.<locals>.<listcomp>Tr   r=   )rA   F)r    r!   maxr   rU   rP   resnet_blocksr   r   shortcut_up_samplerL   shortcut_projectionshortcut_down_sample)
r&   r7   rf   r   r   rc   r   kernel_sizeshidden_channelr   r)   r   r   r!     s.   
 
zKandinsky3ResNetBlock.__init__c                 C   sD   |}| j D ]}|||}q| |}| |}| |}|| }|S r+   )r   r   r   r   )r&   r,   r   outresnet_blockr   r   r   r-     s   



zKandinsky3ResNetBlock.forwardr.   r   r   r)   r   r     s    r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	rJ   r4   c                    s"   t    t||||dd| _d S )NFdim_headrq   out_bias)r    r!   r   r   )r&   num_channelsr   r   r)   r   r   r!     s   
z#Kandinsky3AttentionPooling.__init__Nc                 C   s4   |j |jd}| |jddd||}||d S )Nr   r=   T)r   keepdim)r   r   r   meansqueeze)r&   r,   r   r   r   r   r   r-     s   z"Kandinsky3AttentionPooling.forward)r4   r+   r.   r   r   r)   r   rJ     s    
rJ   c                       s(   e Zd Zd	 fdd	Zd
ddZ  ZS )r   Nr3   r4   r1   c              
      sz   t    t|||| _t||p|||dd| _|| }t|||| _ttj	||dddt
 tj	||ddd| _d S )NFr   r=   )rA   r   )r    r!   r   in_normr   r   out_normr   r   rL   r_   feed_forward)r&   r   r   r   r   r   rb   r   r)   r   r   r!     s    

z!Kandinsky3AttentionBlock.__init__c           	      C   s   |j dd  \}}| ||}||j d d|| ddd}|d ur&|n|}|d ur3|j|jd}| |||}|dddd|j d d||}|| }| ||}| 	|}|| }|S )Nr   rC   r   r=   r   )
r   r   reshapepermuter   r   r   r   r   r   )	r&   r,   r   r   r   r   heightwidthr   r   r   r   r-     s   "&
z Kandinsky3AttentionBlock.forward)Nr3   r4   r1   r   r.   r   r   r)   r   r     s    r   )&dataclassesr   typingr   r   r   r   torch.utils.checkpointr   configuration_utilsr   r	   utilsr
   r   attention_processorr   r   r   
embeddingsr   r   modeling_utilsr   
get_loggerr   loggerr   r   r   r0   r[   rY   r   r   r   rJ   r   r   r   r   r   <module>   s.   
 NGD,