o
    pin                     @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
mZ ddlmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZeG dd deZG dd deeeZ dS )    N)	dataclass)OptionalTupleUnion   )ConfigMixinregister_to_config)FromOriginalModelMixin)
BaseOutput   )	Attention)
ModelMixinc                       s(   e Zd Z fddZ fddZ  ZS )SDCascadeLayerNormc                    s   t  j|i | d S N)super__init__)selfargskwargs	__class__ h/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/unets/unet_stable_cascade.pyr       s   zSDCascadeLayerNorm.__init__c                    s,   | dddd}t |}| ddddS )Nr   r   r      )permuter   forward)r   xr   r   r   r   #   s   zSDCascadeLayerNorm.forward__name__
__module____qualname__r   r   __classcell__r   r   r   r   r      s    r   c                       s(   e Zd Zg f fdd	Zdd Z  ZS )SDCascadeTimestepBlockc              	      sN   t    t||d | _|| _|D ]}t| d| t||d  qd S )Nr   mapper_)r   r   nnLinearmappercondssetattr)r   c
c_timestepr'   cnamer   r   r   r   *   s   
 zSDCascadeTimestepBlock.__init__c           	      C   s   |j t| jd dd}| |d d d d d d d f j ddd\}}t| jD ],\}}t| d| ||d  d d d d d d f j ddd\}}|| || }}q)|d|  | S )Nr   dimr   r   r#   )chunklenr'   r&   	enumerategetattr)	r   r   tabir)   acbcr   r   r   r   2   s   0>zSDCascadeTimestepBlock.forwardr   r   r   r   r   r"   )   s    r"   c                       s(   e Zd Zd	 fdd	Zd
ddZ  ZS )SDCascadeResBlockr   r           c              
      sz   t    tj||||d |d| _t|ddd| _tt|| |d t	 t
|d t|t|d || _d S )Nr   )kernel_sizepaddinggroupsFư>elementwise_affineeps   )r   r   r$   Conv2d	depthwiser   norm
Sequentialr%   GELUGlobalResponseNormDropoutchannelwise)r   r)   c_skipr:   dropoutr   r   r   r   <   s   


zSDCascadeResBlock.__init__Nc                 C   sX   |}|  | |}|d urtj||gdd}| |dddddddd}|| S )Nr   r,   r   r   r   )rD   rC   torchcatrI   r   )r   r   x_skipx_resr   r   r   r   H   s   "zSDCascadeResBlock.forward)r   r   r9   r   r   r   r   r   r   r8   ;   s    r8   c                       s$   e Zd Z fddZdd Z  ZS )rG   c                    s>   t    ttddd|| _ttddd|| _d S )Nr   )r   r   r$   	ParameterrL   zerosgammabeta)r   r-   r   r   r   r   S   s   
zGlobalResponseNorm.__init__c                 C   s@   t j|dddd}||jdddd  }| j||  | j | S )Nr   )r   r   T)pr-   keepdim)r-   rU   r=   )rL   rD   meanrR   rS   )r   r   agg_normstand_div_normr   r   r   r   X   s   zGlobalResponseNorm.forwardr   r   r   r   r   rG   R   s    rG   c                       s&   e Zd Zd fdd	Zdd Z  ZS )SDCascadeAttnBlockTr9   c                    sV   t    || _t|ddd| _t|||| |dd| _tt	 t
||| _d S )NFr=   r>   T)	query_dimheadsdim_headrK   bias)r   r   	self_attnr   rD   r   	attentionr$   rE   SiLUr%   	kv_mapper)r   r)   c_condnheadr_   rK   r   r   r   r   _   s
   
zSDCascadeAttnBlock.__init__c                 C   sb   |  |}| |}| jr&|j\}}}}tj|||ddd|gdd}|| j||d }|S )NrV   r   r   r,   )encoder_hidden_states)	rb   rD   r_   shaperL   rM   view	transposer`   )r   r   kvnorm_x
batch_sizechannel_r   r   r   r   g   s   

$zSDCascadeAttnBlock.forward)Tr9   r   r   r   r   r   rZ   ^   s    rZ   c                       s&   e Zd Zd fdd	Zdd Z  ZS )UpDownBlock2dTc                    s|   t    |dvrt| d|r tj|dkrdnddddnt }tj||d	d
}t|dkr6||gn||g| _d S )N)updown not supportedro   r   g      ?bilinearT)scale_factormodealign_cornersr   r:   )	r   r   
ValueErrorr$   UpsampleIdentityrB   
ModuleListblocks)r   in_channelsout_channelsrt   enabledinterpolationmappingr   r   r   r   r   s   
$zUpDownBlock2d.__init__c                 C   s   | j D ]}||}q|S r   )r{   )r   r   blockr   r   r   r   ~   s   

zUpDownBlock2d.forward)Tr   r   r   r   r   rn   q   s    rn   c                   @   s   e Zd ZU dZejed< dS )StableCascadeUNetOutputNsample)r   r   r    r   rL   Tensor__annotations__r   r   r   r   r      s   
 r   c                ,       sP  e Zd ZdZe											
	
												d8dedededededee dee dee dee deee  deee  deee  dee dee d ee d!ee d"e	e
ee
 f d#e	eee f d$ee d%eee  f( fd&d'Zd9d)d*Zd+d, Zd:d.d/Zd;d0d1Zd2d3 Zd4d5 Z							d<d6d7Z  ZS )=StableCascadeUNetT   @   r      r   r       r         r   r   r   r   r8   r"   rZ   r   N   rA   r   皙?r   scacrpr|   r}   timestep_ratio_embedding_dim
patch_sizeconditioning_dimblock_out_channelsnum_attention_headsdown_num_layers_per_blockup_num_layers_per_blockdown_blocks_repeat_mappersup_blocks_repeat_mappersblock_types_per_layerclip_text_in_channelsclip_image_in_channelseffnet_in_channelspixel_mapper_in_channelsrK   r_   timestep_conditioning_typeswitch_levelc           #         s  t    t|t|krtdt| t|t|	kr'tdt| t|t|
kr8tdt| t|t|krItdt| t|t|krZtdt| t|trf|ft| }t|trr|ft| }|durttj	||d d d	d
t
 tj	|d d |d d	d
t|d ddd| _|durttj	||d d d	d
t
 tj	|d d |d d	d
t|d ddd| _t| | | _|durt| | _|durt| | | _tj ddd| _tt|tj	||d  |d d	d
t|d ddd| _d fdd	}t | _t | _t | _tt|D ]}|dkrp| jtt||d	  ddd|dur^t||d	  || d||d	  dntj	||d	  || ddd n| jt  t }t|| D ]!}|| D ]}|||| || || || d}|| qq| j| |
durt }t|
| d	 D ]}|tj	|| || d	d
 q| j| q1t | _t | _t | _ t!tt|D ]}|dkr*| jtt|| ddd|durt|| ||d	  d||d	  dntj"|| ||d	  ddd n| jt  t }t|	ddd | D ]C} t#|| D ]9\}!}|t|d	 k rh| |!  krbdkrhn n|| nd}"|||| || |"|| || d}|| qIqA| j| |durt }t|ddd | d	 D ]}|tj	|| || d	d
 q| j | qtt|d dddtj	|d ||d  d	d
t$|| _%d| _&dS )a  

        Parameters:
            in_channels (`int`, defaults to 16):
                Number of channels in the input sample.
            out_channels (`int`, defaults to 16):
                Number of channels in the output sample.
            timestep_ratio_embedding_dim (`int`, defaults to 64):
                Dimension of the projected time embedding.
            patch_size (`int`, defaults to 1):
                Patch size to use for pixel unshuffling layer
            conditioning_dim (`int`, defaults to 2048):
                Dimension of the image and text conditional embedding.
            block_out_channels (Tuple[int], defaults to (2048, 2048)):
                Tuple of output channels for each block.
            num_attention_heads (Tuple[int], defaults to (32, 32)):
                Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have
                attention.
            down_num_layers_per_block (Tuple[int], defaults to [8, 24]):
                Number of layers in each down block.
            up_num_layers_per_block (Tuple[int], defaults to [24, 8]):
                Number of layers in each up block.
            down_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
                Number of 1x1 Convolutional layers to repeat in each down block.
            up_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
                Number of 1x1 Convolutional layers to repeat in each up block.
            block_types_per_layer (Tuple[Tuple[str]], optional,
                defaults to (
                    ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"), ("SDCascadeResBlock",
                    "SDCascadeTimestepBlock", "SDCascadeAttnBlock")
                ): Block types used in each layer of the up/down blocks.
            clip_text_in_channels (`int`, *optional*, defaults to `None`):
                Number of input channels for CLIP based text conditioning.
            clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280):
                Number of input channels for pooled CLIP text embeddings.
            clip_image_in_channels (`int`, *optional*):
                Number of input channels for CLIP based image conditioning.
            clip_seq (`int`, *optional*, defaults to 4):
            effnet_in_channels (`int`, *optional*, defaults to `None`):
                Number of input channels for effnet conditioning.
            pixel_mapper_in_channels (`int`, defaults to `None`):
                Number of input channels for pixel mapper conditioning.
            kernel_size (`int`, *optional*, defaults to 3):
                Kernel size to use in the block convolutional layers.
            dropout (Tuple[float], *optional*, defaults to (0.1, 0.1)):
                Dropout to use per block.
            self_attn (Union[bool, Tuple[bool]]):
                Tuple of booleans that determine whether to use self attention in a block or not.
            timestep_conditioning_type (Tuple[str], defaults to ("sca", "crp")):
                Timestep conditioning type.
            switch_level (Optional[Tuple[bool]], *optional*, defaults to `None`):
                Tuple that indicates whether upsampling or downsampling should be applied in a block
        zaNumber of elements in `down_num_layers_per_block` must match the length of `block_out_channels`: z_Number of elements in `up_num_layers_per_block` must match the length of `block_out_channels`: zbNumber of elements in `down_blocks_repeat_mappers` must match the length of `block_out_channels`: z`Number of elements in `up_blocks_repeat_mappers` must match the length of `block_out_channels`: z]Number of elements in `block_types_per_layer` must match the length of `block_out_channels`: Nr   rA   r   rv   Fr=   r>   r   Tc                    sX   | dkrt |||dS | dkrt| |||dS | dkr$t|dS td|  d)	Nr8   )r:   rK   rZ   )r_   rK   r"   )r'   zBlock type rq   )r8   rZ   r"   rw   )
block_typer|   rd   rJ   rK   r_   r   r:   r   r   r   r   	get_block   s   z-StableCascadeUNet.__init__.<locals>.get_blockrp   )rt   r~   )r:   stride)rK   r_   ro   rV   )rJ   rK   r_   )r   r   T)'r   r   r/   rw   
isinstancefloatboolr$   rE   rB   rF   r   effnet_mapperpixels_mapperr%   clip_txt_pooled_mapperclip_txt_mapperclip_img_mapper	LayerNorm	clip_normPixelUnshuffle	embeddingrz   down_blocksdown_downscalersdown_repeat_mappersrangeappendrn   ry   	up_blocksup_upscalersup_repeat_mappersreversedConvTranspose2dr0   PixelShuffleclfgradient_checkpointing)#r   r|   r}   r   r   r   r   r   r   r   r   r   r   r   clip_text_pooled_in_channelsr   clip_seqr   r   r:   rK   r_   r   r   r   r5   
down_blockrm   r   r   block_repeat_mappersup_blockjkrJ   r   r   r   r      s  
W






	
"




:
"
zStableCascadeUNet.__init__Fc                 C   s
   || _ d S r   )r   )r   valuer   r   r   _set_gradient_checkpointing  s   
z-StableCascadeUNet._set_gradient_checkpointingc              
   C   s  t |tjtjfrtjj|j |jd urtj	|jd tjj
| jjdd t| dr7tjj
| jjddnd  t| drHtjj
| jjddnd  t| drgtjj
| jd jdd tjj
| jd jdd t| drtjj
| jd jdd tjj
| jd jdd tjj| jd	 jd tj	| jd	 jd | j| j D ]1}|D ],}t |tr|jd
 j jtd	t| jjd  9  _qt |trtj	|jjd qqd S )Nr   g{Gz?)stdr   r   r   r   r   r   rV   )r   r$   rB   r%   rL   initxavier_uniform_weightr^   	constant_normal_r   hasattrr   r   r   r   r   r   r   r   r8   rI   datanpsqrtsumconfigr{   r"   r&   )r   mlevel_blockr   r   r   r   _init_weights  s4   
  


.
zStableCascadeUNet._init_weights'  c                 C   s   || }| j jd }t||d  }tj||jd | 	 }|d d d f |d d d f  }tj
| | gdd}| j jd dkrQtjj|ddd}|j|jdS )	Nr   r   )devicer,   )r   r   constant)rt   )dtype)r   r   mathlogrL   aranger   r   mulexprM   sincosr$   
functionalpadtor   )r   timestep_ratiomax_positionsrhalf_dimembr   r   r   get_timestep_ratio_embedding  s     z.StableCascadeUNet.get_timestep_ratio_embeddingc                 C   s   t |jdkr|d}| ||d|d| jj d}|d urX|d urX| |}t |jdkr9|d}| 	||d|d| jj d}t
j|||gdd}n|}| |S )Nr   r   r   rV   r,   )r/   rf   	unsqueezer   rg   sizer   r   r   r   rL   rM   r   )r   clip_txt_pooledclip_txtclip_imgclip_txt_poolclipr   r   r   get_clip_embeddings  s   





z%StableCascadeUNet.get_clip_embeddingsc              	   C   s  g }t | j| j| j}| jr| jrdd }|D ]n\}}}	||}tt|	d D ]V}
|D ]E}t|t	r@t
jjj|||dd}q,t|trSt
jjj||||dd}q,t|trft
jjj||||dd}q,t
jjj||dd}q,|
t|	k r~|	|
 |}q(|d| q|S |D ]O\}}}	||}tt|	d D ]7}
|D ]&}t|t	r||}qt|tr|||}qt|tr|||}q||}q|
t|	k r|	|
 |}q|d| q|S )Nc                        fdd}|S )Nc                         |  S r   r   inputsmoduler   r   custom_forward     zUStableCascadeUNet._down_encode.<locals>.create_custom_forward.<locals>.custom_forwardr   r   r  r   r   r   create_custom_forward     z=StableCascadeUNet._down_encode.<locals>.create_custom_forwardr   Fuse_reentrantr   )zipr   r   r   trainingr   r   r/   r   r8   rL   utils
checkpointrZ   r"   insert)r   r   r_embedr   level_outputsblock_groupr  r   
downscalerrepmapr5   r   r   r   r   _down_encode  sP   







zStableCascadeUNet._down_encodec              	   C   s  |d }t | j| j| j}| jr| jrdd }t|D ]\}\}}	}
tt|
d D ]}t|D ]\}}t	|t
r|dkrE|dkrE|| nd }|d ury|d|dks_|d|dkry|j}tjjj| |jdd  ddd	}||}tjjj||||d
d}q0t	|trtjjj||||d
d}q0t	|trtjjj||||d
d}q0tjjj|||d
d}q0|t|
k r|
| |}q*|	|}q|S t|D ]\}\}}	}
tt|
d D ]}t|D ]o\}}t	|t
r:|dkr|dkr|| nd }|d ur4|d|dks|d|dkr4|j}tjjj| |jdd  ddd	}||}|||}qt	|trF|||}qt	|trR|||}q||}q|t|
k rd|
| |}q|	|}q|S )Nr   c                    r   )Nc                     r   r   r   r   r   r   r   r    r  zSStableCascadeUNet._up_decode.<locals>.create_custom_forward.<locals>.custom_forwardr   r  r   r   r   r    r  z;StableCascadeUNet._up_decode.<locals>.create_custom_forwardr   rV   rr   T)rt   ru   Fr  )r  r   r   r   r	  r   r0   r   r/   r   r8   r   r   rL   r$   r   interpolater   rf   r   r
  r  rZ   r"   )r   r  r  r   r   r  r  r5   r   upscalerr  r   r   r   skip	orig_typer   r   r   
_up_decode  sl   
0



6


zStableCascadeUNet._up_decodec              	   C   s<  |d u r| |dddd}| |}| jjD ]%}|dkr!|}n	|dkr(|	}nd }|p0t|}tj|| |gdd}q| j|||d}| 	|}t
| d	rh|d urh|| tjj||jd
d  ddd }t
| dr|tjj| ||jd
d  ddd }| |||}| |||}| |}|
s|fS t|dS )Nr   r   r   r   r   r   r,   )r   r   r   r   r  rr   T)r   rt   ru   r   )r   )	new_zerosr   r   r   r   rL   
zeros_likerM   r   r   r   r   r$   r   r  rf   r   r  r  r   r   )r   r   r   clip_text_pooled	clip_textr   effnetpixelsr   r   return_dicttimestep_ratio_embedr)   condt_condr   r   r  r   r   r   r   6  s6   




zStableCascadeUNet.forward)r   r   r   r   r   r   r   r   r   r   r   r   Nr   NrA   NNr   r   Tr   N)F)r   )NN)NNNNNNT)r   r   r     _supports_gradient_checkpointingr   intr   r   strr   r   r   r   r   r   r   r   r  r  r   r!   r   r   r   r   r      s    	




 
z

1Dr   )!r   dataclassesr   typingr   r   r   numpyr   rL   torch.nnr$   configuration_utilsr   r   loadersr	   r
  r
   attention_processorr   modeling_utilsr   r   r   Moduler"   r8   rG   rZ   rn   r   r   r   r   r   r   <module>   s(   
