o
    ۷i"j                     @   s  d dl Z d dlmZ d dlZd dlZd dlmZ ddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ G d
d dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZeG dd deZG dd dee	eZdS )    N)	dataclass   )ConfigMixinregister_to_config)FromOriginalModelMixin)
BaseOutput   )	Attention)
ModelMixinc                       s(   e Zd Z fddZ fddZ  ZS )SDCascadeLayerNormc                    s   t  j|i | d S N)super__init__)selfargskwargs	__class__ `/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/unets/unet_stable_cascade.pyr      s   zSDCascadeLayerNorm.__init__c                    s,   | dddd}t |}| ddddS )Nr   r   r      )permuter   forward)r   xr   r   r   r   "   s   zSDCascadeLayerNorm.forward__name__
__module____qualname__r   r   __classcell__r   r   r   r   r      s    r   c                       s(   e Zd Zg f fdd	Zdd Z  ZS )SDCascadeTimestepBlockc              	      sN   t    t||d | _|| _|D ]}t| d| t||d  qd S )Nr   mapper_)r   r   nnLinearmappercondssetattr)r   c
c_timestepr$   cnamer   r   r   r   )   s   
 zSDCascadeTimestepBlock.__init__c           	      C   s   |j t| jd dd}| |d d d d d d d f j ddd\}}t| jD ],\}}t| d| ||d  d d d d d d f j ddd\}}|| || }}q)|d|  | S )Nr   dimr   r   r    )chunklenr$   r#   	enumerategetattr)	r   r   tabir&   acbcr   r   r   r   1   s   0>zSDCascadeTimestepBlock.forwardr   r   r   r   r   r   (   s    r   c                       s(   e Zd Zd	 fdd	Zd
ddZ  ZS )SDCascadeResBlockr   r           c              
      sz   t    tj||||d |d| _t|ddd| _tt|| |d t	 t
|d t|t|d || _d S )Nr   )kernel_sizepaddinggroupsFư>elementwise_affineeps   )r   r   r!   Conv2d	depthwiser   norm
Sequentialr"   GELUGlobalResponseNormDropoutchannelwise)r   r&   c_skipr7   dropoutr   r   r   r   ;   s   


zSDCascadeResBlock.__init__Nc                 C   sX   |}|  | |}|d urtj||gdd}| |dddddddd}|| S )Nr   r)   r   r   r   )rA   r@   torchcatrF   r   )r   r   x_skipx_resr   r   r   r   G   s   "zSDCascadeResBlock.forward)r   r   r6   r   r   r   r   r   r   r5   :   s    r5   c                       s$   e Zd Z fddZdd Z  ZS )rD   c                    s>   t    ttddd|| _ttddd|| _d S )Nr   )r   r   r!   	ParameterrI   zerosgammabeta)r   r*   r   r   r   r   R   s   
zGlobalResponseNorm.__init__c                 C   s@   t j|dddd}||jdddd  }| j||  | j | S )Nr   )r   r   T)pr*   keepdim)r*   rR   r:   )rI   rA   meanrO   rP   )r   r   agg_normstand_div_normr   r   r   r   W   s   zGlobalResponseNorm.forwardr   r   r   r   r   rD   Q   s    rD   c                       s&   e Zd Zd fdd	Zdd Z  ZS )SDCascadeAttnBlockTr6   c                    sV   t    || _t|ddd| _t|||| |dd| _tt	 t
||| _d S )NFr:   r;   T)	query_dimheadsdim_headrH   bias)r   r   	self_attnr   rA   r	   	attentionr!   rB   SiLUr"   	kv_mapper)r   r&   c_condnheadr\   rH   r   r   r   r   ^   s
   
zSDCascadeAttnBlock.__init__c                 C   sb   |  |}| |}| jr&|j\}}}}tj|||ddd|gdd}|| j||d }|S )NrS   r   r   r)   )encoder_hidden_states)	r_   rA   r\   shaperI   rJ   view	transposer]   )r   r   kvnorm_x
batch_sizechannel_r   r   r   r   f   s   

$zSDCascadeAttnBlock.forward)Tr6   r   r   r   r   r   rW   ]   s    rW   c                       s&   e Zd Zd fdd	Zdd Z  ZS )UpDownBlock2dTc                    s|   t    |dvrt| d|r tj|dkrdnddddnt }tj||d	d
}t|dkr6||gn||g| _d S )N)updown not supportedrl   r   g      ?bilinearT)scale_factormodealign_cornersr   r7   )	r   r   
ValueErrorr!   UpsampleIdentityr?   
ModuleListblocks)r   in_channelsout_channelsrq   enabledinterpolationmappingr   r   r   r   q   s   
$zUpDownBlock2d.__init__c                 C   s   | j D ]}||}q|S r   )rx   )r   r   blockr   r   r   r   }   s   

zUpDownBlock2d.forward)Tr   r   r   r   r   rk   p   s    rk   c                   @   s   e Zd ZU dZejed< dS )StableCascadeUNetOutputNsample)r   r   r   r   rI   Tensor__annotations__r   r   r   r   r      s   
 r   c                +       sR  e Zd ZdZe											
	
												d6dedededededeedf deedf deedf deedf dee dB dee dB deee  dedB d edB d!edB d"edB d#eee B d$e	ee	 B d%eedf d&ee	 dB f( fd'd(Z
d)d* Zd7d,d-Zd8d.d/Zd0d1 Zd2d3 Z							d9d4d5Z  ZS ):StableCascadeUNetT   @   r      r   r       r         r   r   r   r   r5   r   rW   r   N   r>   r   皙?r   scacrpry   rz   timestep_ratio_embedding_dim
patch_sizeconditioning_dimblock_out_channels.num_attention_headsdown_num_layers_per_blockup_num_layers_per_blockdown_blocks_repeat_mappersup_blocks_repeat_mappersblock_types_per_layerclip_text_in_channelsclip_image_in_channelseffnet_in_channelspixel_mapper_in_channelsrH   r\   timestep_conditioning_typeswitch_levelc           #         s  t    t|t|krtdt| t|t|	kr'tdt| t|t|
kr8tdt| t|t|krItdt| t|t|krZtdt| t|trf|ft| }t|trr|ft| }|durttj	||d d d	d
t
 tj	|d d |d d	d
t|d ddd| _|durttj	||d d d	d
t
 tj	|d d |d d	d
t|d ddd| _t| | | _|durt| | _|durt| | | _tj ddd| _tt|tj	||d  |d d	d
t|d ddd| _d fdd	}t | _t | _t | _tt|D ]}|dkrp| jtt||d	  ddd|dur^t||d	  || d||d	  dntj	||d	  || ddd n| jt  t }t|| D ]!}|| D ]}|||| || || || d}|| qq| j| |
durt }t|
| d	 D ]}|tj	|| || d	d
 q| j| q1t | _t | _t | _ t!tt|D ]}|dkr*| jtt|| ddd|durt|| ||d	  d||d	  dntj"|| ||d	  ddd n| jt  t }t|	ddd | D ]C} t#|| D ]9\}!}|t|d	 k rh| |!  krbdkrhn n|| nd}"|||| || |"|| || d}|| qIqA| j| |durt }t|ddd | d	 D ]}|tj	|| || d	d
 q| j | qtt|d dddtj	|d ||d  d	d
t$|| _%d| _&dS )a  

        Parameters:
            in_channels (`int`, defaults to 16):
                Number of channels in the input sample.
            out_channels (`int`, defaults to 16):
                Number of channels in the output sample.
            timestep_ratio_embedding_dim (`int`, defaults to 64):
                Dimension of the projected time embedding.
            patch_size (`int`, defaults to 1):
                Patch size to use for pixel unshuffling layer
            conditioning_dim (`int`, defaults to 2048):
                Dimension of the image and text conditional embedding.
            block_out_channels (tuple[int], defaults to (2048, 2048)):
                tuple of output channels for each block.
            num_attention_heads (tuple[int], defaults to (32, 32)):
                Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have
                attention.
            down_num_layers_per_block (tuple[int], defaults to [8, 24]):
                Number of layers in each down block.
            up_num_layers_per_block (tuple[int], defaults to [24, 8]):
                Number of layers in each up block.
            down_blocks_repeat_mappers (tuple[int], optional, defaults to [1, 1]):
                Number of 1x1 Convolutional layers to repeat in each down block.
            up_blocks_repeat_mappers (tuple[int], optional, defaults to [1, 1]):
                Number of 1x1 Convolutional layers to repeat in each up block.
            block_types_per_layer (tuple[tuple[str]], optional,
                defaults to (
                    ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"), ("SDCascadeResBlock",
                    "SDCascadeTimestepBlock", "SDCascadeAttnBlock")
                ): Block types used in each layer of the up/down blocks.
            clip_text_in_channels (`int`, *optional*, defaults to `None`):
                Number of input channels for CLIP based text conditioning.
            clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280):
                Number of input channels for pooled CLIP text embeddings.
            clip_image_in_channels (`int`, *optional*):
                Number of input channels for CLIP based image conditioning.
            clip_seq (`int`, *optional*, defaults to 4):
            effnet_in_channels (`int`, *optional*, defaults to `None`):
                Number of input channels for effnet conditioning.
            pixel_mapper_in_channels (`int`, defaults to `None`):
                Number of input channels for pixel mapper conditioning.
            kernel_size (`int`, *optional*, defaults to 3):
                Kernel size to use in the block convolutional layers.
            dropout (tuple[float], *optional*, defaults to (0.1, 0.1)):
                Dropout to use per block.
            self_attn (bool | tuple[bool]):
                tuple of booleans that determine whether to use self attention in a block or not.
            timestep_conditioning_type (tuple[str], defaults to ("sca", "crp")):
                Timestep conditioning type.
            switch_level (tuple[bool] | None, *optional*, defaults to `None`):
                tuple that indicates whether upsampling or downsampling should be applied in a block
        zaNumber of elements in `down_num_layers_per_block` must match the length of `block_out_channels`: z_Number of elements in `up_num_layers_per_block` must match the length of `block_out_channels`: zbNumber of elements in `down_blocks_repeat_mappers` must match the length of `block_out_channels`: z`Number of elements in `up_blocks_repeat_mappers` must match the length of `block_out_channels`: z]Number of elements in `block_types_per_layer` must match the length of `block_out_channels`: Nr   r>   r   rs   Fr:   r;   r   Tc                    sX   | dkrt |||dS | dkrt| |||dS | dkr$t|dS td|  d)	Nr5   )r7   rH   rW   )r\   rH   r   )r$   zBlock type rn   )r5   rW   r   rt   )
block_typery   ra   rG   rH   r\   r   r7   r   r   r   r   	get_block  s   z-StableCascadeUNet.__init__.<locals>.get_blockrm   )rq   r{   )r7   stride)rH   r\   rl   rS   )rG   rH   r\   )r   r   T)'r   r   r,   rt   
isinstancefloatboolr!   rB   r?   rC   r   effnet_mapperpixels_mapperr"   clip_txt_pooled_mapperclip_txt_mapperclip_img_mapper	LayerNorm	clip_normPixelUnshuffle	embeddingrw   down_blocksdown_downscalersdown_repeat_mappersrangeappendrk   rv   	up_blocksup_upscalersup_repeat_mappersreversedConvTranspose2dr-   PixelShuffleclfgradient_checkpointing)#r   ry   rz   r   r   r   r   r   r   r   r   r   r   r   clip_text_pooled_in_channelsr   clip_seqr   r   r7   rH   r\   r   r   r   r2   
down_blockrj   r   r~   block_repeat_mappersup_blockjkrG   r   r   r   r      s  
W






	
"




:
"
zStableCascadeUNet.__init__c              
   C   s  t |tjtjfrtjj|j |jd urtj	|jd tjj
| jjdd t| dr7tjj
| jjddnd  t| drHtjj
| jjddnd  t| drgtjj
| jd jdd tjj
| jd jdd t| drtjj
| jd jdd tjj
| jd jdd tjj| jd	 jd tj	| jd	 jd | j| j D ]1}|D ],}t |tr|jd
 j jtd	t| jjd  9  _qt |trtj	|jjd qqd S )Nr   g{Gz?)stdr   r   r   r   r   r   rS   )r   r!   r?   r"   rI   initxavier_uniform_weightr[   	constant_normal_r   hasattrr   r   r   r   r   r   r   r   r5   rF   datanpsqrtsumconfigrx   r   r#   )r   mlevel_blockr~   r   r   r   _init_weights  s4   
  


.
zStableCascadeUNet._init_weights'  c                 C   s   || }| j jd }t||d  }tj||jd | 	 }|d d d f |d d d f  }tj
| | gdd}| j jd dkrQtjj|ddd}|j|jdS )	Nr   r   )devicer)   )r   r   constant)rq   )dtype)r   r   mathlogrI   aranger   r   mulexprJ   sincosr!   
functionalpadtor   )r   timestep_ratiomax_positionsrhalf_dimembr   r   r   get_timestep_ratio_embedding  s     z.StableCascadeUNet.get_timestep_ratio_embeddingc                 C   s   t |jdkr|d}| ||d|d| jj d}|d urX|d urX| |}t |jdkr9|d}| 	||d|d| jj d}t
j|||gdd}n|}| |S )Nr   r   r   rS   r)   )r,   rc   	unsqueezer   rd   sizer   r   r   r   rI   rJ   r   )r   clip_txt_pooledclip_txtclip_imgclip_txt_poolclipr   r   r   get_clip_embeddings  s   





z%StableCascadeUNet.get_clip_embeddingsc                 C   s  g }t | j| j| j}t rm| jrm|D ]V\}}}||}tt|d D ]>}	|D ]-}
t	|
t
r7| |
|}q)t	|
trD| |
||}q)t	|
trQ| |
||}q)| |
}q)|	t|k rc||	 |}q%|d| q|S |D ]O\}}}||}tt|d D ]7}	|D ]&}
t	|
t
r|
|}qt	|
tr|
||}qt	|
tr|
||}q|
|}q|	t|k r||	 |}q|d| qo|S )Nr   r   )zipr   r   r   rI   is_grad_enabledr   r   r,   r   r5   _gradient_checkpointing_funcrW   r   insert)r   r   r_embedr   level_outputsblock_groupr   
downscalerrepmapr2   r~   r   r   r   _down_encode  sF   







zStableCascadeUNet._down_encodec              	   C   s  |d }t | j| j| j}t r| jrt|D ]\}\}}}	tt	|	d D ]}
t|D ]q\}}t
|tr~|dkrB|dkrB|| nd }|d urv|d|dks\|d|dkrv|j}tjjj| |jdd  ddd}||}| |||}q-t
|tr| |||}q-t
|tr| |||}q-| ||}q-|
t	|	k r|	|
 |}q'||}q|S t|D ]\}\}}}	tt	|	d D ]}
t|D ]n\}}t
|tr|dkr|dkr|| nd }|d ur|d|dks|d|dkr|j}tjjj| |jdd  ddd}||}|||}qt
|tr*|||}qt
|tr6|||}q||}q|
t	|	k rH|	|
 |}q||}q|S )Nr   r   rS   ro   T)rq   rr   )r   r   r   r   rI   r   r   r-   r   r,   r   r5   r   r   r!   r   interpolater   rc   r   r   rW   r   )r   r   r   r   r   r   r2   r   upscalerr   r   r   r~   skip	orig_typer   r   r   
_up_decode  s^   
0



4


zStableCascadeUNet._up_decodec              	   C   s<  |d u r| |dddd}| |}| jjD ]%}|dkr!|}n	|dkr(|	}nd }|p0t|}tj|| |gdd}q| j|||d}| 	|}t
| d	rh|d urh|| tjj||jd
d  ddd }t
| dr|tjj| ||jd
d  ddd }| |||}| |||}| |}|
s|fS t|dS )Nr   r   r   r   r   r   r)   )r   r   r   r   r   ro   T)r   rq   rr   r   )r   )	new_zerosr   r   r   r   rI   
zeros_likerJ   r   r   r   r   r!   r   r  rc   r   r   r  r   r   )r   r   r   clip_text_pooled	clip_textr   effnetpixelsr   r   return_dicttimestep_ratio_embedr&   condt_condr   r   r   r   r   r   r     s6   




zStableCascadeUNet.forward)r   r   r   r   r   r   r   r   r   r   r   r   Nr   Nr>   NNr   r   Tr   N)r   )NN)NNNNNNT)r   r   r    _supports_gradient_checkpointingr   inttuplestrr   r   r   r   r   r   r   r  r   r   r   r   r   r   r      s    


	








 z

&7r   )r   dataclassesr   numpyr   rI   torch.nnr!   configuration_utilsr   r   loadersr   utilsr   attention_processorr	   modeling_utilsr
   r   r   Moduler   r5   rD   rW   rk   r   r   r   r   r   r   <module>   s&   
