o
    piQS                    @   s  d dl mZ d dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZmZ ddlmZmZ ddlmZmZmZ dd	lmZ d
dlmZmZmZmZmZmZmZ d
dl m!Z! d
dl"m#Z#m$Z$ d
dl%m&Z& d
dl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d
dl/m0Z0 e1e2Z3eG dd deZ4G dd dej5Z6G dd dej5Z7G dd dej5Z8			
	
				dDde9de9de9d e9d!e9d"ee9 d#ee
e9e	e9 f  d$ee9 d%ee9 d&e:d'ee: d(ee: fd)d*Z;			
	
			dEd+e9d,e9d!ee9 d"ee9 d#e9d$ee9 d%ee9 d'e:d(e:fd-d.Z<d/e9d0e9d1ee9 fd2d3Z=G d4d5 d5e&eZ>G d6d7 d7e&eZ?G d8d9 d9ej5Z@G d:d; d;ej5ZAG d<d= d=ej5ZBdFd>d?ZCd@dA ZDdBdC ZEdS )G    )	dataclass)gcd)AnyDictListOptionalTupleUnionN)Tensornn   )ConfigMixinregister_to_config)
BaseOutputis_torch_versionlogging)apply_freeu   )ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORS	AttentionAttentionProcessorAttnAddedKVProcessorAttnProcessorFusedAttnProcessor2_0)ControlNetConditioningEmbedding)TimestepEmbedding	Timesteps)
ModelMixin)CrossAttnDownBlock2DCrossAttnUpBlock2DDownsample2DResnetBlock2DTransformer2DModelUNetMidBlock2DCrossAttn
Upsample2D)UNet2DConditionModelc                   @   s   e Zd ZU dZdZeed< dS )ControlNetXSOutputa=  
    The output of [`UNetControlNetXSModel`].

    Args:
        sample (`Tensor` of shape `(batch_size, num_channels, height, width)`):
            The output of the `UNetControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base
            model output, but is already the final output.
    Nsample)__name__
__module____qualname____doc__r(   r
   __annotations__ r.   r.   \/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/controlnet_xs.pyr'   4   s   
 	r'   c                       sN   e Zd ZdZ		d
dejdejdejdeej deej f
 fdd	Z  Z	S )DownBlockControlNetXSAdapterz}Components that together with corresponding components from the base model will form a
    `ControlNetXSCrossAttnDownBlock2D`Nresnetsbase_to_ctrlctrl_to_base
attentionsdownsamplerc                    s,   t    || _|| _|| _|| _|| _d S N)super__init__r1   r2   r3   r4   downsamplers)selfr1   r2   r3   r4   r5   	__class__r.   r/   r8   F   s   

z%DownBlockControlNetXSAdapter.__init__)NN)
r)   r*   r+   r,   r   
ModuleListr   Conv2dr8   __classcell__r.   r.   r;   r/   r0   B   s    r0   c                       s2   e Zd ZdZdedejdejf fddZ  ZS )MidBlockControlNetXSAdapterz|Components that together with corresponding components from the base model will form a
    `ControlNetXSCrossAttnMidBlock2D`midblockr2   r3   c                    s    t    || _|| _|| _d S r6   )r7   r8   rA   r2   r3   )r:   rA   r2   r3   r;   r.   r/   r8   Z   s   

z$MidBlockControlNetXSAdapter.__init__)	r)   r*   r+   r,   r$   r   r=   r8   r?   r.   r.   r;   r/   r@   V   s    &r@   c                       s(   e Zd ZdZdejf fddZ  ZS )UpBlockControlNetXSAdapterzwComponents that together with corresponding components from the base model will form a `ControlNetXSCrossAttnUpBlock2D`r3   c                    s   t    || _d S r6   )r7   r8   r3   )r:   r3   r;   r.   r/   r8   d   s   

z#UpBlockControlNetXSAdapter.__init__)r)   r*   r+   r,   r   r=   r8   r?   r.   r.   r;   r/   rB   a   s    rB       T   Fbase_in_channelsbase_out_channelsctrl_in_channelsctrl_out_channelstemb_channelsmax_norm_num_groupstransformer_layers_per_blocknum_attention_headscross_attention_dimadd_downsampleupcast_attentionuse_linear_projectionc                 C   sZ  d}g }g }g }g }t |tr|g| }t|D ]T}|dkr | n|} |dkr(|n|}|t| |  |t||  ||t||  |dt||ddd |rd|t||| ||| |	||t||dd |t|| q|
r|t|| t|| d|dd	}|t|| nd }t	t
|t
|t
|d
}|rt
||_|d ur||_|S )Nr   r   
max_factorh㈵>in_channelsout_channelsrI   groups
groups_outepsrU   
num_layersrM   rP   rO   norm_num_groupsTopuse_convrV   name)r1   r2   r3   )
isinstanceintrangeappendmake_zero_convr"   find_largest_factorr#   r!   r0   r   r=   r4   r9   )rE   rF   rG   rH   rI   rJ   has_crossattnrK   rL   rM   rN   rO   rP   r[   r1   r4   r3   r2   ir9   down_block_componentsr.   r.   r/   get_down_block_adapteri   sh   



rj   base_channelsctrl_channelsc	                 C   sP   t | | }	t|||  ||tt|||  |||||d	}
t || }t|	|
|dS )N	rK   rU   rV   rI   resnet_groupsrM   rL   rP   rO   )r2   rA   r3   )re   r$   rf   r   r@   )rk   rl   rI   rJ   rK   rL   rM   rO   rP   r2   rA   r3   r.   r.   r/   get_mid_block_adapter   s   

ro   rV   prev_output_channelctrl_skip_channelsc                 C   sJ   g }d}t |D ]}|dkr|n| }|t|| | qtt|dS )N   r   )r3   )rc   rd   re   rB   r   r=   )rV   rp   rq   r3   r[   rh   resnet_in_channelsr.   r.   r/   get_up_block_adapter   s   rt   c                        s  e Zd ZdZe										
						d(dededee dede	de
eee f dee dee dedee dee de
eee f de	dede	f fdd Ze	!	!	!					d)d"ed#ee deee  deee  de	dedededee fd$d%Zd&d' Z  ZS )*ControlNetXSAdaptera  
    A `ControlNetXSAdapter` model. To use it, pass it into a `UNetControlNetXSModel` (together with a
    `UNet2DConditionModel` base model).

    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
    methods implemented for all models (such as downloading or saving).

    Like `UNetControlNetXSModel`, `ControlNetXSAdapter` is compatible with StableDiffusion and StableDiffusion-XL. It's
    default parameters are compatible with StableDiffusion.

    Parameters:
        conditioning_channels (`int`, defaults to 3):
            Number of channels of conditioning input (e.g. an image)
        conditioning_channel_order (`str`, defaults to `"rgb"`):
            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
        conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
            The tuple of output channels for each block in the `controlnet_cond_embedding` layer.
        time_embedding_mix (`float`, defaults to 1.0):
            If 0, then only the control adapters's time embedding is used. If 1, then only the base unet's time
            embedding is used. Otherwise, both are combined.
        learn_time_embedding (`bool`, defaults to `False`):
            Whether a time embedding should be learned. If yes, `UNetControlNetXSModel` will combine the time
            embeddings of the base model and the control adapter. If no, `UNetControlNetXSModel` will use the base
            model's time embedding.
        num_attention_heads (`list[int]`, defaults to `[4]`):
            The number of attention heads.
        block_out_channels (`list[int]`, defaults to `[4, 8, 16, 16]`):
            The tuple of output channels for each block.
        base_block_out_channels (`list[int]`, defaults to `[320, 640, 1280, 1280]`):
            The tuple of output channels for each block in the base unet.
        cross_attention_dim (`int`, defaults to 1024):
            The dimension of the cross attention features.
        down_block_types (`list[str]`, defaults to `["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"]`):
            The tuple of downsample blocks to use.
        sample_size (`int`, defaults to 96):
            Height and width of input/output sample.
        transformer_layers_per_block (`Union[int, Tuple[int]]`, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
        upcast_attention (`bool`, defaults to `True`):
            Whether the attention computation should always be upcasted.
        max_norm_num_groups (`int`, defaults to 32):
            Maximum number of groups in group normal. The actual number will be the largest divisor of the respective
            channels, that is <= max_norm_num_groups.
    rr   rgb   rC   `            ?F   r|      rx   rx   i@  i     r   rD   r   r   r   DownBlock2Dry   r   TrC   conditioning_channelsconditioning_channel_order#conditioning_embedding_out_channelstime_embedding_mixlearn_time_embeddingrL   block_out_channelsbase_block_out_channelsrM   down_block_typessample_sizerK   rO   rJ   rP   c                    s  t    |d }|d d }|dvrtd| t|t|
kr-td| d|
 dt|ttfs;|gt|
 }t|	ttfsI|	gt|
 }	t|ttfsW|gt|
 }t|t|
krjtd| d|
 dt|d ||d	| _|r}t	||| _
nd | _
tg | _tg | _tjd|d d
dd| _t|d |d | _|d }|d }t|
D ]8\}}|}|| }|}|| }d|v }|t|
d k}| jt||||||||| || |	| | ||d qt|d |d ||d |d |	d ||d| _|d g t|D ]\}}|t|d k rd
nd} |g|  qtt|}|d }tt|
D ]}|}|| } fddtd
D }| jt|||d q2d S )Nr   r|   )rv   bgrz&unknown `conditioning_channel_order`: zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: .zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: conditioning_embedding_channelsr   r   rr   r   kernel_sizepadding	CrossAttn)rE   rF   rG   rH   rI   rJ   rg   rK   rL   rM   rN   rO   rP   )rk   rl   rI   rK   rL   rM   rO   rP   r   c                       g | ]}   qS r.   pop.0_rq   r.   r/   
<listcomp>      z0ControlNetXSAdapter.__init__.<locals>.<listcomp>)rV   rp   rq   )r7   r8   
ValueErrorlenra   listtupler   controlnet_cond_embeddingr   time_embeddingr   r=   down_blocksup_connectionsr>   conv_inre   control_to_base_for_conv_in	enumeraterd   rj   ro   	mid_blockextendreversedrc   rt   )r:   r   r   r   r   r   rL   r   r   rM   r   r   rK   rO   rJ   rP   time_embedding_input_dimtime_embedding_dimrF   rH   rh   down_block_typerE   rG   rg   is_final_blockrV   number_of_subblocks reversed_base_block_out_channelsprev_base_output_channelctrl_skip_channels_r;   r   r/   r8   $  s   

zControlNetXSAdapter.__init__Nunet
size_ratioc
                    s   |du}
 du}|
|A st d|p fdd|jjD }|du r%|jj}| |||	|||||jj|jj|jj|jj|jj|jj|jj	|jj
d}||j |S )a8  
        Instantiate a [`ControlNetXSAdapter`] from a [`UNet2DConditionModel`].

        Parameters:
            unet (`UNet2DConditionModel`):
                The UNet model we want to control. The dimensions of the ControlNetXSAdapter will be adapted to it.
            size_ratio (float, *optional*, defaults to `None`):
                When given, block_out_channels is set to a fraction of the base model's block_out_channels. Either this
                or `block_out_channels` must be given.
            block_out_channels (`List[int]`, *optional*, defaults to `None`):
                Down blocks output channels in control model. Either this or `size_ratio` must be given.
            num_attention_heads (`List[int]`, *optional*, defaults to `None`):
                The dimension of the attention heads. The naming seems a bit confusing and it is, see
                https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
            learn_time_embedding (`bool`, defaults to `False`):
                Whether the `ControlNetXSAdapter` should learn a time embedding.
            time_embedding_mix (`float`, defaults to 1.0):
                If 0, then only the control adapter's time embedding is used. If 1, then only the base unet's time
                embedding is used. Otherwise, both are combined.
            conditioning_channels (`int`, defaults to 3):
                Number of channels of conditioning input (e.g. an image)
            conditioning_channel_order (`str`, defaults to `"rgb"`):
                The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
            conditioning_embedding_out_channels (`Tuple[int]`, defaults to `(16, 32, 96, 256)`):
                The tuple of output channel for each block in the `controlnet_cond_embedding` layer.
        NzePass exactly one of `block_out_channels` (for absolute sizing) or `size_ratio` (for relative sizing).c                    s   g | ]}t |  qS r.   )rb   )r   br   r.   r/   r     s    z1ControlNetXSAdapter.from_unet.<locals>.<listcomp>)r   r   r   r   r   rL   r   r   rM   r   r   rK   rO   rJ   rP   )r   configr   attention_head_dimrM   r   r   rK   rO   r\   rP   todtype)clsr   r   r   rL   r   r   r   r   r   
fixed_sizerelative_sizemodelr.   r   r/   	from_unet  s8   )zControlNetXSAdapter.from_unetc                 O   s   t d)NzA ControlNetXSAdapter cannot be run by itself. Use it together with a UNet2DConditionModel to instantiate a UNetControlNetXSModel.)r   )r:   argskwargsr.   r.   r/   forward  s   zControlNetXSAdapter.forward)rr   rv   rw   r{   Fr|   r}   r   rD   r   ry   r   TrC   T)NNNFr{   rr   rv   rw   )r)   r*   r+   r,   r   rb   strr   floatboolr	   r   r8   classmethodr&   r   r   r   r?   r.   r.   r;   r/   ru      s    .	
 	

	
Lru   c                /       sz  e Zd ZdZdZe									
														dWdee dee	 dee	 dee dee de
eee f de
eee f de
eee f dee	 dee dededee d ee d!ed"ed#ee d$e	d%ed&ee d'e
eee f d(ef, fd)d*Ze					dXd+ed,ee d-ee d&eee  d!ee d.ee fd/d0ZdYd2d3ZdZd4d5Zed1ee	ef fd6d7Zd8e
eee	ef f fd9d:Zd;d< Zd=ed>ed?ed@efdAdBZdCdD ZdEdF ZdGdH Z									d[dIedJe
e jeef dKe jdLee j dMee dNee j dOee j dPee j dQeee	e!f  dReee	e jf  dSedTed1e
e"ef fdUdVZ#  Z$S )\UNetControlNetXSModela9  
    A UNet fused with a ControlNet-XS adapter model

    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
    methods implemented for all models (such as downloading or saving).

    `UNetControlNetXSModel` is compatible with StableDiffusion and StableDiffusion-XL. It's default parameters are
    compatible with StableDiffusion.

    It's parameters are either passed to the underlying `UNet2DConditionModel` or used exactly like in
    `ControlNetXSAdapter` . See their documentation for details.
    Try   r   	UpBlock2Dr    r    r    r   rC   rD   r   r~   Nr{   rr   rw   rv   Fr}   r|   r   r   up_block_typesr   r\   rM   rK   rL   addition_embed_typeaddition_time_embed_dimrO   rP   time_cond_proj_dim%projection_class_embeddings_input_dimr   ctrl_conditioning_channels(ctrl_conditioning_embedding_out_channelsctrl_conditioning_channel_orderctrl_learn_time_embeddingctrl_block_out_channelsctrl_num_attention_headsctrl_max_norm_num_groupsc           .         s  t    |dk s|dkrtd|dk r|std|	d ur'|	dkr'tdt|ttfs5|gt| }t|ttfsC|gt| }t|ttfsQ|gt| }t|ttfs_|gt| }|}d| _tj	d|d ddd	| _
t|d ||d
| _tj	d|d ddd	| _t|d |d | _|d }|d d }t|d ddd| _t|||d| _|rt||d| _nd | _|	d u rd | _d | _nt|
ddd| _t||| _g }|d }|d }t|D ];\}}|}|| }|} || }d|v }!|t|d k}"|t||| |||||!|| || || || |" ||d qt|d |d ||||d |d |d |d ||d| _g }#tt|}$tt|}%tt|}&|d g t|D ]\}}'|t|d k rddnd}( |'g|(  qUtt|})|)d }'t|D ]K\}}*|'}+|)| }'|)t|d t|d  }, fddtdD }-d|*v }!|t|d k}"|#t |,|'|+|-|||!|$| |%| |&| |" |||d q~t!|| _"t!|#| _#tj$|d |d| _%t& | _'tj	|d dddd	| _(d S )Nr   r   z1`time_embedding_mix` needs to be between 0 and 1.zKTo use `time_embedding_mix` < 1, `ctrl_learn_time_embedding` must be `True`	text_timezAs `UNetControlNetXSModel` currently only supports StableDiffusion and StableDiffusion-XL, `addition_embed_type` must be `None` or `'text_time'`.r|   rr   r   r   T)flip_sin_to_cosdownscale_freq_shift)cond_proj_dim)rU   time_embed_dimr   rE   rF   rG   rH   rI   r\   r   rg   rK   base_num_attention_headsr   rM   rN   rO   rP   r   rk   rl   rI   r\   r   rK   r   r   rM   rO   rP   r   c                    r   r.   r   r   r   r.   r/   r     r   z2UNetControlNetXSModel.__init__.<locals>.<listcomp>)rU   rV   rp   rq   rI   resolution_idxrg   rK   rL   rM   add_upsamplerO   r\   rP   )num_channels
num_groups))r7   r8   r   ra   r   r   r   rU   r   r>   base_conv_inr   r   ctrl_conv_inre   r   r   base_time_projr   base_time_embeddingctrl_time_embeddingbase_add_time_projbase_add_embeddingr   rd    ControlNetXSCrossAttnDownBlock2DControlNetXSCrossAttnMidBlock2Dr   r   r   minrc   ControlNetXSCrossAttnUpBlock2Dr=   r   	up_blocks	GroupNormbase_conv_norm_outSiLUbase_conv_actbase_conv_out).r:   r   r   r   r   r\   rM   rK   rL   r   r   rO   rP   r   r   r   r   r   r   r   r   r   r   r   time_embed_input_dimr   r   rF   rH   rh   r   rE   rG   rg   r   r    rev_transformer_layers_per_blockrev_num_attention_headsrev_cross_attention_dimrV   r   reversed_block_out_channelsup_block_typerp   rU   r   r;   r   r/   r8     s   
!


zUNetControlNetXSModel.__init__r   
controlnetr   ctrl_optional_kwargsc                    s  |du rt j|||fi |}ntdd ||||fD r!tdg dfdd|j D |jjd< g d	  fd
d|j D  |jj d< | i  }g d}|D ]}	t	|d|	 
t	||	  q\ddg}
|
D ]}	t||	rt	||	durt	|d|	 
t	||	  qt|j
|j  |j
|j  |jdur|j
|j  |j
|j  tdd t|j|jD |_t|j|j|_tdd t|j|jD |_||j |S )a  
        Instantiate a [`UNetControlNetXSModel`] from a [`UNet2DConditionModel`] and an optional [`ControlNetXSAdapter`]
        .

        Parameters:
            unet (`UNet2DConditionModel`):
                The UNet model we want to control.
            controlnet (`ControlNetXSAdapter`):
                The ConntrolNet-XS adapter with which the UNet will be fused. If none is given, a new ConntrolNet-XS
                adapter will be created.
            size_ratio (float, *optional*, defaults to `None`):
                Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
            ctrl_block_out_channels (`List[int]`, *optional*, defaults to `None`):
                Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
                where this parameter is called `block_out_channels`.
            time_embedding_mix (`float`, *optional*, defaults to None):
                Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
            ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`):
                Passed to the `init` of the new controlent if no controlent was given.
        Nc                 s   s    | ]}|d uV  qd S r6   r.   )r   or.   r.   r/   	<genexpr>  s    
z2UNetControlNetXSModel.from_unet.<locals>.<genexpr>zWhen a controlnet is passed, none of these parameters should be passed: size_ratio, ctrl_block_out_channels, time_embedding_mix, ctrl_optional_kwargs.)r   r   r   r   r\   rM   rK   r   r   rO   rP   r   r   c                    s   i | ]\}}| v r||qS r.   r.   r   kv)params_for_unetr.   r/   
<dictcomp>	  s    z3UNetControlNetXSModel.from_unet.<locals>.<dictcomp>rL   )r   r   r   r   r   rL   rJ   c                    s"   i | ]\}}| v rd | |qS )ctrl_r.   r   )params_for_controlnetr.   r/   r    s   " r   )r   r   conv_norm_outconv_outbase_add_time_projadd_embeddingc                 s        | ]\}}t ||V  qd S r6   )r   from_modulesr   r   cr.   r.   r/   r   7  
    

c                 s   r
  r6   )r   r  r  r.   r.   r/   r   <  r  )ru   r   anyr   r   itemsr   r   from_configgetattrload_state_dict
state_dicthasattrr   r   r   r   r   r   r   r=   zipr   r   r  r   r   r   r   r   )r   r   r   r   r   r   r   r   modules_from_unetmoptional_modules_from_unetr.   )r  r  r/   r     sV   
	 




zUNetControlNetXSModel.from_unetreturnc                    s      D ]}d|_qg d} fdd|D }|D ]}|  D ]}d|_qq jD ]}|  q) j   jD ]}|  q8dS )Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
        tuning.T)r   r   r   r   r   r   r   r   c                    s$   g | ]}t  |d urt  |qS r6   )r  )r   partr:   r.   r/   r   X  s   $ z<UNetControlNetXSModel.freeze_unet_params.<locals>.<listcomp>FN)
parametersrequires_gradr   freeze_base_paramsr   r   )r:   param
base_partsr  dur.   r  r/   freeze_unet_paramsF  s   





z(UNetControlNetXSModel.freeze_unet_paramsc                 C   s   t |dr
||_d S d S )Ngradient_checkpointing)r  r&  )r:   modulevaluer.   r.   r/   _set_gradient_checkpointingc  s   

z1UNetControlNetXSModel._set_gradient_checkpointingc                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        r`   r'  
processorsc                    sH   t |dr| ||  d< | D ]\}} |  d| || q|S )Nget_processor
.processorr   )r  r+  named_children)r`   r'  r*  sub_namechildfn_recursive_add_processorsr.   r/   r1  r  s
   
zJUNetControlNetXSModel.attn_processors.<locals>.fn_recursive_add_processors)r   torchr   Moduler   r   r-  )r:   r*  r`   r'  r.   r0  r/   attn_processorsg  s
   	&	z%UNetControlNetXSModel.attn_processors	processorc                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r`   r'  c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S )Nset_processorr,  r   )r  ra   dictr6  r   r-  )r`   r'  r5  r.  r/  fn_recursive_attn_processorr.   r/   r9    s   

zMUNetControlNetXSModel.set_attn_processor.<locals>.fn_recursive_attn_processorN)r   r4  keysra   r7  r   r   r2  r   r3  r-  )r:   r5  countr`   r'  r.   r8  r/   set_attn_processor  s   
z(UNetControlNetXSModel.set_attn_processorc                 C   sj   t dd | j D rt }nt dd | j D r t }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s       | ]}|j tv V  qd S r6   )r<   r   r   procr.   r.   r/   r         zCUNetControlNetXSModel.set_default_attn_processor.<locals>.<genexpr>c                 s   r=  r6   )r<   r   r>  r.   r.   r/   r     r@  zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr4  valuesr   r   r   nextiterr<  )r:   r5  r.   r.   r/   set_default_attn_processor  s   z0UNetControlNetXSModel.set_default_attn_processors1s2b1b2c                 C   sH   t | jD ]\}}t|d| t|d| t|d| t|d| qdS )a>  Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        rF  rG  rH  rI  N)r   r   setattr)r:   rF  rG  rH  rI  rh   upsample_blockr.   r.   r/   enable_freeu  s   z"UNetControlNetXSModel.enable_freeuc                 C   sP   h d}t | jD ]\}}|D ]}t||st||ddur$t||d qq	dS )zDisables the FreeU mechanism.>   rH  rI  rF  rG  N)r   r   r  r  rJ  )r:   
freeu_keysrh   rK  r   r.   r.   r/   disable_freeu  s   z#UNetControlNetXSModel.disable_freeuc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u1  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr4  r  r   r<   r)   r   modulesra   r   fuse_projectionsr<  r   )r:   r   attn_processorr'  r.   r.   r/   fuse_qkv_projections  s   
z*UNetControlNetXSModel.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>

        N)rQ  r<  r  r.   r.   r/   unfuse_qkv_projections  s   

z,UNetControlNetXSModel.unfuse_qkv_projectionsr(   timestepencoder_hidden_statescontrolnet_condconditioning_scaleclass_labelstimestep_condattention_maskcross_attention_kwargsadded_cond_kwargsreturn_dictapply_controlc           '      C   s|  | j jdkrtj|dgd}|dur!d||j d }|d}|}t|sO|jj	dk}t
|tr<|r8tjntj}n|rAtjntj}tj|g||jd}nt|jdkr^|d |j}||jd }| |}|j|jd	}| j jr|r| ||}| ||}| j jd
 }|| |d|   }n| |}d}| j jdu rnX| j jdkrd|
vrt| j d|
d}d|
vrt| j d|
d}| | }||jd df}tj ||gdd}||j}| !|}n
td| j j d|dur|| n|}|}| }}g g }}| "|}| #|}| $|}|dur)||7 }|r5|| %||  }|&| |&| | j'D ]} | ||||||	||d\}}}!}"|(|! |(|" qB| j)||||||	||d\}}| j*D ]1}#t|#j+}$||$ d }%||$ d }&|d|$  }|d|$  }|#||%|&||||	||d	}qr| ,|}| -|}| .|}|s|fS t/|dS )ay	  
        The [`ControlNetXSModel`] forward method.

        Args:
            sample (`Tensor`):
                The noisy input tensor.
            timestep (`Union[torch.Tensor, float, int]`):
                The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states.
            controlnet_cond (`Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                How much the control model affects the base model outputs.
            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
                embeddings.
            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
                negative values to the attention scores corresponding to "discard" tokens.
            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
            added_cond_kwargs (`dict`):
                Additional conditions for the Stable Diffusion XL UNet.
            return_dict (`bool`, defaults to `True`):
                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
            apply_control (`bool`, defaults to `True`):
                If `False`, the input is run only through the base model.

        Returns:
            [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
                If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a
                tuple is returned where the first element is the sample tensor.
        r   r   )dimsNg     mps)r   devicer   )r   g333333?r   text_embedsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`time_idsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`r   dimzgControlNet-XS currently only supports StableDiffusion and StableDiffusion-XL, so addition_embed_type = z is currently not supported.)hidden_states_basehidden_states_ctrltembrX  rZ  r^  r]  ra  )	hidden_statesres_hidden_states_tuple_baseres_hidden_states_tuple_ctrlrk  rX  rZ  r^  r]  ra  )r(   )0r   r   r2  flipr   r   	unsqueeze	is_tensorrd  typera   r   float32float64int32int64tensorr   shapeexpandr   r   r   r   r   r   r   r<   getr   flattenreshapeconcatr   r   r   r   r   rd   r   r   r   r   r1   r   r   r   r'   )'r:   r(   rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  	timestepsis_mpsr   t_emb	ctrl_temb	base_tembinterpolation_paramrk  aug_embre  rf  time_embeds
add_embedscembh_ctrlh_basehs_basehs_ctrlguided_hintdownresidual_hbresidual_hcup	n_resnetsskips_hbskips_hcr.   r.   r/   r     s   7


























zUNetControlNetXSModel.forward)ry   r   r   r   rC   rD   r   r~   NNTTNNr{   rr   rw   rv   Fr}   r|   rC   )NNNNNr  N)F)	Nr{   NNNNNTT)%r)   r*   r+   r,    _supports_gradient_checkpointingr   r   rb   r   r   r	   r   r   r8   r   r&   ru   r   r   r   r%  r)  propertyr   r4  r<  rE  rL  rN  rU  rV  r
   r2  r   r'   r   r?   r.   r.   r;   r/   r      s   
 ?

v
#		

r   c                       s.  e Zd Z										d+dededed	ed
edededeeeee f  dee dee dee dedee dee f fddZe	de
defddZd,ddZ							d-ded ed!ee d"ee d#ee d$ee d%eeeef  d&ee d'edeeeeed(f eed(f f fd)d*Z  ZS ).r   rC   Tr   rD   FrE   rF   rG   rH   rI   r\   r   rK   r   r   rM   rN   rO   rP   c                    s  t    g }g }g }g }g }g }d}t|	tr|	g| }	t|D ]r}|dkr)|n|}|dkr1|n|}|t|| |t||||d |t|| ||t|| |dt||ddd |r|t	|
||
 ||	| ||||d |t	||| ||	| |||t||dd |t|| q!|r|t|| t
|d|d	d
| _t
|| d|d	d
| _|t|| nd | _d | _t|| _t|| _|rt|nd g| | _|rt|nd g| | _t|| _t|| _d| _d S )Nr   r   rU   rV   rI   rW   rQ   rS   rT   rZ   Tr]   r^   F)r7   r8   ra   rb   rc   rd   re   r"   rf   r#   r!   base_downsamplersctrl_downsamplersr   r=   base_resnetsctrl_resnetsbase_attentionsctrl_attentionsr2   r3   r&  )r:   rE   rF   rG   rH   rI   r\   r   rg   rK   r   r   rM   rN   rO   rP   r  r  r  r  r3   r2   r[   rh   r;   r.   r/   r8     s   





z)ControlNetXSCrossAttnDownBlock2D.__init__base_downblockctrl_downblockc                 C   s  dd }|j d j}|j d j}|j d j| }|j d j}|j d jj}|j d jj}	|j d jj}
t|dr]d}t|j	d j
}||j}||j}||j}||j}|j	d j}nd}d }d }d }d }d }d }|jd u}| ||||||	|
||||||||d}|j|j   |j|j   |r|j|j	  |j|j	  |r|j|jd   |j|j  |j|j  |j|j  |S )Nc                 S      | j d jd jS Nr   r4   transformer_blocksattn2blockr.   r.   r/   get_first_cross_attentionB     zPControlNetXSCrossAttnDownBlock2D.from_modules.<locals>.get_first_cross_attentionr   r4   TFr   )r1   rU   rV   time_emb_projin_featuresnorm1r   r  r   r4   r  headsrM   rO   rP   r9   r  r  r  r  r  r  r  r  r2   r3   )r   r  r  r  rE   rF   rG   rH   rI   r   ctrl_num_groupsrg   rK   r   r   rM   rO   rP   rN   r   r.   r.   r/   r  ?  sj   





z-ControlNetXSCrossAttnDownBlock2D.from_modulesr  Nc                 C   n   |   D ]}d|_q| jg}t| jtjr|| j | jdur&|| j |D ]}|  D ]}d|_q.q(dS r  TNF)	r  r  r  ra   r  r   r=   rd   r  r:   r!  r"  r  r.   r.   r/   r        
z3ControlNetXSCrossAttnDownBlock2D.freeze_base_paramsr{   ri  rk  rX  rj  rZ  r]  r^  encoder_attention_maskra  .c
              	   C   s.  |d ur| dd d urtd |}
|}d}d}tt| j| j}tt| j| j}ddd}t||| j	| j
D ]\\}}\}}}}|	rPtj|||
gdd}| jrq| jrqtdd	r_d
dini }tjjj|||
|fi |}
n||
|}
|d ur||
||||ddd }
|	r| jr| jrtdd	rd
dini }tjjj||||fi |}n|||}|d ur||||||ddd }|	r|
|||  }
||
f }||f }q9| jd ur| j	d }| j
d }|	rtj|||
gdd}| |
}
|	r| |}|	r|
|||  }
||
f }||f }|
|||fS )NscaleSPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r.   c                        fdd}|S )Nc                        d ur | diS  |  S Nr`  r.   inputsr'  r`  r.   r/   custom_forward     z_ControlNetXSCrossAttnDownBlock2D.forward.<locals>.create_custom_forward.<locals>.custom_forwardr.   r'  r`  r  r.   r  r/   create_custom_forward     zGControlNetXSCrossAttnDownBlock2D.forward.<locals>.create_custom_forwardr   rg  >=1.11.0use_reentrantFrX  r^  r]  r  r`  r   r   r6   )rz  loggerwarningr   r  r  r  r  r  r2   r3   r2  cattrainingr&  r   utils
checkpointr  r  )r:   ri  rk  rX  rj  rZ  r]  r^  r  ra  r  r  base_output_statesctrl_output_statesbase_blocksctrl_blocksr  b_resb_attnc_resc_attnb2cc2bckpt_kwargsr.   r.   r/   r     s   

	










z(ControlNetXSCrossAttnDownBlock2D.forward)
rC   rC   Tr   r   r   rD   TFTr  )NNr{   NNNT)r)   r*   r+   rb   r   r	   r   r   r8   r   r   r0   r  r   r
   r   r   r   r   r   r?   r.   r.   r;   r/   r     s    
v
A	
r   c                       s   e Zd Z									d'deded	ee d
edededee dee dee dedee f fddZedede	fddZ
d(ddZ						d)dedededee d ee d!eeeef  d"ee d#ee d$edeeef fd%d&Z  ZS )*r   NrC   r   rD   FTrk   rl   rI   r\   r   rK   r   r   rM   rO   rP   c                    sv   t    t||| _t|||||	|||
d| _t||| ||tt||| ||	|||
d	| _t||| _	d| _
d S )N)rK   rU   rI   rn   rM   rL   rP   rO   rm   F)r7   r8   re   r2   r$   base_midblockrf   r   ctrl_midblockr3   r&  )r:   rk   rl   rI   r\   r   rK   r   r   rM   rO   rP   r;   r.   r/   r8     s6   

z(ControlNetXSCrossAttnMidBlock2D.__init__r  r  c                 C   s   |j }|j}|j}dd }|j}|j}t|jd j}|jd j	j
}	|jd jj}
|jd jj}||j}||j}||j}||j}|jd j}| |||	|
|||||||d}|j |  |j|  |j|  |j|  |S )Nc                 S   r  r  r  )rA   r.   r.   r/   r  D  r  zOControlNetXSCrossAttnMidBlock2D.from_modules.<locals>.get_first_cross_attentionr   r   )r2   r3   rA   rV   rU   r   r4   r  r1   r  r  r  r   r  rM   rO   rP   r  r  r  r  )r   r  r  r2   r3   r  rk   rl   rK   rI   r   r  r   r   rM   rO   rP   r   r.   r.   r/   r  9  sB   



z,ControlNetXSCrossAttnMidBlock2D.from_modulesr  c                 C   s.   |   D ]}d|_q| j  D ]}d|_qdS )r  TFN)r  r  r  )r:   r!  r.   r.   r/   r   j  s
   z2ControlNetXSCrossAttnMidBlock2D.freeze_base_paramsr{   ri  rk  rX  rj  rZ  r^  r]  r  ra  c
                 C   s   |d ur| dd d urtd |}
|}|||||d}|	r+tj|| |
gdd}| j|
fi |}
|	rH| j|fi |}|
| ||  }
|
|fS )Nr  r  )rk  rX  r]  r^  r  r   rg  )	rz  r  r  r2  r  r2   r  r  r3   )r:   ri  rk  rX  rj  rZ  r^  r]  r  ra  r  r  
joint_argsr.   r.   r/   r   u  s$   
z'ControlNetXSCrossAttnMidBlock2D.forward)	NrC   rC   r   r   r   rD   FTr  )Nr{   NNNT)r)   r*   r+   rb   r   r   r8   r   r$   r@   r  r   r
   r   r   r   r   r   r   r?   r.   r.   r;   r/   r     s    	
4
0	

r   c                       s  e Zd Z									d,deded	ed
ee dededee dedededededee f fddZede	de
fddZd-ddZ							d.dedeed f d!eed f d"ed#ee d$ee d%eeeef  d&ee d'ee d(ee d)edefd*d+Z  ZS )/r   rC   NTr   rD   FrU   rV   rp   rq   rI   r\   r   rK   rL   rM   r   rO   rP   c                    s"  t    g }g }g }d}|| _|
| _t|	tr|	g| }	t|D ]@}||d kr+|n|}|dkr3|n|}|t|| | |t	|| |||d |ra|t
|
||
 ||	| ||||d q!t|| _|rot|nd g| | _t|| _|rt|d|d| _nd | _d| _|| _d S )	Nrr   r   r   r  rZ   T)r_   rV   F)r7   r8   has_cross_attentionrL   ra   rb   rc   rd   re   r"   r#   r   r=   r1   r4   r3   r%   
upsamplersr&  r   )r:   rU   rV   rp   rq   rI   r\   r   rg   rK   rL   rM   r   rO   rP   r1   r4   r3   r[   rh   res_skip_channelsrs   r;   r.   r/   r8     sV   


	
z'ControlNetXSCrossAttnUpBlock2D.__init__base_upblockctrl_upblockc                 C   sP  |j }dd }|jd j}|jd j| }|jd j| }dd |D }|jd jj}	|jd jj}
|j}t	|drZd}t
|jd j}||j}||j}||j}|jd j}nd	}d }d }d }d }d }|jd u}| |||||	|
||||||||d
}|j|j  |r|j|j  |r|j|jd   |j |  |S )Nc                 S   r  r  r  r  r.   r.   r/   r    r  zNControlNetXSCrossAttnUpBlock2D.from_modules.<locals>.get_first_cross_attentionr   r   c                 S   s   g | ]}|j qS r.   )rU   )r   r  r.   r.   r/   r     s    z?ControlNetXSCrossAttnUpBlock2D.from_modules.<locals>.<listcomp>r4   TF)rU   rV   rp   rq   rI   r\   r   rg   rK   rL   rM   r   rO   rP   )r3   r1   rV   rU   r  r  r  r   r   r  r   r4   r  r  rM   rO   rP   r  r  r  )r   r  r  ctrl_to_base_skip_connectionsr  rV   rU   prev_output_channelsctrl_skip_channelssrI   r   r   rg   rK   rL   rM   rO   rP   r   r   r.   r.   r/   r    s\   




z+ControlNetXSCrossAttnUpBlock2D.from_modulesr  c                 C   r  r  )	r  r  r1   ra   r4   r   r=   rd   r  r  r.   r.   r/   r     r  z1ControlNetXSCrossAttnUpBlock2D.freeze_base_paramsr{   rl  rm  .rn  rk  rX  rZ  r^  r]  upsample_sizer  ra  c              	      sR  |d ur| dd d urtd tdd o(tdd o(tdd o(tdd  ddd} fd	d
}tjjjt|t|D ]W\}}}}}|rU|||| 7 }|||\}}t	j
||gdd}jrjrtddrtddini }t	jjj||||fi |}n|||}|d ur||||||
ddd }qDjd ur||	}|S )Nr  r  rF  rG  rH  rI  c                    r  )Nc                     r  r  r.   r  r  r.   r/   r  H  r  z]ControlNetXSCrossAttnUpBlock2D.forward.<locals>.create_custom_forward.<locals>.custom_forwardr.   r  r.   r  r/   r  G  r  zEControlNetXSCrossAttnUpBlock2D.forward.<locals>.create_custom_forwardc              	      s,    rt j| |jjjjdS | |fS )N)rF  rG  rH  rI  )r   r   rF  rG  rH  rI  )rl  
res_h_baseis_freeu_enabledr:   r.   r/   maybe_apply_freeu_to_subblockP  s   
zMControlNetXSCrossAttnUpBlock2D.forward.<locals>.maybe_apply_freeu_to_subblockr   rg  r  r  r  Fr  r   r6   )rz  r  r  r  r  r1   r4   r3   r   r2  r  r  r&  r   r  r  r  )r:   rl  rm  rn  rk  rX  rZ  r^  r]  r  r  ra  r  r  resnetattnr  r  
res_h_ctrlr  r.   r  r/   r   .  s`   




	

	z&ControlNetXSCrossAttnUpBlock2D.forward)	rC   NTr   r   rD   TFTr  )Nr{   NNNNT)r)   r*   r+   rb   r   r   r   r8   r   r    rB   r  r   r
   r   r   r   r   r   r   r?   r.   r.   r;   r/   r     s    
G
:

	
r   c                 C   s   t tj| |dddS )Nr   r   )r   )zero_moduler   r>   )rU   rV   r.   r.   r/   re     s   re   c                 C   s   |   D ]}tj| q| S r6   )r  r   initzeros_)r'  pr.   r.   r/   r    s   r  c                 C   sD   |}|| kr| S |dkr | | }|dkr|S |d8 }|dksd S d S )Nr   r   r.   )numberrR   factorresidualr.   r.   r/   rf     s   rf   )rC   Tr   r   rD   TFT)NrC   r   r   rD   FTr6   )Fdataclassesr   mathr   typingr   r   r   r   r   r	   r2  torch.utils.checkpointr
   r   configuration_utilsr   r   r  r   r   r   utils.torch_utilsr   attention_processorr   r   r   r   r   r   r   r   r   
embeddingsr   r   modeling_utilsr   unets.unet_2d_blocksr   r    r!   r"   r#   r$   r%   unets.unet_2d_conditionr&   
get_loggerr)   r  r'   r3  r0   r@   rB   rb   r   rj   ro   rt   ru   r   r   r   r   re   r  rf   r.   r.   r.   r/   <module>   s    $	$	
	

^	
#
       M  >  
n