o
    GiA                    @   s  d dl mZ d dlmZ d dlmZ d dlZd dlmZmZ ddl	m
Z
mZ ddlmZmZ dd	lmZ d
dlmZ d
dlmZmZmZmZmZmZ d
dlmZmZ d
dlmZ d
dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d
dl'm(Z( ddl)m*Z* e+e,Z-eG dd deZ.G dd dej/Z0G dd dej/Z1G dd dej/Z2								dFde3d e3d!e3d"e3d#e3d$e3dB d%e3e4e3 B dB d&e3dB d'e3dB d(e5d)e5dB d*e5dB fd+d,Z6							dGd-e3d.e3d#e3dB d$e3dB d%e3d&e3dB d'e3dB d)e5d*e5fd/d0Z7d1e3d2e3d3e8e3 fd4d5Z9G d6d7 d7eee
Z:G d8d9 d9eee
Z;G d:d; d;ej/Z<G d<d= d=ej/Z=G d>d? d?ej/Z>dHd@dAZ?dBdC Z@dDdE ZAdS )I    )	dataclass)gcd)AnyN)Tensornn   )ConfigMixinregister_to_config)
BaseOutputlogging)apply_freeu   )AttentionMixin)ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORS	AttentionAttnAddedKVProcessorAttnProcessorFusedAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)CrossAttnDownBlock2DCrossAttnUpBlock2DDownsample2DResnetBlock2DTransformer2DModelUNetMidBlock2DCrossAttn
Upsample2D)UNet2DConditionModel   )ControlNetConditioningEmbeddingc                   @   s   e Zd ZU dZdZeed< dS )ControlNetXSOutputa=  
    The output of [`UNetControlNetXSModel`].

    Args:
        sample (`Tensor` of shape `(batch_size, num_channels, height, width)`):
            The output of the `UNetControlNetXSModel`. Unlike `ControlNetOutput` this is NOT to be added to the base
            model output, but is already the final output.
    Nsample)__name__
__module____qualname____doc__r#   r   __annotations__ r)   r)   ^/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/controlnets/controlnet_xs.pyr"   3   s   
 	r"   c                       sN   e Zd ZdZ		d
dejdejdejdejdB dejdB f
 fdd	Z  ZS )DownBlockControlNetXSAdapterz}Components that together with corresponding components from the base model will form a
    `ControlNetXSCrossAttnDownBlock2D`Nresnetsbase_to_ctrlctrl_to_base
attentionsdownsamplerc                    s,   t    || _|| _|| _|| _|| _d S N)super__init__r,   r-   r.   r/   downsamplers)selfr,   r-   r.   r/   r0   	__class__r)   r*   r3   E   s   

z%DownBlockControlNetXSAdapter.__init__)NN)	r$   r%   r&   r'   r   
ModuleListConv2dr3   __classcell__r)   r)   r6   r*   r+   A   s    r+   c                       s2   e Zd ZdZdedejdejf fddZ  ZS )MidBlockControlNetXSAdapterz|Components that together with corresponding components from the base model will form a
    `ControlNetXSCrossAttnMidBlock2D`midblockr-   r.   c                    s    t    || _|| _|| _d S r1   )r2   r3   r<   r-   r.   )r5   r<   r-   r.   r6   r)   r*   r3   Y   s   

z$MidBlockControlNetXSAdapter.__init__)	r$   r%   r&   r'   r   r   r8   r3   r:   r)   r)   r6   r*   r;   U   s    &r;   c                       s(   e Zd ZdZdejf fddZ  ZS )UpBlockControlNetXSAdapterzwComponents that together with corresponding components from the base model will form a `ControlNetXSCrossAttnUpBlock2D`r.   c                    s   t    || _d S r1   )r2   r3   r.   )r5   r.   r6   r)   r*   r3   c   s   

z#UpBlockControlNetXSAdapter.__init__)r$   r%   r&   r'   r   r8   r3   r:   r)   r)   r6   r*   r=   `   s    r=       T   Fbase_in_channelsbase_out_channelsctrl_in_channelsctrl_out_channelstemb_channelsmax_norm_num_groupstransformer_layers_per_blocknum_attention_headscross_attention_dimadd_downsampleupcast_attentionuse_linear_projectionc                 C   sZ  d}g }g }g }g }t |tr|g| }t|D ]T}|dkr | n|} |dkr(|n|}|t| |  |t||  ||t||  |dt||ddd |rd|t||| ||| |	||t||dd |t|| q|
r|t|| t|| d|dd	}|t|| nd }t	t
|t
|t
|d
}|rt
||_|d ur||_|S )Nr   r   
max_factorh㈵>in_channelsout_channelsrD   groups
groups_outepsrP   
num_layersrH   rK   rJ   norm_num_groupsTopuse_convrQ   name)r,   r-   r.   )
isinstanceintrangeappendmake_zero_convr   find_largest_factorr   r   r+   r   r8   r/   r4   )r@   rA   rB   rC   rD   rE   has_crossattnrF   rG   rH   rI   rJ   rK   rV   r,   r/   r.   r-   ir4   down_block_componentsr)   r)   r*   get_down_block_adapterh   sh   



re   base_channelsctrl_channelsc	                 C   sP   t | | }	t|||  ||tt|||  |||||d	}
t || }t|	|
|dS )N	rF   rP   rQ   rD   resnet_groupsrH   rG   rK   rJ   )r-   r<   r.   )r`   r   ra   r   r;   )rf   rg   rD   rE   rF   rG   rH   rJ   rK   r-   r<   r.   r)   r)   r*   get_mid_block_adapter   s   

rj   rQ   prev_output_channelctrl_skip_channelsc                 C   sJ   g }d}t |D ]}|dkr|n| }|t|| | qtt|dS )Nr   r   )r.   )r^   r_   r`   r=   r   r8   )rQ   rk   rl   r.   rV   rc   resnet_in_channelsr)   r)   r*   get_up_block_adapter   s   rn   c                        s
  e Zd ZdZe										
						d(dededee dede	deee B dee dee dedee dedB deee B de	dede	f fd d!Z
e								d)d"ed#edB dee dB dee dB de	dedededee fd$d%Zd&d' Z  ZS )*ControlNetXSAdaptera  
    A `ControlNetXSAdapter` model. To use it, pass it into a `UNetControlNetXSModel` (together with a
    `UNet2DConditionModel` base model).

    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
    methods implemented for all models (such as downloading or saving).

    Like `UNetControlNetXSModel`, `ControlNetXSAdapter` is compatible with StableDiffusion and StableDiffusion-XL. It's
    default parameters are compatible with StableDiffusion.

    Parameters:
        conditioning_channels (`int`, defaults to 3):
            Number of channels of conditioning input (e.g. an image)
        conditioning_channel_order (`str`, defaults to `"rgb"`):
            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
        conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
            The tuple of output channels for each block in the `controlnet_cond_embedding` layer.
        time_embedding_mix (`float`, defaults to 1.0):
            If 0, then only the control adapters's time embedding is used. If 1, then only the base unet's time
            embedding is used. Otherwise, both are combined.
        learn_time_embedding (`bool`, defaults to `False`):
            Whether a time embedding should be learned. If yes, `UNetControlNetXSModel` will combine the time
            embeddings of the base model and the control adapter. If no, `UNetControlNetXSModel` will use the base
            model's time embedding.
        num_attention_heads (`list[int]`, defaults to `[4]`):
            The number of attention heads.
        block_out_channels (`list[int]`, defaults to `[4, 8, 16, 16]`):
            The tuple of output channels for each block.
        base_block_out_channels (`list[int]`, defaults to `[320, 640, 1280, 1280]`):
            The tuple of output channels for each block in the base unet.
        cross_attention_dim (`int`, defaults to 1024):
            The dimension of the cross attention features.
        down_block_types (`list[str]`, defaults to `["CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"]`):
            The tuple of downsample blocks to use.
        sample_size (`int`, defaults to 96):
            Height and width of input/output sample.
        transformer_layers_per_block (`int | tuple[int]`, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
        upcast_attention (`bool`, defaults to `True`):
            Whether the attention computation should always be upcasted.
        max_norm_num_groups (`int`, defaults to 32):
            Maximum number of groups in group normal. The actual number will be the largest divisor of the respective
            channels, that is <= max_norm_num_groups.
    r   rgb   r>   `            ?F   rv      rr   rr   i@  i     rz   r?   r   r   r   DownBlock2Drs   r    Tr>   conditioning_channelsconditioning_channel_order#conditioning_embedding_out_channelstime_embedding_mixlearn_time_embeddingrG   block_out_channelsbase_block_out_channelsrH   down_block_typessample_sizeNrF   rJ   rE   rK   c                    s  t    |d }|d d }|dvrtd| t|t|
kr-td| d|
 dt|ttfs;|gt|
 }t|	ttfsI|	gt|
 }	t|ttfsW|gt|
 }t|t|
krjtd| d|
 dt|d ||d	| _|r}t	||| _
nd | _
tg | _tg | _tjd|d d
dd| _t|d |d | _|d }|d }t|
D ]8\}}|}|| }|}|| }d|v }|t|
d k}| jt||||||||| || |	| | ||d qt|d |d ||d |d |	d ||d| _|d g t|D ]\}}|t|d k rd
nd} |g|  qtt|}|d }tt|
D ]}|}|| } fddtd
D }| jt|||d q2d S )Nr   rv   )rp   bgrz&unknown `conditioning_channel_order`: zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: .zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: conditioning_embedding_channelsr   r}   r   r    kernel_sizepadding	CrossAttn)r@   rA   rB   rC   rD   rE   rb   rF   rG   rH   rI   rJ   rK   )rf   rg   rD   rF   rG   rH   rJ   rK   r   c                       g | ]}   qS r)   pop.0_rl   r)   r*   
<listcomp>      z0ControlNetXSAdapter.__init__.<locals>.<listcomp>)rQ   rk   rl   )r2   r3   
ValueErrorlenr\   listtupler!   controlnet_cond_embeddingr   time_embeddingr   r8   down_blocksup_connectionsr9   conv_inr`   control_to_base_for_conv_in	enumerater_   re   rj   	mid_blockextendreversedr^   rn   )r5   r}   r~   r   r   r   rG   r   r   rH   r   r   rF   rJ   rE   rK   time_embedding_input_dimtime_embedding_dimrA   rC   rc   down_block_typer@   rB   rb   is_final_blockrQ   number_of_subblocks reversed_base_block_out_channelsprev_base_output_channelctrl_skip_channels_r6   r   r*   r3   #  s   

zControlNetXSAdapter.__init__unet
size_ratioc
                    s   |du}
 du}|
|A st d|p fdd|jjD }|du r%|jj}| |||	|||||jj|jj|jj|jj|jj|jj|jj	|jj
d}||j |S )a8  
        Instantiate a [`ControlNetXSAdapter`] from a [`UNet2DConditionModel`].

        Parameters:
            unet (`UNet2DConditionModel`):
                The UNet model we want to control. The dimensions of the ControlNetXSAdapter will be adapted to it.
            size_ratio (float, *optional*, defaults to `None`):
                When given, block_out_channels is set to a fraction of the base model's block_out_channels. Either this
                or `block_out_channels` must be given.
            block_out_channels (`list[int]`, *optional*, defaults to `None`):
                Down blocks output channels in control model. Either this or `size_ratio` must be given.
            num_attention_heads (`list[int]`, *optional*, defaults to `None`):
                The dimension of the attention heads. The naming seems a bit confusing and it is, see
                https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 for why.
            learn_time_embedding (`bool`, defaults to `False`):
                Whether the `ControlNetXSAdapter` should learn a time embedding.
            time_embedding_mix (`float`, defaults to 1.0):
                If 0, then only the control adapter's time embedding is used. If 1, then only the base unet's time
                embedding is used. Otherwise, both are combined.
            conditioning_channels (`int`, defaults to 3):
                Number of channels of conditioning input (e.g. an image)
            conditioning_channel_order (`str`, defaults to `"rgb"`):
                The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
            conditioning_embedding_out_channels (`tuple[int]`, defaults to `(16, 32, 96, 256)`):
                The tuple of output channel for each block in the `controlnet_cond_embedding` layer.
        NzePass exactly one of `block_out_channels` (for absolute sizing) or `size_ratio` (for relative sizing).c                    s   g | ]}t |  qS r)   )r]   )r   br   r)   r*   r     s    z1ControlNetXSAdapter.from_unet.<locals>.<listcomp>)r}   r~   r   r   r   rG   r   r   rH   r   r   rF   rJ   rE   rK   )r   configr   attention_head_dimrH   r   r   rF   rJ   rW   rK   todtype)clsr   r   r   rG   r   r   r}   r~   r   
fixed_sizerelative_sizemodelr)   r   r*   	from_unet  s8   )zControlNetXSAdapter.from_unetc                 O   s   t d)NzA ControlNetXSAdapter cannot be run by itself. Use it together with a UNet2DConditionModel to instantiate a UNetControlNetXSModel.)r   )r5   argskwargsr)   r)   r*   forward  s   zControlNetXSAdapter.forward)r   rp   rq   ru   Frv   rw   ry   r?   r{   rs   r    Tr>   T)NNNFru   r   rp   rq   )r$   r%   r&   r'   r	   r]   strr   floatboolr3   classmethodr   r   r   r   r:   r)   r)   r6   r*   ro      s    .
	

 	

	
Lro   c                .       s"  e Zd ZdZdZe									
														dPdedB dee dee dee dedB deee B deee B deee B dedB dedB de	de	dedB d edB d!e
d"ed#ee d$ed%e	d&ee d'eee B d(ef, fd)d*Ze					dQd+ed,edB d-e
dB d&ee
 dB d!e
dB d.edB fd/d0ZdRd2d3Zd4d5 Zd6e
d7e
d8e
d9e
fd:d;Zd<d= Zd>d? Zd@dA Z									dSdBedCeje
B eB dDejdEejdB dFe
dB dGejdB dHejdB dIejdB dJeeef dB dKeeejf dB dLe	dMe	d1eeB fdNdOZ  ZS )TUNetControlNetXSModela9  
    A UNet fused with a ControlNet-XS adapter model

    This model inherits from [`ModelMixin`] and [`ConfigMixin`]. Check the superclass documentation for it's generic
    methods implemented for all models (such as downloading or saving).

    `UNetControlNetXSModel` is compatible with StableDiffusion and StableDiffusion-XL. It's default parameters are
    compatible with StableDiffusion.

    It's parameters are either passed to the underlying `UNet2DConditionModel` or used exactly like in
    `ControlNetXSAdapter` . See their documentation for details.
    Trs   r{   	UpBlock2Dr   r   r   ry   r>   r?   r    rx   Nru   r   rq   rp   Frw   rv   r   r   up_block_typesr   rW   rH   rF   rG   addition_embed_typeaddition_time_embed_dimrJ   rK   time_cond_proj_dim%projection_class_embeddings_input_dimr   ctrl_conditioning_channels(ctrl_conditioning_embedding_out_channelsctrl_conditioning_channel_orderctrl_learn_time_embeddingctrl_block_out_channelsctrl_num_attention_headsctrl_max_norm_num_groupsc           .         s  t    |dk s|dkrtd|dk r|std|	d ur'|	dkr'tdt|ttfs5|gt| }t|ttfsC|gt| }t|ttfsQ|gt| }t|ttfs_|gt| }|}d| _tj	d|d ddd	| _
t|d ||d
| _tj	d|d ddd	| _t|d |d | _|d }|d d }t|d ddd| _t|||d| _|rt||d| _nd | _|	d u rd | _d | _nt|
ddd| _t||| _g }|d }|d }t|D ];\}}|}|| }|} || }d|v }!|t|d k}"|t||| |||||!|| || || || |" ||d qt|d |d ||||d |d |d |d ||d| _g }#tt|}$tt|}%tt|}&|d g t|D ]\}}'|t|d k rddnd}( |'g|(  qUtt|})|)d }'t|D ]K\}}*|'}+|)| }'|)t|d t|d  }, fddtdD }-d|*v }!|t|d k}"|#t |,|'|+|-|||!|$| |%| |&| |" |||d q~t!|| _"t!|#| _#tj$|d |d| _%t& | _'tj	|d dddd	| _(d S )Nr   r    z1`time_embedding_mix` needs to be between 0 and 1.zKTo use `time_embedding_mix` < 1, `ctrl_learn_time_embedding` must be `True`	text_timezAs `UNetControlNetXSModel` currently only supports StableDiffusion and StableDiffusion-XL, `addition_embed_type` must be `None` or `'text_time'`.rv   r   r   r   T)flip_sin_to_cosdownscale_freq_shift)cond_proj_dim)rP   time_embed_dimr   r@   rA   rB   rC   rD   rW   r   rb   rF   base_num_attention_headsr   rH   rI   rJ   rK   r   rf   rg   rD   rW   r   rF   r   r   rH   rJ   rK   r   c                    r   r)   r   r   r   r)   r*   r     r   z2UNetControlNetXSModel.__init__.<locals>.<listcomp>)rP   rQ   rk   rl   rD   resolution_idxrb   rF   rG   rH   add_upsamplerJ   rW   rK   )num_channels
num_groups))r2   r3   r   r\   r   r   r   rP   r   r9   base_conv_inr!   r   ctrl_conv_inr`   r   r   base_time_projr   base_time_embeddingctrl_time_embeddingbase_add_time_projbase_add_embeddingr   r_    ControlNetXSCrossAttnDownBlock2DControlNetXSCrossAttnMidBlock2Dr   r   r   minr^   ControlNetXSCrossAttnUpBlock2Dr8   r   	up_blocks	GroupNormbase_conv_norm_outSiLUbase_conv_actbase_conv_out).r5   r   r   r   r   rW   rH   rF   rG   r   r   rJ   rK   r   r   r   r   r   r   r   r   r   r   r   time_embed_input_dimr   r   rA   rC   rc   r   r@   rB   rb   r   r    rev_transformer_layers_per_blockrev_num_attention_headsrev_cross_attention_dimrQ   r   reversed_block_out_channelsup_block_typerk   rP   r   r6   r   r*   r3     s   
!


zUNetControlNetXSModel.__init__r   
controlnetr   ctrl_optional_kwargsc                    s  |du rt j|||fi |}ntdd ||||fD r!tdg dfdd|j D |jjd< g d	  fd
d|j D  |jj d< | i  }g d}|D ]}	t	|d|	 
t	||	  q\ddg}
|
D ]}	t||	rt	||	durt	|d|	 
t	||	  qt|j
|j  |j
|j  |jdur|j
|j  |j
|j  tdd t|j|jD |_t|j|j|_tdd t|j|jD |_||j |S )a  
        Instantiate a [`UNetControlNetXSModel`] from a [`UNet2DConditionModel`] and an optional [`ControlNetXSAdapter`]
        .

        Parameters:
            unet (`UNet2DConditionModel`):
                The UNet model we want to control.
            controlnet (`ControlNetXSAdapter`):
                The ControlNet-XS adapter with which the UNet will be fused. If none is given, a new ControlNet-XS
                adapter will be created.
            size_ratio (float, *optional*, defaults to `None`):
                Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
            ctrl_block_out_channels (`list[int]`, *optional*, defaults to `None`):
                Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
                where this parameter is called `block_out_channels`.
            time_embedding_mix (`float`, *optional*, defaults to None):
                Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
            ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`):
                Passed to the `init` of the new controlnet if no controlnet was given.
        Nc                 s   s    | ]}|d uV  qd S r1   r)   )r   or)   r)   r*   	<genexpr>  s    
z2UNetControlNetXSModel.from_unet.<locals>.<genexpr>zWhen a controlnet is passed, none of these parameters should be passed: size_ratio, ctrl_block_out_channels, time_embedding_mix, ctrl_optional_kwargs.)r   r   r   r   rW   rH   rF   r   r   rJ   rK   r   r   c                    s   i | ]\}}| v r||qS r)   r)   r   kv)params_for_unetr)   r*   
<dictcomp>  s    z3UNetControlNetXSModel.from_unet.<locals>.<dictcomp>rG   )r}   r   r~   r   r   rG   rE   c                    s"   i | ]\}}| v rd | |qS )ctrl_r)   r   )params_for_controlnetr)   r*   r     s   " r   )r   r   conv_norm_outconv_outbase_add_time_projadd_embeddingc                 s        | ]\}}t ||V  qd S r1   )r   from_modulesr   r   cr)   r)   r*   r   6  
    

c                 s   r  r1   )r   r  r  r)   r)   r*   r   ;  r  )ro   r   anyr   r   itemsr   r   from_configgetattrload_state_dict
state_dicthasattrr   r   r   r   r   r   r   r8   zipr   r   r  r   r   r   r   r   )r   r   r   r   r   r   r   r   modules_from_unetmoptional_modules_from_unetr)   )r   r   r*   r     sV   
	 




zUNetControlNetXSModel.from_unetreturnc                    s      D ]}d|_qg d} fdd|D }|D ]}|  D ]}d|_qq jD ]}|  q) j   jD ]}|  q8dS )Freeze the weights of the parts belonging to the base UNet2DConditionModel, and leave everything else unfrozen for fine
        tuning.T)r   r   r   r   r   r   r   r   c                    s$   g | ]}t  |d urt  |qS r1   )r  )r   partr5   r)   r*   r   W  s   $ z<UNetControlNetXSModel.freeze_unet_params.<locals>.<listcomp>FN)
parametersrequires_gradr   freeze_base_paramsr   r   )r5   param
base_partsr  dur)   r  r*   freeze_unet_paramsE  s   





z(UNetControlNetXSModel.freeze_unet_paramsc                 C   sj   t dd | j D rt }nt dd | j D r t }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s       | ]}|j tv V  qd S r1   )r7   r   r   procr)   r)   r*   r   g      zCUNetControlNetXSModel.set_default_attn_processor.<locals>.<genexpr>c                 s   r   r1   )r7   r   r!  r)   r)   r*   r   i  r#  zOCannot call `set_default_attn_processor` when attention processors are of type N)	allattn_processorsvaluesr   r   r   nextiterset_attn_processor)r5   	processorr)   r)   r*   set_default_attn_processorc  s   z0UNetControlNetXSModel.set_default_attn_processors1s2b1b2c                 C   sH   t | jD ]\}}t|d| t|d| t|d| t|d| qdS )aF  Enables the FreeU mechanism from https://huggingface.co/papers/2309.11497.

        The suffixes after the scaling factors represent the stage blocks where they are being applied.

        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

        Args:
            s1 (`float`):
                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            s2 (`float`):
                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
                mitigate the "oversmoothing effect" in the enhanced denoising process.
            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        r,  r-  r.  r/  N)r   r   setattr)r5   r,  r-  r.  r/  rc   upsample_blockr)   r)   r*   enable_freeus  s   z"UNetControlNetXSModel.enable_freeuc                 C   sP   h d}t | jD ]\}}|D ]}t||st||ddur$t||d qq	dS )zDisables the FreeU mechanism.>   r.  r/  r,  r-  N)r   r   r  r  r0  )r5   
freeu_keysrc   r1  r   r)   r)   r*   disable_freeu  s   z#UNetControlNetXSModel.disable_freeuc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        > [!WARNING] > This API is 🧪 experimental.
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr%  r
  r   r7   r$   r   modulesr\   r   fuse_projectionsr)  r   )r5   r   attn_processormoduler)   r)   r*   fuse_qkv_projections  s   
z*UNetControlNetXSModel.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )un   Disables the fused QKV projection if enabled.

        > [!WARNING] > This API is 🧪 experimental.

        N)r7  r)  r  r)   r)   r*   unfuse_qkv_projections  s   
z,UNetControlNetXSModel.unfuse_qkv_projectionsr#   timestepencoder_hidden_statescontrolnet_condconditioning_scaleclass_labelstimestep_condattention_maskcross_attention_kwargsadded_cond_kwargsreturn_dictapply_controlc           (      C   s  | j jdkrtj|dgd}|dur!d||j d }|d}|}t|sY|jj	dk}|jj	dk}t
|trD|s=|r@tjntj}n
|sH|rKtjntj}tj|g||jd}nt|jd	krh|d |j}||jd	 }| |}|j|jd
}| j jr|r| ||}| ||}| j jd }|| |d|   }n| |}d}| j jdu rnX| j jdkrd|
vrt| j d|
d}d|
vrt| j d|
d}| | }||jd	 df}tj ||gdd}||j}| !|}n
td| j j d|dur|| n|}|}| }}g g }}| "|} | #|}| $|}| dur3|| 7 }|r?|| %||  }|&| |&| | j'D ]}!|!||||||	||d\}}}"}#|(|" |(|# qL| j)||||||	||d\}}| j*D ]1}$t|$j+}%||% d }&||% d }'|d|%  }|d|%  }|$||&|'||||	||d	}q|| ,|}| -|}| .|}|s|fS t/|dS )a	  
        The [`ControlNetXSModel`] forward method.

        Args:
            sample (`Tensor`):
                The noisy input tensor.
            timestep (`torch.Tensor | float | int`):
                The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states.
            controlnet_cond (`Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                How much the control model affects the base model outputs.
            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
                embeddings.
            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
                negative values to the attention scores corresponding to "discard" tokens.
            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
            added_cond_kwargs (`dict`):
                Additional conditions for the Stable Diffusion XL UNet.
            return_dict (`bool`, defaults to `True`):
                Whether or not to return a [`~models.controlnets.controlnet.ControlNetOutput`] instead of a plain
                tuple.
            apply_control (`bool`, defaults to `True`):
                If `False`, the input is run only through the base model.

        Returns:
            [`~models.controlnetxs.ControlNetXSOutput`] **or** `tuple`:
                If `return_dict` is `True`, a [`~models.controlnetxs.ControlNetXSOutput`] is returned, otherwise a
                tuple is returned where the first element is the sample tensor.
        r   r    )dimsNg     mpsnpu)r   devicer   )r   g333333?r   text_embedsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`time_idsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`r   dimzgControlNet-XS currently only supports StableDiffusion and StableDiffusion-XL, so addition_embed_type = z is currently not supported.)hidden_states_basehidden_states_ctrltembr?  rA  rE  rD  rH  )	hidden_statesres_hidden_states_tuple_baseres_hidden_states_tuple_ctrlrS  r?  rA  rE  rD  rH  )r#   )0r   r   torchflipr   r   	unsqueeze	is_tensorrL  typer\   r   float32float64int32int64tensorr   shapeexpandr   r   r   r   r   r   r   r7   getr   flattenreshapeconcatr   r   r   r   r   r_   r   r   r   r   r,   r   r   r   r"   )(r5   r#   r>  r?  r@  rA  rB  rC  rD  rE  rF  rG  rH  	timestepsis_mpsis_npur   t_emb	ctrl_temb	base_tembinterpolation_paramrS  aug_embrM  rN  time_embeds
add_embedscembh_ctrlh_basehs_basehs_ctrlguided_hintdownresidual_hbresidual_hcup	n_resnetsskips_hbskips_hcr)   r)   r*   r     s   8


























zUNetControlNetXSModel.forward)rs   r{   r   ry   r>   r?   r    rx   NNTTNNru   r   rq   rp   Frw   rv   r>   )NNNNNr  N)	Nru   NNNNNTT)r$   r%   r&   r'    _supports_gradient_checkpointingr	   r]   r   r   r   r   r3   r   r   ro   r   dictr   r  r+  r2  r4  r<  r=  r   rW  r   r"   r   r:   r)   r)   r6   r*   r     s
   




 ?

v		
r   c                       s*  e Zd Z										d+dededed	ed
edededeee B dB dedB dedB dedB dededB dedB f fddZedede	fddZ
d,ddZ							d-ded ed!edB d"edB d#edB d$edB d%eeef dB d&edB d'edeeeeed(f eed(f f fd)d*Z  ZS ).r   r>   Tr    r?   Fr@   rA   rB   rC   rD   rW   r   rF   Nr   r   rH   rI   rJ   rK   c                    s  t    g }g }g }g }g }g }d}t|	tr|	g| }	t|D ]r}|dkr)|n|}|dkr1|n|}|t|| |t||||d |t|| ||t|| |dt||ddd |r|t	|
||
 ||	| ||||d |t	||| ||	| |||t||dd |t|| q!|r|t|| t
|d|d	d
| _t
|| d|d	d
| _|t|| nd | _d | _t|| _t|| _|rt|nd g| | _|rt|nd g| | _t|| _t|| _d| _d S )Nr   r   rP   rQ   rD   rR   rL   rN   rO   rU   TrX   rY   F)r2   r3   r\   r]   r^   r_   r`   r   ra   r   r   base_downsamplersctrl_downsamplersr   r8   base_resnetsctrl_resnetsbase_attentionsctrl_attentionsr-   r.   gradient_checkpointing)r5   r@   rA   rB   rC   rD   rW   r   rb   rF   r   r   rH   rI   rJ   rK   r  r  r  r  r.   r-   rV   rc   r6   r)   r*   r3     s   





z)ControlNetXSCrossAttnDownBlock2D.__init__base_downblockctrl_downblockc                 C   s  dd }|j d j}|j d j}|j d j| }|j d j}|j d jj}|j d jj}	|j d jj}
t|dr]d}t|j	d j
}||j}||j}||j}||j}|j	d j}nd}d }d }d }d }d }d }|jd u}| ||||||	|
||||||||d}|j|j   |j|j   |r|j|j	  |j|j	  |r|j|jd   |j|j  |j|j  |j|j  |S )Nc                 S      | j d jd jS Nr   r/   transformer_blocksattn2blockr)   r)   r*   get_first_cross_attention     zPControlNetXSCrossAttnDownBlock2D.from_modules.<locals>.get_first_cross_attentionr   r/   TFr   )r,   rP   rQ   time_emb_projin_featuresnorm1r   r  r   r/   r  headsrH   rJ   rK   r4   r  r  r  r  r  r  r  r  r-   r.   )r   r  r  r  r@   rA   rB   rC   rD   r   ctrl_num_groupsrb   rF   r   r   rH   rJ   rK   rI   r   r)   r)   r*   r    sj   





z-ControlNetXSCrossAttnDownBlock2D.from_modulesr  c                 C   n   |   D ]}d|_q| jg}t| jtjr|| j | jdur&|| j |D ]}|  D ]}d|_q.q(dS r  TNF)	r  r  r  r\   r  r   r8   r_   r  r5   r  r  r  r)   r)   r*   r  :     
z3ControlNetXSCrossAttnDownBlock2D.freeze_base_paramsru   rQ  rS  r?  rR  rA  rD  rE  encoder_attention_maskrH  .c
              	   C   s  |d ur| dd d urtd |}
|}d}d}tt| j| j}tt| j| j}t||| j	| j
D ]u\\}}\}}}}|	rKtj|||
gdd}t rZ| jrZ| ||
|}
n||
|}
|d uro||
||||ddd }
|	rt r| jr| |||}n|||}|d ur||||||ddd }|	r|
|||  }
||
f }||f }q4| jd ur| j	d	 }| j
d	 }|	rtj|||
gdd}| |
}
|	r| |}|	r|
|||  }
||
f }||f }|
|||fS )
NscaleSPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r)   r    rO  Fr?  rE  rD  r  rG  r   r   )rc  loggerwarningr   r  r  r  r  r  r-   r.   rW  catis_grad_enabledr  _gradient_checkpointing_funcr  r  )r5   rQ  rS  r?  rR  rA  rD  rE  r  rH  rs  rr  base_output_statesctrl_output_statesbase_blocksctrl_blocksb_resb_attnc_resc_attnb2cc2br)   r)   r*   r   K  sx   












z(ControlNetXSCrossAttnDownBlock2D.forward)
r>   r>   Tr    r    r    r?   TFTr~  )NNru   NNNT)r$   r%   r&   r]   r   r   r3   r   r   r+   r  r  r   r   r  r   r   r   r:   r)   r)   r6   r*   r     s    
v
A	
r   c                       s   e Zd Z									d'deded	edB d
ededededB dedB dedB dededB f fddZededefddZ	d(ddZ
						d)dededededB d edB d!eeef dB d"edB d#edB d$edeeef fd%d&Z  ZS )*r   Nr>   r    r?   FTrf   rg   rD   rW   r   rF   r   r   rH   rJ   rK   c                    sv   t    t||| _t|||||	|||
d| _t||| ||tt||| ||	|||
d	| _t||| _	d| _
d S )N)rF   rP   rD   ri   rH   rG   rK   rJ   rh   F)r2   r3   r`   r-   r   base_midblockra   r   ctrl_midblockr.   r  )r5   rf   rg   rD   rW   r   rF   r   r   rH   rJ   rK   r6   r)   r*   r3     s6   

z(ControlNetXSCrossAttnMidBlock2D.__init__r  r  c                 C   s   |j }|j}|j}dd }|j}|j}t|jd j}|jd j	j
}	|jd jj}
|jd jj}||j}||j}||j}||j}|jd j}| |||	|
|||||||d}|j |  |j|  |j|  |j|  |S )Nc                 S   r  r  r  )r<   r)   r)   r*   r    r  zOControlNetXSCrossAttnMidBlock2D.from_modules.<locals>.get_first_cross_attentionr   r   )r-   r.   r<   rQ   rP   r   r/   r  r,   r  r  r  r   r  rH   rJ   rK   r  r  r  r  )r   r  r  r-   r.   r  rf   rg   rF   rD   r   r  r   r   rH   rJ   rK   r   r)   r)   r*   r    sB   



z,ControlNetXSCrossAttnMidBlock2D.from_modulesr  c                 C   s.   |   D ]}d|_q| j  D ]}d|_qdS )r  TFN)r  r  r  )r5   r  r)   r)   r*   r    s
   z2ControlNetXSCrossAttnMidBlock2D.freeze_base_paramsru   rQ  rS  r?  rR  rA  rE  rD  r  rH  c
                 C   s   |d ur| dd d urtd |}
|}|||||d}|	r+tj|| |
gdd}| j|
fi |}
|	rH| j|fi |}|
| ||  }
|
|fS )Nr  r  )rS  r?  rD  rE  r  r    rO  )	rc  r  r  rW  r  r-   r  r  r.   )r5   rQ  rS  r?  rR  rA  rE  rD  r  rH  rs  rr  
joint_argsr)   r)   r*   r     s$   
z'ControlNetXSCrossAttnMidBlock2D.forward)	Nr>   r>   r    r    r    r?   FTr~  )Nru   NNNT)r$   r%   r&   r]   r   r3   r   r   r;   r  r  r   r   r  r   r   r   r   r:   r)   r)   r6   r*   r     s    	
4
0	

r   c                       s  e Zd Z									d,deded	ed
ee dedededB dededededededB f fddZedede	fddZ
d-ddZ							d.dedeed f d!eed f d"ed#edB d$edB d%eeef dB d&edB d'edB d(edB d)edefd*d+Z  ZS )/r   r>   NTr    r?   FrP   rQ   rk   rl   rD   rW   r   rF   rG   rH   r   rJ   rK   c                    s"  t    g }g }g }d}|| _|
| _t|	tr|	g| }	t|D ]@}||d kr+|n|}|dkr3|n|}|t|| | |t	|| |||d |ra|t
|
||
 ||	| ||||d q!t|| _|rot|nd g| | _t|| _|rt|d|d| _nd | _d| _|| _d S )	Nr   r    r   r  rU   T)rZ   rQ   F)r2   r3   has_cross_attentionrG   r\   r]   r^   r_   r`   r   r   r   r8   r,   r/   r.   r   
upsamplersr  r   )r5   rP   rQ   rk   rl   rD   rW   r   rb   rF   rG   rH   r   rJ   rK   r,   r/   r.   rV   rc   res_skip_channelsrm   r6   r)   r*   r3   ?  sV   


	
z'ControlNetXSCrossAttnUpBlock2D.__init__base_upblockctrl_upblockc                 C   sP  |j }dd }|jd j}|jd j| }|jd j| }dd |D }|jd jj}	|jd jj}
|j}t	|drZd}t
|jd j}||j}||j}||j}|jd j}nd	}d }d }d }d }d }|jd u}| |||||	|
||||||||d
}|j|j  |r|j|j  |r|j|jd   |j |  |S )Nc                 S   r  r  r  r  r)   r)   r*   r    r  zNControlNetXSCrossAttnUpBlock2D.from_modules.<locals>.get_first_cross_attentionr   r   c                 S   s   g | ]}|j qS r)   )rP   )r   r  r)   r)   r*   r     s    z?ControlNetXSCrossAttnUpBlock2D.from_modules.<locals>.<listcomp>r/   TF)rP   rQ   rk   rl   rD   rW   r   rb   rF   rG   rH   r   rJ   rK   )r.   r,   rQ   rP   r  r  r  r   r   r  r   r/   r  r  rH   rJ   rK   r  r  r  )r   r  r  ctrl_to_base_skip_connectionsr  rQ   rP   prev_output_channelsctrl_skip_channelssrD   r   r   rb   rF   rG   rH   rJ   rK   r   r   r)   r)   r*   r    s\   




z+ControlNetXSCrossAttnUpBlock2D.from_modulesr  c                 C   r  r  )	r  r  r,   r\   r/   r   r8   r_   r  r  r)   r)   r*   r    r  z1ControlNetXSCrossAttnUpBlock2D.freeze_base_paramsru   rT  rU  .rV  rS  r?  rA  rE  rD  upsample_sizer  rH  c              	      s$  |d ur| dd d urtd tdd o(tdd o(tdd o(tdd   fdd}tjjjt|t|D ]E\}}}}}|rP|||| 7 }|||\}}t	j
||gd	d
}t	 rojro|||}n|||}|d ur||||||
ddd }q?jd ur||	}|S )Nr  r  r,  r-  r.  r/  c              	      s,    rt j| |jjjjdS | |fS )N)r,  r-  r.  r/  )r   r   r,  r-  r.  r/  )rT  
res_h_baseis_freeu_enabledr5   r)   r*   maybe_apply_freeu_to_subblock  s   
zMControlNetXSCrossAttnUpBlock2D.forward.<locals>.maybe_apply_freeu_to_subblockr    rO  Fr  r   )rc  r  r  r  r  r,   r/   r.   r   rW  r  r  r  r  r  )r5   rT  rU  rV  rS  r?  rA  rE  rD  r  r  rH  r  resnetattnr  r  
res_h_ctrlr)   r  r*   r     sP   





	z&ControlNetXSCrossAttnUpBlock2D.forward)	r>   NTr    r    r?   TFTr~  )Nru   NNNNT)r$   r%   r&   r]   r   r   r3   r   r   r=   r  r  r   r   r   r  r   r   r   r:   r)   r)   r6   r*   r   >  s    
G
:

	
r   c                 C   s   t tj| |dddS )Nr    r   )r   )zero_moduler   r9   )rP   rQ   r)   r)   r*   r`     s   r`   c                 C   s   |   D ]}tj| q| S r1   )r  r   initzeros_)r;  pr)   r)   r*   r     s   r  c                 C   sD   |}|| kr| S |dkr | | }|dkr|S |d8 }|dksd S d S )Nr   r    r)   )numberrM   factorresidualr)   r)   r*   ra   &  s   ra   )r>   Tr    r    r?   TFT)Nr>   r    r    r?   FTr1   )Bdataclassesr   mathr   typingr   rW  r   r   configuration_utilsr   r	   utilsr
   r   utils.torch_utilsr   	attentionr   attention_processorr   r   r   r   r   r   
embeddingsr   r   modeling_utilsr   unets.unet_2d_blocksr   r   r   r   r   r   r   unets.unet_2d_conditionr   r   r!   
get_loggerr$   r  r"   Moduler+   r;   r=   r]   r   r   re   rj   r   rn   ro   r   r   r   r   r`   r  ra   r)   r)   r)   r*   <module>   s    $	
	

^	
#
         )  
_