o
    ۷ibL                     @   s
  d dl mZ d dlmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! e"e#Z$eG dd de Z%G dd deeeeeZ&G dd deZ'dS )    )	dataclass)AnyN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)apply_lora_scalelogging   )AttentionMixinJointTransformerBlock)	AttentionFusedJointAttnProcessor2_0)"CombinedTimestepTextProjEmbeddings
PatchEmbed)Transformer2DModelOutput)
ModelMixin)SD3SingleTransformerBlock   )
BaseOutputzero_modulec                   @   s   e Zd ZU eej ed< dS )SD3ControlNetOutputcontrolnet_block_samplesN)__name__
__module____qualname__tupletorchTensor__annotations__ r!   r!   a/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/controlnets/controlnet_sd3.pyr   %   s   
 r   c                $       sF  e Zd ZdZdZe										
								d>dededededededededededededeedf dedB dedB d e	d!e	f" fd"d#Z
d?d$edB d%ed&dfd'd(Zd)d* Zd+d, Zd-d. Ze	d@d1d2Zed3	4					dAd5ejd6ejd7ed8ejd9ejd:ejd3eeef dB d;e	d&ejeB fd<d=Z  ZS )BSD3ControlNetModela]	  
    ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

    Parameters:
        sample_size (`int`, defaults to `128`):
            The width/height of the latents. This is fixed during training since it is used to learn a number of
            position embeddings.
        patch_size (`int`, defaults to `2`):
            Patch size to turn the input data into small patches.
        in_channels (`int`, defaults to `16`):
            The number of latent channels in the input.
        num_layers (`int`, defaults to `18`):
            The number of layers of transformer blocks to use.
        attention_head_dim (`int`, defaults to `64`):
            The number of channels in each head.
        num_attention_heads (`int`, defaults to `18`):
            The number of heads to use for multi-head attention.
        joint_attention_dim (`int`, defaults to `4096`):
            The embedding dimension to use for joint text-image attention.
        caption_projection_dim (`int`, defaults to `1152`):
            The embedding dimension of caption embeddings.
        pooled_projection_dim (`int`, defaults to `2048`):
            The embedding dimension of pooled text projections.
        out_channels (`int`, defaults to `16`):
            The number of latent channels in the output.
        pos_embed_max_size (`int`, defaults to `96`):
            The maximum latent height/width of positional embeddings.
        extra_conditioning_channels (`int`, defaults to `0`):
            The number of extra channels to use for conditioning for patch embedding.
        dual_attention_layers (`tuple[int, ...]`, defaults to `()`):
            The number of dual-stream transformer blocks to use.
        qk_norm (`str`, *optional*, defaults to `None`):
            The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
        pos_embed_type (`str`, defaults to `"sincos"`):
            The type of positional embedding to use. Choose between `"sincos"` and `None`.
        use_pos_embed (`bool`, defaults to `True`):
            Whether to use positional embeddings.
        force_zeros_for_pooled_projection (`bool`, defaults to `True`):
            Whether to force zeros for pooled projection embeddings. This is handled in the pipelines by reading the
            config value of the ControlNet model.
    T   r         @           `   r   r!   Nsincossample_size
patch_sizein_channels
num_layersattention_head_dimnum_attention_headsjoint_attention_dimcaption_projection_dimpooled_projection_dimout_channelspos_embed_max_sizeextra_conditioning_channelsdual_attention_layers.qk_normpos_embed_typeuse_pos_embed!force_zeros_for_pooled_projectionc              	      s8  t    |}|
d ur|
n|_  _|r%t||||j||d_nd _tj|	d_|d urOt	||_
t fddt|D _nd _
t fddt|D _tg _ttjD ]}t	jj}t|}j| qpt||||| jd d}t|_d_d S )N)heightwidthr.   r/   	embed_dimr7   r;   )embedding_dimr5   c              
      s.   g | ]}t j d |v rdnd dqS )FT)dimr2   r1   context_pre_onlyr:   use_dual_attention)r   	inner_dim).0ir1   r9   r2   r:   selfr!   r"   
<listcomp>   s    	z/SD3ControlNetModel.__init__.<locals>.<listcomp>c                    s   g | ]
}t j d qS ))rB   r2   r1   )r   rE   )rF   _)r1   r2   rI   r!   r"   rJ      s    )r>   r?   r.   r/   r@   r;   F)super__init__r6   rE   r   	pos_embedr   time_text_embednnLinearcontext_embedder
ModuleListrangetransformer_blockscontrolnet_blockslenr   appendpos_embed_inputgradient_checkpointing)rI   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   default_out_channelsrK   controlnet_blockrY   	__class__rH   r"   rM   W   s\   



	

zSD3ControlNetModel.__init__
chunk_sizerB   returnc                    sZ   |dvrt d| |pd}dtjjdtdtf fdd |  D ]} ||| q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r   z-Make sure to set `dim` to either 0 or 1, not r   moduler_   rB   c                    s6   t | dr| j||d |  D ]} ||| qd S )Nset_chunk_feed_forward)r_   rB   )hasattrrb   children)ra   r_   rB   childfn_recursive_feed_forwardr!   r"   rg      s
   
zMSD3ControlNetModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)
ValueErrorr   rP   Moduleintrd   )rI   r_   rB   ra   r!   rf   r"   enable_forward_chunking   s   z*SD3ControlNetModel.enable_forward_chunkingc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        > [!WARNING] > This API is 🧪 experimental.
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsattn_processorsitemsstrr^   r   rh   modules
isinstancer   fuse_projectionsset_attn_processorr   )rI   rK   attn_processorra   r!   r!   r"   fuse_qkv_projections   s   
z'SD3ControlNetModel.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )un   Disables the fused QKV projection if enabled.

        > [!WARNING] > This API is 🧪 experimental.

        N)rn   ru   )rI   r!   r!   r"   unfuse_qkv_projections   s   
z)SD3ControlNetModel.unfuse_qkv_projectionsc                 C   sB   t |jj|jj|jj|jj|j|jjd}|j|j	 dd |S )N)r>   r?   r.   r/   r@   r7   Tstrict)
r   configr-   r.   r/   rE   r7   load_state_dictrN   
state_dict)rI   transformerrN   r!   r!   r"   _get_pos_embed_from_transformer   s   z2SD3ControlNetModel._get_pos_embed_from_transformer   r   c                 C   s   |j }|p|j|d< ||d< | |}|rA|j|j  |j|j  |j|j  |jj|j dd t	|j
|_
|S )Nr0   r8   Fry   )r{   r0   from_configrN   r|   r}   rO   rR   rU   r   rY   )clsr~   r0   num_extra_conditioning_channelsload_weights_from_transformerr{   
controlnetr!   r!   r"   from_transformer   s   
z#SD3ControlNetModel.from_transformerjoint_attention_kwargs      ?hidden_statescontrolnet_condconditioning_scaleencoder_hidden_statespooled_projectionstimestepreturn_dictc	                    s  | j dur|jdkrtd| j du r|jdkrtd| jdur)|du r)td| jdu r6|dur6td| j dur@|  |}| ||}	| jdurP| |}|| | }d}
| jD ]:}t r}| j	r}| jduru| 
||||	\}}n| 
|||	}n| jdur||||	d	\}}n|||	}|
|f }
q\d}t|
| jD ]\}}||}||f }q fd
d|D }|s|fS t|dS )a  
        The [`SD3Transformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            controlnet_cond (`torch.Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                The scale factor for ControlNet outputs.
            encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
                from the embeddings of input conditions.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        N   z/hidden_states must be 4D when pos_embed is usedr   z3hidden_states must be 3D when pos_embed is not usedzDencoder_hidden_states must be provided when context_embedder is usedzNencoder_hidden_states should not be provided when context_embedder is not usedr!   )r   r   tembc                    s   g | ]}|  qS r!   r!   )rF   sampler   r!   r"   rJ   q  s    z.SD3ControlNetModel.forward.<locals>.<listcomp>)r   )rN   ndimrh   rR   rO   rY   rU   r   is_grad_enabledrZ   _gradient_checkpointing_funcziprV   r   )rI   r   r   r   r   r   r   r   r   r   block_res_samplesblockcontrolnet_block_res_samplesblock_res_sampler\   r!   r   r"   forward  sN   (









zSD3ControlNetModel.forward)r$   r   r%   r&   r'   r&   r(   r)   r*   r%   r+   r   r!   Nr,   TT)Nr   )r   r   T)r   NNNNT)r   r   r   __doc__ _supports_gradient_checkpointingr   rj   r   rq   boolrM   rk   rw   rx   r   classmethodr   r	   r   r   float
LongTensordictr   r   r   __classcell__r!   r!   r]   r"   r#   *   s    *	

Z	
r#   c                       sz   e Zd ZdZ fddZ				ddejdeej dee	 d	ejd
ejdej
deeef dB dedeeB fddZ  ZS )SD3MultiControlNetModela  
    `SD3ControlNetModel` wrapper class for Multi-SD3ControlNet

    This module is a wrapper for multiple instances of the `SD3ControlNetModel`. The `forward()` API is designed to be
    compatible with `SD3ControlNetModel`.

    Args:
        controlnets (`list[SD3ControlNetModel]`):
            Provides additional conditioning to the unet during the denoising process. You must set multiple
            `SD3ControlNetModel` as a list.
    c                    s   t    t|| _d S )N)rL   rM   rP   rS   nets)rI   controlnetsr]   r!   r"   rM     s   
z SD3MultiControlNetModel.__init__NTr   r   r   r   r   r   r   r   r`   c	                 C   sr   t t||| jD ]-\}	\}
}}||||||
|||d}|	dkr#|}q	dd t|d |d D }t|f}q	|S )N)r   r   r   r   r   r   r   r   r   c                 S   s   g | ]\}}|| qS r!   r!   )rF   control_block_sampleblock_sampler!   r!   r"   rJ     s    z3SD3MultiControlNetModel.forward.<locals>.<listcomp>)	enumerater   r   r   )rI   r   r   r   r   r   r   r   r   rG   imagescaler   block_samplescontrol_block_samplesr!   r!   r"   r     s$    zSD3MultiControlNetModel.forward)NNNT)r   r   r   r   rM   r   r   listtensorr   r   r   rq   r   r   r   r   r   r   r!   r!   r]   r"   r   y  s4    
	
r   )(dataclassesr   typingr   r   torch.nnrP   configuration_utilsr   r   loadersr   r   utilsr	   r
   	attentionr   r   attention_processorr   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   transformers.transformer_sd3r   r   r   r   
get_loggerr   loggerr   r#   r   r!   r!   r!   r"   <module>   s*   
  Q