o
    pi$F                     @   s  d dl mZ d dlmZmZmZmZmZmZ d dl	Z	d dl
mZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' e(e)Z*eG dd de#Z+G dd deeeeZ,G dd deZ-dS )    )	dataclass)AnyDictListOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)JointTransformerBlock)	AttentionAttentionProcessorFusedJointAttnProcessor2_0)Transformer2DModelOutput)
ModelMixin)USE_PEFT_BACKENDis_torch_versionloggingscale_lora_layersunscale_lora_layers   )
BaseOutputzero_module)"CombinedTimestepTextProjEmbeddings
PatchEmbedc                   @   s   e Zd ZU eej ed< dS )SD3ControlNetOutputcontrolnet_block_samplesN)__name__
__module____qualname__r   torchTensor__annotations__ r&   r&   ]/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/controlnet_sd3.pyr   $   s   
 r   c                       sB  e Zd ZdZe												
d9dededededededededededef fddZd:dee deddfddZe	de
eef fdd Zd!eee
eef f fd"d#Zd$d% Zd&d' Zd;d)d*Zed<d,d-Z	.					d=d/ejd0ejd1ed2ejd3ejd4ejd5ee
eef  d6edeejef fd7d8Z  ZS )>SD3ControlNetModelT   r	         @           `   sample_size
patch_sizein_channels
num_layersattention_head_dimnum_attention_headsjoint_attention_dimcaption_projection_dimpooled_projection_dimout_channelspos_embed_max_sizec                    s   t    |}|
d ur|
n|_ | _t||||j|d_tj|	d_t	||_
t fddt|D _tg _ttjD ]}t	jj}t|}j| qMt||||jd d}t|_d_d S )N)heightwidthr2   r3   	embed_dimr;   )embedding_dimr9   c                    s"   g | ]}t j jjd dqS )F)dimr6   r5   context_pre_only)r   	inner_dimconfigr5   ).0ir6   selfr&   r'   
<listcomp>P   s    z/SD3ControlNetModel.__init__.<locals>.<listcomp>)r<   r=   r2   r3   r>   pos_embed_typeF)super__init__r:   rB   r   	pos_embedr   time_text_embednnLinearcontext_embedder
ModuleListrangetransformer_blockscontrolnet_blockslenr   appendpos_embed_inputgradient_checkpointing)rG   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   default_out_channels_controlnet_blockrW   	__class__rF   r'   rK   ,   sH   



zSD3ControlNetModel.__init__Nr   
chunk_sizer@   returnc                    sZ   |dvrt d| |pd}dtjjdtdtf fdd |  D ]} ||| q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r   z-Make sure to set `dim` to either 0 or 1, not r   moduler^   r@   c                    s6   t | dr| j||d |  D ]} ||| qd S )Nset_chunk_feed_forward)r^   r@   )hasattrra   children)r`   r^   r@   childfn_recursive_feed_forwardr&   r'   rf      s
   
zMSD3ControlNetModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)
ValueErrorr#   rN   Moduleintrc   )rG   r^   r@   r`   r&   re   r'   enable_forward_chunkingn   s   z*SD3ControlNetModel.enable_forward_chunkingc                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namer`   
processorsc                    sH   t |dr| ||  d< | D ]\}} |  d| || q|S )Nget_processor
.processor.)rb   rm   named_children)rk   r`   rl   sub_namerd   fn_recursive_add_processorsr&   r'   rs      s
   
zGSD3ControlNetModel.attn_processors.<locals>.fn_recursive_add_processors)strr#   rN   rh   r   r   rp   )rG   rl   rk   r`   r&   rr   r'   attn_processors   s
   	&	z"SD3ControlNetModel.attn_processors	processorc                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.rk   r`   c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S )Nset_processorrn   ro   )rb   
isinstancedictrw   poprp   )rk   r`   rv   rq   rd   fn_recursive_attn_processorr&   r'   r|      s   

zJSD3ControlNetModel.set_attn_processor.<locals>.fn_recursive_attn_processorN)rU   ru   keysrx   ry   rg   rt   r#   rN   rh   rp   )rG   rv   countrk   r`   r&   r{   r'   set_attn_processor   s   
z%SD3ControlNetModel.set_attn_processorc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u1  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsru   itemsrt   r]   r    rg   modulesrx   r   fuse_projectionsr   r   )rG   rZ   attn_processorr`   r&   r&   r'   fuse_qkv_projections   s   
z'SD3ControlNetModel.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )u   Disables the fused QKV projection if enabled.

        <Tip warning={true}>

        This API is 🧪 experimental.

        </Tip>

        N)r   r   )rG   r&   r&   r'   unfuse_qkv_projections   s   

z)SD3ControlNetModel.unfuse_qkv_projectionsFc                 C   s   t |dr
||_d S d S )NrX   )rb   rX   )rG   r`   valuer&   r&   r'   _set_gradient_checkpointing   s   

z.SD3ControlNetModel._set_gradient_checkpointing   c                 C   s   |j }|p|j|d< | di |}|r?|j|j  |j|j  |j|j  |jj|j dd t|j	|_	|S )Nr4   F)strictr&   )
rC   r4   rL   load_state_dict
state_dictrM   rP   rS   r   rW   )clstransformerr4   load_weights_from_transformerrC   
controlnetr&   r&   r'   from_transformer   s   z#SD3ControlNetModel.from_transformer      ?hidden_statescontrolnet_condconditioning_scaleencoder_hidden_statespooled_projectionstimestepjoint_attention_kwargsreturn_dictc	                    s\  |dur|  }|dd}	nd}	trt| |	 n|dur*|dddur*td | |}| ||}
| 	|}|| 
| }d}| jD ]7}| jro| jroddd}tdd	r\d
dini }tjjj|||||
fi |}n	||||
d\}}||f }qFd}t|| jD ]\}}||}||f }q fdd|D }trt| |	 |s|fS t|dS )a  
        The [`SD3Transformer2DModel`] forward method.

        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            controlnet_cond (`torch.Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                The scale factor for ControlNet outputs.
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
                from the embeddings of input conditions.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        Nscaler   z\Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective.r&   c                    s    fdd}|S )Nc                     s   d ur | diS  |  S )Nr   r&   )inputsr`   r   r&   r'   custom_forwardF  s   zQSD3ControlNetModel.forward.<locals>.create_custom_forward.<locals>.custom_forwardr&   )r`   r   r   r&   r   r'   create_custom_forwardE  s   z9SD3ControlNetModel.forward.<locals>.create_custom_forwardz>=z1.11.0use_reentrantF)r   r   tembc                    s   g | ]}|  qS r&   r&   )rD   sampler   r&   r'   rH   d  s    z.SD3ControlNetModel.forward.<locals>.<listcomp>)r   N)copyrz   r   r   getloggerwarningrL   rM   rP   rW   rS   trainingrX   r   r#   utils
checkpointziprT   r   r   )rG   r   r   r   r   r   r   r   r   
lora_scaler   block_res_samplesblockr   ckpt_kwargscontrolnet_block_res_samplesblock_res_sampler[   r&   r   r'   forward  sR   '



		


zSD3ControlNetModel.forward)r)   r	   r*   r+   r,   r+   r-   r.   r/   r*   r0   )Nr   )F)r   T)r   NNNNT)r    r!   r"    _supports_gradient_checkpointingr   ri   rK   r   rj   propertyr   rt   r   ru   r   r   r   r   r   classmethodr   r#   FloatTensorr$   float
LongTensorr   boolr   r   __classcell__r&   r&   r\   r'   r(   )   s    	
A#
	
r(   c                       s~   e Zd ZdZ fddZ				ddejdeej dee	 d	ejd
ejdej
deeeef  dedeeef fddZ  ZS )SD3MultiControlNetModela  
    `SD3ControlNetModel` wrapper class for Multi-SD3ControlNet

    This module is a wrapper for multiple instances of the `SD3ControlNetModel`. The `forward()` API is designed to be
    compatible with `SD3ControlNetModel`.

    Args:
        controlnets (`List[SD3ControlNetModel]`):
            Provides additional conditioning to the unet during the denoising process. You must set multiple
            `SD3ControlNetModel` as a list.
    c                    s   t    t|| _d S r   )rJ   rK   rN   rQ   nets)rG   controlnetsr\   r&   r'   rK   }  s   
z SD3MultiControlNetModel.__init__NTr   r   r   r   r   r   r   r   r_   c	                 C   sr   t t||| jD ]-\}	\}
}}||||||
|||d}|	dkr#|}q	dd t|d |d D }t|f}q	|S )N)r   r   r   r   r   r   r   r   r   c                 S   s   g | ]\}}|| qS r&   r&   )rD   control_block_sampleblock_sampler&   r&   r'   rH     s    z3SD3MultiControlNetModel.forward.<locals>.<listcomp>)	enumerater   r   tuple)rG   r   r   r   r   r   r   r   r   rE   imager   r   block_samplescontrol_block_samplesr&   r&   r'   r     s$    zSD3MultiControlNetModel.forward)NNNT)r    r!   r"   __doc__rK   r#   r   r   tensorr   r   r   r   rt   r   r   r   r   r   r   r   r&   r&   r\   r'   r   p  s4    
	

r   ).dataclassesr   typingr   r   r   r   r   r   r#   torch.nnrN   configuration_utilsr
   r   loadersr   r   models.attentionr   models.attention_processorr   r   r   models.modeling_outputsr   models.modeling_utilsr   r   r   r   r   r   r   r   r   r   
embeddingsr   r   
get_loggerr    r   r   r(   r   r&   r&   r&   r'   <module>   s(    
  I