o
    piB                     @   s   d dl mZ d dlmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZ ddlmZ ddlmZ eeZeG dd deZG dd dee	ZG dd deZ dS )    )	dataclass)DictOptionalUnionN)nn   )ConfigMixinregister_to_config)logging   )AttentionProcessor)
BaseOutputTuplezero_module)-HunyuanCombinedTimestepTextSizeStyleEmbedding
PatchEmbedPixArtAlphaTextProjection)
ModelMixin)HunyuanDiTBlockc                   @   s   e Zd ZU eej ed< dS )HunyuanControlNetOutputcontrolnet_block_samplesN)__name__
__module____qualname__r   torchTensor__annotations__ r   r   a/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/controlnet_hunyuan.pyr   $   s   
 r   c                       s   e Zd Ze												
		
			d,dedededee dee dededededededededef fddZ	e
deeef fd d!Zd"eeeeef f fd#d$Ze	d-d%d&Z	'								d.d(ejd)efd*d+Z  ZS )/HunyuanDiT2DControlNetModel      X   Ngelu-approximate      (         @      M      Tconditioning_channelsnum_attention_headsattention_head_dimin_channels
patch_sizeactivation_fntransformer_num_layers	mlp_ratiocross_attention_dimcross_attention_dim_t5pooled_projection_dimtext_lentext_len_t5"use_style_cond_and_image_meta_sizec                    s   t    |_|| _t||d dd_ttj	|| tj
d_t|||||d d_t|||||d_tg _t fddt|	d	 d
 D _tt||_ttjD ]}t||}t|}j| qhd S )N   	silu_fp32)in_featureshidden_sizeout_featuresact_fn)dtype)heightwidthr/   	embed_dimr0   pos_embed_type)r6   seq_lenr4   r9   c                    s2   g | ]}t jjj tj d ddqS )TF)dimr-   r1   ff_inner_dimr4   qk_normskip)r   	inner_dimconfigr-   int).0layerr1   r4   r3   selfr   r   
<listcomp>c   s    
z8HunyuanDiT2DControlNetModel.__init__.<locals>.<listcomp>r   r   )super__init__	num_headsrJ   r   text_embedderr   	Parameterr   randnfloat32text_embedding_paddingr   	pos_embedr   time_extra_emb
ModuleListcontrolnet_blocksrangeblocksr   Linearinput_blocklenappend)rP   r,   r-   r.   r/   r0   r1   sample_sizer=   r2   r3   r4   r5   r6   r7   r8   r9   _controlnet_block	__class__rO   r   rS   *   sN   

		
z$HunyuanDiT2DControlNetModel.__init__returnc                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                    sL   t |dr|jdd||  d< | D ]\}} |  d| || q|S )Nget_processorT)return_deprecated_lora
.processor.)hasattrrm   named_children)rj   rk   rl   sub_namechildfn_recursive_add_processorsr   r   rv      s
   
zPHunyuanDiT2DControlNetModel.attn_processors.<locals>.fn_recursive_add_processors)strr   r   Moduler   r   rr   )rP   rl   rj   rk   r   ru   r   attn_processorsv   s
   &	z+HunyuanDiT2DControlNetModel.attn_processors	processorc                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a2  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers. If `processor` is a dict, the key needs to define the path to the
                corresponding cross attention processor. This is strongly recommended when setting trainable attention
                processors.
        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.rj   rk   c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S )Nset_processorro   rp   )rq   
isinstancedictr{   poprr   )rj   rk   rz   rs   rt   fn_recursive_attn_processorr   r   r      s   

zSHunyuanDiT2DControlNetModel.set_attn_processor.<locals>.fn_recursive_attn_processorN)rb   ry   keysr|   r}   
ValueErrorrw   r   r   rx   rr   )rP   rz   countrj   rk   r   r   r   set_attn_processor   s   
z.HunyuanDiT2DControlNetModel.set_attn_processorc                 C   s   |j }|j}|j}|j}|j}	|j}
|j}|j}|j}|j	}|j
}|j}|j}|}|p-|j}| ||||||	|
|||||||d}|rU|j| dd}td|d   |S )N)r,   r2   r1   r.   r4   r5   r=   r/   r3   r-   r0   rd   r7   r8   F)strictz0controlnet load from Hunyuan-DiT. missing_keys: r   )rK   r1   r.   r4   r5   r=   r/   r3   r-   r0   rd   r7   r8   r2   load_state_dict
state_dictloggerwarning)clstransformerr,   r2   load_weights_from_transformerrK   r1   r.   r4   r5   r=   r/   r3   r-   r0   rd   r7   r8   
controlnetkeyr   r   r   from_transformer   sF   
z,HunyuanDiT2DControlNetModel.from_transformer      ?controlnet_condconditioning_scalec                    s>  |j dd \}}| |}|| | | }| j|||	|
|jd}|j \}}}| |d|j d }|||d}tj||gdd}tj||gdd}|	d
 }t||| j}d}t| jD ]\}}|||||d	}||f }qdd}t|| jD ]\}}||}||f }q~ fd
d|D }|s|fS t|dS )  
        The [`HunyuanDiT2DControlNetModel`] forward method.

        Args:
        hidden_states (`torch.Tensor` of shape `(batch size, dim, height, width)`):
            The input tensor.
        timestep ( `torch.LongTensor`, *optional*):
            Used to indicate denoising step.
        controlnet_cond ( `torch.Tensor` ):
            The conditioning input to ControlNet.
        conditioning_scale ( `float` ):
            Indicate the conditioning scale.
        encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
            Conditional embeddings for cross attention layer. This is the output of `BertModel`.
        text_embedding_mask: torch.Tensor
            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. This is the output
            of `BertModel`.
        encoder_hidden_states_t5 ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
            Conditional embeddings for cross attention layer. This is the output of T5 Text Encoder.
        text_embedding_mask_t5: torch.Tensor
            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. This is the output
            of T5 Text Encoder.
        image_meta_size (torch.Tensor):
            Conditional embedding indicate the image sizes
        style: torch.Tensor:
            Conditional embedding indicate the style
        image_rotary_emb (`torch.Tensor`):
            The image rotary embeddings to apply on query and key tensors during attention calculation.
        return_dict: bool
            Whether to return a dictionary.
        N)hidden_dtyper   )rF   r   r   )tembencoder_hidden_statesimage_rotary_embc                    s   g | ]}|  qS r   r   )rM   sampler   r   r   rQ   0  s    z7HunyuanDiT2DControlNetModel.forward.<locals>.<listcomp>)r   )shaperZ   ra   r[   r@   rU   viewr   cat	unsqueezeboolwhererY   	enumerater_   zipr]   r   )rP   hidden_statestimestepr   r   r   text_embedding_maskencoder_hidden_states_t5text_embedding_mask_t5image_meta_sizestyler   return_dictrA   rB   r   
batch_sizesequence_lengthre   block_res_samplesrN   blockcontrolnet_block_res_samplesblock_res_samplerf   r   r   r   forward   s@   /

z#HunyuanDiT2DControlNetModel.forward)r    r!   r"   NNr#   r$   r%   r&   r'   r(   r)   r(   r*   r+   T)r    NT	r   NNNNNNNT)r   r   r   r	   rL   r   rw   floatr   rS   propertyr   r   ry   r   r   classmethodr   r   r   r   __classcell__r   r   rg   r   r   )   s    
K .r   c                       sH   e Zd ZdZ fddZ									ddejdefd	d
Z  Z	S ) HunyuanDiT2DMultiControlNetModela  
    `HunyuanDiT2DMultiControlNetModel` wrapper class for Multi-HunyuanDiT2DControlNetModel

    This module is a wrapper for multiple instances of the `HunyuanDiT2DControlNetModel`. The `forward()` API is
    designed to be compatible with `HunyuanDiT2DControlNetModel`.

    Args:
        controlnets (`List[HunyuanDiT2DControlNetModel]`):
            Provides additional conditioning to the unet during the denoising process. You must set multiple
            `HunyuanDiT2DControlNetModel` as a list.
    c                    s   t    t|| _d S )N)rR   rS   r   r\   nets)rP   controlnetsrg   r   r   rS   E  s   
z)HunyuanDiT2DMultiControlNetModel.__init__r   NTr   r   c                 C   sv   t t||| jD ]/\}\}}}||||||||||	|
||d}|dkr'|}q	dd t|d |d D }|f}q	|S )r   )r   r   r   r   r   r   r   r   r   r   r   r   r   c                 S   s   g | ]\}}|| qS r   r   )rM   control_block_sampleblock_sampler   r   r   rQ     s    z<HunyuanDiT2DMultiControlNetModel.forward.<locals>.<listcomp>)r   r   r   )rP   r   r   r   r   r   r   r   r   r   r   r   r   iimagescaler   block_samplescontrol_block_samplesr   r   r   r   I  s,    .z(HunyuanDiT2DMultiControlNetModel.forwardr   )
r   r   r   __doc__rS   r   r   r   r   r   r   r   rg   r   r   8  s"    	r   )!dataclassesr   typingr   r   r   r   r   configuration_utilsr   r	   utilsr
   attention_processorr   r   r   r   r   
embeddingsr   r   r   modeling_utilsr   #transformers.hunyuan_transformer_2dr   
get_loggerr   r   r   r   r   r   r   r   r   <module>   s$   
  