o
    Gi                  	   @   s  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ d dl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ ddlmZ d	d
lmZmZ d	dlmZ d	dlmZ d	dlmZ d	dlmZm Z m!Z!m"Z" d	dl#m$Z$ d	dl%m&Z& d	dl'm(Z(m)Z)m*Z* e+e,Z-G dd dZ.G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3eG dd dej/Z4G dd dej/Z5G d d! d!ej/Z6G d"d# d#ej/Z7eG d$d% d%ej/Z8eG d&d' d'ej/Z9G d(d) d)e&eeee
eZ:dS )*    N)Any)FromOriginalModelMixin   )ConfigMixinregister_to_config)PeftAdapterMixin)apply_lora_scalelogging)maybe_allow_in_graph   )AttentionMixinFeedForward)dispatch_attention_fn)	Attention)
CacheMixin)"CombinedTimestepTextProjEmbeddingsTimestepEmbedding	Timestepsget_1d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousAdaLayerNormZeroAdaLayerNormZeroSinglec                   @   s\   e Zd ZdZdZdd Z			ddedejdejdB dejdB dejdB d	ejfd
dZ	dS )HunyuanImageAttnProcessorNc                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzYHunyuanImageAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.)hasattrFImportError)self r    j/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_hunyuanimage.py__init__2   s
   
z"HunyuanImageAttnProcessor.__init__attnhidden_statesencoder_hidden_statesattention_maskimage_rotary_embreturnc              
   C   s  |j d u r|d urtj||gdd}||}||}||}|d|jdf}|d|jdf}|d|jdf}|jd urF||}|j	d urP|	|}|d urddl
m}	 |j d u r|d urtj|	|d d d |jd  f |dd|d d |jd  d f gdd}tj|	|d d d |jd  f |dd|d d |jd  d f gdd}n|	||dd}|	||dd}|j d ur#|d ur#| |}
||}||}|
d|jdf}
|d|jdf}|d|jdf}|jd ur||
}
|jd ur||}tj||
gdd}tj||gdd}tj||gdd}t||||dd| j| jd	}|dd
}||j}|d ur|d d d |jd  f |d d |jd  d f }}t|dd d urv|jd |}|jd |}t|dd d ur||}||fS )N   dimr   )apply_rotary_emb)sequence_dim        F)	attn_mask	dropout_p	is_causalbackendparallel_configr   to_outr   
to_add_out)
add_q_projtorchcatto_qto_kto_v	unflattenheadsnorm_qnorm_k
embeddingsr-   shape
add_k_proj
add_v_projnorm_added_qnorm_added_kr   _attention_backend_parallel_configflattentodtypegetattrr5   r6   )r   r#   r$   r%   r&   r'   querykeyvaluer-   encoder_queryencoder_keyencoder_valuer    r    r!   __call__8   s   






	$








z"HunyuanImageAttnProcessor.__call__)NNN)
__name__
__module____qualname__rG   rH   r"   r   r8   TensorrS   r    r    r    r!   r   .   s(    
r   c                	       s`   e Zd Z			ddeeeeeeef f dededdf fd	d
ZdejdejfddZ  Z	S )HunyuanImagePatchEmbed   rZ   r      
patch_sizein_chans	embed_dimr(   Nc                    sj   t    || _t|dkrtj||||d| _d S t|dkr,tj||||d| _d S tdt| )Nr   )kernel_sizestrider   1patch_size must be a tuple of length 2 or 3, got )	superr"   r\   lennnConv2dprojConv3d
ValueError)r   r\   r]   r^   	__class__r    r!   r"      s   
zHunyuanImagePatchEmbed.__init__r$   c                 C   s    |  |}|ddd}|S )Nr   r)   )rf   rI   	transpose)r   r$   r    r    r!   forward   s   
zHunyuanImagePatchEmbed.forward)rY   r   r[   )
rT   rU   rV   tupleintr"   r8   rW   rl   __classcell__r    r    ri   r!   rX      s    rX   c                       s@   e Zd Zdededef fddZdejdejfdd	Z  ZS )
HunyuanImageByT5TextProjectionin_featureshidden_sizeout_featuresc                    sN   t    t|| _t||| _t||| _t||| _t	 | _
d S N)rb   r"   rd   	LayerNormnormLinearlinear_1linear_2linear_3GELUact_fn)r   rq   rr   rs   ri   r    r!   r"      s   
z'HunyuanImageByT5TextProjection.__init__r%   r(   c                 C   s@   |  |}| |}| |}| |}| |}| |}|S rt   )rv   rx   r|   ry   rz   )r   r%   r$   r    r    r!   rl      s   





z&HunyuanImageByT5TextProjection.forward)	rT   rU   rV   rn   r"   r8   rW   rl   ro   r    r    ri   r!   rp      s    rp   c                	       s\   e Zd Zd
dededB ddf fddZdejdeejejejejejf fdd	Z  Z	S )HunyuanImageAdaNormNrq   rs   r(   c                    s2   t    |p
d| }t||| _t | _d S )Nr   )rb   r"   rd   rw   linearSiLUnonlinearity)r   rq   rs   ri   r    r!   r"      s   
zHunyuanImageAdaNorm.__init__tembc                 C   s@   |  | |}|jddd\}}|d|d}}||fS )Nr   r)   r*   )r~   r   chunk	unsqueeze)r   r   gate_msagate_mlpr    r    r!   rl      s   zHunyuanImageAdaNorm.forwardrt   )
rT   rU   rV   rn   r"   r8   rW   rm   rl   ro   r    r    ri   r!   r}      s     r}   c                       sj   e Zd Z		ddededef fddZ		ddejd	ejdB d
ejdB deejejf fddZ	  Z
S ))HunyuanImageCombinedTimeGuidanceEmbeddingFembedding_dimguidance_embedsuse_meanflowc                    s|   t    tdddd| _td|d| _|| _d | _d | _|r.tdddd| _td|d| _d | _	|r<td|d| _	d S d S )N   Tr   )num_channelsflip_sin_to_cosdownscale_freq_shift)in_channelstime_embed_dim)
rb   r"   r   	time_projr   timestep_embedderr   time_proj_rtimestep_embedder_rguidance_embedder)r   r   r   r   ri   r    r!   r"      s   
z2HunyuanImageCombinedTimeGuidanceEmbedding.__init__Ntimestep
timestep_rguidancer(   c                 C   s   |  |}| |j|jd}|d ur(| |}| |j|jd}|| d }| jd urB|  |}| |j|jd}	||	 }
|
S |}
|
S )N)rK   r   )r   r   rJ   rK   r   r   r   )r   r   r   r   timesteps_projtimesteps_embtimesteps_proj_rtimesteps_emb_rguidance_projguidance_embconditioningr    r    r!   rl      s   



z1HunyuanImageCombinedTimeGuidanceEmbedding.forward)FFNN)rT   rU   rV   rn   boolr"   r8   rW   rm   rl   ro   r    r    ri   r!   r      s*    r   c                       sh   e Zd Z			ddededededed	d
f fddZ	
ddej	dej	dej	d
B d	ej	fddZ
  ZS )'HunyuanImageIndividualTokenRefinerBlock      @r/   Tnum_attention_headsattention_head_dimmlp_width_ratiomlp_drop_rateattention_biasr(   Nc                    sp   t    || }tj|ddd| _t|d |||d| _tj|ddd| _t||d|d| _	t
|d| | _d S )NTư>elementwise_affineeps)	query_dimcross_attention_dimr>   dim_headbiaszlinear-silu)multactivation_fndropoutr   )rb   r"   rd   ru   norm1r   r#   norm2r   ffr}   norm_out)r   r   r   r   r   r   rr   ri   r    r!   r"     s   
z0HunyuanImageIndividualTokenRefinerBlock.__init__r$   r   r&   c           	      C   sT   |  |}| j|d |d}| |\}}|||  }| | |}|||  }|S )N)r$   r%   r&   )r   r#   r   r   r   )	r   r$   r   r&   norm_hidden_statesattn_outputr   r   	ff_outputr    r    r!   rl   &  s   
z/HunyuanImageIndividualTokenRefinerBlock.forwardr   r/   Trt   )rT   rU   rV   rn   strfloatr   r"   r8   rW   rl   ro   r    r    ri   r!   r   
  s6    r   c                       sj   e Zd Z			ddededededed	ed
df fddZ	ddejdejdejdB d
dfddZ	  Z
S )"HunyuanImageIndividualTokenRefinerr   r/   Tr   r   
num_layersr   r   r   r(   Nc                    s4   t    t fddt|D | _d S )Nc              	      s   g | ]}t  d qS ))r   r   r   r   r   )r   .0_r   r   r   r   r   r    r!   
<listcomp>J  s    z?HunyuanImageIndividualTokenRefiner.__init__.<locals>.<listcomp>)rb   r"   rd   
ModuleListrangerefiner_blocks)r   r   r   r   r   r   r   ri   r   r!   r"   >  s   
	
z+HunyuanImageIndividualTokenRefiner.__init__r$   r   r&   c           
      C   s   d }|d ur>|j d }|j d }||j}||dd|dd|d}|dd}||@  }d|d d d d d d df< | jD ]}	|	|||}qA|S )Nr   r)   r   r   T)rB   rJ   deviceviewrepeatrk   r   r   )
r   r$   r   r&   self_attn_mask
batch_sizeseq_lenself_attn_mask_1self_attn_mask_2blockr    r    r!   rl   V  s   


z*HunyuanImageIndividualTokenRefiner.forwardr   rt   )rT   rU   rV   rn   r   r   r"   r8   rW   rl   ro   r    r    ri   r!   r   =  s:    r   c                       sp   e Zd Z			ddededededed	ed
eddf fddZ	ddejdej	dej	dB dejfddZ
  ZS )HunyuanImageTokenRefinerr   r/   Tr   r   r   r   	mlp_ratior   r   r(   Nc           	         sL   t    || }t||d| _tj||dd| _t||||||d| _d S )N)r   pooled_projection_dimT)r   )r   r   r   r   r   r   )	rb   r"   r   time_text_embedrd   rw   proj_inr   token_refiner)	r   r   r   r   r   r   r   r   rr   ri   r    r!   r"   n  s   

z!HunyuanImageTokenRefiner.__init__r$   r   r&   c                 C   sx   |d u r|j dd}n|j}| d}|| jdd|jdd }||}| ||}| |}| |||}|S )Nr)   r*   r,   )	meanrK   r   r   sumrJ   r   r   r   )r   r$   r   r&   pooled_hidden_statesoriginal_dtype
mask_floatr   r    r    r!   rl     s   

z HunyuanImageTokenRefiner.forwardr   rt   )rT   rU   rV   rn   r   r   r"   r8   rW   
LongTensorrl   ro   r    r    ri   r!   r   m  s>    	r   c                	       sV   e Zd Zddeee B deee B deddf fddZd	ej	dej	fd
dZ
  ZS )HunyuanImageRotaryPosEmbed      p@r\   rope_dimthetar(   Nc                    s   t    t|ttfrt|dvrtd| t|ttfr&t|dvr-td| t|t|ks?td| d| || _|| _|| _	d S )Nr   r   z9patch_size must be a tuple or list of length 2 or 3, got z7rope_dim must be a tuple or list of length 2 or 3, got z7patch_size and rope_dim must have the same length, got z and )
rb   r"   
isinstancerm   listrc   rh   r\   r   r   )r   r\   r   r   ri   r    r!   r"     s   

z#HunyuanImageRotaryPosEmbed.__init__r$   c                 C   sT  |j dkr|j\}}}}}| j\}}}|| || || g}	n"|j dkr9|j\}}}}| j\}}|| || g}	ntd|j g }
tt|	D ]}tjd|	| |jtj	d}|

| qItj|
ddi}tj|dd}g }tt|	D ]}t| j| || d	| jd
d}|
| qutjdd |D dd}tjdd |D dd}||fS )N      -hidden_states must be a 4D or 5D tensor, got r   )r   rK   indexingijr*   r,   T)use_realc                 S      g | ]}|d  qS )r   r    r   fr    r    r!   r         z6HunyuanImageRotaryPosEmbed.forward.<locals>.<listcomp>r)   c                 S   r   )r)   r    r   r    r    r!   r     r   )ndimrB   r\   rh   r   rc   r8   aranger   float32appendmeshgridstackr   r   reshaper   r9   )r   r$   r   frameheightwidthpatch_size_framepatch_size_heightpatch_size_width
rope_sizes
axes_gridsigridfreqsfreq	freqs_cos	freqs_sinr    r    r!   rl     s,   


"z"HunyuanImageRotaryPosEmbed.forward)r   )rT   rU   rV   rm   r   rn   r   r"   r8   rW   rl   ro   r    r    ri   r!   r     s    0r   c                       s~   e Zd Z		ddededededdf
 fd	d
Z		ddejdejdejdejdB de	ejejf dB dejfddZ
  ZS )"HunyuanImageSingleTransformerBlockr   rms_normr   r   r   qk_normr(   Nc                    s~   t    || }t|| }t|d |||dt |ddd
| _t|dd| _t	||| _
tjdd| _t	|| || _d S )NTr   )
r   r   r   r>   out_dimr   	processorr  r   pre_only
layer_norm	norm_typetanh)approximate)rb   r"   rn   r   r   r#   r   rv   rd   rw   proj_mlpr{   act_mlpproj_out)r   r   r   r   r  rr   mlp_dimri   r    r!   r"     s&   
z+HunyuanImageSingleTransformerBlock.__init__r$   r%   r   r&   r'   c                 O   s  |j d }tj||gdd}|}	| j||d\}
}| | |
}|
d d d | d d f |
d d | d d d f }
}| j|
|||d\}}tj||gdd}tj||gdd}|d| | }||	 }|d d d | d d f |d d | d d d f }}||fS )Nr)   r*   embr$   r%   r&   r'   r   )	rB   r8   r9   rv   r  r  r#   r   r  )r   r$   r%   r   r&   r'   argskwargstext_seq_lengthresidualr   gatemlp_hidden_statesnorm_encoder_hidden_statesr   context_attn_outputr    r    r!   rl     s,   


z*HunyuanImageSingleTransformerBlock.forward)r   r  r   rT   rU   rV   rn   r   r   r"   r8   rW   rm   rl   ro   r    r    ri   r!   r    s:    #	r  c                       s   e Zd Z	ddededededdf
 fdd	Z		dd
ejdejdejdejdB de	ejejf dB de	ejejf fddZ
  ZS )HunyuanImageTransformerBlockr  r   r   r   r  r(   Nc                    s   t    || }t|dd| _t|dd| _t|d ||||ddt |dd| _tj	|ddd| _
t||dd	| _tj	|ddd| _t||dd	| _d S )
Nr  r	  FTr   )r   r   added_kv_proj_dimr   r>   r  context_pre_onlyr   r  r  r   r   zgelu-approximate)r   r   )rb   r"   r   r   norm1_contextr   r   r#   rd   ru   r   r   r   norm2_context
ff_context)r   r   r   r   r  rr   ri   r    r!   r"     s*   
z%HunyuanImageTransformerBlock.__init__r$   r%   r   r&   r'   c                 O   s  | j ||d\}}	}
}}| j||d\}}}}}| j||||d\}}|||	d  }|||d  }| |}| |}|d|d d d f   |
d d d f  }|d|d d d f   |d d d f  }| |}| |}||d|  }||d|  }||fS )Nr  r  r)   )r   r   r#   r   r   r!  r   r"  )r   r$   r%   r   r&   r'   r  r  r   r   	shift_mlp	scale_mlpr   r  
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpr   r  r   context_ff_outputr    r    r!   rl   =  s*   


((

z$HunyuanImageTransformerBlock.forward)r  r   r  r    r    ri   r!   r    s8    '	r  c                $       sD  e Zd ZdZdZg dZg dZddgZe						
											d5de	de	de	de	de	de	de	de
dee	e	f deded e	d!e	dB d"e
d#ee	d$f d%ed&df" fd'd(Zed)						d6d*ejd+ejd,ejd-ejd.ejdB d/ejdB d0ejdB d1ejdB d)eeef dB d2ed&ejeeejf B fd3d4Z  ZS )7HunyuanImageTransformer2DModela^	  
    The Transformer model used in [HunyuanImage-2.1](https://github.com/Tencent-Hunyuan/HunyuanImage-2.1).

    Args:
        in_channels (`int`, defaults to `16`):
            The number of channels in the input.
        out_channels (`int`, defaults to `16`):
            The number of channels in the output.
        num_attention_heads (`int`, defaults to `24`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`, defaults to `128`):
            The number of channels in each head.
        num_layers (`int`, defaults to `20`):
            The number of layers of dual-stream blocks to use.
        num_single_layers (`int`, defaults to `40`):
            The number of layers of single-stream blocks to use.
        num_refiner_layers (`int`, defaults to `2`):
            The number of layers of refiner blocks to use.
        mlp_ratio (`float`, defaults to `4.0`):
            The ratio of the hidden layer size to the input size in the feedforward network.
        patch_size (`int`, defaults to `2`):
            The size of the spatial patches to use in the patch embedding layer.
        patch_size_t (`int`, defaults to `1`):
            The size of the tmeporal patches to use in the patch embedding layer.
        qk_norm (`str`, defaults to `rms_norm`):
            The normalization to use for the query and key projections in the attention layers.
        guidance_embeds (`bool`, defaults to `True`):
            Whether to use guidance embeddings in the model.
        text_embed_dim (`int`, defaults to `4096`):
            Input dimension of text embeddings from the text encoder.
        pooled_projection_dim (`int`, defaults to `768`):
            The dimension of the pooled projection of the text embeddings.
        rope_theta (`float`, defaults to `256.0`):
            The value of theta to use in the RoPE layer.
        rope_axes_dim (`tuple[int]`, defaults to `(16, 56, 56)`):
            The dimensions of the axes to use in the RoPE layer.
        image_condition_type (`str`, *optional*, defaults to `None`):
            The type of image conditioning to use. If `None`, no image conditioning is used. If `latent_concat`, the
            image is concatenated to the latent stream. If `token_replace`, the image is used to replace first-frame
            tokens in the latent stream and apply conditioning.
    T)
x_embeddercontext_embedderrv   )r  r  rX   r   r  r  @            (   r   r   r)   r)   r  F   Nr   r-  r-  r   out_channelsr   r   r   num_single_layersnum_refiner_layersr   r\   r  r   text_embed_dimtext_embed_2_dim
rope_thetarope_axes_dim.r   r(   c                    s  t    t|	ttfrt|	dv std|	   }|p |}t|	||| _t	| |d| _
|d ur=t|d|| _nd | _t|||| _t|	||| _t fddt|D | _t fddt|D | _t||dd	d
| _t|t|	| | _d| _d S )Nr   ra   )r   i   c                       g | ]
}t  d qS )r   r  )r  r   r   r   r   r  r    r!   r         z;HunyuanImageTransformer2DModel.__init__.<locals>.<listcomp>c                    r<  r=  )r  r   r>  r    r!   r     r?  Fr   r   )rb   r"   r   rm   r   rc   rh   rX   r+  r   r,  rp   context_embedder_2r   time_guidance_embedr   roperd   r   r   transformer_blockssingle_transformer_blocksr   r   rw   mathprodr  gradient_checkpointing)r   r   r5  r   r   r   r6  r7  r   r\   r  r   r8  r9  r:  r;  r   	inner_dimri   r>  r!   r"     s6   



z'HunyuanImageTransformer2DModel.__init__attention_kwargsr$   r   r%   encoder_attention_maskr   encoder_hidden_states_2encoder_attention_mask_2r   return_dictc           "   	   C   s
  |j dkr|j\}}}}||f}n|j dkr$|j\}}}}}|||f}ntd|j tdd t|| jjD }| |}| }| j	|||d}| 
|}| |||}| jd ur|d ur| |}| }g }g }t||||D ]6\}}}}|tj|| || ||  ||  gdd |tj|| || ||  ||  gdd qtt|}t|}tjjj||jd	 dfd
d}|d	d}t r| jr| jD ]}| j||||||d\}}q| jD ]}| j||||||d\}}qn#| jD ]}||||||d\}}q| jD ]}||||||d\}}q| ||}| |}| jj}|gt| |g t| jj }|j| }t|}d|d	 g}t |D ]} |!| d	 |d |  g qO|j"| }||gdd t|| jjD  }!|j|! }|
s|fS t#|dS )Nr   r   r   c                 s   s    | ]	\}}|| V  qd S rt   r    )r   dpr    r    r!   	<genexpr>  s    z9HunyuanImageTransformer2DModel.forward.<locals>.<genexpr>)r   r   r   r*   r)   T)rO   r   )r&   r'   c                 S   s   g | ]\}}|| qS r    r    )r   
post_patchpatchr    r    r!   r   r  s    z:HunyuanImageTransformer2DModel.forward.<locals>.<listcomp>)sample)$r   rB   rh   rm   zipconfigr\   rB  r   rA  r+  r,  r@  r   r8   r9   r   rd   
functionalpadr   is_grad_enabledrG  rC  _gradient_checkpointing_funcrD  r   r  r5  r   r   rc   r   extendpermuter   )"r   r$   r   r%   rJ  r   rK  rL  r   rI  rM  r   channelsr   r   sizesr   post_patch_sizesr'   r   new_encoder_hidden_statesnew_encoder_attention_masktext	text_masktext_2text_mask_2r&   r   r5  reshape_dimsr   permute_patternr   
final_dimsr    r    r!   rl     s   









	

	
 



z&HunyuanImageTransformer2DModel.forward)r-  r-  r.  r/  r0  r1  r   r   r2  r  Fr3  Nr   r4  F)NNNNNT)rT   rU   rV   __doc__ _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modules_repeated_blocksr   rn   r   rm   r   r   r"   r   r8   rW   r   dictr   rl   ro   r    r    ri   r!   r*  i  s    *	


F	
r*  );rE  typingr   r8   torch.nnrd   torch.nn.functionalrV  r   diffusers.loadersr   configuration_utilsr   r   loadersr   utilsr   r	   utils.torch_utilsr
   	attentionr   r   attention_dispatchr   attention_processorr   cache_utilsr   rA   r   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerrT   loggerr   ModulerX   rp   r}   r   r   r   r   r   r  r  r*  r    r    r    r!   <module>   sH   
o1201.L
O