o
    ۷i4                  
   @   sj  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ d dl	m
Z
 ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ dZdZG dd dej Z!G dd dZ"G dd dej Z#dej$dej$dej$de%dej$f
ddZ&eG dd dej Z'G d d! d!Z(eG d"d# d#ej Z)G d$d% d%eeeeZ*dS )&    N)Literal)pad_sequence   )ConfigMixinregister_to_config)PeftAdapterMixin)FromOriginalModelMixin)	Attention)RMSNorm)maybe_allow_in_graph   )dispatch_attention_fn)zero_module)
ModelMixin       c                       s4   e Zd Zd
 fdd	ZedddZdd	 Z  ZS )TimestepEmbedderNr   c              	      sL   t    |d u r|}ttj||ddt tj||dd| _|| _d S )NTbias)super__init__nn
SequentialLinearSiLUmlpfrequency_embedding_size)selfout_sizemid_sizer   	__class__ e/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/controlnets/controlnet_z_image.pyr   (   s   

zTimestepEmbedder.__init__'  c              
   C   s   t jjdddY |d }t t| t jd|t j| jd | }| d d d f 	 |d   }t j
t |t |gdd}|d rXt j
|t |d d d d	f gdd}|W  d    S 1 sdw   Y  d S )
NcudaFenabledr   r   )startenddtypedevicedim   )torchampautocastexpmathlogarangefloat32r+   floatcatcossin
zeros_like)tr.   
max_periodhalffreqsargs	embeddingr"   r"   r#   timestep_embedding4   s   $($z#TimestepEmbedder.timestep_embeddingc                 C   s`   |  || j}| jd jj}t| jd dd }|jr ||}n	|d ur)||}| |}|S )Nr   compute_dtype)rC   r   r   weightr*   getattris_floating_pointto)r   r=   t_freqweight_dtyperD   t_embr"   r"   r#   forwardA   s   

zTimestepEmbedder.forward)Nr   )r$   )__name__
__module____qualname__r   staticmethodrC   rL   __classcell__r"   r"   r    r#   r   '   s
    r   c                   @   s`   e Zd ZdZdZdZdd Z			ddedej	dej	dB dej	dB d	ej	dB d
ej	fddZ
dS )ZSingleStreamAttnProcessorz
    Processor for Z-Image single stream attention that adapts the existing Attention class to match the behavior of the
    original Z-ImageAttention module.
    Nc                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzlZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher.)hasattrFImportError)r   r"   r"   r#   r   W   s
   
z#ZSingleStreamAttnProcessor.__init__attnhidden_statesencoder_hidden_statesattention_mask	freqs_cisreturnc              
   C   sZ  | |}||}||}|d|jdf}|d|jdf}|d|jdf}|jd ur4||}|jd ur>||}dtjdtjdtjfdd}	|d urZ|	||}|	||}|j	}
|
|
|
|
}}|d ur}|jdkr}|d d d d d d f }t||||dd	| j| jd
}|dd}|
|
}|jd |}t|jdkr|jd |}|S )Nr,   x_inr[   r\   c                 S   s   t jjddd2 t |  jg | jd d ddR  }|d}t || 	d}|
| W  d    S 1 s=w   Y  d S )Nr%   Fr&   r,   r   r   )r0   r1   r2   view_as_complexr8   reshapeshape	unsqueezeview_as_realflattentype_as)r]   r[   xx_outr"   r"   r#   apply_rotary_embt   s   ,
$z=ZSingleStreamAttnProcessor.__call__.<locals>.apply_rotary_embr           F)	attn_mask	dropout_p	is_causalbackendparallel_configr   r   r/   )to_qto_kto_v	unflattenheadsnorm_qnorm_kr0   Tensorr*   rH   ndimr   _attention_backend_parallel_configrc   to_outlen)r   rW   rX   rY   rZ   r[   querykeyvaluerg   r*   outputr"   r"   r#   __call__]   sD   









z#ZSingleStreamAttnProcessor.__call__)NNN)rM   rN   rO   __doc__rw   rx   r   r	   r0   ru   r   r"   r"   r"   r#   rR   N   s*    
rR   c                       s6   e Zd Zdedef fddZdd Zdd Z  ZS )	FeedForwardr.   
hidden_dimc                    sD   t    tj||dd| _tj||dd| _tj||dd| _d S )NFr   )r   r   r   r   w1w2w3)r   r.   r   r    r"   r#   r      s   
zFeedForward.__init__c                 C   s   t || S N)rU   silu)r   x1x3r"   r"   r#   _forward_silu_gating   s   z FeedForward._forward_silu_gatingc                 C   s   |  | | || |S r   )r   r   r   r   )r   re   r"   r"   r#   rL      s   zFeedForward.forward)rM   rN   rO   intr   r   rL   rQ   r"   r"   r    r#   r      s    r   value_noisyvalue_clean
noise_maskseq_lenr\   c              	   C   s<   | d}t|dk|  dd|d| dd|dS )Nr,   r/   )ra   r0   whereexpand)r   r   r   r   noise_mask_expandedr"   r"   r#   select_per_token   s   
r   c                       s   e Zd Z	ddedededededef fdd	Z	
	
	
	
ddejdejdejdejd
B dejd
B dejd
B dejd
B fddZ	  Z
S )ZImageTransformerBlockTlayer_idr.   n_heads
n_kv_headsnorm_epsqk_normc                    s   t    || _|| | _t|d || ||rdnd dddt d	| _t|t|d d d| _	|| _
t||d| _t||d| _t||d| _t||d| _|| _|rfttjt|td	| d
d| _d S d S )Nrms_normh㈵>F		query_dimcross_attention_dimdim_headrr   r   epsr   out_bias	processorr      r.   r   r      Tr   )r   r   r.   head_dimr	   rR   	attentionr   r   feed_forwardr   r
   attention_norm1	ffn_norm1attention_norm2	ffn_norm2
modulationr   r   r   minADALN_EMBED_DIMadaLN_modulation)r   r   r.   r   r   r   r   r   r    r"   r#   r      s0   



&zZImageTransformerBlock.__init__Nre   ri   r[   adaln_inputr   adaln_noisyadaln_cleanc              	   C   s  | j r|jd }|d urm| |}	| |}
|	jddd\}}}}|
jddd\}}}}| | }}| | }}d| d| }}d| d| }}t||||}t||||}t||||}t||||}n%| |}|djddd\}}}}| | }}d| d| }}| j| || ||d}||| 	|  }||| 
| | ||   }|S | j| |||d}|| 	| }|| 
| | | }|S )Nr/   r   r-         ?r   rZ   r[   )r   r`   r   chunktanhr   ra   r   r   r   r   r   r   )r   re   ri   r[   r   r   r   r   r   	mod_noisy	mod_cleanscale_msa_noisygate_msa_noisyscale_mlp_noisygate_mlp_noisyscale_msa_cleangate_msa_cleanscale_mlp_cleangate_mlp_clean	scale_msa	scale_mlpgate_msagate_mlpmodattn_outr"   r"   r#   rL      s:   




"	zZImageTransformerBlock.forward)T)NNNNrM   rN   rO   r   r8   boolr   r0   ru   rL   rQ   r"   r"   r    r#   r      sD    .r   c                   @   sf   e Zd Z			ddedee dee fddZedd	ee d
ee defddZde	j
fddZdS )RopeEmbedder      p@   8   r   @      r   theta	axes_dims	axes_lensc                 C   s4   || _ || _|| _t|t|ksJ dd | _d S )Nz1axes_dims and axes_lens must have the same length)r   r   r   rz   r[   )r   r   r   r   r"   r"   r#   r   #  s
   
zRopeEmbedder.__init__r.   r)   c           
      C   s   t dO g }tt| |D ];\}\}}d|t jd|dt jdd|   }t j||jt jd}t || }t t 	||
t j}	||	 q|W  d    S 1 sWw   Y  d S )Ncpur   r   r   r*   r+   )r+   r*   )r0   r+   	enumeratezipr6   float64outerr8   polar	ones_likerH   	complex64append)
r.   r)   r   r[   ider@   timestepfreqs_cis_ir"   r"   r#   precompute_freqs_cis/  s   "$z!RopeEmbedder.precompute_freqs_cisidsc                    s   |j dksJ |jd t| jksJ |j | jd u r3| j| j| j| jd| _ fdd| jD | _n| jd j krF fdd| jD | _g }t	t| jD ]}|d d |f }|
| j| |  qOtj|ddS )	Nr   r,   )r   c                       g | ]}|  qS r"   rH   .0r[   r+   r"   r#   
<listcomp>C      z)RopeEmbedder.__call__.<locals>.<listcomp>r   c                    r   r"   r   r   r   r"   r#   r   G  r   r-   )rv   r`   rz   r   r+   r[   r   r   r   ranger   r0   r9   )r   r   resultr   indexr"   r   r#   r   <  s   
zRopeEmbedder.__call__N)r   r   r   )r   )rM   rN   rO   r8   listr   r   rP   r   r0   ru   r   r"   r"   r"   r#   r   "  s    
"r   c                       sl   e Zd Z		ddedededededef fd	d
Z	ddejdejdejdejdejdB f
ddZ	  Z
S )ZImageControlTransformerBlockTr   r   r.   r   r   r   r   c	           	         s  t    || _|| | _t|d || ||rdnd dddt d	| _t|t|d d d| _	|| _
t||d| _t||d| _t||d| _t||d| _|| _|rdttjt|td	| d
d| _|| _|dkrvtt| j| j| _tt| j| j| _d S )Nr   r   Fr   r   r   r   r   r   Tr   r   )r   r   r.   r   r	   rR   r   r   r   r   r   r
   r   r   r   r   r   r   r   r   r   r   r   block_idr   before_proj
after_proj)	r   r   r.   r   r   r   r   r   r   r    r"   r#   r   R  s6   


"z&ZImageControlTransformerBlock.__init__Ncre   ri   r[   r   c              	   C   s@  | j dkr| || }g }ntt|}|d}| jro|d us$J | |dj	ddd\}}}	}
|
 |

 }}
d| d|	 }}	| j| || ||d}||| |  }||
| | | ||	   }n| j| |||d}|| | }|| | | | }| |}|||g7 }t|}|S )	Nr   r,   r/   r   r   r-   r   r   )r   r   r   r0   unbindpopr   r   ra   r   r   r   r   r   r   r   r   r   stack)r   r   re   ri   r[   r   all_cr   r   r   r   r   c_skipr"   r"   r#   rL     s,   
	
"$

z%ZImageControlTransformerBlock.forward)Tr   r   r   r"   r"   r    r#   r   P  s8    
6r   c                       s.  e Zd ZdZe												d-d	ee d
ee ded dB f fddZe	dd Z
ed.ddZdejdedefddZ	d/dejdededejdedB f
ddZdeej d eej dedefd!d"Zdeej dedefd#d$Z	%		&d0d'eej d(eej d)eej d*efd+d,Z  ZS )1ZImageControlNetModelTNr   r/      r      r   control_layers_placescontrol_refiner_layers_placesadd_control_noise_refiner)control_layerscontrol_noise_refinerc                    sT  t    || _|| _|| _|| _d| jv sJ t fdd| jD | _i }t	t
||D ]\}\}}tj|| | | j  dd}||| d| < q3t|| _| jdkrad | _n,| jdkrzt fd	dt|D | _nt fd
dt|D | _d | _d | _d | _d | _d | _d | _d | _d | _d | _d S )Nr   c                    s"   g | ]}t | |d qS ))r   r   )r   r   r.   r   r   r   r   r"   r#   r     s    z2ZImageControlNetModel.__init__.<locals>.<listcomp>Tr   -r  r  c                    s(   g | ]}t d |  d|dqS )  T)r   r   r  r   r   r  r"   r#   r     s    c                    s&   g | ]}t d |  ddqS )r	  T)r   )r   r
  r  r"   r#   r     s    
)r   r   r  control_in_dimr  r  r   
ModuleListr  r   r   r   
ModuleDictcontrol_all_x_embedderr  r   t_scale
t_embedderall_x_embeddercap_embedderrope_embeddernoise_refinercontext_refinerx_pad_tokencap_pad_token)r   r  r  r  r  all_patch_sizeall_f_patch_sizer.   n_refiner_layersr   r   r   r   r  	patch_idx
patch_sizef_patch_size
x_embedderr    r  r#   r     sL   




zZImageControlNetModel.__init__c                 C   sL   |j |_ |j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_|S r   )	r  r  r  r  r  r  r  r  r  )cls
controlnettransformerr"   r"   r#   from_transformer  s   z&ZImageControlNetModel.from_transformerc                    sJ   |d u rdd | D } fddt || D }tj|dd}tj|ddS )	Nc                 s   s    | ]}d V  qdS r   Nr"   r   _r"   r"   r#   	<genexpr>  s    z?ZImageControlNetModel.create_coordinate_grid.<locals>.<genexpr>c                    s(   g | ]\}}t j||| t j d qS )r   )r0   r6   int32)r   x0spanr   r"   r#   r     s   ( z@ZImageControlNetModel.create_coordinate_grid.<locals>.<listcomp>ij)indexingr,   r-   )r   r0   meshgridr   )sizer(   r+   axesgridsr"   r   r#   create_coordinate_grid  s
   z,ZImageControlNetModel.create_coordinate_gridimager  r  c              	   C   s   |||}}}|  \}}}	}
|| |	| |
| }}}||||||||}|ddddddd|| | || | | }|||	|
f|||ffS )zIPatchify a single image tensor: (C, F, H, W) -> (num_patches, patch_dim).r/   r      r   r      r   )r-  viewpermuter_   )r   r1  r  r  pHpWpFCrU   HWF_tokensH_tokensW_tokensr"   r"   r#   _patchify_image  s   2z%ZImageControlNetModel._patchify_imagefeatpos_grid_size	pos_startr+   noise_mask_valc                 C   s   t |}| t }|| }| j|||ddd}	|dkr]| jdd|ddd|d}
tj|	|
gdd}tj||dd	 |dgdd}ttj|tj|d
tj	|tj|d
g}n|	}|}tj|tj|d
}|d	urs|g| nd	}|||||fS )z>Pad feature to SEQ_MULTI_OF, create position IDs and pad mask.)r-  r(   r+   r   r   )r/   r/   r/   )r   r   r   r/   r-   r,   Nr   )
rz   SEQ_MULTI_OFr0  rc   repeatr0   r9   zerosr   ones)r   r@  rA  rB  r+   rC  ori_lenpad_len	total_lenori_pos_idspad_pos_idspos_idspadded_featpad_maskr   r"   r"   r#   _pad_with_ids'  s*   	
"z#ZImageControlNetModel._pad_with_ids	all_imageall_cap_featsc                 C   s  |d j }g g g g f\}}}}	g g g }
}}t||D ]b\}}| |t|t| t  ddfd|\}}}}}|
| || || | |||\}}\}}}| ||||f|d ddf|\}}}}}|| || || |	| q||
||||	|fS )z5Patchify for basic mode: single image per batch item.r   r/   )r/   r   r   )r+   r   rP  rz   rD  r   r?  )r   rQ  rR  r  r  r+   all_img_outall_img_sizeall_img_pos_idsall_img_pad_maskall_cap_outall_cap_pos_idsall_cap_pad_maskr1  cap_featcap_outcap_pos_idscap_pad_maskcap_lenr%  img_patchesr-  F_tH_tW_timg_outimg_pos_idsimg_pad_maskr"   r"   r#   patchify_and_embedM  s4   
 





z(ZImageControlNetModel.patchify_and_embedc              
   C   s   | }}|}g }t |D ]]\}}	|	 \}
}}}|| || || }}}|	|
||||||}	|	ddddddd|| | || | |
 }	t|	}| t }tj|	|	dd  	|dgdd	}|
| q|S )
Nr/   r   r2  r   r   r3  r   r,   r-   )r   r-  r4  r5  r_   rz   rD  r0   r9   rE  r   )r   rQ  r  r  r6  r7  r8  all_image_outr   r1  r9  rU   r:  r;  r<  r=  r>  image_ori_lenimage_padding_lenimage_padded_featr"   r"   r#   patchifyr  s   2
"zZImageControlNetModel.patchifyr   r/   re   	cap_featscontrol_contextconditioning_scalec           '   	      s   | j d u s-| jd u s-| jd u s-| jd u s-| jd u s-| jd u s-| jd u s-| jd u s-| jd u r1t	d|| j
jv s9J || j
jv sAJ t|}|d j}	|| j  }| |}| ||||\}}}
}}}}dd |D }tdd |D suJ t|}| |||}tj|dd}| j| d|  |}| j|t|< t|j|dd}t|d	d
d}tj|dd}| j| d|  |}||}| j|t|< t|j|dd}t| tj|ddjdd |D dd}t|d	d
d}t|d	d
d}|d d d |jd f }tj||ftj|	d}t|D ]\}}d||d |f< q| jd ur| jdkr2| j}n| jdkr<| j }n	t	d| j d|D ] }t! r^| j"r^| #||||||}qG||||||}qGt$|d d t$|d } fddt| j%D }nd }t! r| j"rt| jD ]\}}| #|||||}|d ur||v r|||  }qn"t| jD ]\}}|||||}|d ur||v r|||  }qdd |D }t|}tj|dd}| |}| j|t|< t|j|dd}t| tj|ddjdd |D dd}t|d	d
d}t|d	d
d}|d d d |jd f }tj||ftj|	d}t|D ]\}}d||d |f< q@t! rg| j"rg| jD ]}| #||||}qZn| jD ]	}||||}qjg }g }t&|D ]7}|| } || }!|'t|| d |  || d |! g |'t|| d |  || d |! g q|dd t(||D }"|"dd |D ksJ t|"}#t|d	d
d}t|d	d
d}tj||#ftj|	d}$t|"D ]\}}d|$|d |f< q| js%t! r| j"r| j D ]}| #|||||}q	n| j D ]
}|||||}qg }%t&|D ]!}|| } || }!|%'t|| d |  || d |! g q+t|%d	d
d}%| jD ] }t! rn| j"rn| #||%||$||}%qW||%||$||}%qWt$|%d d  fddt| j)D }&|&S )NzaRequired modules are `None`, use `from_transformer` to share required modules from `transformer`.r   c                 S      g | ]}t |qS r"   rz   r$  r"   r"   r#   r         z1ZImageControlNetModel.forward.<locals>.<listcomp>c                 s   s    | ]	}|t  d kV  qdS r#  )rD  r$  r"   r"   r#   r&    s    z0ZImageControlNetModel.forward.<locals>.<genexpr>r-   r  Trh   )batch_firstpadding_valuec                 S   ro  r"   rp  r$  r"   r"   r#   r     rq  r/   r   r  r  z.Unsupported `add_control_noise_refiner` type: .r,   c                       i | ]\}}||   qS r"   r"   r   idx	layer_idxrn  hintsr"   r#   
<dictcomp>  s    z1ZImageControlNetModel.forward.<locals>.<dictcomp>c                 S   ro  r"   rp  r$  r"   r"   r#   r     rq  c                 S   ro  r"   rp  r$  r"   r"   r#   r     rq  c                 S   s   g | ]\}}|| qS r"   r"   )r   abr"   r"   r#   r   "  s    c                 S   ro  r"   rp  r$  r"   r"   r#   r   #  rq  c                    ru  r"   r"   rv  ry  r"   r#   r{  J  s    )*r  r  r  r  r  r  r  r  r  
ValueErrorconfigr  r  rz   r+   rf  allmaxrk  r0   r9   r  r   splitr   rd   r`   rF  r   r   r  r  r  is_grad_enabledgradient_checkpointing_gradient_checkpointing_funcr   r  r   r   r   r  )'r   re   r=   rl  rm  rn  r  r  bszr+   x_size	x_pos_idsr\  x_inner_pad_maskcap_inner_pad_maskx_item_seqlensx_max_item_seqlenr   x_freqs_cisx_attn_maskr   r   layerslayernoise_refiner_block_samplesrx  cap_item_seqlenscap_max_item_seqlencap_freqs_ciscap_attn_maskunifiedunified_freqs_cisx_lenr^  unified_item_seqlensunified_max_item_seqlenunified_attn_maskcontrol_context_unifiedcontrolnet_block_samplesr"   ry  r#   rL     s   













,




&

,0


0

zZImageControlNetModel.forward)NNNNr   r   r   r   r   r   r   T)NNr   )r   r   r/   )rM   rN   rO    _supports_gradient_checkpointingr   r   r   r   r   classmethodr"  rP   r0  r0   ru   r?  tupler+   rP  rf  rk  r8   rL   rQ   r"   r"   r    r#   r     s    
R

&
%
"r   )+r4   typingr   r0   torch.nnr   torch.nn.functional
functionalrU   torch.nn.utils.rnnr   configuration_utilsr   r   loadersr   loaders.single_file_modelr   models.attention_processorr	   models.normalizationr
   utils.torch_utilsr   attention_dispatchr   controlnets.controlnetr   modeling_utilsr   r   rD  Moduler   rR   r   ru   r   r   r   r   r   r   r"   r"   r"   r#   <module>   sJ   'Q
d._