o
    ۷i8                  	   @   s  d dl mZ d dlmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZm Z m!Z! e"e#Z$eG dd deZ%G dd deeeeeeZ&G dd deeeeeZ'dS )    )	dataclass)AnyN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)
BaseOutputapply_lora_scale	deprecatelogging   )AttentionMixin)
CacheMixin)zero_module)Transformer2DModelOutput)
ModelMixin)QwenEmbedRopeQwenImageTransformerBlockQwenTimestepProjEmbeddingsRMSNormcompute_text_seq_len_from_maskc                   @   s   e Zd ZU eej ed< dS )QwenImageControlNetOutputcontrolnet_block_samplesN)__name__
__module____qualname__tupletorchTensor__annotations__ r!   r!   g/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/controlnets/controlnet_qwenimage.pyr   .   s   
 r   c                       s  e Zd ZdZe										
d(dedededB dededededeeeef def fddZe					
d)dedededefddZ	e
d								d*dejdejdedejd ejd!ejd"eeeeef  dB d#ee dB deeef dB d$ed%ejeB fd&d'Z  ZS )+QwenImageControlNetModelTr   @      <            r%   8   r+   r   
patch_sizein_channelsout_channelsN
num_layersattention_head_dimnum_attention_headsjoint_attention_dimaxes_dims_ropeextra_condition_channelsc
                    s   t    |p|_  _tdt|dd_tjd_t	|dd_
t|j_t|j_t fddt|D _tg _ttjD ]}
jttjj qUttj||	 j_d	_d S )
Ni'  T)thetaaxes_dim
scale_rope)embedding_dimgư>)epsc                    s   g | ]
}t j d qS ))dimr1   r0   )r   	inner_dim).0_r0   r1   selfr!   r"   
<listcomp>S   s    z5QwenImageControlNetModel.__init__.<locals>.<listcomp>F)super__init__r.   r;   r   list	pos_embedr   time_text_embedr   txt_normnnLinearimg_intxt_in
ModuleListrangetransformer_blockscontrolnet_blockslenappendr   r   controlnet_x_embeddergradient_checkpointing)r?   r,   r-   r.   r/   r0   r1   r2   r3   r4   r=   	__class__r>   r"   rB   8   s(   



z!QwenImageControlNetModel.__init__   c           	      C   s   t |j}||d< ||d< ||d< ||d< | |}|rQ|j|j  |j|j  |j|j  |j|j  |j	j|j	 dd t
|j|_|S )Nr/   r0   r1   r4   F)strict)dictconfigfrom_configrD   load_state_dict
state_dictrE   rI   rJ   rM   r   rQ   )	clstransformerr/   r0   r1   load_weights_from_transformerr4   rX   
controlnetr!   r!   r"   from_transformerg   s   


z)QwenImageControlNetModel.from_transformerjoint_attention_kwargs      ?hidden_statescontrolnet_condconditioning_scaleencoder_hidden_statesencoder_hidden_states_masktimestep
img_shapestxt_seq_lensreturn_dictreturnc              
      s  |durt ddddd | |}|| | }| ||}t||\}}}| j|||jd}||j}| 	|}| 
|}|	durG|	 ni }|duro|jdd \}}tj||ftj|jd	}tj||gd
d}||d< d}| jD ](}t r| jr| |||d|||\}}n|||d|||d\}}||f }qtd}t|| jD ]\}}||}||f }q fdd|D }t|dkrdn|}|
s|S t|dS )a  
        The [`QwenImageControlNetModel`] forward method.

        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            controlnet_cond (`torch.Tensor`):
                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
            conditioning_scale (`float`, defaults to `1.0`):
                The scale factor for ControlNet outputs.
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`, *optional*):
                Mask for the encoder hidden states. Expected to have 1.0 for valid tokens and 0.0 for padding tokens.
                Used in the attention processor to prevent attending to padding tokens. The mask can have any pattern
                (not just contiguous valid tokens followed by padding) since it's applied element-wise in attention.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            img_shapes (`list[tuple[int, int, int]]`, *optional*):
                Image shapes for RoPE computation.
            txt_seq_lens (`list[int]`, *optional*):
                **Deprecated**. Not needed anymore, we use `encoder_hidden_states` instead to infer text sequence
                length.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.

        Returns:
            If `return_dict` is True, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a `tuple` where
            the first element is the controlnet block samples.
        Nrj   0.39.0zPassing `txt_seq_lens` to `QwenImageControlNetModel.forward()` is deprecated and will be removed in version 0.39.0. The text sequence length is now automatically inferred from `encoder_hidden_states` and `encoder_hidden_states_mask`.Fstandard_warn)max_txt_seq_lendevicer   )dtyperq      )r:   attention_maskr!   )rc   rf   rg   tembimage_rotary_embra   c                    s   g | ]}|  qS r!   r!   )r<   samplere   r!   r"   r@      s    z4QwenImageControlNetModel.forward.<locals>.<listcomp>r   )r   )r   rI   rQ   rE   r   rD   rq   torr   rF   rJ   copyshaper   onesboolcatrM   is_grad_enabledrR   _gradient_checkpointing_funcziprN   rO   r   )r?   rc   rd   re   rf   rg   rh   ri   rj   ra   rk   ru   text_seq_lenr=   rv   block_attention_kwargs
batch_sizeimage_seq_len
image_maskjoint_attention_maskblock_samplesblockr   block_samplecontrolnet_blockr!   rx   r"   forward   sn   1
	





z QwenImageControlNetModel.forward)	r   r$   r%   r&   r'   r(   r)   r*   r   )rU   r'   r(   Tr   )rb   NNNNNNT)r   r   r    _supports_gradient_checkpointingr   intr   rB   classmethodr`   r
   r   r   float
LongTensorrC   rW   strr   r}   FloatTensorr   r   __classcell__r!   r!   rS   r"   r#   3   s    	
.
	
r#   c                       s   e Zd ZdZ fddZ							ddejdeej dee	 d	ej
d
ej
dejdeeeeef  dB dee dB deeef dB dedeeB fddZ  ZS )QwenImageMultiControlNetModela  
    `QwenImageMultiControlNetModel` wrapper class for Multi-QwenImageControlNetModel

    This module is a wrapper for multiple instances of the `QwenImageControlNetModel`. The `forward()` API is designed
    to be compatible with `QwenImageControlNetModel`.

    Args:
        controlnets (`list[QwenImageControlNetModel]`):
            Provides additional conditioning to the unet during the denoising process. You must set multiple
            `QwenImageControlNetModel` as a list.
    c                    s   t    t|| _d S )N)rA   rB   rG   rK   nets)r?   controlnetsrS   r!   r"   rB     s   
z&QwenImageMultiControlNetModel.__init__NTrc   rd   re   rf   rg   rh   ri   rj   ra   rk   rl   c                 C   s   |d urt ddddd t| jdkrN| jd }tt||D ],\}\}}|||||||||	|
d	}|dkr9|}q|d urK|d urKd	d
 t||D }q|S td)Nrj   rm   zPassing `txt_seq_lens` to `QwenImageMultiControlNetModel.forward()` is deprecated and will be removed in version 0.39.0. The text sequence length is now automatically inferred from `encoder_hidden_states` and `encoder_hidden_states_mask`.Frn   rs   r   )	rc   rd   re   rf   rg   rh   ri   ra   rk   c                 S   s   g | ]\}}|| qS r!   r!   )r<   control_block_sampler   r!   r!   r"   r@   A  s    z9QwenImageMultiControlNetModel.forward.<locals>.<listcomp>zJQwenImageMultiControlNetModel only supports a single controlnet-union now.)r   rO   r   	enumerater   
ValueError)r?   rc   rd   re   rf   rg   rh   ri   rj   ra   rk   r_   iimagescaler   control_block_samplesr!   r!   r"   r     s<   

z%QwenImageMultiControlNetModel.forward)NNNNNNT)r   r   r   __doc__rB   r   r   rC   tensorr   r   r   r   r   rW   r   r   r}   r   r   r   r!   r!   rS   r"   r     sB    	
	
r   )(dataclassesr   typingr   r   torch.nnrG   configuration_utilsr   r   loadersr   r   utilsr	   r
   r   r   	attentionr   cache_utilsr   controlnets.controlnetr   modeling_outputsr   modeling_utilsr   "transformers.transformer_qwenimager   r   r   r   r   
get_loggerr   loggerr   r#   r   r!   r!   r!   r"   <module>   s*   
	
 Q