o
    ۷i                  
   @   s  d dl Z d dlmZmZ d dlZd dlZd dlmZm	Z	m
Z
mZmZmZ ddlmZmZ ddlmZmZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ e! rd dl,m-  m.Z/ dZ0ndZ0e"1e2Z3dZ4				d,de5de5de6de6fddZ7	d-dej8d ej9dB d!e:fd"d#Z;				d.d$e5dB d%e:ej<B dB d&e=e5 dB d'e=e6 dB fd(d)Z>G d*d+ d+e)eeeZ?dS )/    N)AnyCallable)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjectionT5EncoderModelT5TokenizerFast   )PipelineImageInputVaeImageProcessor)FluxIPAdapterMixinFluxLoraLoaderMixinFromSingleFileMixinTextualInversionLoaderMixin)AutoencoderKL)FluxControlNetModelFluxMultiControlNetModel)FluxTransformer2DModel)FlowMatchEulerDiscreteScheduler)USE_PEFT_BACKENDis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor   )DiffusionPipeline   )FluxPipelineOutputTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers.utils import load_image
        >>> from diffusers import FluxControlNetPipeline
        >>> from diffusers import FluxControlNetModel

        >>> base_model = "black-forest-labs/FLUX.1-dev"
        >>> controlnet_model = "InstantX/FLUX.1-dev-controlnet-canny"
        >>> controlnet = FluxControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.bfloat16)
        >>> pipe = FluxControlNetPipeline.from_pretrained(
        ...     base_model, controlnet=controlnet, torch_dtype=torch.bfloat16
        ... )
        >>> pipe.to("cuda")
        >>> control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
        >>> prompt = "A girl in city, 25 years old, cool, futuristic"
        >>> image = pipe(
        ...     prompt,
        ...     control_image=control_image,
        ...     control_guidance_start=0.2,
        ...     control_guidance_end=0.8,
        ...     controlnet_conditioning_scale=1.0,
        ...     num_inference_steps=28,
        ...     guidance_scale=3.5,
        ... ).images[0]
        >>> image.save("flux.png")
        ```
            ?ffffff?base_seq_lenmax_seq_len
base_shift	max_shiftc                 C   s,   || ||  }|||  }| | | }|S N )image_seq_lenr%   r&   r'   r(   mbmur*   r*   g/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/flux/pipeline_flux_controlnet.pycalculate_shiftZ   s   r0   sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr1   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr5   r1   moder7   AttributeError)r2   r3   r4   r*   r*   r/   retrieve_latentsh   s   

r;   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr>   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r>   r=   r?   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r?   r=   r=   r*   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r>   len)	schedulerr<   r=   r>   r?   kwargsaccepts_timestepsaccept_sigmasr*   r*   r/   retrieve_timestepsv   s2   rM   c                C       s  e Zd ZdZdZddgZg dZ		d]deded	e	d
e
dedededeee B ee B eB dedef fddZ					d^deee B dededejdB dejdB f
ddZ		d_deee B dedejdB fddZ							d`deee B deee B dB dejdB dedejdB dejdB dededB fd d!Zd"d# Zd$d% Z 								dad&d'Z!e"d(d) Z#e"d*d+ Z$e"d,d- Z%	dbd.d/Z&	0	0dcd1d2Z'e(d3d4 Z)e(d5d6 Z*e(d7d8 Z+e(d9d: Z,e- e.e/ddddd;ddd<dd=d>d;ddd;dddddddddddd?d@dddAgdf deee B deee B dB dBeee B dCeee B dB dDedEedB dFedB dGedHee dB dIedJeee B dKeee B dLe0dMeee B dB dNeee B dedB dOej1eej1 B dB dAejdB dejdB dejdB dPe0dB dQeej2 dB dRe0dB dSeej2 dB dTejdB dUejdB dVedB dWe3dXe4ee5f dB dYe6eegdf dB dZee def@d[d\Z7  Z8S )dFluxControlNetPipelinea  
    The Flux pipeline for text-to-image generation.

    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/

    Args:
        transformer ([`FluxTransformer2DModel`]):
            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
        text_encoder_2 ([`T5EncoderModel`]):
            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer_2 (`T5TokenizerFast`):
            Second Tokenizer of class
            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
    z=text_encoder->text_encoder_2->image_encoder->transformer->vaeimage_encoderfeature_extractor)r7   prompt_embedscontrol_imageNrI   vaetext_encoder	tokenizertext_encoder_2tokenizer_2transformer
controlnetc                    s   t    t|ttfrt|}| j|||||||||	|
d
 t| dd r0dt| j	j
jd  nd| _t| jd d| _t| drJ| jd urJ| jjnd| _d	| _d S )
N)
rS   rT   rV   rU   rW   rX   rI   rY   rO   rP   rS   r   r      )vae_scale_factorrU   M      )super__init__
isinstancelisttupler   register_modulesgetattrrH   rS   configblock_out_channelsr[   r   image_processorr8   rU   model_max_lengthtokenizer_max_lengthdefault_sample_size)selfrI   rS   rT   rU   rV   rW   rX   rY   rO   rP   rG   r*   r/   r_      s(   
(
zFluxControlNetPipeline.__init__r      promptnum_images_per_promptmax_sequence_lengthr=   dtypec              	   C   s0  |p| j }|p
| jj}t|tr|gn|}t|}t| tr%| || j}| j	|d|ddddd}|j
}| j	|dddj
}	|	jd |jd krit||	si| j	|	d d | jd	 df }
td
| d|
  | j||ddd }| jj}|j||d}|j\}}}|d	|d	}||| |d}|S )N
max_lengthTFpt)paddingrr   
truncationreturn_lengthreturn_overflowing_tokensreturn_tensorslongestrt   rx   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  	 tokens: output_hidden_statesr   rq   r=   )_execution_devicerT   rq   r`   strrH   r   maybe_convert_promptrU   rW   	input_idsshapetorchequalbatch_decoderi   loggerwarningrV   torepeatview)rk   rn   ro   rp   r=   rq   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrQ   _seq_lenr*   r*   r/   _get_t5_prompt_embeds   sB   

	 "z,FluxControlNetPipeline._get_t5_prompt_embedsc           
   	   C   s  |p| j }t|tr|gn|}t|}t| tr| || j}| j|d| jddddd}|j}| j|dddj}|j	d |j	d kret
||se| j|d d | jd	 df }td
| j d|  | j||dd}	|	j}	|	j| jj|d}	|	d	|}	|	|| d}	|	S )Nrr   TFrs   )rt   rr   ru   rw   rv   rx   ry   rz   r{   r   z\The following part of your input was truncated because CLIP can only handle sequences up to r|   r}   r   )r   r`   r   rH   r   r   rU   ri   r   r   r   r   r   r   r   rT   r   pooler_outputrq   r   r   )
rk   rn   ro   r=   r   r   r   r   r   rQ   r*   r*   r/   _get_clip_prompt_embeds*  s>   


 "z.FluxControlNetPipeline._get_clip_prompt_embedsprompt_2rQ   pooled_prompt_embeds
lora_scalec	                 C   s8  |p| j }|dur+t| tr+|| _| jdurtrt| j| | jdur+tr+t| j| t|tr3|gn|}|du rX|p<|}t|trE|gn|}| j	|||d}| j
||||d}| jdurjt| trjtrjt| j| | jdur|t| tr|tr|t| j| | jdur| jjn| jj}	t|jd dj||	d}
|||
fS )a  

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in all text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        N)rn   r=   ro   )rn   ro   rp   r=   r   r
   r=   rq   )r   r`   r   _lora_scalerT   r   r   rV   r   r   r   r   rq   rX   r   zerosr   r   )rk   rn   r   r=   ro   rQ   r   rp   r   rq   text_idsr*   r*   r/   encode_promptV  s>   
#


z$FluxControlNetPipeline.encode_promptc                 C   sX   t | j j}t|tjs| j|ddj}|j	||d}| |j
}|j|dd}|S )Nrs   )rx   r   r   dim)nextrO   rE   rq   r`   r   TensorrP   pixel_valuesr   image_embedsrepeat_interleave)rk   imager=   ro   rq   r   r*   r*   r/   encode_image  s   z#FluxControlNetPipeline.encode_imagec                 C   s  g }|d u r?t |ts|g}t|| jjjkr'tdt| d| jjj d|D ]}| ||d}||d d d f  q)n+t |tsG|g}t|| jjjkr`tdt| d| jjj d|D ]}|| qbg }|D ]}t	j
|g| dd}|j|d	}|| qn|S )
NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r   zR`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got z image embeds and r   r   r=   )r`   ra   rH   rX   encoder_hid_projnum_ip_adaptersr@   r   appendr   catr   )rk   ip_adapter_imageip_adapter_image_embedsr=   ro   r   single_ip_adapter_imagesingle_image_embedsr*   r*   r/   prepare_ip_adapter_image_embeds  s4   

z6FluxControlNetPipeline.prepare_ip_adapter_image_embedsc              	      s&  | j d  dks| j d  dkr$td j d  d| d| d |d urEt fdd|D sEtd	 j d
 fdd|D  |d urX|d urXtd| d| d|d urk|d urktd| d| d|d u rw|d u rwtd|d urt|tst|tstdt	| |d urt|tst|tstdt	| |d ur|d urtd| d| d|d ur|d urtd| d| d|d ur|d ur|j
|j
krtd|j
 d|j
 d|d ur|	d u rtd|d ur|
d u rtd|d ur|dkrtd| d S d S )Nr   r   z-`height` and `width` have to be divisible by z	 but are z and z(. Dimensions will be resized accordinglyc                 3   s    | ]}| j v V  qd S r)   _callback_tensor_inputs.0krk   r*   r/   	<genexpr>  s    

z6FluxControlNetPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r*   r   r   r   r*   r/   
<listcomp>  s    z7FluxControlNetPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.z Cannot forward both `prompt_2`: zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z4`prompt_2` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: z)Cannot forward both `negative_prompt_2`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` .zIf `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`.zIf `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`.rm   z8`max_sequence_length` cannot be greater than 512 but is )r[   r   r   allr@   r   r`   r   ra   typer   )rk   rn   r   heightwidthnegative_promptnegative_prompt_2rQ   negative_prompt_embedsr   negative_pooled_prompt_embeds"callback_on_step_end_tensor_inputsrp   r*   r   r/   check_inputs  st   $z#FluxControlNetPipeline.check_inputsc           	      C   s|   t ||d}|d t |d d d f  |d< |d t |d d d f  |d< |j\}}}||| |}|j||dS )Nr
   ).r   ).r   r   )r   r   aranger   reshaper   )	r   r   r   r=   rq   latent_image_idslatent_image_id_heightlatent_image_id_widthlatent_image_id_channelsr*   r*   r/   _prepare_latent_image_ids&  s   ""z0FluxControlNetPipeline._prepare_latent_image_idsc                 C   sR   |  |||d d|d d} | dddddd} | ||d |d  |d } | S )Nr   r      r   r
      )r   permuter   )r7   r   num_channels_latentsr   r   r*   r*   r/   _pack_latents5  s   z$FluxControlNetPipeline._pack_latentsc                 C   s   | j \}}}dt||d   }dt||d   }| ||d |d |d dd} | dddddd} | ||d ||} | S )Nr   r   r   r
   r   r   )r   intr   r   r   )r7   r   r   r[   r   num_patcheschannelsr*   r*   r/   _unpack_latents>  s    z&FluxControlNetPipeline._unpack_latentsc	                 C   s   dt || jd   }dt || jd   }||||f}	|d ur6| ||d |d ||}
|j||d|
fS t|trNt||krNtdt| d| dt|	|||d}| 	|||||}| ||d |d ||}
||
fS )Nr   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r3   r=   rq   )
r   r[   r   r   r`   ra   rH   r@   r   r   )rk   r   r   r   r   rq   r=   r3   r7   r   r   r*   r*   r/   prepare_latentsP  s    z&FluxControlNetPipeline.prepare_latentsFc
                 C   st   t |tjrn	| jj|||d}|jd }
|
dkr|}n|}|j|dd}|j||d}|r8|	s8t|gd }|S )N)r   r   r   r   r   r   r   )	r`   r   r   rg   
preprocessr   r   r   r   )rk   r   r   r   r   ro   r=   rq   do_classifier_free_guidance
guess_modeimage_batch_size	repeat_byr*   r*   r/   prepare_imaget  s   
z$FluxControlNetPipeline.prepare_imagec                 C      | j S r)   )_guidance_scaler   r*   r*   r/   guidance_scale     z%FluxControlNetPipeline.guidance_scalec                 C   r   r)   )_joint_attention_kwargsr   r*   r*   r/   joint_attention_kwargs  r   z-FluxControlNetPipeline.joint_attention_kwargsc                 C   r   r)   )_num_timestepsr   r*   r*   r/   num_timesteps  r   z$FluxControlNetPipeline.num_timestepsc                 C   r   r)   )
_interruptr   r*   r*   r/   	interrupt  r   z FluxControlNetPipeline.interrupt      ?   g      @g        pilTr7   r   r   true_cfg_scaler   r   r<   r?   r   control_guidance_startcontrol_guidance_endrR   control_modecontrolnet_conditioning_scaler3   r   r   negative_ip_adapter_image negative_ip_adapter_image_embedsr   r   output_typereturn_dictr   callback_on_step_endr   c!           J         s	  |p| j | j }|p| j | j }t|ts"t|tr"t||g }n5t|ts4t|tr4t||g }n#t|tsWt|tsWt| jtrJt| jjnd}!|!|g |!|g }}| j|||||||||||| d |
| _	|| _
d| _|dur}t|tr}d}"n|durt|trt|}"n|jd }"| j}#| jj}$| jdur| jddnd}%|dko|du}&| j|||||#|| |%d\}}}'|&r| j|||||#|| |%d\}}}(| jjjd })t| jtrY| j||||"| ||#| jjd	}|jd
d \}}| jjdu rdnd}*| jjdu r1t| j||d}|| jjj | jjj }|jdd \}+},| ||"| |)|+|,}|durXt|ts@t dt!"|j#|#t!j$d}|%dd&|jd d}nt| jtrg }-| jjd jdu rndnd}*t'|D ]W\ }.| j|.|||"| ||#| jjd	}.|.jd
d \}}| jjd jdu rt| j|.|d}.|.| jjj | jjj }.|.jdd \}+},| |.|"| |)|+|,}.|-(|. qt|-}t|trt|t|krt dt|ts|gt| }g }/|D ]"}0|0du rd}0t!"|0&|-d jd j#|#t!j$d}|/(| q|/}| jjjd })| )|"| |)|||j|#||\}}1|	du r=t*+dd| |n|	}	|jd }2t,|2| j-jdd| j-jdd| j-jdd| j-jdd}3t.rjd}4n|#}4t/| j-||4|	|3d\}t0t|| j-j1  d}5t| _2g }6t3tD ]   fddt4||D }7|6(t| jtr|7d n|7 q|dus|dur|du r|du rt*j5||dft*j6d}n|du r|du r|dus|durt*j5||dft*j6d}| jdu ri | _
d}8d}9|dus	|dur| 7|||#|"| }8|dus|dur'| 7|||#|"| }9| j8|d q}:t'D ]c\ };| j9r>q3|8durH|8| j
d!< |;&|jd #|j}<t| jtrd| jjd jj:}=n| jjj:}=|=rtt!j"|
g|#d"nd}>|>dur|>&|jd nd}>t|6  trd#d t4||6  D }?n|}@t|@tr|@d }@|@|6   }?| j||||?|<d$ |>|||'|1| jdd%\}A}B| jjj:rt!j"|
g|#d"nd}>|>dur|>&|jd nd}>| j||<d$ |>|||A|B|'|1| jd|*d&d }C|&r"|9dur|9| j
d!< | j||<d$ |>|||A|B|'|1| jd|*d&d }D|D||C|D   }C|j}E| j-j;|C|;|dd'd }|j|EkrCt!j<j=> rC|#|E}|durpi }F|D ]
}Gt? |G |F|G< qL||  |;|F}H|H@d(|}|H@d)|}|H@d*|} td ks d |5kr d | j-j1 dkr|:A  t.rtBC  q3W d   n	1 sw   Y  |d+kr|}In'| D|||| j}|| jjj | jjj }| jjE|dd'd }I| jFjG|I|d,}I| H  |s|IfS tI|Id-S ).a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                will be used instead
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image. This is set to 1024 by default for the best results.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            sigmas (`list[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            guidance_scale (`float`, *optional*, defaults to 7.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            control_guidance_start (`float` or `list[float]`, *optional*, defaults to 0.0):
                The percentage of total steps at which the ControlNet starts applying.
            control_guidance_end (`float` or `list[float]`, *optional*, defaults to 1.0):
                The percentage of total steps at which the ControlNet stops applying.
            control_image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, `list[np.ndarray]`,:
                    `list[list[torch.Tensor]]`, `list[list[np.ndarray]]` or `list[list[PIL.Image.Image]]`):
                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
                images must be passed as a list such that each element of the list can be correctly batched for input
                to a single ControlNet.
            controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 1.0):
                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
                the corresponding scale as a list.
            control_mode (`int` or `list[int]`,, *optional*, defaults to None):
                The control mode when applying ControlNet-Union.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            negative_ip_adapter_image:
                (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            negative_ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.

        Examples:

        Returns:
            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
            images.
        r   )r   r   rQ   r   r   r   r   rp   FNr   scale)rn   r   rQ   r   r=   ro   rp   r   r   )r   r   r   r   ro   r=   rq   T)r3   r   zB For `FluxControlNet`, `control_mode` should be an `int` or `None`)rq   r{   zFor Multi-ControlNet, `control_mode` must be a list of the same  length as the number of controlnets (control images) specifiedr   base_image_seq_lenr!   max_image_seq_lenr"   r'   r#   r(   r$   cpu)r?   r.   c                    s<   g | ]\}}d t  t |k p d t |k qS )r   r   )floatrH   )r   seir>   r*   r/   r      s    *z3FluxControlNetPipeline.__call__.<locals>.<listcomp>r
   )totalr   r   c                 S   s   g | ]\}}|| qS r*   r*   )r   cr   r*   r*   r/   r   7  s    i  )hidden_statescontrolnet_condcontrolnet_modeconditioning_scaletimestepguidancepooled_projectionsencoder_hidden_statestxt_idsimg_idsr   r   )r   r  r  r  r  controlnet_block_samplescontrolnet_single_block_samplesr  r  r   r   controlnet_blocks_repeat)r   r7   rQ   rR   latent)r   )images)Jrj   r[   r`   ra   rH   rY   r   netsr   r   r   r   r   r   r   rX   rq   r   getr   re   in_channelsr   r   rS   input_hint_blockr;   encodeshift_factorscaling_factorr   r   r@   r   tensorr   longr   expand	enumerater   r   nplinspacer0   rI   XLA_AVAILABLErM   maxorderr   rangezipr   uint8r   progress_barr   guidance_embedsstepbackendsmpsis_availablelocalspopupdatexm	mark_stepr   decoderg   postprocessmaybe_free_model_hooksr    )Jrk   rn   r   r   r   r   r   r   r<   r?   r   r   r   rR   r   r   ro   r3   r7   rQ   r   r   r   r   r   r   r   r   r   r   r   r   rp   multr   r=   rq   r   do_true_cfgr   r   r   r  height_control_imagewidth_control_imagecontrol_imagescontrol_image_control_modescmoder   r+   r.   timestep_devicenum_warmup_stepscontrolnet_keepkeepsr   negative_image_embedsr!  tr  use_guidancer  
cond_scalecontrolnet_cond_scaler	  r
  
noise_predneg_noise_predlatents_dtypecallback_kwargsr   callback_outputsr   r*   r   r/   __call__  sZ   	

	
		
& 


$







6
k
zFluxControlNetPipeline.__call__)NN)Nr   rm   NN)r   N)NNr   NNrm   N)NNNNNNNNr)   )FF)9__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r	   r   r   ra   rb   r   r   r   r_   r   r   r   r=   rq   r   r   FloatTensorr   r   r   r   r   staticmethodr   r   r   r   r   propertyr   r   r   r   no_gradr   EXAMPLE_DOC_STRINGr   	Generatorr   booldictr   r   rE  __classcell__r*   r*   rl   r/   rN      s   	+

4

/
	
S*
M



-
"





	




 !rN   )r!   r"   r#   r$   )Nr1   )NNNN)@rB   typingr   r   numpyr  r   transformersr   r   r   r   r   r	   rg   r   r   loadersr   r   r   r   models.autoencodersr   "models.controlnets.controlnet_fluxr   r   models.transformersr   
schedulersr   utilsr   r   r   r   r   r   utils.torch_utilsr   pipeline_utilsr   pipeline_outputr    torch_xla.core.xla_modelcore	xla_modelr*  r  
get_loggerrF  r   rP  r   r   r0   r   rQ  r   r;   r=   ra   rM   rN   r*   r*   r*   r/   <module>   sp    	 
"




;