o
    pit                     @   s  d dl Z d dlmZmZmZmZmZmZ d dlZ	d dl
Z
d dlmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& e rwd dl'm(  m)Z* dZ+ndZ+e,e-Z.dZ/				d%de0de0de1de1fddZ2				d&dee0 deee3e
j4f  deee0  d eee1  fd!d"Z5G d#d$ d$e$eZ6dS )'    N)AnyCallableDictListOptionalUnion)CLIPTextModelCLIPTokenizerT5EncoderModelT5TokenizerFast   )VaeImageProcessor)FluxLoraLoaderMixin)AutoencoderKL)FluxTransformer2DModel)FlowMatchEulerDiscreteScheduler)USE_PEFT_BACKENDis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor   )DiffusionPipeline   )FluxPipelineOutputTFaV  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import FluxPipeline

        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")
        >>> prompt = "A cat holding a sign that says hello world"
        >>> # Depending on the variant being used, the pipeline call will slightly vary.
        >>> # Refer to the pipeline documentation for more details.
        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
        >>> image.save("flux.png")
        ```
            ?(\?base_seq_lenmax_seq_len
base_shift	max_shiftc                 C   s,   || ||  }|||  }| | | }|S N )image_seq_lenr!   r"   r#   r$   mbmur&   r&   d/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/flux/pipeline_flux.pycalculate_shiftC   s   r,   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr/   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r/   r.   r0   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r0   r.   r.   r&   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r/   len)	schedulerr-   r.   r/   r0   kwargsaccepts_timestepsaccept_sigmasr&   r&   r+   retrieve_timestepsQ   s2   r>   c                '       s  e Zd ZdZdZg ZddgZdedede	de
d	ed
edef fddZ					dBdeeee f dededeej deej f
ddZ		dCdeeee f dedeej fddZ						dDdeeee f deeee f deej dedeej deej dedee fddZ				dEdd Zed!d" Zed#d$ Zed%d& Z 	dFd'd(Z!e"d)d* Z#e"d+d, Z$e"d-d. Z%e"d/d0 Z&e' e(e)ddddd1dd2dddddd3d4dddgdfdeeee f deeeee f  d5ee d6ee d7ed8ee d9edee d:eeej*eej* f  deej deej deej d;ee d<e+d=ee,ee-f  d>ee.eee,gdf  d?ee def$d@dAZ/  Z0S )GFluxPipelinea  
    The Flux pipeline for text-to-image generation.

    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/

    Args:
        transformer ([`FluxTransformer2DModel`]):
            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
        text_encoder_2 ([`T5EncoderModel`]):
            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer_2 (`T5TokenizerFast`):
            Second Tokenizer of class
            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
    z.text_encoder->text_encoder_2->transformer->vaelatentsprompt_embedsr:   vaetext_encoder	tokenizertext_encoder_2tokenizer_2transformerc              	      s   t    | j|||||||d t| dr$| jd ur$dt| jjj nd| _t	| jd| _
t| dr<| jd ur<| jjnd| _d| _d S )	N)rB   rC   rE   rD   rF   rG   r:   rB   r      )vae_scale_factorrD   M   @   )super__init__register_moduleshasattrrB   r9   configblock_out_channelsrI   r   image_processorrD   model_max_lengthtokenizer_max_lengthdefault_sample_size)selfr:   rB   rC   rD   rE   rF   rG   r8   r&   r+   rM      s    

(

zFluxPipeline.__init__Nr      promptnum_images_per_promptmax_sequence_lengthr.   dtypec              	   C   s  |p| j }|p
| jj}t|tr|gn|}t|}| j|d|ddddd}|j}| j|dddj}	|	jd |jd kr]t	
||	s]| j|	d d | jd	 df }
td
| d|
  | j||ddd }| jj}|j||d}|j\}}}|d	|d	}||| |d}|S )N
max_lengthTFpt)paddingr]   
truncationreturn_lengthreturn_overflowing_tokensreturn_tensorslongestr_   rc   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  	 tokens: output_hidden_statesr   r\   r.   )_execution_devicerC   r\   
isinstancestrr9   rF   	input_idsshapetorchequalbatch_decoderT   loggerwarningrE   torepeatview)rV   rY   rZ   r[   r.   r\   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrA   _seq_lenr&   r&   r+   _get_t5_prompt_embeds   s>   
	 "z"FluxPipeline._get_t5_prompt_embedsc           
   	   C   s   |p| j }t|tr|gn|}t|}| j|d| jddddd}|j}| j|dddj}|jd |jd krYt	||sY| j
|d d | jd	 df }td
| j d|  | j||dd}	|	j}	|	j| jj|d}	|	d	|d	}	|	|| d}	|	S )Nr]   TFr^   )r_   r]   r`   rb   ra   rc   rd   re   rf   r   z\The following part of your input was truncated because CLIP can only handle sequences up to rg   rh   rj   )rk   rl   rm   r9   rD   rT   rn   ro   rp   rq   rr   rs   rt   rC   ru   pooler_outputr\   rv   rw   )
rV   rY   rZ   r.   rx   ry   rz   r{   r|   rA   r&   r&   r+   _get_clip_prompt_embeds   s:   

 "z$FluxPipeline._get_clip_prompt_embedsprompt_2pooled_prompt_embeds
lora_scalec	                 C   sd  |p| j }|dur+t| tr+|| _| jdurtrt| j| | jdur+tr+t| j| t|tr3|gn|}|dur>t	|}	n|j
d }	|du rf|pJ|}t|trS|gn|}| j|||d}| j||||d}| jdurxt| trxtrxt| j| | jdurt| trtrt| j| | jdur| jjn| jj}
t|	|j
d dj||
d}||dd}|||fS )a  

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in all text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        Nr   )rY   r.   rZ   )rY   rZ   r[   r.   r   r   r.   r\   )rk   rl   r   _lora_scalerC   r   r   rE   rm   r9   ro   r   r   r   r\   rG   rp   zerosru   rv   )rV   rY   r   r.   rZ   rA   r   r[   r   rx   r\   text_idsr&   r&   r+   encode_prompt   sF   
 




zFluxPipeline.encode_promptc	           	         sn  |d dks|d dkrt d| d| d|d ur8t fdd|D s8t d j d	 fd
d|D  |d urK|d urKt d| d| d|d ur^|d ur^t d| d| d|d u rj|d u rjt d|d urt|tst|tst dt| |d urt|tst|tst dt| |d ur|d u rt d|d ur|dkrt d| d S d S )N   r   z7`height` and `width` have to be divisible by 8 but are z and .c                 3   s    | ]}| j v V  qd S r%   _callback_tensor_inputs.0krV   r&   r+   	<genexpr>  s    

z,FluxPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r&   r   r   r   r&   r+   
<listcomp>  s    z-FluxPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.z Cannot forward both `prompt_2`: zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z4`prompt_2` has to be of type `str` or `list` but is zIf `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`.rX   z8`max_sequence_length` cannot be greater than 512 but is )r1   allr   rl   rm   listtype)	rV   rY   r   heightwidthrA   r   "callback_on_step_end_tensor_inputsr[   r&   r   r+   check_inputst  s>   zFluxPipeline.check_inputsc           	      C   s   t |d |d d}|d t |d d d d f  |d< |d t |d d d d f  |d< |j\}}}|d d d f | ddd}|| || |}|j||dS )Nr   r   ).r   ).r   r   r   )rp   r   arangero   rv   reshaperu   )	rx   r   r   r.   r\   latent_image_idslatent_image_id_heightlatent_image_id_widthlatent_image_id_channelsr&   r&   r+   _prepare_latent_image_ids  s   &&
z&FluxPipeline._prepare_latent_image_idsc                 C   sR   |  |||d d|d d} | dddddd} | ||d |d  |d } | S )Nr   r      r   r      )rw   permuter   )r@   rx   num_channels_latentsr   r   r&   r&   r+   _pack_latents  s   zFluxPipeline._pack_latentsc                 C   sh   | j \}}}|| }|| }| ||||d dd} | dddddd} | ||d |d |d } | S )Nr   r   r   r   r   r   )ro   rw   r   r   )r@   r   r   rI   rx   num_patcheschannelsr&   r&   r+   _unpack_latents  s   zFluxPipeline._unpack_latentsc	                 C   s   dt || j  }dt || j  }||||f}	|d ur.| |||||}
|j||d|
fS t|trFt||krFtdt| d| dt|	|||d}| 	|||||}| |||||}
||
fS )Nr   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatorr.   r\   )
intrI   r   ru   rl   r   r9   r1   r   r   )rV   rx   r   r   r   r\   r.   r   r@   ro   r   r&   r&   r+   prepare_latents  s    zFluxPipeline.prepare_latentsc                 C      | j S r%   )_guidance_scaler   r&   r&   r+   guidance_scale     zFluxPipeline.guidance_scalec                 C   r   r%   )_joint_attention_kwargsr   r&   r&   r+   joint_attention_kwargs  r   z#FluxPipeline.joint_attention_kwargsc                 C   r   r%   )_num_timestepsr   r&   r&   r+   num_timesteps  r   zFluxPipeline.num_timestepsc                 C   r   r%   )
_interruptr   r&   r&   r+   	interrupt  r   zFluxPipeline.interrupt   g      @pilTr   r   r-   r/   r   r   output_typereturn_dictr   callback_on_step_endr   c           (      C   s  |p| j | j }|p| j | j }| j||||||||d || _|| _d| _|dur2t|tr2d}n|dur@t|tr@t	|}n|j
d }| j}| jdurT| jddnd}| j||||||||d\}}}| jjjd }| || ||||j||	|
\}
}td	d| |}|
j
d }t|| jjj| jjj| jjj| jjj}t| j|||||d
\}}tt	||| jj  d}t	|| _| j |d}t!|D ]\}}| j"rq|#|
j
d $|
j} | jjj%rt&j'|g|d}!|!#|
j
d }!nd}!| j|
| d |!||||| jdd	d }"|
j}#| jj(|"||
ddd }
|
j|#kr*t&j)j*+ r*|
$|#}
|durQi }$|D ]
}%t, |% |$|%< q3|| |||$}&|&-d|
}
|&-d|}|t	|d ksl|d |krp|d | jj dkrp|.  t/rwt01  qW d   n	1 sw   Y  |dkr|
}'n'| 2|
||| j}
|
| j3jj4 | j3jj5 }
| j3j6|
ddd }'| j7j8|'|d}'| 9  |s|'fS t:|'dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                will be used instead
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image. This is set to 1024 by default for the best results.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 7.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.

        Examples:

        Returns:
            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
            images.
        )rA   r   r   r[   FNr   r   scale)rY   r   rA   r   r.   rZ   r[   r   r   g      ?)r*   )total)r.   i  )	hidden_statestimestepguidancepooled_projectionsencoder_hidden_statestxt_idsimg_idsr   r   )r   r@   rA   latent)r   )images);rU   rI   r   r   r   r   rl   rm   r   r9   ro   rk   r   getr   rG   rP   in_channelsr   r\   nplinspacer,   r:   base_image_seq_lenmax_image_seq_lenr#   r$   r>   maxorderr   progress_bar	enumerater   expandru   guidance_embedsrp   tensorstepbackendsmpsis_availablelocalspopupdateXLA_AVAILABLExm	mark_stepr   rB   scaling_factorshift_factordecoderR   postprocessmaybe_free_model_hooksr   )(rV   rY   r   r   r   r-   r/   r   rZ   r   r@   rA   r   r   r   r   r   r   r[   rx   r.   r   r   r   r   r0   r'   r*   num_warmup_stepsr   itr   r   
noise_predlatents_dtypecallback_kwargsr   callback_outputsimager&   r&   r+   __call__  s   ]







6
5
zFluxPipeline.__call__)Nr   rX   NN)r   N)Nr   NNrX   NNNNNr%   )1__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r	   r
   r   r   rM   r   rm   r   r   r   rp   r.   r\   r   r   FloatTensorfloatr   r   staticmethodr   r   r   r   propertyr   r   r   r   no_gradr   EXAMPLE_DOC_STRING	Generatorboolr   r   r   r   __classcell__r&   r&   rW   r+   r?      s$    
1
-	
Z
0



!



	
r?   )r   r   r   r    r   )7r3   typingr   r   r   r   r   r   numpyr   rp   transformersr   r	   r
   r   rR   r   loadersr   models.autoencodersr   models.transformersr   
schedulersr   utilsr   r   r   r   r   r   utils.torch_utilsr   pipeline_utilsr   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rs   r   r   r   r,   rm   r.   r>   r?   r&   r&   r&   r+   <module>   s^     




;