o
    }oi                     @   s   d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z	 d dl
mZ d dl
mZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" G dd dej#Z$G dd de$Z%dS )    N)ListOptionalUnion)Image)	load_file)	save_file)nn)tqdm)FrozenCLIPEmbedderFrozenT5Embedder)Flux)FluxControlNetFluxControlNetConfig)FlowMatchEulerDiscreteScheduler)flux_transformer_converter)FluxModelParams)AutoEncoder)loggingc                %       s&  e Zd ZdZ						dGdedee dee dee dee	 d	e
f fd
dZdHddZdddddejfdeeee f de
deej deej de
deej deej fddZede
de
de
dejdejf
ddZed d! Zed"d# Ze	$	%	&	'dId(e
d)e
d*ed+efd,d-Z	dJd.d/Zed0d1 Zed2d3 Zed4d5 Zed6d7 Zdddd8dd9dddddd:ddej dd;dfdeeee f dee
 dee
 d<e
d=eee
  d>edee
 d?eeej!eej! f  d@eej deej deej dAee de
dejdejdBe"dCe"dDef$dEdFZ#  Z$S )KFluxInferencePipelinea7  
    A pipeline for performing image generation with flux.

    Args:
        params (FluxModelParams, optional):
            Configuration parameters for the model pipeline, including device settings and model configurations.
        flux (Flux, optional):
            A pre-initialized Flux model used for the transformation process.
            If None, a new Flux model is created using the configuration in `params`.
        vae (AutoEncoder, optional):
            A pre-initialized VAE (Variational Autoencoder) model.
            If None, a new VAE model is created using the configuration in `params.vae_config`.
        t5 (FrozenT5Embedder, optional):
            A pre-initialized FrozenT5Embedder model.
            If None, a new T5 model is created using the configuration in `params.t5_params`.
        clip (FrozenCLIPEmbedder, optional):
            A pre-initialized FrozenCLIPEmbedder model.
            If None, a new CLIP model is created using the configuration in `params.clip_params`.
        scheduler_steps (int, optional):
            The number of scheduler steps to use for inference. Default is 1000.

    Attributes:
        device (torch.device): The device (CPU or GPU) where the models will be placed.
        vae (AutoEncoder): The VAE model used for image reconstruction or generation.
        clip_encoder (FrozenCLIPEmbedder): The CLIP encoder for processing image-text inputs.
        t5_encoder (FrozenT5Embedder): The T5 encoder for processing text inputs.
        transformer (Flux): The Flux model used for image-text joint processing.
        vae_scale_factor (float): A scale factor for the VAE, based on the number of channels in the VAE.
        scheduler (FlowMatchEulerDiscreteScheduler): Scheduler used for controlling the flow of inference steps.
        params (FluxModelParams): Configuration parameters used for model setup.

    Methods:
        load_from_pretrained:
            Loads model weights from a checkpoint.
        encoder_prompt:
            Encodes text prompts and retrieves embeddings.
        _prepare_latent_image_ids:
            Prepares latent image ids for the generation process.
        _pack_latents:
            Packs latents into the desired format for input to the model.
        _unpack_latents:
            Unpacks latents from the model into image format.
        _calculate_shift:
            Calculates the shift parameter used for controlling sequence lengths in the model.
        prepare_latents:
            Prepares the latent tensors and latent image ids for generation.
        _generate_rand_latents:
            Generates random latents using a specified generator.
        numpy_to_pil:
            Converts a numpy array or a batch of images to PIL images.
        torch_to_numpy:
            Converts a tensor of images to a numpy array.
        denormalize:
            Denormalizes the image to the range [0, 1].
        __call__:
            Runs the entire image generation process based on the input prompt, including encoding,
            latent preparation, inference, and output generation.

    Example:
        pipeline = FluxInferencePipeline(params)
        images = pipeline(
            prompt=["A beautiful sunset over a mountain range"],
            height=512,
            width=512,
            num_inference_steps=50,
            guidance_scale=7.5
        )
    N  paramsfluxvaet5clipscheduler_stepsc                    s   t    |j| _| j|j_| j|j_|du r"t|j| j n|| _	|du r9t
|jj|jj|jj|jjdn|| _|du rPt|jj|jj|jj|jjdn|| _|du rbt|j| j n|| _dt| j	jj | _t|d| _|| _dS )a  
        Initializes the FluxInferencePipeline with the provided models and configurations.

        Args:
            params (FluxModelParams, optional):
                Configuration parameters for the model pipeline, including device settings and model configurations.
            flux (Flux, optional):
                A pre-initialized Flux model used for the transformation process.
                If None, a new Flux model is created using the configuration in `params`.
            vae (AutoEncoder, optional):
                A pre-initialized VAE (Variational Autoencoder) model.
                If None, a new VAE model is created using the configuration in `params.vae_config`.
            t5 (FrozenT5Embedder, optional):
                A pre-initialized FrozenT5Embedder model.
                If None, a new T5 model is created using the configuration in `params.t5_params`.
            clip (FrozenCLIPEmbedder, optional):
                A pre-initialized FrozenCLIPEmbedder model.
                If None, a new CLIP model is created using the configuration in `params.clip_params`.
            scheduler_steps (int, optional): The number of scheduler steps to use for inference. Default is 1000.
        N)version
max_lengthalways_return_pooleddevice)r   r   load_config_only   )num_train_timesteps)super__init__r   clip_params	t5_paramsr   
vae_configtoevalr   r
   r   r   r   clip_encoderr   r    
t5_encoderr   flux_configtransformerlenr   ch_multvae_scale_factorr   	scheduler)selfr   r   r   r   r   r   	__class__ c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/models/flux/pipeline.pyr$   j   s6   


$$

zFluxInferencePipeline.__init__Tc                 C   s   |r"t || jj}|dur!tj|d}t|| td|  nt	|}| jj
|dd\}}dd |D }t|dkrOtd	|  td
|  dS dS )a  
        Loads the model's weights from a checkpoint. If HF ckpt is provided, it will be converted to NeMo
        format and save it to local folder.

        Args:
            ckpt_path (str):
                Path to the checkpoint file.
            do_convert_from_hf (bool, optional):
                Whether to convert the checkpoint from Hugging Face format before loading. Default is True.
            save_converted_model_to (str, optional):
                Path to save the converted checkpoint if `do_convert_from_hf` is True. Default is None.

        Logs:
            The function logs information about missing or unexpected keys during checkpoint loading.
        N!nemo_flux_transformer.safetensors+saving converted transformer checkpoint to Fstrictc                 S      g | ]	}| d s|qS _extra_stateendswith.0kr5   r5   r6   
<listcomp>       z>FluxInferencePipeline.load_from_pretrained.<locals>.<listcomp>r   zThe following keys are missing during checkpoint loading, please check the ckpt provided or the image quality may be compromised.
 Found unexepected keys: 
 )r   r-   configospathjoinsave_safetensorsr   infoload_safetensorsload_state_dictr.   )r2   	ckpt_pathdo_convert_from_hfsave_converted_model_tockpt	save_pathmissing
unexpectedr5   r5   r6   load_from_pretrained   s$   
z*FluxInferencePipeline.load_from_pretrained      cudapromptnum_images_per_promptprompt_embedspooled_prompt_embedsmax_sequence_lengthr   dtypec                 C   sD  |dur	t |}n|dur|jd }ntd|dkr'| jj|kr'| j| |du r2| j||d}|jd }	|d|d}||| |	dj|d}|dkr[| jj|kr[| j| |du rf| |\}
}|d|d}||| dj|d}|dur|n| jj	}t
||jd d	j||d
}||dd}|dd||fS )ad  
        Encodes a text prompt (or a batch of prompts) into embeddings using both T5 and CLIP models.

        Args:
            prompt (Union[str, List[str]]):
                The text prompt(s) to be encoded. Can be a string or a list of strings.
            num_images_per_prompt (int, optional):
                The number of images to generate per prompt. Default is 1.
            prompt_embeds (torch.FloatTensor, optional):
                Precomputed prompt embeddings, if available. Default is None.
            pooled_prompt_embeds (torch.FloatTensor, optional):
                Precomputed pooled prompt embeddings, if available. Default is None.
            max_sequence_length (int, optional):
                The maximum sequence length for the text model. Default is 512.
            device (torch.device, optional):
                The device (CPU or CUDA) on which the models are placed. Default is 'cuda'.
            dtype (torch.dtype, optional):
                The data type for tensor operations. Default is `torch.float`.

        Returns:
            Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
                - The prompt embeddings.
                - The pooled prompt embeddings.
                - The text IDs for the prompt.

        Raises:
            ValueError: If neither `prompt` nor `prompt_embeds` are provided.
        Nr   0Either prompt or prompt_embeds must be provided.rX   )r]   rV   r^      r   r^   )r.   shape
ValueErrorr+   r   r(   repeatviewr*   r^   torchzeros	transpose)r2   rY   rZ   r[   r\   r]   r   r^   
batch_sizeseq_len_text_idsr5   r5   r6   encoder_prompt   s,   &

z$FluxInferencePipeline.encoder_promptrk   heightwidthc           	      C   s   t |d |d d}|d t |d dddf  |d< |d t |d dddf  |d< |j\}}}|dddf | ddd}|| || |}|j||dS )a  
        Prepares latent image IDs for input into the model. These IDs represent the image grid.

        Args:
            batch_size (int): The number of samples in the batch.
            height (int): The height of the image.
            width (int): The width of the image.
            device (torch.device): The device to place the tensor.
            dtype (torch.dtype): The data type for the tensor.

        Returns:
            torch.FloatTensor: A tensor representing the latent image IDs.
        r!   rb   ).rV   N).r!   rV   rc   )rh   ri   arangerd   rf   reshaper(   )	rk   rp   rq   r   r^   latent_image_idslatent_image_id_heightlatent_image_id_widthlatent_image_id_channelsr5   r5   r6   _prepare_latent_image_ids
  s   &&
z/FluxInferencePipeline._prepare_latent_image_idsc                 C   sR   |  |||d d|d d} | dddddd} | ||d |d  |d } | S )a  
        Packs latents into desired shape, e.g. (B, C, H, W) --> (B, (H//2)*(W//2), C * 4).

        Args:
            latents (torch.Tensor): The latents to be packed.
            batch_size (int): The number of samples in the batch.
            num_channels_latents (int): The number of channels in the latents.
            height (int): The height of the image.
            width (int): The width of the image.

        Returns:
            torch.Tensor: The packed latents.
        r!   r      rV   rb      )rg   permuters   )latentsrk   num_channels_latentsrp   rq   r5   r5   r6   _pack_latents&  s   z#FluxInferencePipeline._pack_latentsc                 C   sh   | j \}}}|| }|| }| ||||d dd} | dddddd} | ||d |d |d } | S )a  
        Unpacks the latents from the model output into an image format suitable for further processing.

        The method reshapes and permutes the latents, adjusting their dimensions according to the
        specified `vae_scale_factor` to match the expected resolution of the image.

        Args:
            latents (torch.Tensor): The latents output from the model, typically in a compact, compressed format.
            height (int): The original height of the image before scaling, used to adjust the latent dimensions.
            width (int): The original width of the image before scaling, used to adjust the latent dimensions.
            vae_scale_factor (int): A scale factor used to adjust the resolution of the image when unpacking.
                This factor istypically the inverse of the VAE downsampling factor.

        Returns:
            torch.Tensor: The unpacked latents reshaped to match the expected dimensions for image reconstruction.
                The output tensor will have shape `(batch_size, channels, height * 2, width * 2)`.

        Notes:
            - This function is intended to convert latents back into a format
            that can be decoded into images by the VAE.
        ry   r!   r   rb   rV   rz   )rd   rg   r{   rs   )r|   rp   rq   r0   rk   num_patcheschannelsr5   r5   r6   _unpack_latents;  s   z%FluxInferencePipeline._unpack_latents            ?(\?base_seq_lenmax_seq_len
base_shift	max_shiftc                 C   s,   || ||  }|||  }| | | }|S Nr5   )image_seq_lenr   r   r   r   mbmur5   r5   r6   _calculate_shift^  s   	z&FluxInferencePipeline._calculate_shiftc	                 C   s   dt | | j }dt | | j }||||f}	|dur.| |||||}
|j||d|
fS t|trFt||krFtdt| d| dtj	|	||||d}| 
|||||}| |||||}
|dd	|
fS )
a  
        Prepares and optionally generates image latents for use in the image generation pipeline.

        This method can either use the provided latents (if already available) or generate new random latents
        using a random generator. The generated latents are then packed and prepared for the model to process.

        Args:
            batch_size (int): The number of samples in the batch.
            num_channels_latents (int): The number of channels in the latents (e.g., depth of the latent tensor).
            height (int): The height of the image to be generated (before scaling).
            width (int): The width of the image to be generated (before scaling).
            dtype (torch.dtype): The data type to use for the latents (e.g., `torch.float32`).
            device (torch.device): The device on which the latents will reside (e.g., 'cuda').
            generator (Union[torch.Generator, List[torch.Generator]]): A random number generator or a list of
                generatorsfor generating random latents. If a list is provided, its length must match the batch size.
            latents (Optional[torch.FloatTensor]): An optional pre-existing latent tensor. If provided, it is used
                instead of generating new latents.

        Returns:
            tuple: A tuple containing:
                - latents (torch.Tensor):
                    The prepared latents, with shape `(batch_size, num_channels_latents, height, width)`.
                - latent_image_ids (torch.Tensor):
                    A tensor containing latent image IDs for each batch sample, used for indexing
                    in the model.

        Raises:
            ValueError: If a list of generators is provided but its length does not match the batch size.

        r!   Nrc   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatorr   r^   rk   r   rV   )intr0   rx   r(   
isinstancelistr.   re   r   _generate_rand_latentsr~   rj   )r2   rk   r}   rp   rq   r^   r   r   r|   rd   rt   r5   r5   r6   prepare_latentsl  s$   )
z%FluxInferencePipeline.prepare_latentsc                    sf   t tr(ddd   fddt|D }tj|ddj d}|S tj d	}|S )
zY
        Create random latents using a random generator or a list of generators.
        )rV   rV   Nc                    s"   g | ]}t j|  d qS )r   r   r^   )rh   randn)rA   ir   r^   r   rd   r5   r6   rC     s    z@FluxInferencePipeline._generate_rand_latents.<locals>.<listcomp>r   dimr   r   )r   r   rangerh   catr(   r   )rd   r   r   r^   rk   r|   r5   r   r6   r     s   
z,FluxInferencePipeline._generate_rand_latentsc                 C   s6   | j dkr	| d } | d  d} dd | D }|S )zL
        Convert a numpy image or a batch of images to a PIL image.
        rb   )N.   uint8c                 S   s   g | ]}t |qS r5   )r   	fromarrayrA   imager5   r5   r6   rC     s    z6FluxInferencePipeline.numpy_to_pil.<locals>.<listcomp>)ndimroundastype)images
pil_imagesr5   r5   r6   numpy_to_pil  s
   
z"FluxInferencePipeline.numpy_to_pilc                 C   s    |    dddd }|S )zN
        Convert a torch image or a batch of images to a numpy image.
        r   r!   rb   rV   )floatcpur{   numpy)r   numpy_imagesr5   r5   r6   torch_to_numpy  s   z$FluxInferencePipeline.torch_to_numpyc                 C   s   | d d  ddS )Nr!   r   r   rV   )clamp)r   r5   r5   r6   denormalize  s   z!FluxInferencePipeline.denormalize         @pilFnum_inference_steps	timestepsguidance_scaler   r|   output_typesave_to_diskoffloadoutput_pathc           !         s  |dksJ d|durt |trd}|g}n"|dur%t |tr%t|}n|
dur5t |
tjr5|
jd }ntd| j||
| |||d\}
}}|r[| j	
d | j
d tj  | jjd	 }| |  |||||||	\}	}td
d| |}|	jd }t|| jj| jj| jj| jj}| jj|||d | jj}|dkr|| jkr| j
| t  tt|D ]Y\}}| |	jd j
|	j|	j!d}| jj"rtj#|g|d |	jd }nd}tj$d|	j!d  | j|	|
||d |||d}| j%|||	d }	W d   n	1 sw   Y  q|r| j
d tj  |dkr1|	&ddW  d   S |dkr| '|	&dd||| j(}	|dkrT|| jkrT| j)
| tj$d|	j!d | j)*|	}W d   n	1 snw   Y  |r| j)
d tj  t+|}t,|}t-|}W d   n	1 sw   Y  |rt.d t/j0|dd t|t1t|  ksJ  fdd|D }t2||D ]\} }|3t/j45||  d q|S )a+  
        Generates images based on a given text prompt and various model parameters.
        Optionally saves the images to disk.

        This method orchestrates the process of generating images by embedding the prompt, preparing the latent
        vectors, iterating through timesteps in the diffusion process, and then decoding the latent representation
        back into an image. It supports both the generation of latent representations or final images in a desired
        output format (e.g., PIL image). The images are optionally saved to disk with a unique filename based
        on the prompt.

        Args:
            prompt (Union[str, List[str]]):
                A text prompt or a list of text prompts to guide image generation. Each prompt
                generates one or more images based on the `num_images_per_prompt`.
            height (Optional[int]):
                The height of the output image. Default is 512.
            width (Optional[int]):
                The width of the output image. Default is 512.
            num_inference_steps (int):
                The number of steps for the diffusion process. Default is 28.
            timesteps (Optional[List[int]]):
                A list of specific timesteps for the diffusion process. If not provided,
                they are automatically calculated.
            guidance_scale (float):
                The scale of the guidance signal, typically used to control the strength of prompt conditioning.
            num_images_per_prompt (Optional[int]):
                The number of images to generate per prompt. Default is 1.
            generator (Optional[Union[torch.Generator, List[torch.Generator]]]):
                A random number generator or a list of generators
                for generating latents. If a list is provided, it should match the batch size.
            latents (Optional[torch.FloatTensor]):
                Pre-existing latents to use instead of generating new ones.
            prompt_embeds (Optional[torch.FloatTensor]):
                Optionally pre-computed prompt embeddings to skip the prompt encoding step.
            pooled_prompt_embeds (Optional[torch.FloatTensor]):
                Optionally pre-computed pooled prompt embeddings.
            output_type (Optional[str]):
                The format of the output. Can be "latent" or "pil" (PIL image). Default is "pil".
            max_sequence_length (int):
                The maximum sequence length for tokenizing the prompt. Default is 512.
            device (torch.device):
                The device on which the computation should take place (e.g., 'cuda'). Default is 'cuda'.
            dtype (torch.dtype):
                The data type of the latents and model weights. Default is `torch.float32`.
            save_to_disk (bool):
                Whether or not to save the generated images to disk. Default is True.
            offload (bool):
                Whether or not to offload model components to CPU to free up GPU memory during the process.
                Default is False.

        Returns:
            Union[List[Image.Image], torch.Tensor]:
                The generated images or latents, depending on the `output_type` argument.
                If `output_type` is "pil", a list of PIL images is returned. If "latent", the latents are returned.

        Raises:
            ValueError: If neither a `prompt` nor `prompt_embeds` is provided.

        Notes:
            - The model expects a device of 'cuda'.
              The method will raise an assertion error if a different device is provided.
            - The method handles both prompt-based and pre-embedded prompt input,
              providing flexibility for different usage scenarios.
            - If `save_to_disk` is enabled, images will be saved with a filename derived from the prompt text.
        rX   4Transformer blocks in Mcore must run on cuda devicesNrV   r   r_   rY   r[   r\   rZ   r]   r   r^   r   ry         ?sigmasr   r   rc   r   device_typer^   r   )imgtxtyr   img_idstxt_idsguidancelatentr   Saving to diskTexist_okc                    0   g | ]}t  D ]}|d d d|  qqS N(   rm   r   rA   pidxrZ   r5   r6   rC        0 z2FluxInferencePipeline.__call__.<locals>.<listcomp>.png)6r   strr   r.   rh   FloatTensorrd   re   ro   r+   r(   r*   rX   empty_cacher-   in_channelsr   nplinspacer   r   r1   base_image_seq_lenmax_image_seq_lenr   r   set_timestepsr   r   no_gradr	   	enumerateexpandr^   guidance_embedtensorautocaststeprj   r   r0   r   decoder   r   r   printrG   makedirsr   zipsaverH   rI   )!r2   rY   rp   rq   r   r   r   rZ   r   r|   r[   r\   r   r]   r   r^   r   r   r   rk   rn   r}   rt   r   r   r   r   ttimestepr   predr   	file_namer5   r   r6   __call__  s   V
	


	







$zFluxInferencePipeline.__call__)NNNNNr   TN)r   r   r   r   r   )%__name__
__module____qualname____doc__r   r   r   r   r   r
   r   r$   rU   rh   r   r   r   r   r   r   r^   ro   staticmethodrx   r~   r   r   r   r   r   r   r   float32	Generatorboolr   __classcell__r5   r5   r3   r6   r   $   s    G
<%
B$

"
A




	
r   c                .       s  e Zd ZdZ								d:dee dee dedede	d	e
d
edef fddZ	d;ddZdd ZdejdejfddZdd Zddddddddddddddejddd d!dd!dfd"eeee f d#ee d$ee d%ed&eee  d'ed(ee d)eeejeej f  d*eej d+eej d,eej d-ee d.ed/ejd0ejd1ed2ed3ed4ed5ee j ejf d6eeee f d7ef,d8d9Z!  Z"S )<FluxControlNetInferencePipelinezu
    Flux Contronlnet inference pipeline initializes controlnet component in addition to a normal flux pipeline.
    Nr   r   contorlnet_configr   r   r   r   r   flux_controlnetc	           	         s6   t  |||||| |du rt|| _dS || _dS )zI
        Same as Flux Inference Pipeline with controlnet object.
        N)r#   r$   r   r   )	r2   r   r   r   r   r   r   r   r   r3   r5   r6   r$     s    z(FluxControlNetInferencePipeline.__init__Tc           
      C   s.  |r=t || jj}t || jj}|dur<tj|d}t|| t	d|  tj|d}t|| t	d|  nt
|}t
|}| jj|dd\}}	dd |D }t|d	krlt	d
|  t	d|	  | jj|dd\}}	dd |D }t|d	krt	d|  t	d|	  dS dS )zZ
        Converts both flux base model and flux controlnet ckpt into NeMo format.
        Nr7   r8   z,nemo_flux_controlnet_transformer.safetensorsFr9   c                 S   r;   r<   r>   r@   r5   r5   r6   rC     rD   zHFluxControlNetInferencePipeline.load_from_pretrained.<locals>.<listcomp>r   zThe following keys are missing during flux checkpoint loading, please check the ckpt provided or the image quality may be compromised.
 rE   c                 S   r;   r<   r>   r@   r5   r5   r6   rC     rD   zThe following keys are missing during controlnet checkpoint loading, please check the ckpt provided or the image quality may be compromised.
 )r   r-   rF   r   rG   rH   rI   rJ   r   rK   rL   rM   r.   )
r2   flux_ckpt_pathcontrolnet_ckpt_pathrO   rP   	flux_ckptflux_controlnet_ckptrR   rS   rT   r5   r5   r6   rU     s@   

z4FluxControlNetInferencePipeline.load_from_pretrainedc                 C   s0   t |ts|g}dd |D }tj|dd}|S )z*
        PIL image to numpy array
        c                 S   s"   g | ]}t |t jd  qS )g     o@)r   arrayr   r   r   r5   r5   r6   rC     s   " z@FluxControlNetInferencePipeline.pil_to_numpy.<locals>.<listcomp>r   )axis)r   r   r   stackr2   r   r5   r5   r6   pil_to_numpy  s
   
z,FluxControlNetInferencePipeline.pil_to_numpyr   returnc                 C   s,   |j dkr	|d }t|dddd}|S )z8
        Convert numpy image into torch tensors
        rb   ).Nr   rV   r!   )r   rh   
from_numpyrj   r  r5   r5   r6   numpy_to_pt  s   
z+FluxControlNetInferencePipeline.numpy_to_ptc                    s   t |tjrn'|d j|d j}}	 |ks|	kr$ fdd|D }| |}| |}|jd }
|
dkr:|}n|}|j|dd}|j	||d}|S )zS
        Preprocess image into torch tensor, also duplicate by batch size.
        r   c                    s   g | ]}|j  fd dqS )rb   )resample)resizer   rp   rq   r5   r6   rC     s    zAFluxControlNetInferencePipeline.prepare_image.<locals>.<listcomp>rV   r   rc   )
r   rh   Tensorrp   rq   r  r  rd   repeat_interleaver(   )r2   r   rp   rq   rk   rZ   r   r^   orig_height
orig_widthimage_batch_size	repeat_byr5   r  r6   prepare_image  s   


z-FluxControlNetInferencePipeline.prepare_imagerW   r   r   rV   r   rX   Fg        r   rY   rp   rq   r   r   r   rZ   r   r|   r[   r\   r   r]   r   r^   r   r   control_guidance_startcontrol_guidance_endcontrol_imagecontrolnet_conditioning_scaler   c           +         s  |dksJ d|durt |trd}|g}n"|dur%t |tr%t|}n|
dur5t |
tjr5|
jd }ntd| j||
| |||d\}
}}|r[| j	
d | j
d tj  | jjd	 }| |  |||||||	\}	}td
d| |}|	jd }t|| jj| jj| jj| jj}| jj|||d | jj}| j||||   |tjd}|jdd \}}| jjdu r|dkr| j |kr| j!
| t"  | j!#|j
|d}W d   n1 sw   Y  |jdd \}}| $||  |||%dd}g } t&t|D ]}!| 'd
t(|!t| |k p!|!d t| |k  q	|dkr>|| j kr>| j
| | j
| t"  t)t*|D ]u\}!}"|"+|	jd j
|	j |	j,d}#| jj-rqtj.|g|d+|	jd }$nd}$| |! | }%tj/d|	j,d4 | j|	||
||#d |||$|%d	\}&}'| j|	|
||#d |||$|&|'d	}(| j0|(|"|	d }	W d   n	1 sw   Y  qJ|r| j
d tj  |dkr|	%ddW  d   S |dkr?| 1|	%dd||| j2}	|dkr|| j kr| j!
| tj/d|	j,d | j!3|	})W d   n	1 sw   Y  |r0| j!
d tj  t4|)})t5|)})t6|)})W d   n	1 sJw   Y  |rt7d t8j9|dd t|)t:t|  kslJ  fdd|D }t;||)D ]\}*})|)<t8j=>||* d qz|)S )a  
        Generates images based on a given text prompt and optionally incorporates control images and ControlNet for
        guidance.

        This method generates images by embedding the prompt, preparing the latent vectors, iterating through timesteps
        in the diffusion process, and then decoding the latent representation back into an image. The method supports
        control images through ControlNet, where the `control_image` is used to condition the image generation.
        It also allows you to specify custom guidance scales and other parameters. Generated images can be saved
        to disk if requested.

        Args:
            prompt (Union[str, List[str]]):
                A text prompt or a list of text prompts to guide image generation. Each prompt generates one or more
                images based on the `num_images_per_prompt`.
            height (Optional[int]):
                The height of the output image. Default is 512.
            width (Optional[int]):
                The width of the output image. Default is 512.
            num_inference_steps (int):
                The number of steps for the diffusion process. Default is 28.
            timesteps (Optional[List[int]]):
                A list of specific timesteps for the diffusion process. If not provided, they are automatically
                calculated.
            guidance_scale (float):
                The scale of the guidance signal, typically used to control the strength of prompt conditioning.
            num_images_per_prompt (Optional[int]):
                The number of images to generate per prompt. Default is 1.
            generator (Optional[Union[torch.Generator, List[torch.Generator]]]):
                A random number generator or a list of generators for generating latents. If a list is provided,
                it should match the batch size.
            latents (Optional[torch.FloatTensor]):
                Pre-existing latents to use instead of generating new ones.
            prompt_embeds (Optional[torch.FloatTensor]):
                Optionally pre-computed prompt embeddings to skip the prompt encoding step.
            pooled_prompt_embeds (Optional[torch.FloatTensor]):
                Optionally pre-computed pooled prompt embeddings.
            output_type (Optional[str]):
                The format of the output. Can be "latent" or "pil" (PIL image). Default is "pil".
            max_sequence_length (int):
                The maximum sequence length for tokenizing the prompt. Default is 512.
            device (torch.device):
                The device on which the computation should take place (e.g., 'cuda'). Default is 'cuda'.
            dtype (torch.dtype):
                The data type of the latents and model weights. Default is `torch.float32`.
            save_to_disk (bool):
                Whether or not to save the generated images to disk. Default is True.
            offload (bool):
                Whether or not to offload model components to CPU to free up GPU memory during the process.
                Default is False.
            control_guidance_start (float):
                The start point for control guidance to apply during the diffusion process.
            control_guidance_end (float):
                The end point for control guidance to apply during the diffusion process.
            control_image (Union[Image.Image, torch.FloatTensor]):
                The image used for conditioning the generation process via ControlNet.
            controlnet_conditioning_scale (Union[float, List[float]]):
                Scaling factors to control the impact of the control image in the generation process.
                Can be a single value or a list for multiple images.

        Returns:
            Union[List[Image.Image], torch.Tensor]:
                The generated images or latents, depending on the `output_type` argument.
                If `output_type` is "pil", a list of PIL images is returned. If "latent", the latents are returned.

        Raises:
            ValueError: If neither a `prompt` nor `prompt_embeds` is provided.

        Notes:
            - The model expects a device of 'cuda'.
              The method will raise an assertion error if a different device is provided.
            - The method supports conditional image generation using ControlNet, where a `control_image` can guide the
              generation process.
            - If `save_to_disk` is enabled, images will be saved with a filename derived from the prompt text.
        rX   r   NrV   r   r_   r   r   ry   r   r   )r   rp   rq   rk   rZ   r   r^   ra   r!   rc   r   r   r   )	r   controlnet_condr   r   r   r   r   r   conditioning_scale)	r   r   r   r   r   r   r   controlnet_double_block_samplescontrolnet_single_block_samplesr   r   r   Tr   c                    r   r   r   r   r   r5   r6   rC     r   z<FluxControlNetInferencePipeline.__call__.<locals>.<listcomp>r   )?r   r   r   r.   rh   r   rd   re   ro   r+   r(   r*   rX   r   r-   r   r   r   r   r   r   r1   r   r   r   r   r   r   r  r   r   input_hint_blockr   r   r   encoder~   rj   r   appendr   r	   r   r   r^   r   r   r   r   r   r0   r   r   r   r   r   rG   r   r   r   r   rH   rI   )+r2   rY   rp   rq   r   r   r   rZ   r   r|   r[   r\   r   r]   r   r^   r   r   r  r  r  r  r   rk   rn   r}   rt   r   r   r   height_control_imagewidth_control_imagecontrolnet_keepr   r   r   r   r  r  r  r   r   r   r5   r   r6   r     s  c
	



(





(



4z(FluxControlNetInferencePipeline.__call__)NNNNNNr   Nr   )#r   r   r   r   r   r   r   r   r   r   r
   r   r   r$   rU   r  r   ndarrayrh   r  r  r  r   r   r   r   r   r   r   r   r^   r   r   r   r   r5   r5   r3   r6   r     s    	
(
#
	
r   )&rG   typingr   r   r   r   r   rh   PILr   safetensors.torchr   rL   r   rJ   r   r	   /nemo.collections.diffusion.encoders.conditionerr
   r   ,nemo.collections.diffusion.models.flux.modelr   7nemo.collections.diffusion.models.flux_controlnet.modelr   r   Jnemo.collections.diffusion.sampler.flow_matching.flow_match_euler_discreter   4nemo.collections.diffusion.utils.flux_ckpt_converterr   4nemo.collections.diffusion.utils.flux_pipeline_utilsr   *nemo.collections.diffusion.vae.autoencoderr   
nemo.utilsr   Moduler   r   r5   r5   r5   r6   <module>   s.       r