o
    ۷i(                     @   s   d dl Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ e r5d dlm  mZ d	Znd
ZG dd deZdS )    N   )AutoencoderKLDiTTransformer2DModel)KarrasDiffusionSchedulers)is_torch_xla_available)randn_tensor   )DiffusionPipelineImagePipelineOutputTFc                       s   e Zd ZdZdZ	ddedededee	e
f dB f fdd	Zd
e
ee
 B dee	 fddZe 					ddee	 dedejeej B dB de	de
dB dedeeB fddZ  ZS )DiTPipelinea  
    Pipeline for image generation based on a Transformer backbone instead of a UNet.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        transformer ([`DiTTransformer2DModel`]):
            A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents. Initially published as
            [`Transformer2DModel`](https://huggingface.co/facebook/DiT-XL-2-256/blob/main/transformer/config.json#L2)
            in the config, but the mismatch can be ignored.
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        scheduler ([`DDIMScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    ztransformer->vaeNtransformervae	schedulerid2labelc                    s|   t    | j|||d i | _|d ur<| D ]\}}|dD ]}t|| j|  < q!qt	t
| j | _d S d S )N)r   r   r   ,)super__init__register_moduleslabelsitemssplitintlstriprstripdictsorted)selfr   r   r   r   keyvaluelabel	__class__ Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/dit/pipeline_dit.pyr   :   s   
zDiTPipeline.__init__r   returnc                    sN   t |ts	t|}|D ]}| jvrt| d j dq fdd|D S )a0  

        Map label strings from ImageNet to corresponding class ids.

        Parameters:
            label (`str` or `dict` of `str`):
                Label strings to be mapped to class ids.

        Returns:
            `list` of `int`:
                Class ids to be processed by pipeline.
        zK does not exist. Please make sure to select one of the following labels: 
 .c                    s   g | ]} j | qS r"   )r   ).0lr   r"   r#   
<listcomp>c   s    z-DiTPipeline.get_label_ids.<locals>.<listcomp>)
isinstancelistr   
ValueError)r   r   r'   r"   r(   r#   get_label_idsL   s   

zDiTPipeline.get_label_ids      @2   pilTclass_labelsguidance_scale	generatornum_inference_stepsoutput_typereturn_dictc                 C   s   t |}| jjj}| jjj}	t||	||f|| j| jjd}
|dkr)t	|
gd n|
}tj
|| jdd}tj
dg| | jd}|dkrMt	||gdn|}| j| | | jjD ]}|dkru|dt |d  }tj	||gdd	}| j||}|}t|s|jjd
k}|jjdk}t|tr|s|rtjntj}n
|s|rtjntj}tj
|g||jd}nt |jdkr|d |j}||jd }| j|||dj}|dkr|ddd|	f |dd|	df }}tj|t |d dd	\}}||||   }tj	||gdd	}tj	||gdd	}| jjjd |	kr+tj||	dd	\}}n|}| j |||j!}t"r=t#$  q\|dkrM|j%ddd	\}
}n|}
d| j&jj' |
 }
| j&(|
j}|d d )dd}|* +dddd , }|dkr| -|}| .  |s|fS t/|dS )a>	  
        The call function to the pipeline for generation.

        Args:
            class_labels (list[int]):
                list of ImageNet class labels for the images to be generated.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            generator (`torch.Generator`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            num_inference_steps (`int`, *optional*, defaults to 250):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        ```py
        >>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler
        >>> import torch

        >>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
        >>> pipe = pipe.to("cuda")

        >>> # pick words from Imagenet class labels
        >>> pipe.labels  # to print all available words

        >>> # pick words that exist in ImageNet
        >>> words = ["white shark", "umbrella"]

        >>> class_ids = pipe.get_label_ids(words)

        >>> generator = torch.manual_seed(33)
        >>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)

        >>> image = output.images[0]  # label 'white shark'
        ```

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images
        )shaper3   devicedtype   r   )r8   i  r   N)dimmpsnpu)r9   r8   )timestepr1   g      ?r   r0   )images)0lenr   configsample_sizein_channelsr   _execution_devicer9   torchcattensorreshaper   set_timestepsprogress_bar	timestepsscale_model_input	is_tensorr8   typer*   floatfloat32float64int32int64r7   toexpandsampler   out_channelsstepprev_sampleXLA_AVAILABLExm	mark_stepchunkr   scaling_factordecodeclampcpupermutenumpynumpy_to_pilmaybe_free_model_hooksr
   )r   r1   r2   r3   r4   r5   r6   
batch_sizelatent_sizelatent_channelslatentslatent_model_input
class_nullclass_labels_inputthalfrL   is_mpsis_npur9   
noise_predepsrestcond_eps
uncond_epshalf_epsmodel_output_samplesr"   r"   r#   __call__e   sz   <





*



zDiTPipeline.__call__)N)r.   Nr/   r0   T)__name__
__module____qualname____doc__model_cpu_offload_seqr   r   r   r   r   strr   r+   r-   rF   no_gradrP   	Generatorboolr
   tupler{   __classcell__r"   r"   r    r#   r   &   sF    r   )rF   modelsr   r   
schedulersr   utilsr   utils.torch_utilsr   pipeline_utilsr	   r
   torch_xla.core.xla_modelcore	xla_modelr\   r[   r   r"   r"   r"   r#   <module>   s   