o
    pi&                     @   sp   d dl mZmZmZmZmZ d dlZddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZ G d	d
 d
eZdS )    )DictListOptionalTupleUnionN   )AutoencoderKLDiTTransformer2DModel)KarrasDiffusionSchedulers)randn_tensor   )DiffusionPipelineImagePipelineOutputc                       s   e Zd ZdZdZ	ddedededee	e
ef  f fdd	Zd
eeee f dee
 fddZe 					ddee
 dedeeejeej f  de
dee dedeeef fddZ  ZS )DiTPipelinea  
    Pipeline for image generation based on a Transformer backbone instead of a UNet.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        transformer ([`DiTTransformer2DModel`]):
            A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents.
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
        scheduler ([`DDIMScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    ztransformer->vaeNtransformervae	schedulerid2labelc                    s|   t    | j|||d i | _|d ur<| D ]\}}|dD ]}t|| j|  < q!qt	t
| j | _d S d S )N)r   r   r   ,)super__init__register_moduleslabelsitemssplitintlstriprstripdictsorted)selfr   r   r   r   keyvaluelabel	__class__ b/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/dit/pipeline_dit.pyr   1   s   
zDiTPipeline.__init__r#   returnc                    sN   t |ts	t|}|D ]}| jvrt| d j dq fdd|D S )a0  

        Map label strings from ImageNet to corresponding class ids.

        Parameters:
            label (`str` or `dict` of `str`):
                Label strings to be mapped to class ids.

        Returns:
            `list` of `int`:
                Class ids to be processed by pipeline.
        zK does not exist. Please make sure to select one of the following labels: 
 .c                    s   g | ]} j | qS r&   )r   ).0lr    r&   r'   
<listcomp>Z   s    z-DiTPipeline.get_label_ids.<locals>.<listcomp>)
isinstancelistr   
ValueError)r    r#   r+   r&   r,   r'   get_label_idsC   s   

zDiTPipeline.get_label_ids      @2   pilTclass_labelsguidance_scale	generatornum_inference_stepsoutput_typereturn_dictc                 C   s  t |}| jjj}| jjj}	t||	||f|| j| jjd}
|dkr)t	|
gd n|
}tj
|| jdd}tj
dg| | jd}|dkrMt	||gdn|}| j| | | jjD ]}|dkru|dt |d  }tj	||gdd	}| j||}|}t|s|jjd
k}t|tr|rtjntj}n|rtjntj}tj
|g||jd}nt |jdkr|d |j}||jd }| j|||dj}|dkr|ddd|	f |dd|	df }}tj|t |d dd	\}}||||   }tj	||gdd	}tj	||gdd	}| jjjd |	kr!tj||	dd	\}}n|}| j |||j!}q\|dkr<|j"ddd	\}
}n|}
d| j#jj$ |
 }
| j#%|
j}|d d &dd}|' (dddd ) }|dkrp| *|}| +  |sz|fS t,|dS )a>	  
        The call function to the pipeline for generation.

        Args:
            class_labels (List[int]):
                List of ImageNet class labels for the images to be generated.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            generator (`torch.Generator`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            num_inference_steps (`int`, *optional*, defaults to 250):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        ```py
        >>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler
        >>> import torch

        >>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
        >>> pipe = pipe.to("cuda")

        >>> # pick words from Imagenet class labels
        >>> pipe.labels  # to print all available words

        >>> # pick words that exist in ImageNet
        >>> words = ["white shark", "umbrella"]

        >>> class_ids = pipe.get_label_ids(words)

        >>> generator = torch.manual_seed(33)
        >>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)

        >>> image = output.images[0]  # label 'white shark'
        ```

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images
        )shaper7   devicedtype   r   )r<   i  r   N)dimmps)r=   r<   )timestepr5   g      ?r   r4   )images)-lenr   configsample_sizein_channelsr   _execution_devicer=   torchcattensorreshaper   set_timestepsprogress_bar	timestepsscale_model_input	is_tensorr<   typer.   floatfloat32float64int32int64r;   toexpandsampler   out_channelsstepprev_samplechunkr   scaling_factordecodeclampcpupermutenumpynumpy_to_pilmaybe_free_model_hooksr   )r    r5   r6   r7   r8   r9   r:   
batch_sizelatent_sizelatent_channelslatentslatent_model_input
class_nullclass_labels_inputthalfrO   is_mpsr=   
noise_predepsrestcond_eps
uncond_epshalf_epsmodel_output_samplesr&   r&   r'   __call__\   sr   <





*



zDiTPipeline.__call__)N)r2   Nr3   r4   T)__name__
__module____qualname____doc__model_cpu_offload_seqr	   r   r
   r   r   r   strr   r   r   r1   rI   no_gradrS   	Generatorboolr   r   rz   __classcell__r&   r&   r$   r'   r      sF    "
r   )typingr   r   r   r   r   rI   modelsr   r	   
schedulersr
   utils.torch_utilsr   pipeline_utilsr   r   r   r&   r&   r&   r'   <module>   s   