o
    ۷i,                  
   @   s&  d dl Z d dlmZ d dlZd dlmZmZ ddlmZm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ e r_d dlm  mZ dZ ndZ e!e"Z#dZ$				dde%dB de&ej'B dB de(e% dB de(e) dB fddZ*G dd deZ+dS )    N)Callable)T5EncoderModelT5Tokenizer   )MultiPipelineCallbacksPipelineCallback)VaeImageProcessor)AutoencoderKLCogView3PlusTransformer2DModel)DiffusionPipeline)CogVideoXDDIMSchedulerCogVideoXDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )CogView3PipelineOutputTFa  
    Examples:
        ```python
        >>> import torch
        >>> from diffusers import CogView3PlusPipeline

        >>> pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3-Plus-3B", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")

        >>> prompt = "A photo of an astronaut riding a horse on mars"
        >>> image = pipe(prompt).images[0]
        >>> image.save("output.png")
        ```
num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r   r   r   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r   r   r    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r   len)	schedulerr   r   r   r   kwargsaccepts_timestepsaccept_sigmasr   r   h/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/cogview3/pipeline_cogview3plus.pyretrieve_timesteps;   s2   r'   c                .       s\  e Zd ZdZg ZdZg dZdedede	de
deeB f
 fd	d
Z					dAdeee B dededejdB dejdB f
ddZ								dBdeee B deee B dB dededejdB dejdB dedejdB dejdB fddZdCddZdd  Z		dDd!d"Zed#d$ Zed%d& Zed'd( Zed)d* Ze  e!e"ddddd+dd,dd-dddddd.d/ddd0gdfdeee B dB deee B dB d1edB d2edB d3ed4ee dB d5e#ded6e#d7ej$eej$ B dB d0ej%dB dej%dB dej%dB d8e&eef dB d9e&eef d:ed;ed<e'eegdf e(B e)B dB d=ee ded>e*e&B f*d?d@Z+  Z,S )ECogView3PlusPipelinea  
    Pipeline for text-to-image generation using CogView3Plus.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`T5EncoderModel`]):
            Frozen text-encoder. CogView3Plus uses
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
        tokenizer (`T5Tokenizer`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
        transformer ([`CogView3PlusTransformer2DModel`]):
            A text conditioned `CogView3PlusTransformer2DModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    ztext_encoder->transformer->vae)latentsprompt_embedsnegative_prompt_embeds	tokenizertext_encodervaetransformerr"   c                    sX   t    | j|||||d t| dd r dt| jjjd  nd| _t	| jd| _
d S )N)r,   r-   r.   r/   r"   r.      r      )vae_scale_factor)super__init__register_modulesgetattrr!   r.   configblock_out_channelsr2   r   image_processor)selfr,   r-   r.   r/   r"   r    r   r&   r4      s   

(zCogView3PlusPipeline.__init__Nr      promptnum_images_per_promptmax_sequence_lengthr   dtypec                 C   s  |p| j }|p
| jj}t|tr|gn|}t|}| j|d|dddd}|j}| j|dddj}	|	jd |jd kr[t	
||	s[| j|	d d |d df }
td	| d
|
  | ||d }|j||d}|j\}}}|d|d}||| |d}|S )N
max_lengthTpt)paddingrA   
truncationadd_special_tokensreturn_tensorslongest)rC   rF   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: r   )r@   r   )_execution_devicer-   r@   
isinstancestrr!   r,   	input_idsshapetorchequalbatch_decodeloggerwarningtorepeatview)r:   r=   r>   r?   r   r@   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textr*   _seq_lenr   r   r&   _get_t5_prompt_embeds   s:   
  z*CogView3PlusPipeline._get_t5_prompt_embedsT   negative_promptdo_classifier_free_guidancer*   r+   c
              
   C   s  |p| j }t|tr|gn|}|durt|}
n|jd }
|du r+| j|||||	d}|r7|du r7||j}|r|du rt|trG|
|g n|}|durdt|t|urdtdt| dt| d|
t|kr}t	d| dt| d	| d|
 d
	| j|||||	d}||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                Number of images that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            max_sequence_length (`int`, defaults to `224`):
                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        Nr   )r=   r>   r?   r   r@   z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
rI   rJ   rK   r!   rM   r]   	new_zerostype	TypeErrorr   )r:   r=   r_   r`   r>   r*   r+   r?   r   r@   rV   r   r   r&   encode_prompt   sN   
(

z"CogView3PlusPipeline.encode_promptc	           
      C   s   ||t || j t || j f}	t|tr(t||kr(tdt| d| d|d u r5t|	|||d}n||}|| jj	 }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatorr   r@   )
intr2   rJ   listr!   r   r   rS   r"   init_noise_sigma)
r:   rV   num_channels_latentsheightwidthr@   r   rf   r)   rM   r   r   r&   prepare_latents(  s    
z$CogView3PlusPipeline.prepare_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Netarf   )r   r   r   r"   stepr   r   )r:   rf   rn   accepts_etaextra_step_kwargsaccepts_generatorr   r   r&   prepare_extra_step_kwargs?  s   z.CogView3PlusPipeline.prepare_extra_step_kwargsc                    sj  |d dks|d dkrt d| d| d|d ur8t fdd|D s8t d j d	 fd
d|D  |d urK|d urKt d| d| d|d u rW|d u rWt d|d urnt|tsnt|tsnt dt| |d ur|d urt d| d| d|d ur|d urt d| d| d|d ur|d ur|j|jkrt d|j d|j dd S d S d S )Nr1   r   z7`height` and `width` have to be divisible by 8 but are z and ra   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0kr:   r   r&   	<genexpr>^  s    

z4CogView3PlusPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r   ru   rw   rz   r   r&   
<listcomp>b  s    z5CogView3PlusPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z and `negative_prompt_embeds`: z'Cannot forward both `negative_prompt`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r   allrv   rJ   rK   rh   rc   rM   )r:   r=   rk   rl   r_   "callback_on_step_end_tensor_inputsr*   r+   r   rz   r&   check_inputsQ  sR   
z!CogView3PlusPipeline.check_inputsc                 C      | j S rt   _guidance_scalerz   r   r   r&   guidance_scale     z#CogView3PlusPipeline.guidance_scalec                 C   s
   | j dkS )Nr   r   rz   r   r   r&   r`     s   
z0CogView3PlusPipeline.do_classifier_free_guidancec                 C   r   rt   )_num_timestepsrz   r   r   r&   num_timesteps  r   z"CogView3PlusPipeline.num_timestepsc                 C   r   rt   )
_interruptrz   r   r   r&   	interrupt  r   zCogView3PlusPipeline.interrupt2   g      @g        )r   r   pilr)   rk   rl   r   r   r   rn   rf   original_sizecrops_coords_top_leftoutput_typereturn_dictcallback_on_step_endr~   returnc           *      C   sr  t |ttfr
|j}|p| jjj| j }|p| jjj| j }|p#||f}||f}| ||||||| || _	d| _
|durEt |trEd}n|durSt |trSt|}n|jd }| j}|dk}| j||| j|||||d\}}| jr{tj||gdd}trd}n|}t| j|||\}}t|| _| jjj}| || ||||j||
|}| |
|	}tj|g|jd	}tj|g|jd	}tj|g|jd	}| jrt||g}t||g}t||g}|||| d}|||| d}|||| d}tt||| jj   d}| j!|d
}d}t"|D ]\}} | j#r#q| jr/t|gd n|}!| j$|!| }!| %|!jd }"| j|!||"|||ddd }#|#& }#| jrf|#'d\}$}%|$| j(|%|$   }#t | jt)s| jj*|#| |fi |ddid }n| jj*|#|| |dkr||d  nd|fi |ddi\}}||j}|duri }&|D ]
}'t+ |' |&|'< q|| || |&}(|(,d|}|(,d|}|(,d|}|t|d ks|d |kr|d | jj  dkr|-  trt./  qW d   n	1 sw   Y  |dks | j0j1|| j0jj2 d|
dd })n|})| j3j4|)|d})| 5  |s4|)fS t6|)dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. If not provided, it is set to 1024.
            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image. If not provided it is set to 1024.
            num_inference_steps (`int`, *optional*, defaults to `50`):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`list[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to `5.0`):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to `1`):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            original_size (`tuple[int]`, *optional*, defaults to (1024, 1024)):
                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
                explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            crops_coords_top_left (`tuple[int]`, *optional*, defaults to (0, 0)):
                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int`, defaults to `224`):
                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.

        Examples:

        Returns:
            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`:
            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        FNr   r   g      ?)r>   r*   r+   r?   r   )dimcpu)r@   )totalr0   )hidden_statesencoder_hidden_statestimestepr   target_sizecrop_coordsr   r   r)   r*   r+   latent)r   rf   )r   )images)7rJ   r   r   tensor_inputsr/   r7   sample_sizer2   r   r   r   rK   rh   r!   rM   rI   re   r`   rN   catXLA_AVAILABLEr'   r"   r   in_channelsrm   r@   rs   tensorrS   rT   maxorderprogress_bar	enumerater   scale_model_inputexpandfloatchunkr   r   ro   localspopupdatexm	mark_stepr.   decodescaling_factorr9   postprocessmaybe_free_model_hooksr   )*r:   r=   r_   rk   rl   r   r   r   r>   rn   rf   r)   r*   r+   r   r   r   r   r   r~   r?   r   rV   r   r`   timestep_devicelatent_channelsrq   num_warmup_stepsr   old_pred_original_sampleitlatent_model_inputr   
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsry   callback_outputsimager   r   r&   __call__  s   l	






	&	
6
>
zCogView3PlusPipeline.__call__)Nr   r<   NN)NTr   NNr^   NNrt   )NN)-__name__
__module____qualname____doc___optional_componentsmodel_cpu_offload_seqrv   r   r   r	   r
   r   r   r4   rK   rh   rg   rN   r   r@   r]   boolTensorre   rm   rs   r   propertyr   r`   r   r   no_gradr   EXAMPLE_DOC_STRINGr   	GeneratorFloatTensortupler   r   r   r   r   __classcell__r   r   r;   r&   r(   v   s   

-
	


V
3




	

r(   )NNNN),r   typingr   rN   transformersr   r   	callbacksr   r   r9   r   modelsr	   r
   pipelines.pipeline_utilsr   
schedulersr   r   utilsr   r   r   utils.torch_utilsr   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rQ   r   rg   rK   r   rh   r   r'   r(   r   r   r   r&   <module>   s@   



;