o
    Gij                     @   s(  d dl Z d dlmZmZ d dlZd dlZd dlmZm	Z	m
Z
 ddlmZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZ ddl m!Z! e rmd dl"m#  m$Z% dZ&ndZ&e'e(Z)e r{d dl*Z*dZ+dd Z,dd Z-dd Z.G dd deeZ/dS )    N)AnyCallable)AutoTokenizerT5EncoderModelUMT5EncoderModel   )MultiPipelineCallbacksPipelineCallback)SkyReelsV2LoraLoaderMixin)AutoencoderKLWanSkyReelsV2Transformer3DModel)UniPCMultistepScheduler)is_ftfy_availableis_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )DiffusionPipeline   )SkyReelsV2PipelineOutputTFas      Examples:
        ```py
        >>> import torch
        >>> from diffusers import (
        ...     SkyReelsV2Pipeline,
        ...     UniPCMultistepScheduler,
        ...     AutoencoderKLWan,
        ... )
        >>> from diffusers.utils import export_to_video

        >>> # Load the pipeline
        >>> # Available models:
        >>> # - Skywork/SkyReels-V2-T2V-14B-540P-Diffusers
        >>> # - Skywork/SkyReels-V2-T2V-14B-720P-Diffusers
        >>> vae = AutoencoderKLWan.from_pretrained(
        ...     "Skywork/SkyReels-V2-T2V-14B-720P-Diffusers",
        ...     subfolder="vae",
        ...     torch_dtype=torch.float32,
        ... )
        >>> pipe = SkyReelsV2Pipeline.from_pretrained(
        ...     "Skywork/SkyReels-V2-T2V-14B-720P-Diffusers",
        ...     vae=vae,
        ...     torch_dtype=torch.bfloat16,
        ... )
        >>> flow_shift = 8.0  # 8.0 for T2V, 5.0 for I2V
        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=flow_shift)
        >>> pipe = pipe.to("cuda")

        >>> prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."

        >>> output = pipe(
        ...     prompt=prompt,
        ...     num_inference_steps=50,
        ...     height=544,
        ...     width=960,
        ...     guidance_scale=6.0,  # 6.0 for T2V, 5.0 for I2V
        ...     num_frames=97,
        ... ).frames[0]
        >>> export_to_video(output, "video.mp4", fps=24, quality=8)
        ```
c                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptext r    h/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/skyreels_v2/pipeline_skyreels_v2.pybasic_cleanZ   s   
r"   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr   r   r    r    r!   whitespace_clean`   s   r&   c                 C   s   t t| } | S r   )r&   r"   r   r    r    r!   prompt_cleanf   s   r'   c                '       s  e Zd ZdZdZg dZdedeeB de	de
def
 fd	d
Z					dHdeee B dededejdB dejdB f
ddZ								dIdeee B deee B dB dededejdB dejdB dedejdB dejdB fddZ			dJddZ			 	!				dKd"ed#ed$ed%ed&edejdB dejdB d'ejeej B dB d(ejdB d)ejfd*d+Zed,d- Zed.d/ Zed0d1 Zed2d3 Zed4d5 Zed6d7 Z e! e"e#ddd8d9d:d;d<dddddd=dddd(gd>fdeee B deee B d$ed%ed&ed?ed@e$dedB d'ejeej B dB d(ejdB dejdB dejdB dAedB dBedCe%ee&f dB dDe'eegdf e(B e)B dB dEee def$dFdGZ*  Z+S )LSkyReelsV2Pipelinea  
    Pipeline for Text-to-Video (t2v) generation using SkyReels-V2.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        tokenizer ([`T5Tokenizer`]):
            Tokenizer from [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5Tokenizer),
            specifically the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
        text_encoder ([`T5EncoderModel`]):
            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
            the [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) variant.
        transformer ([`SkyReelsV2Transformer3DModel`]):
            Conditional Transformer to denoise the input latents.
        scheduler ([`UniPCMultistepScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
        vae ([`AutoencoderKLWan`]):
            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
    ztext_encoder->transformer->vae)latentsprompt_embedsnegative_prompt_embeds	tokenizertext_encodertransformervae	schedulerc                    st   t    | j|||||d t| dd rdt| jj nd| _t| dd r.dt| jj nd| _	t
| j	d| _d S )N)r/   r-   r,   r.   r0   r/   r         )vae_scale_factor)super__init__register_modulesgetattrsumr/   temperal_downsamplevae_scale_factor_temporallenvae_scale_factor_spatialr   video_processor)selfr,   r-   r.   r/   r0   	__class__r    r!   r5      s   
""zSkyReelsV2Pipeline.__init__Nr      promptnum_videos_per_promptmax_sequence_lengthdevicedtypec              	      s  |p| j }|p
| jj}t|tr|gn|}dd |D }t|}| j|d ddddd}|j|j}}	|		dj
dd	 }
| |||	|j}|j||d
}dd t||
D }tj fdd|D dd	}|j\}}}|d|d}||| |d}|S )Nc                 S   s   g | ]}t |qS r    )r'   .0ur    r    r!   
<listcomp>   s    z<SkyReelsV2Pipeline._get_t5_prompt_embeds.<locals>.<listcomp>
max_lengthTpt)paddingrK   
truncationadd_special_tokensreturn_attention_maskreturn_tensorsr   r   )dim)rF   rE   c                 S   s   g | ]
\}}|d | qS r   r    )rH   rI   vr    r    r!   rJ      s    c                    s2   g | ]}t || |d  |dgqS )r   r   )torchcat	new_zerossizerG   rD   r    r!   rJ      s   2 )_execution_devicer-   rF   
isinstancestrr;   r,   	input_idsattention_maskgtr8   longtolast_hidden_stateziprT   stackshaperepeatview)r>   rB   rC   rD   rE   rF   
batch_sizetext_inputstext_input_idsmaskseq_lensr*   _seq_lenr    rX   r!   _get_t5_prompt_embeds   s4   
	z(SkyReelsV2Pipeline._get_t5_prompt_embedsTnegative_promptdo_classifier_free_guidancer*   r+   c
              
   C   s  |p| j }t|tr|gn|}|durt|}
n|jd }
|du r+| j|||||	d}|r|du r|p4d}t|tr?|
|g n|}|dur\t|t|ur\tdt| dt| d|
t|krutd| d	t| d
| d	|
 d	| j|||||	d}||fS )a"  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        Nr   )rB   rC   rD   rE   rF    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	rZ   r[   r\   r;   re   ro   type	TypeError
ValueError)r>   rB   rp   rq   rC   r*   r+   rD   rE   rF   rh   r    r    r!   encode_prompt   sL   
&

z SkyReelsV2Pipeline.encode_promptc                    s<  |d dks|d dkrt d| d| d|d ur8t fdd|D s8t d j d	 fd
d|D  |d urK|d urKt d| d| d|d ur^|d ur^t d| d| d|d u rj|d u rjt d|d urt|tst|tst dt| |d urt|tst|tst dt| d S d S d S )N   r   z8`height` and `width` have to be divisible by 16 but are z and rs   c                 3   s    | ]}| j v V  qd S r   _callback_tensor_inputsrH   kr>   r    r!   	<genexpr>#  s    

z2SkyReelsV2Pipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r    ry   r{   r}   r    r!   rJ   '  s    z3SkyReelsV2Pipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z;`negative_prompt` has to be of type `str` or `list` but is )rv   allrz   r[   r\   listrt   )r>   rB   rp   heightwidthr*   r+   "callback_on_step_end_tensor_inputsr    r}   r!   check_inputs  s:   
zSkyReelsV2Pipeline.check_inputsrx     @  Q   rh   num_channels_latentsr   r   
num_frames	generatorr)   returnc
                 C   s   |	d ur|	j ||dS |d | j d }
|||
t|| j t|| j f}t|tr=t||kr=tdt| d| dt||||d}	|	S )N)rE   rF   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   rE   rF   )	ra   r:   intr<   r[   r   r;   rv   r   )r>   rh   r   r   r   r   rF   rE   r   r)   num_latent_framesre   r    r    r!   prepare_latents@  s"   z"SkyReelsV2Pipeline.prepare_latentsc                 C      | j S r   _guidance_scaler}   r    r    r!   guidance_scale`     z!SkyReelsV2Pipeline.guidance_scalec                 C   s
   | j dkS )N      ?r   r}   r    r    r!   rq   d  s   
z.SkyReelsV2Pipeline.do_classifier_free_guidancec                 C   r   r   )_num_timestepsr}   r    r    r!   num_timestepsh  r   z SkyReelsV2Pipeline.num_timestepsc                 C   r   r   )_current_timestepr}   r    r    r!   current_timestepl  r   z#SkyReelsV2Pipeline.current_timestepc                 C   r   r   )
_interruptr}   r    r    r!   	interruptp  r   zSkyReelsV2Pipeline.interruptc                 C   r   r   )_attention_kwargsr}   r    r    r!   attention_kwargst  r   z#SkyReelsV2Pipeline.attention_kwargsi   i  a   2   g      @npi   num_inference_stepsr   output_typereturn_dictr   callback_on_step_endr   c           &      C   s  t |ttfr
|j}| ||||||| || j dkr0td| j d || j | j d }t|d}|| _	|| _
d| _d| _| j}|durPt |trPd}n|dur^t |tr^t|}n|jd }| j||| j|||||d\}}| jj}||}|dur||}| jj||d | jj}| jjj}| || ||||tj||	|
	}
t||| jj  }t|| _ | j!|d	}t"|D ]\}}| j#rq|| _|
|}|$|
jd }| j%d
 | j||||ddd }W d   n1 sw   Y  | jr)| j%d | j||||ddd }W d   n	1 sw   Y  ||||   }| jj&|||
ddd }
|durbi } |D ]
}!t' |! | |!< q>|| ||| }"|"(d|
}
|"(d|}|"(d|}|t|d ks}|d |kr|d | jj dkr|)  t*rt+,  qW d   n	1 sw   Y  d| _|dks|
| j-j}
t.| j-jj/0d| j-jj1ddd|
j2|
j}#dt.| j-jj30d| j-jj1ddd|
j2|
j }$|
|$ |# }
| j-j4|
ddd }%| j5j6|%|d}%n|
}%| 7  |s|%fS t8|%dS )a  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            height (`int`, defaults to `544`):
                The height in pixels of the generated image.
            width (`int`, defaults to `960`):
                The width in pixels of the generated image.
            num_frames (`int`, defaults to `97`):
                The number of frames in the generated video.
            num_inference_steps (`int`, defaults to `50`):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, defaults to `6.0`):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            output_type (`str`, *optional*, defaults to `"np"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`SkyReelsV2PipelineOutput`] instead of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int`, *optional*, defaults to `512`):
                The maximum sequence length for the text encoder.

        Examples:

        Returns:
            [`~SkyReelsV2PipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`SkyReelsV2PipelineOutput`] is returned, otherwise a `tuple` is returned
                where the first element is a list with the generated images and the second element is a list of `bool`s
                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
        r   z(`num_frames - 1` has to be divisible by z!. Rounding to the nearest number.NFr   )rB   rp   rq   rC   r*   r+   rD   rE   )rE   )totalcond)hidden_statestimestepencoder_hidden_statesr   r   uncond)r   r)   r*   r+   latentr   )r   )frames)9r[   r	   r   tensor_inputsr   r:   loggerwarningmaxr   r   r   r   rZ   r\   r   r;   re   rw   rq   r.   rF   ra   r0   set_timesteps	timestepsconfigin_channelsr   rT   float32orderr   progress_bar	enumerater   expandcache_contextsteplocalspopupdateXLA_AVAILABLExm	mark_stepr/   tensorlatents_meanrg   z_dimrE   latents_stddecoder=   postprocess_videomaybe_free_model_hooksr   )&r>   rB   rp   r   r   r   r   r   rC   r   r)   r*   r+   r   r   r   r   r   rD   rE   rh   transformer_dtyper   r   num_warmup_stepsr   itlatent_model_inputr   
noise_prednoise_uncondcallback_kwargsr|   callback_outputsr   r   videor    r    r!   __call__x  s   V









	
61
&
zSkyReelsV2Pipeline.__call__)Nr   rA   NN)NTr   NNrA   NN)NNN)rx   r   r   r   NNNN),__name__
__module____qualname____doc__model_cpu_offload_seqrz   r   r   r   r   r   r   r5   r\   r   r   rT   rE   rF   ro   boolTensorrw   r   	Generatorr   propertyr   rq   r   r   r   r   no_gradr   EXAMPLE_DOC_STRINGfloatdictr   r   r	   r   r   __classcell__r    r    r?   r!   r(   k   s@   

-
	

W
-	

 







	
r(   )0r   typingr   r   regexr$   rT   transformersr   r   r   	callbacksr   r	   loadersr
   modelsr   r   
schedulersr   utilsr   r   r   r   utils.torch_utilsr   r=   r   pipeline_utilsr   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   r   r   r   r"   r&   r'   r(   r    r    r    r!   <module>   s4   
,