o
    ٷip                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% e&e'Z(defddZ)				d'de*de*de+de+fddZ,				d(de*dB de-ej.B dB d e/e* dB d!e/e+ dB d"e0ej1e*f f
d#d$Z2G d%d& d&ej3eZ4dS ))    N)Iterable)VaeImageProcessor)AutoencoderKL)FlowMatchEulerDiscreteScheduler)randn_tensor)nn)CLIPTextModelWithProjectionCLIPTokenizerT5EncoderModelT5Tokenizer)AutoWeightsLoader)DiffusionOutputOmniDiffusionConfig)CFGParallelMixin)get_local_device)DiffusersPipelineLoader)SD3Transformer2DModel)OmniDiffusionRequest)!download_weights_from_hf_specific	od_configc                    s   | j dkr	dd S | j}tj|r|}nt|d dg}tj|d}t|}t	|}d|v r;dt
|d d  nd	}W d    n1 sGw   Y  t|d
 dtjf fdd}|S )Nlatentc                 S   s   | S N )xr   r   _/home/ubuntu/.local/lib/python3.10/site-packages/vllm_omni/diffusion/models/sd3/pipeline_sd3.py<lambda>%   s    z1get_sd3_image_post_process_func.<locals>.<lambda>*zvae/config.jsonblock_out_channels         vae_scale_factorimagesc                    s
     | S r   )postprocess)r#   image_processorr   r   post_process_func2   s   
z:get_sd3_image_post_process_func.<locals>.post_process_func)output_typemodelospathexistsr   joinopenjsonloadlenr   torchTensor)r   
model_name
model_pathvae_config_pathf
vae_configr"   r'   r   r%   r   get_sd3_image_post_process_func!   s    


"
r9               ?ffffff?base_seq_lenmax_seq_len
base_shift	max_shiftc                 C   s,   || ||  }|||  }| | | }|S r   r   )image_seq_lenr>   r?   r@   rA   mbmur   r   r   calculate_shift:   s   rF   num_inference_stepsdevice	timestepssigmasreturnc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesrI   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)rI   rH   rJ   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)rJ   rH   rH   r   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__rI   r1   )	schedulerrG   rH   rI   rJ   kwargsaccepts_timestepsaccept_sigmasr   r   r   retrieve_timestepsG   s2   rX   c                )       s  e Zd Zdddedef fddZ						dBdd	Z		
		dCdeee B dede	j
dB defddZ		
		dDdeee B dedede	j
dB fddZ			
dEdeee B deee B deee B de	jdB dedefddZ	dFde	jfddZdd Zed d! Zed"d# Zed$d% Zed&d' Z	(dGd)e	jd*e	jde	jd+e	jd,e	jd-e	jd.ed/ed0ede	jfd1d2Z									3		
							dHd4edeee B deee B deee B d5eee B d6eee B d7eee B d8edB d9edB d:ed;ee dB ded<e	jee	j B dB d)e	jdB de	jdB d,e	jdB d+e	jdB d-e	jdB dedef(d=d>Zd?eeee	jf  dee fd@dAZ   Z!S )IStableDiffusion3Pipeline )prefixr   r[   c                   sZ  t    || _tj|jdd dddg| _t | _|j}t	j
|}tj|d|d| _tj|d|d| _tj|d|d| _tj|d	|d| _tj|d
|d| _tj|d|d| _tj|d|d| _t|d| _tj|d|d| j| _t| dd rdt| jj j!d  nd| _"t#| j"d| _$t%| dr| jd ur| jj&nd| _'d| _(d| _)| jj*| _*d S )Ntransformerztransformer.T)model_or_path	subfolderrevisionr[   fall_back_to_ptrT   )r^   local_files_only	tokenizertokenizer_2tokenizer_3text_encodertext_encoder_2text_encoder_3)r   vaer   r   r    r!   M      )+super__init__r   r   ComponentSourcer)   weights_sourcesr   rH   r*   r+   r,   r   from_pretrainedrT   r	   rb   rc   r   rd   r   re   rf   r
   rg   r   r\   r   torh   getattrr1   configr   r"   r   r&   hasattrmodel_max_lengthtokenizer_max_lengthdefault_sample_size
patch_sizer(   )selfr   r[   r)   ra   rS   r   r   rl      sZ   

(z!StableDiffusion3Pipeline.__init__Nc                 C   sd  || j | j  dks|| j | j  dkr<td| j | j  d| d| d||| j | j    d||| j | j    d|d urO|	d urOtd| d	|	 d
|d urb|	d urbtd| d	|	 d
|d uru|	d urutd| d	|	 d
|d u r|	d u rtd|d urt|tst|tstdt| |d urt|tst|tstdt| |d urt|tst|tstdt| |d ur|
d urtd| d|
 d
|d ur|
d urtd| d|
 d
|d ur|
d urtd| d|
 d
|	d ur|
d ur|	j|
jkrtd|	j d|
j d|d ur.|dkr0td| d S d S )Nr   z-`height` and `width` have to be divisible by z	 but are z and z. You can use height z and width .zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.z Cannot forward both `prompt_2`: z Cannot forward both `prompt_3`: zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z4`prompt_2` has to be of type `str` or `list` but is z4`prompt_3` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: z)Cannot forward both `negative_prompt_2`: z)Cannot forward both `negative_prompt_3`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` i   z8`max_sequence_length` cannot be greater than 512 but is )r"   rw   rL   
isinstancestrlisttypeshape)rx   promptprompt_2prompt_3heightwidthnegative_promptnegative_prompt_2negative_prompt_3prompt_embedsnegative_prompt_embedsmax_sequence_lengthr   r   r   check_inputs   s   
z%StableDiffusion3Pipeline.check_inputsr   r   r   num_images_per_promptdtypeclip_model_indexc                 C   s\  |p| j j}| j| jg}| j | jg}|| }|| }t|tr"|gn|}t|}	||d| jddd}
|
j	}||dddj	}|j
d |j
d krit||si||d d | jd df }td	| j d
|  ||| jdd}|d }|jd }|j| j j| jd}|j
\}}}|d|d}||	| |d}|d|}||	| d}||fS )N
max_lengthTpt)paddingr   
truncationreturn_tensorslongestr   r   r   z\The following part of your input was truncated because CLIP can only handle sequences up to 	 tokens: )output_hidden_statesr   r   rH   )re   r   rb   rc   rf   r{   r|   r1   ru   	input_idsr   r2   equalbatch_decodeloggerwarningrp   rH   hidden_statesrepeatview)rx   r   r   r   r   clip_tokenizersclip_text_encodersrb   re   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textr   pooled_prompt_embeds_seq_lenr   r   r   _get_clip_prompt_embeds  sF     
z0StableDiffusion3Pipeline._get_clip_prompt_embedsr:   r   c                 C   s:  |p| j j}t|tr|gn|}t|}| j d u r'tj||| jjf| j	|dS | j
|d|dddd| j	}|j}| j
|dddj}|jd |jd krnt||sn| j
|d d | jd	 df }	td
| d|	  |  || j	d }
| j j}|
j|| j	d}
|
j\}}}|
d	|d	}
|
|| |d}
|
S )NrH   r   r   Tr   )r   r   r   add_special_tokensr   r   r   r   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  r   r   r   )rg   r   r{   r|   r1   r2   zerosr\   joint_attention_dimrH   rd   rp   r   r   r   r   ru   r   r   r   r   )rx   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _get_t5_prompt_embeds=  sP   

 "z.StableDiffusion3Pipeline._get_t5_prompt_embedsr   r   r   c                 C   s   t |tr|gn|}d}|du ru|p|}t |tr|gn|}|p!|}t |tr*|gn|}| j||dd\}}	| j||dd\}
}tj||
gdd}| j|||d}tjj|d|j	d |j	d  f}tj||gdd}tj|	|gdd}||fS )	a  

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in all text-encoders
            prompt_3 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
                used in all text-encoders
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
        Nr   )r   r   r   r   r   )dim)r   r   r   r   )
r{   r|   r   r2   catr   r   
functionalpadr   )rx   r   r   r   r   r   r   r   prompt_embedpooled_prompt_embedprompt_2_embedpooled_prompt_2_embedclip_prompt_embedst5_prompt_embedr   r   r   encode_promptp  s:   

z&StableDiffusion3Pipeline.encode_promptrK   c	           
      C   sz   |d ur|j ||dS ||t|| j t|| j f}	t|tr3t||kr3tdt| d| dt|	|||d}|S )Nr   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatorrH   r   )rp   intr"   r{   r}   r1   rL   r   )
rx   r   num_channels_latentsr   r   r   rH   r   latentsr   r   r   r   prepare_latents  s   z(StableDiffusion3Pipeline.prepare_latentsc              	   C   s   i }| j jdd r.t|| j jdd| j jdd| j jdd| j jdd	}||d
< t| j |fd|i|\}}||fS )Nuse_dynamic_shiftingbase_image_seq_lenr:   max_image_seq_lenr;   r@   r<   rA   g(\?rE   rJ   )rT   rr   getrF   rX   )rx   rG   rJ   rB   scheduler_kwargsrE   rI   r   r   r   prepare_timesteps  s&   
z*StableDiffusion3Pipeline.prepare_timestepsc                 C      | j S r   )_guidance_scalerx   r   r   r   guidance_scale     z'StableDiffusion3Pipeline.guidance_scalec                 C   r   r   )_num_timestepsr   r   r   r   num_timesteps  r   z&StableDiffusion3Pipeline.num_timestepsc                 C   r   r   )_current_timestepr   r   r   r   current_timestep  r   z)StableDiffusion3Pipeline.current_timestepc                 C   r   r   )
_interruptr   r   r   r   	interrupt  r   z"StableDiffusion3Pipeline.interruptFr   rI   r   r   negative_pooled_prompt_embedsdo_true_cfgr   cfg_normalizec
                 C   s   | j d t|D ]@\}
}| jrq
|| _||jd j|j|j	d}||||dd}|r7||||dd}nd}| 
|||||	}| ||||}q
|S )a  
        Diffusion loop with optional classifier-free guidance.

        Args:
            latents: Noise latents to denoise
            timesteps: Diffusion timesteps
            prompt_embeds: Positive prompt embeddings
            pooled_prompt_embeds: Pooled positive prompt embeddings
            negative_prompt_embeds: Negative prompt embeddings
            negative_pooled_prompt_embeds: Pooled negative prompt embeddings
            do_true_cfg: Whether to apply CFG
            guidance_scale: CFG scale factor
            cfg_normalize: Whether to normalize CFG output (default: False)

        Returns:
            Denoised latents
        r   r   F)r   timestepencoder_hidden_statespooled_projectionsreturn_dictN)rT   set_begin_index	enumerater   r   expandr   rp   rH   r   predict_noise_maybe_with_cfgscheduler_step_maybe_with_cfg)rx   r   rI   r   r   r   r   r   r   r   r   tr   positive_kwargsnegative_kwargs
noise_predr   r   r   diffuse  s:   	z StableDiffusion3Pipeline.diffuse   reqr   r   r   r   r   rG   rJ   r   c                 C   s  dd |j D p	|}dd |j D p|}|jjp| j| j }|jjp'| j| j }	|jjp-|}|jjp3|}|jjp9|
}
|jj	p?|}|jj
dkrJ|jj
n|}| j|||||	||||||d |jj| _d | _d| _|d urst|trsd}n|d urt|trt|}n|jd }| j|||||d\}}| jdk}|r| j|||||d\}}| jj}| || |||	|j| j||}| |
||jd \}}
t|| _| j|||||r|nd |r|nd || jdd		}d | _| jd
kr|}n|| j j}|| j j!j" | j j!j# }| j j$|ddd }t%|dS )Nc                 S   s(   g | ]}t |tr|n|d pdqS )r   rZ   r{   r|   r   .0pr   r   r   
<listcomp>Q  s   ( z4StableDiffusion3Pipeline.forward.<locals>.<listcomp>c                 S   s(   g | ]}t |trd n|dpd qS )rZ   r   r   r   r   r   r   r   R  s    r   )r   r   r   r   r   r   Fr   )r   r   r   r   r   )	r   rI   r   r   r   r   r   r   r   r   )r   )output)&promptssampling_paramsr   rv   r"   r   rJ   r   rG   r   num_outputs_per_promptr   r   r   r   r   r{   r|   r}   r1   r   r   r\   in_channelsr   r   rH   r   r   r   r(   rp   rh   rr   scaling_factorshift_factordecoder   )rx   r   r   r   r   r   r   r   r   r   rG   rJ   r   r   r   r   r   r   r   r   r   do_cfgr   rI   imager   r   r   forward9  s   










z StableDiffusion3Pipeline.forwardweightsc                 C   s   t | }||S r   )r   load_weights)rx   r   loaderr   r   r   r     s   
z%StableDiffusion3Pipeline.load_weights)NNNNNN)rZ   r   Nr   )rZ   r   r:   N)Nr:   r   r   )F)rZ   rZ   rZ   rZ   rZ   rZ   NNr   Nr   NNNNNNr:   )"__name__
__module____qualname__r   r|   rl   r   r}   r   r2   r   r   r   r3   r   r   r   propertyr   r   r   r   boolfloatr   r   	Generatorr   r   r   tuplerM   r   __classcell__r   r   ry   r   rY      s0   C
Q

1

8



I





	

J





	


 ,rY   )r:   r;   r<   r=   )NNNN)5rN   r/   loggingr*   collections.abcr   r2   diffusers.image_processorr   diffusers.models.autoencodersr   9diffusers.schedulers.scheduling_flow_match_euler_discreter   diffusers.utils.torch_utilsr   r   transformersr   r	   r
   r    vllm.model_executor.models.utilsr   vllm_omni.diffusion.datar   r   ,vllm_omni.diffusion.distributed.cfg_parallelr   %vllm_omni.diffusion.distributed.utilsr   1vllm_omni.diffusion.model_loader.diffusers_loaderr   .vllm_omni.diffusion.models.sd3.sd3_transformerr   vllm_omni.diffusion.requestr   2vllm_omni.model_executor.model_loader.weight_utilsr   	getLoggerr   r   r9   r   r   rF   r|   rH   r}   r  r3   rX   ModulerY   r   r   r   r   <module>   sj    





;