o
    ٷi                  	   @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlZd dlZd dlZd dlmZ d dlm  mZ d dlmZmZ d dlmZmZmZ d dlmZ d dlmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 e9e:Z;eG dd de!Z<G dd deeZ=de+fddZ>de+fddZ?G dd deZ@			d'deAdB d eBejCB dB d!eDeA dB d"e
fd#d$ZEG d%d& d&ejFZGdS )(    N)Iterable)	dataclass)Any)ConfigMixinregister_to_config)PipelineImageInputVaeImageProcessoris_valid_image_imagelist)AutoencoderKL)SchedulerMixin)
BaseOutput)randn_tensor)"Qwen2_5_VLForConditionalGenerationQwen2_5_VLProcessor)AutoWeightsLoader)DiffusionOutputOmniDiffusionConfig)get_local_device)DiffusersPipelineLoader)OmniGen2RotaryPosEmbedOmniGen2Transformer2DModel)OmniDiffusionRequest)OmniTextPrompt)!download_weights_from_hf_specificc                   @   s   e Zd ZU dZejed< dS )%FlowMatchEulerDiscreteSchedulerOutputaJ  
    Output class for the scheduler's `step` function output.

    Args:
        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
            denoising loop.
    prev_sampleN)__name__
__module____qualname____doc__torchFloatTensor__annotations__ r#   r#   i/home/ubuntu/.local/lib/python3.10/site-packages/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.pyr   /   s   
 	r   c                   @   s   e Zd ZdZg ZdZed&dedefddZ	e
d	d
 Ze
dd Zd'defddZd(ddZ				d)dedeejB dee dB dedB fddZdd Z		d*dejdeejB dejdejdB d ed!eeB fd"d#Zd$d% ZdS )+FlowMatchEulerDiscreteSchedulera  
    Euler scheduler.

    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
    methods the library implements for all schedulers such as loading and saving.

    Args:
        num_train_timesteps (`int`, defaults to 1000):
            The number of diffusion steps to train the model.
        dynamic_time_shift (`bool`, defaults to `True`):
            Whether to use dynamic time shifting for the timestep schedule.
         Tnum_train_timestepsdynamic_time_shiftc                 C   s6   t jdd|d t jdd d }|| _d | _d | _d S )Nr   r&   dtype)r    linspacefloat32	timesteps_step_index_begin_index)selfr(   r)   r/   r#   r#   r$   __init__N   s    
z(FlowMatchEulerDiscreteScheduler.__init__c                 C      | j S )zg
        The index counter for current timestep. It will increase 1 after each scheduler step.
        )r0   r2   r#   r#   r$   
step_indexW      z*FlowMatchEulerDiscreteScheduler.step_indexc                 C   r4   )zq
        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
        r1   r5   r#   r#   r$   begin_index^   r7   z+FlowMatchEulerDiscreteScheduler.begin_indexr   r9   c                 C   s
   || _ dS )z
        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.

        Args:
            begin_index (`int`):
                The begin index for the scheduler.
        Nr8   )r2   r9   r#   r#   r$   set_begin_indexe   s   
z/FlowMatchEulerDiscreteScheduler.set_begin_indexNc                 C   s:   |d u r| j }||k }t|dkrdnd}||  S )Nr&   r   )
_timestepsnonzerolenitem)r2   timestepschedule_timestepsindicesposr#   r#   r$   index_for_timestepo   s
   z2FlowMatchEulerDiscreteScheduler.index_for_timestepnum_inference_stepsdevicer/   
num_tokensc                 C   s   |du r0|| _ tjdd|d tjddd }| jjr0|dur0t|d }||||  |  }t|j	tj|d}t
|tjd|jdg}|| _|| _d| _d| _dS )	a  
        Sets the discrete timesteps used for the diffusion chain (to be run before inference).

        Args:
            num_inference_steps (`int`):
                The number of diffusion steps used when generating samples with a pre-trained model.
            device (`str` or `torch.device`, *optional*):
                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
            timesteps (`list[float]`, *optional*):
                Custom timesteps to use. If provided, `num_inference_steps` is ignored.
            num_tokens (`int`, *optional*):
                Number of tokens, used for dynamic time shifting.
        Nr   r&   r*   r,   (   r+   rE   rE   )rD   npr-   r.   configr)   sqrtr    
from_numpytocatonesrE   r/   r;   r0   r1   )r2   rD   rE   r/   rF   mr;   r#   r#   r$   set_timesteps}   s    
z-FlowMatchEulerDiscreteScheduler.set_timestepsc                 C   s@   | j d u rt|tjr|| jj}| || _d S | j	| _d S N)
r9   
isinstancer    TensorrN   r/   rE   rC   r0   r1   )r2   r?   r#   r#   r$   _init_step_index   s
   
z0FlowMatchEulerDiscreteScheduler._init_step_indexmodel_outputr?   sample	generatorreturn_dictreturnc           	      C   s   t |tst |tjst |tjrtd| jdu r| | |tj	}| j
| j }| j
| jd  }||| |  }||j}|  jd7  _|sM|fS t|dS )aT  
        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
        process from the learned model outputs (most often the predicted noise).

        Args:
            model_output (`torch.FloatTensor`):
                The direct output from learned diffusion model.
            timestep (`float`):
                The current discrete timestep in the diffusion chain.
            sample (`torch.FloatTensor`):
                A current instance of a sample created by the diffusion process.
            generator (`torch.Generator`, *optional*):
                A random number generator.
            return_dict (`bool`):
                Whether or not to return a
                [`~FlowMatchEulerDiscreteSchedulerOutput`] or tuple.

        Returns:
            [`~FlowMatchEulerDiscreteSchedulerOutput`] or `tuple`:
                If return_dict is `True`,
                [`~FlowMatchEulerDiscreteSchedulerOutput`] is returned,
                otherwise a tuple is returned where the first element is
                the sample tensor.
        zPassing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to `EulerDiscreteScheduler.step()` is not supported. Make sure to pass one of the `scheduler.timesteps` as a timestep.Nr&   )r   )rT   intr    	IntTensor
LongTensor
ValueErrorr6   rV   rN   r.   r;   r+   r0   r   )	r2   rW   r?   rX   rY   rZ   tt_nextr   r#   r#   r$   step   s   "!


z$FlowMatchEulerDiscreteScheduler.stepc                 C   s   | j jS rS   )rK   r(   r5   r#   r#   r$   __len__   s   z'FlowMatchEulerDiscreteScheduler.__len__)r'   T)r   rS   NNNN)NT)r   r   r   r   _compatiblesorderr   r\   boolr3   propertyr6   r9   r:   rC   strr    rE   listfloatrR   rV   r!   	Generatorr   tuplerb   rc   r#   r#   r#   r$   r%   =   sT    





%
>r%   	od_configc                    s   | j }tj|r|}nt|ddg}tj|d}t|}t|}d|v r0dt	|d  nd}W d   n1 s<w   Y  t
|d dd |d	d
dtdtf fdd}|S )z-Pre-processing function for OmniGen2Pipeline.N*vae/config.jsontemporal_downsample      Tvae_scale_factor	do_resizez_dim   requestr[   c                    sh  t | jD ]\}}t|ts|di nd}|dur |ddnd}t|tr,t|d}d|vr4i |d< |durt|trEdd |D }nt|trRtj	|g}n|g}|d }t|tjjr~ j
|d	d
d\}}| jjdu rt|| j_| jjdu r~|| j_g }	|D ]#}
t|
tjrt|
jdkr|
jd ks j|
d	d
d}
|	|
 q|	|d d< || j|< q| S )z*Pre-process requests for OmniGen2Pipeline.multi_modal_dataNimage)promptadditional_informationc                 S   s&   g | ]}t |trtj|n|qS r#   )rT   ri   PILImageopen).0imgr#   r#   r$   
<listcomp>  s   & zKget_omnigen2_pre_process_func.<locals>.pre_process_func.<locals>.<listcomp>r         )
max_pixelsmax_side_lengthr&   preprocessed_images)	enumeratepromptsrT   ri   getr   rj   r~   r   r   get_new_height_widthsampling_paramsheightwidthr    rU   r=   shape
preprocessappend)ry   ir|   rz   	raw_imageimages	first_rawnew_hnew_wr   r{   image_processorlatent_channelsr#   r$   pre_process_func   s@   





z7get_omnigen2_pre_process_func.<locals>.pre_process_func)modelospathexistsr   joinr   jsonloadr=   OmniGen2ImageProcessorr   r   )rn   
model_name
model_pathvae_config_pathf
vae_configru   r   r#   r   r$   get_omnigen2_pre_process_func   s"   

,r   c                    s   | j }tj|r|}nt|d dg}tj|d}t|}t|}d|v r2dt	|d d  nd}W d    n1 s>w   Y  t
|d dd d	tjf fd
d}|S )Nro   rp   block_out_channelsrr   r&   rs   Trt   r   c                    s
     | S rS   )postprocess)r   r   r#   r$   post_process_func;  s   
z9get_omnigen2_post_process_func.<locals>.post_process_func)r   r   r   r   r   r   r   r   r   r=   r   r    rU   )rn   r   r   r   r   r   ru   r   r#   r   r$   get_omnigen2_post_process_func,  s   

"r   c                       s  e Zd ZdZe								dded	ed
edededededef fddZ				dde	j
j
ejB ejB dedB dedB dedB dedB deeef fddZ						d dededB dedB dedB dedB dedeeeeef dB dejfddZ  ZS )!r   aq  
    Image processor for OmniGen2 image resize and crop.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
        vae_scale_factor (`int`, *optional*, defaults to `16`):
            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
        resample (`str`, *optional*, defaults to `lanczos`):
            Resampling filter to use when resizing the image.
        max_pixels (`int`, *optional*, defaults to `1048576`):
            Maximum number of pixels allowed in the image. Images exceeding this limit are downscaled proportionally.
        max_side_length (`int`, *optional*, defaults to `1024`):
            Maximum length of the longer side of the image. Images exceeding this limit are downscaled proportionally.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image to [-1,1].
        do_binarize (`bool`, *optional*, defaults to `False`):
            Whether to binarize the image to 0/1.
        do_convert_grayscale (`bool`, *optional*, defaults to `False`):
            Whether to convert the images to grayscale format.
    Trx   lanczosr   r   Frv   ru   resampler   r   do_normalizedo_binarizedo_convert_grayscalec	           	         s(   t  j||||||d || _|| _d S )N)rv   ru   r   r   r   r   )superr3   r   r   )	r2   rv   ru   r   r   r   r   r   r   	__class__r#   r$   r3   [  s   	
zOmniGen2ImageProcessor.__init__Nr{   r   r   r[   c                 C   s$  |du r t |tjjr|j}nt |tjr|jd }n|jd }|du r@t |tjjr/|j}nt |tjr;|jd }n|jd }|du rG| j}|du rN| j	}d}|dura||kr]|| }n|| }|| }|| d }t
||d}	t||	 | jj | jj t||	 | jj | jj }
}|
|fS )a3  
        Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.

        Args:
            image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
                The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
                should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
                tensor, it should have shape `[batch, channels, height, width]`.
            height (`Optional[int]`, *optional*, defaults to `None`):
                The height of the preprocessed image. If `None`, the height of the `image` input will be used.
            width (`Optional[int]`, *optional*, defaults to `None`):
                The width of the preprocessed image. If `None`, the width of the `image` input will be used.

        Returns:
            `Tuple[int, int]`:
                A tuple containing the height and width, both resized to the nearest integer multiple of
                `vae_scale_factor`.
        Nrr   r&            ?      ?)rT   r~   r   r   r    rU   r   r   r   r   minr\   rK   ru   )r2   r{   r   r   r   r   max_side_length_ratio
cur_pixelsmax_pixels_ratioratio
new_height	new_widthr#   r#   r$   r   s  s8   


z+OmniGen2ImageProcessor.get_new_height_widthdefaultresize_modecrops_coordsc                    sp  t jjtjtjf}jjr=t|tjtjfr=|j	dkr=t|tjr'|
d}n|jd dkr6tj|dd}ntj|dd}t|tr^t|d tjr^|d j	dkr^tdt tj|dd}t|trt|d tjr|d j	dkrtdt tj|dd}t|std	d
dd |D  t|ts|g}t|d t jjr dur fdd|D }jjr͈|d ||\fdd|D }jjrۇfdd|D }njjrfdd|D }|}|}nt|d tjr/|d j	dkrtj|ddntj|dd}|}|||\jjr.|}nTt|d tjr|d j	dkrGtj|ddntj|dd}jjr^|j	dkr^|
d}|jd }	|	jjkrl|S |||\jjr|}jj}
|
r| dk rtd|  d|   dt d}
|
r!|}jj"r#|}|S )a  
        Preprocess the image input.

        Args:
            image (`PipelineImageInput`):
                The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of
                supported formats.
            height (`int`, *optional*):
                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default
                height.
            width (`int`, *optional*):
                The width in preprocessed. If `None`, will use get_default_height_width()` to get the default width.
            resize_mode (`str`, *optional*, defaults to `default`):
                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit within
                the specified width and height, and it may not maintaining the original aspect ratio. If `fill`, will
                resize the image to fit within the specified width and height, maintaining the aspect ratio, and then
                center the image within the dimensions, filling empty with data from image. If `crop`, will resize the
                image to fit within the specified width and height, maintaining the aspect ratio, and then center the
                image within the dimensions, cropping the excess. Note that resize_mode `fill` and `crop` are only
                supported for PIL image input.
            crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
                The crop coordinates for each image in the batch. If `None`, will not crop the image.

        Returns:
            `torch.Tensor`:
                The preprocessed image.
        r   r&   r,   r   )axis   zPassing `image` as a list of 4d np.ndarray is deprecated.Please concatenate the list along the batch dimension and pass it as a single 4d np.ndarrayzPassing `image` as a list of 4d torch.Tensor is deprecated.Please concatenate the list along the batch dimension and pass it as a single 4d torch.Tensorz9Input is in incorrect format. Currently, we only support z, c                 s   s    | ]}t |V  qd S rS   )ri   )r   xr#   r#   r$   	<genexpr>  s    z4OmniGen2ImageProcessor.preprocess.<locals>.<genexpr>Nc                    s   g | ]}|  qS r#   )cropr   r   )r   r#   r$   r         z5OmniGen2ImageProcessor.preprocess.<locals>.<listcomp>c                    s   g | ]}j | d qS ))r   )resizer   )r   r   r2   r   r#   r$   r     s    c                       g | ]}  |qS r#   )convert_to_rgbr   r5   r#   r$   r     r   c                    r   r#   )convert_to_grayscaler   r5   r#   r$   r     r   zPassing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] when passing as pytorch tensor or numpy Array. You passed `image` with value range [,]F)$r~   r   rJ   ndarrayr    rU   rK   r   rT   ndim	unsqueezer   expand_dimsrj   warningswarnFutureWarningconcatenaterO   r	   r_   r   rv   r   do_convert_rgbpil_to_numpynumpy_to_ptstackr   vae_latent_channelsr   r   max	normalizer   binarize)r2   r{   r   r   r   r   r   r   supported_formatschannelr   r#   )r   r   r   r2   r   r$   r     s   %$((

,

,





z!OmniGen2ImageProcessor.preprocess)Trx   r   r   r   TFFrd   )NNNNr   N)r   r   r   r   r   rg   r\   ri   r3   r~   r   rJ   r   r    rU   rm   r   r   r   __classcell__r#   r#   r   r$   r   C  s    	

E	r   rD   rE   r/   kwargsc                 K   s   |dur2dt t| jj v }|std| j d| jd||d| | j}t	|}||fS | j|fd|i| | j}||fS )aa  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` must be `None`.
        **kwargs (`Any`):
            Additional keyword arguments passed to `scheduler.set_timesteps`.

    Returns:
        timesteps (`torch.Tensor`): The timestep schedule from the scheduler.
        num_inference_steps (`int`): The number of inference steps.
    Nr/   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r/   rE   rE   r#   )
setinspect	signaturerR   
parameterskeysr_   r   r/   r=   )	schedulerrD   rE   r/   r   accepts_timestepsr#   r#   r$   retrieve_timesteps@  s   r   c                4       s4  e Zd ZdZdddededdf fdd	Z	dQd
ededededej	dej
dejdB dejdB dejfddZdejdejfddZdeejj ejjB d
ededededej
dej	deejdB  fddZ		dRdeee B dej
dB dedeejejf fd d!Zdefd"d#Z	$		%						dSdeee B d&ed'eee B dB dedej
dB d(ejdB d)ejdB d*ejdB d+ejdB dedeejejejejf fd,d-Zed.d/ Zed0d1 Zed2d3 Zed4d5 Ze 							6		%			7	6	$	8	9	:	;					<	dTd=edeee B dB d'eee B dB d(ejdB d)ejdB d*ej dB d+ej dB dedB d>eejj dB dededB dedB ded?ed@edAedBe!dCe!dDee!e!f dEe"ee#f dB dFee dejeej B dB dejdB dGede$f2dHdIZ%	dQdJdKZ&dLdM Z'dNe(eeejf  de)e fdOdPZ*  Z+S )UOmniGen2Pipelinea  
    Pipeline for text-to-image generation using OmniGen2.

    This pipeline implements a text-to-image generation model that uses:
    - Qwen2.5-VL for text encoding
    - A custom transformer architecture for image generation
    - VAE for image encoding/decoding
    - FlowMatchEulerDiscreteScheduler for noise scheduling

    Args:
        od_config (OmniDiffusionConfig): The OmniDiffusion configuration.
     )prefixrn   r   r[   Nc                   s  t    || _t | _|j}tj|}t	j
|jdddddg| _tj|d|d| _tj|d|d| j| _tj|dd	}i }tj|rt|}t|}W d   n1 s\w   Y  d
ddddddddddddddd}	|	 D ]\}
}|
|v r||
 }t|tr|dv rt|}|||< qwtd"i || _tj|d|d| j| _tj|d|d| _t | dr| jdurdt!| jj"j#d  nd| _$t%| j$d dd | _&d!| _'dS )#aj  
        Initialize the OmniGen2 pipeline.

        Args:
            transformer: The transformer model for image generation.
            vae: The VAE model for image encoding/decoding.
            scheduler: The scheduler for noise scheduling.
            text_encoder: The text encoder model.
            tokenizer: The tokenizer for text processing.
        transformerNztransformer.T)model_or_path	subfolderrevisionr   fall_back_to_ptr   )r   local_files_onlyvaezconfig.json
patch_sizein_channelsout_channelshidden_size
num_layersnum_refiner_layersnum_attention_headsnum_kv_headsmultiple_offfn_dim_multipliernorm_epsaxes_dim_rope	axes_lenstext_feat_dimtimestep_scale)r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  )r  r  mllm	processorrr   r&   rs   rt      r#   )(r   r3   rn   r   rE   r   r   r   r   r   ComponentSourceweights_sourcesr%   from_pretrainedr   r
   rN   r   r   r   r   r   itemsrT   rj   rm   r   r   r   r  r   r  hasattrr=   rK   r   ru   r   r   default_sample_size)r2   rn   r   r   r   transformer_config_pathtransformer_kwargsr   transformer_configparam_mapping
config_key
param_namevaluer   r#   r$   r3   {  sz   


,
zOmniGen2Pipeline.__init__
batch_sizenum_channels_latentsr   r   r+   rE   rY   latentsc	           
      C   sR   t || j }t || j }||||f}	|du r"t|	|||d}|S ||}|S )a  
        Prepare the initial latents for the diffusion process.

        Args:
            batch_size: The number of images to generate.
            num_channels_latents: The number of channels in the latent space.
            height: The height of the generated image.
            width: The width of the generated image.
            dtype: The data type of the latents.
            device: The device to place the latents on.
            generator: The random number generator to use.
            latents: Optional pre-computed latents to use instead of random initialization.

        Returns:
            torch.FloatTensor: The prepared latents tensor.
        N)rY   rE   r+   )r\   ru   r   rN   )
r2   r  r  r   r   r+   rE   rY   r  r   r#   r#   r$   prepare_latents  s   
z OmniGen2Pipeline.prepare_latentsr   c                 C   sj   | j |j| j jdj }| j jjdur|| j jj }| j jjdur+|| j jj }|j| j jd}|S )z
        Encode an image into the VAE latent space.

        Args:
            img: The input image tensor to encode.

        Returns:
            torch.FloatTensor: The encoded latent representation.
        r*   N)	r   encoderN   r+   latent_distrX   rK   shift_factorscaling_factor)r2   r   z0r#   r#   r$   
encode_vae  s   
zOmniGen2Pipeline.encode_vaer   num_images_per_promptr   r   c              	   C   s   |dkr|g}g }t |D ]7\}	}
|
dur6t|
dkr6g }t |
D ]\}}|| |j|dd q!nd}t|D ]}|| q<q|S )a:  
        Prepare input images for processing by encoding them into the VAE latent space.

        Args:
            images: Single image or list of images to process.
            batch_size: The number of images to generate per prompt.
            num_images_per_prompt: The number of images to generate for each prompt.
            device: The device to place the encoded latents on.
            dtype: The data type of the encoded latents.

        Returns:
            List[Optional[torch.FloatTensor]]: List of encoded latent representations for each image.
        r&   Nr   rI   )r   r=   r   r   rN   squeezerange)r2   r   r  r!  r   r   rE   r+   r  r   r   ref_latentsjimg_j_r#   r#   r$   prepare_image  s    zOmniGen2Pipeline.prepare_image   r|   max_sequence_lengthc                 C   s  |p| j }t|tr|gn|}| jj|d|ddd}|j|}| jj|dddj|}|jd |jd krYt	||sY| jj
|dd|d df }td	| d
|  |j|}| j||ddjd }	| jdurt| jj}
n| jdur~| jj}
nd}
|	j|
|d}	|	|fS )aT  
        Get prompt embeddings from the Qwen2 text encoder.

        Args:
            prompt: The prompt or list of prompts to encode.
            device: The device to place the embeddings on. If None, uses the pipeline's device.
            max_sequence_length: Maximum sequence length for tokenization.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
                - The prompt embeddings tensor
                - The attention mask tensor

        Raises:
            Warning: If the input text is truncated due to sequence length limitations.
        longestTpt)padding
max_length
truncationreturn_tensors)r-  r0  r,   Nr&   zbThe following part of your input was truncated because Qwen2.5-VL can only handle sequences up to z	 tokens: )attention_maskoutput_hidden_statesrH   )rE   rT   ri   r  	tokenizer	input_idsrN   r   r    equalbatch_decodeloggerwarningr1  r  hidden_statesr+   r   )r2   r|   rE   r*  text_inputstext_input_idsuntruncated_idsremoved_textprompt_attention_maskprompt_embedsr+   r#   r#   r$   _get_qwen2_prompt_embeds2  sF   
 "



z)OmniGen2Pipeline._get_qwen2_prompt_embedsc                 C   s,   dddd|dg}| j jj|ddd}|S )NsystemzZYou are a helpful assistant that generates high-quality images based on user instructions.)rolecontentuserF)tokenizeadd_generation_prompt)r  r3  apply_chat_template)r2   r|   r#   r#   r$   _apply_chat_templateu  s   z%OmniGen2Pipeline._apply_chat_templateTr&   do_classifier_free_guidancenegative_promptr?  negative_prompt_embedsr>  negative_prompt_attention_maskc              
      s  |p j }t|tr|gn|} fdd|D }|dur!t|}n|jd }|du r4 j|||
d\}}|j\}}}|d|d}||| |d}||d}||| d}|r|du r|durd|nd}t|trp||g n|} fd	d|D }|durt|t|urt	d
t| dt| dt|tr|g}n|t|krt
d| dt| d| d| d	 j|||
d\}}	|j\}}}|d|d}||| |d}|	|d}	|	|| d}	||||	fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
                Lumina-T2I, this should be "".
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                whether to use classifier free guidance or not
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                number of images that should be generated per prompt
            device: (`torch.device`, *optional*):
                torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. For Lumina-T2I, it's should be the embeddings of the "" string.
            max_sequence_length (`int`, defaults to `256`):
                Maximum sequence length to use for the prompt.
        c                    r   r#   rH  )r   _promptr5   r#   r$   r     r   z2OmniGen2Pipeline.encode_prompt.<locals>.<listcomp>Nr   )r|   rE   r*  r&   r,   r   c                    r   r#   rM  )r   _negative_promptr5   r#   r$   r     r   z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rE   rT   ri   r=   r   r@  repeatviewtype	TypeErrorr_   )r2   r|   rI  rJ  r!  rE   r?  rK  r>  rL  r*  r  seq_lenr'  r#   r5   r$   encode_prompt  sd   
%




zOmniGen2Pipeline.encode_promptc                 C   r4   rS   )_num_timestepsr5   r#   r#   r$   num_timesteps     zOmniGen2Pipeline.num_timestepsc                 C   r4   rS   )_text_guidance_scaler5   r#   r#   r$   text_guidance_scale  rY  z$OmniGen2Pipeline.text_guidance_scalec                 C   r4   rS   )_image_guidance_scaler5   r#   r#   r$   image_guidance_scale  rY  z%OmniGen2Pipeline.image_guidance_scalec                 C   r4   rS   )
_cfg_ranger5   r#   r#   r$   	cfg_range  rY  zOmniGen2Pipeline.cfg_ranger   r            @r   g        r   Freqinput_imagesmax_input_image_side_length	align_resrD   r[  r]  r_  attention_kwargsr/   verbosec           '      C   s  t |jdkrtdt |jd  |jd }t|tr|n|dp$|}t|tr,d n|d|}|d u r>|d u r>tdt|tsRd|di  }v rR|d}	|jj	p]|p]| j
| j }|jjpi|pi| j
| j }|jjpo|}|jjpu|}|jjr~|jj}|| _|jjd ur|jjn|| _|| _|| _|jjdkr|jjn|
}
|d urt|trd}n|d urt|trt |}n|jd }| j}| j|| jd	k||
||||||d

\}}}}| jj}| j|	||
||||d}|	d u rg }	t |	dkr|r|d d jd | j |d d jd | j }}||} }!n)||} }!|| }"||" d }#t|#d	}#t ||# d d t ||# d d }}t |	dkrJd| _| j!j"j#}$| $||
 |$|||j|||}t%j&| j!j"j'| j!j"j(dd}%| j)||||%|||||||||d}&t*j+|&|!| fdd}&t,|&dS )Nr&   z{OmniGen2 only supports a single prompt per request. Only the first prompt will be used; %d extra prompt(s) will be ignored.r   r|   rJ  z<Prompt or prompt_embeds is required for OmniGen2 generation.r   r}   r   )rJ  r!  rE   r?  rK  r>  rL  r*  )r   r  r!  r   r   rE   r+   r,   r   rx   i'  )theta)r  r$  r?  	freqs_cisrK  r>  rL  rD   r/   rE   r+   rh  	step_funcbilinear)sizemode)output)-r=   r   r7  r8  rT   ri   r   r_   r   r   r  ru   r   rY   rD   guidance_scale_providedguidance_scalerZ  guidance_scale_2r\  r^  _attention_kwargsnum_outputs_per_promptrj   r   rE   rV  r[  r   r+   r(  r   r\   r   rK   r   r  r   get_freqs_cisr  r  
processingFinterpolater   )'r2   rc  r|   rJ  r?  rK  r>  rL  r*  rd  r!  r   r   r   re  rf  rD   r[  r]  r_  rg  r/   rY   r  rh  rl  first_promptr}   r  rE   r+   r$  	ori_width
ori_heightr   r   r   rk  r{   r#   r#   r$   forward  s   








zOmniGen2Pipeline.forwardc              	   C   s  t | j||
|	|jd |jd  d\}	}t|	| _t|	D ]\}}| j||||||d}| jd |t|	   kr@| jd krEn n| jnd}| jd |t|	   kr[| jd kr`n n| j	nd}|dkr|dkr| j||||||d}| j|||||d d}||||   |||   }n|dkr| j|||||d d}||||   }| jj
|||dd	d }|j|d
}|d ur||| j q|j|d
}| jjjd ur|| jjj }| jjjd ur|| jjj }| jj|dd	d }|S )Nri  r,   )rF   )r`   r  r?  rk  r>  ref_image_hidden_statesr   r&   r   F)rZ   r*   )r   r   r   r=   rW  r   predictr_  r[  r]  rb   rN   r   rK   r  r  decode)r2   r  r$  r?  rk  rK  r>  rL  rD   r/   rE   r+   rh  rl  r   r`   
model_predr[  r]  model_pred_refmodel_pred_uncondr{   r#   r#   r$   rw    s   

4	4	


zOmniGen2Pipeline.processingc                 C   sl   | |jd |j}|j\}}	}
}i }dtt| jjj	
 v r'||d< | j|||||fi |}|S )Nr   r~  )expandr   rN   r+   r   r   r   r   r}  r   r   )r2   r`   r  r?  rk  r>  r~  r?   r  r  r   r   optional_kwargsr  r#   r#   r$   r    s   
zOmniGen2Pipeline.predictweightsc                 C   s   t | }||S rS   )r   load_weights)r2   r  loaderr#   r#   r$   r    s   
zOmniGen2Pipeline.load_weightsrS   )Nr)  )	TNr&   NNNNNr)  )NNNNNNr   Nr&   NNr   r   Tr`  ra  r   rb  NNNNFN),r   r   r   r   r   ri   r3   r\   r    r+   rE   rl   r!   r  r   rj   r~   r   r(  rm   rU   r@  rH  rg   rV  rh   rX  r[  r]  r_  no_gradr   r^   rk   dictr   r   r}  rw  r  r   r   r  r   r#   r#   r   r$   r   m  sj   b	

&	
)

C
	

c



	

 8
\,r   )NNN)Hr   r   loggingr   r   collections.abcr   dataclassesr   typingr   numpyrJ   	PIL.Imager~   r    torch.nnnntorch.nn.functional
functionalrx  diffusers.configuration_utilsr   r   diffusers.image_processorr   r   r	   diffusers.models.autoencodersr
   %diffusers.schedulers.scheduling_utilsr   diffusers.utilsr   diffusers.utils.torch_utilsr   transformersr   r    vllm.model_executor.models.utilsr   vllm_omni.diffusion.datar   r   %vllm_omni.diffusion.distributed.utilsr   1vllm_omni.diffusion.model_loader.diffusers_loaderr   8vllm_omni.diffusion.models.omnigen2.omnigen2_transformerr   r   vllm_omni.diffusion.requestr   vllm_omni.inputs.datar   2vllm_omni.model_executor.model_loader.weight_utilsr   	getLoggerr   r7  r   r%   r   r   r   r\   ri   rE   rj   r   Moduler   r#   r#   r#   r$   <module>   sn   
 0
@
  

-