o
    ۷i0                     @   sL  d dl Z d dlZd dlmZmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ d	d
lmZmZ d	dlmZmZmZ ddlmZ e rL	 eeZ	d(dededeee B dejdedeej fddZ 	d)dejdej!dB defddZ"	d*dejdedej!dejd ej#d!efd"d#Z$G d$d% d%eZ%G d&d' d'eZ&dS )+    N)Qwen2Tokenizer
Qwen3Model   )
FrozenDict)ClassifierFreeGuidance)VaeImageProcessor)AutoencoderKL)is_ftfy_availablelogging   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )ZImageModularPipeline   text_encoder	tokenizerpromptdevicemax_sequence_lengthreturnc                 C   s   t |tr|gn|}t|D ]\}}d|dg}|j|dddd}|||< q||d|ddd}|j|}	|j| }
| |	|
dd	jd
 }g }t	t
|D ]}||| |
|   qO|S )Nuser)rolecontentFT)tokenizeadd_generation_promptenable_thinking
max_lengthpt)paddingr    
truncationreturn_tensors)	input_idsattention_maskoutput_hidden_states)
isinstancestr	enumerateapply_chat_templater%   tor&   boolhidden_statesrangelenappend)r   r   r   r   r   iprompt_itemmessagestext_inputstext_input_idsprompt_masksprompt_embedsprompt_embeds_list r;   b/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/modular_pipelines/z_image/encoders.pyget_qwen_prompt_embeds$   s>   
r=   sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr>   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrrB   r>   moderD   AttributeError)r?   r@   rA   r;   r;   r<   retrieve_latentsS   s   

rH      image_tensorvaedtypelatent_channelsc                    s   t tjstdt dt  tr.t jd kr.tdt  djd  dj||dt  trR fddt	jd D }tj
|dd	}n	t d
}|jj jj }|S )Nz*Expected image_tensor to be a tensor, got .r   z/You have passed a list of generators of length z), but it is not same as number of images r   rL   c              	      s.   g | ]}t ||d    | dqS )r   r@   )rH   encode).0r3   r@   rJ   rK   r;   r<   
<listcomp>s   s     z$encode_vae_image.<locals>.<listcomp>)dimrP   )r)   torchTensor
ValueErrortypelistr1   shaper-   r0   catrH   rQ   configshift_factorscaling_factor)rJ   rK   r@   r   rL   rM   image_latentsr;   rS   r<   encode_vae_image`   s   
ra   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedd Ze				ddedejdB dededB def
ddZe dededefddZdS )ZImageTextEncoderStepz-imager   c                 C      dS )NzMText Encoder step that generate text_embeddings to guide the video generationr;   selfr;   r;   r<   description      z!ZImageTextEncoderStep.descriptionc              	   C   s,   t dtt dtt dttdddddgS )	Nr   r   guiderg      @F)guidance_scaleenabledfrom_configr]   default_creation_method)r   r   r   r   r   re   r;   r;   r<   expected_components   s   z)ZImageTextEncoderStep.expected_componentsc                 C   s   t dt dt dddgS )Nr   negative_promptr   r   )default)r   re   r;   r;   r<   inputs   s   
zZImageTextEncoderStep.inputsc                 C   s,   t dttj dddt dttj dddgS )Nr9   denoiser_input_fieldsz2text embeddings used to guide the image generation)	type_hintkwargs_typerg   negative_prompt_embedsz;negative text embeddings used to guide the image generation)r   rZ   rV   rW   re   r;   r;   r<   intermediate_outputs   s   z*ZImageTextEncoderStep.intermediate_outputsc                 C   sB   | j d urt| j tst| j tstdt| j  d S d S d S )Nz2`prompt` has to be of type `str` or `list` but is )r   r)   r*   rZ   rX   rY   )block_stater;   r;   r<   check_inputs   s   


z"ZImageTextEncoderStep.check_inputsNTr   r   r   prepare_unconditional_embedsrp   r   c           	   
   C   s   |p| j }t|ts|g}t|}t| j| j|||d}d}|ro|p#d}t|tr.||g n|}|durKt|t|urKt	dt| dt| d|t|krdt
d| dt| d	| d| d
	t| j| j|||d}||fS )aC  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            prepare_unconditional_embeds (`bool`):
                whether to use prepare unconditional embeddings or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            max_sequence_length (`int`, defaults to `512`):
                The maximum number of text tokens to be used for the generation process.
        )r   r   r   r   r   N z?`negative_prompt` should be the same type to `prompt`, but got z != rN   z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)_execution_devicer)   rZ   r1   r=   r   r   r*   rY   	TypeErrorrX   )	
componentsr   r   rz   rp   r   
batch_sizer9   rv   r;   r;   r<   encode_prompt   sJ   

z#ZImageTextEncoderStep.encode_promptr~   statec                 C   sX   |  |}| | |j|_| j||j|j|j|j|jd\|_	|_
| || ||fS )N)r~   r   r   rz   rp   r   )get_block_statery   r|   r   r   r   requires_unconditional_embedsrp   r   r9   rv   set_block_state)rf   r~   r   rx   r;   r;   r<   __call__   s    

zZImageTextEncoderStep.__call__)NTNr   )__name__
__module____qualname__
model_namepropertyr*   rg   rZ   r   ro   r   rr   r   rw   staticmethodry   rV   r   r.   intr   no_gradr   r   r   r;   r;   r;   r<   rb      s<    
Arb   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedd ZdededefddZdS )ZImageVaeImageEncoderSteprc   r   c                 C   rd   )NzcVae Image Encoder step that generate condition_latents based on image to guide the image generationr;   re   r;   r;   r<   rg     rh   z%ZImageVaeImageEncoderStep.descriptionc                 C   s"   t dtt dttddiddgS )NrK   image_processorvae_scale_factorrI   rl   rm   )r   r   r   r   re   r;   r;   r<   ro     s   
z-ZImageVaeImageEncoderStep.expected_componentsc                 C   s&   t dtjjddt dt dt dgS )NimageT)rt   requiredheightwidthr@   )r   PILImagere   r;   r;   r<   rr   #  s
   z ZImageVaeImageEncoderStep.inputsc                 C   s   t dtjddgS )Nr`   z@video latent representation with the first frame image condition)rt   rg   )r   rV   rW   re   r;   r;   r<   rw   ,  s   z.ZImageVaeImageEncoderStep.intermediate_outputsc                 C   s^   |j d ur|j | j dks|jd ur+|j| j dkr-td| j d|j  d|j dd S d S )Nr   z-`height` and `width` have to be divisible by z	 but are z and rN   )r   vae_scale_factor_spatialr   rX   )r~   rx   r;   r;   r<   ry   6  s   z&ZImageVaeImageEncoderStep.check_inputsr~   r   c           	      C   s   |  |}| || |j}|j}tj}|jj}|jj	||j
|jdj||d}t||j|j|||jd|_| || ||fS )N)r   r   rO   )rJ   rK   r@   r   rL   rM   )r   ry   r   r|   rV   float32rK   rL   r   
preprocessr   r   r-   ra   r@   num_channels_latentsr`   r   )	rf   r~   r   rx   r   r   rL   	vae_dtyperJ   r;   r;   r<   r   ?  s*   

	z"ZImageVaeImageEncoderStep.__call__N)r   r   r   r   r   r*   rg   rZ   r   ro   r   rr   r   rw   r   ry   r   r   r   r;   r;   r;   r<   r     s    	
r   )r   )Nr>   )rI   )'r   rV   transformersr   r   configuration_utilsr   guidersr   r   r   modelsr   utilsr	   r
   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   
get_loggerr   loggerr*   rZ   r   r   rW   r=   	GeneratorrH   rL   ra   rb   r   r;   r;   r;   r<   <module>   sj   


0

  