o
    Gip                  
   @   s  d dl Z d dlZd dlmZ d dlZd dlZd dlmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z m!Z! e rqd dl"m#  m$Z% dZ&ndZ&e'e(Z)dZ*dd Z+d-ddZ,d.ddZ-				d/de.d e.d!e/d"e/fd#d$Z0				d0d%e.dB d&e1ej2B dB d'e3e. dB d(e3e/ dB fd)d*Z4G d+d, d,eeZ5dS )1    N)Any)"Qwen2_5_VLForConditionalGenerationQwen2TokenizerQwen2VLProcessor   )VaeImageProcessor)FromSingleFileMixin)AutoencoderKL)LongCatImageTransformer2DModel)DiffusionPipeline)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )LongCatImagePipelineOutput)SYSTEM_PROMPT_ENSYSTEM_PROMPT_ZHTFu  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LongCatImagePipeline

        >>> pipe = LongCatImagePipeline.from_pretrained("meituan-longcat/LongCat-Image", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")

        >>> prompt = "一个年轻的亚裔女性，身穿黄色针织衫，搭配白色项链。她的双手放在膝盖上，表情恬静。背景是一堵粗糙的砖墙，午后的阳光温暖地洒在她身上，营造出一种宁静而温馨的氛围。镜头采用中距离视角，突出她的神态和服饰的细节。光线柔和地打在她的脸上，强调她的五官和饰品的质感，增加画面的层次感与亲和力。整个画面构图简洁，砖墙的纹理与阳光的光影效果相得益彰，突显出人物的优雅与从容。"
        >>> image = pipe(
        ...     prompt,
        ...     height=768,
        ...     width=1344,
        ...     num_inference_steps=50,
        ...     guidance_scale=4.5,
        ...     generator=torch.Generator("cpu").manual_seed(43),
        ...     enable_cfg_renorm=True,
        ... ).images[0]
        >>> image.save("longcat_image.png")
        ```
c                 C   s    t d}t|| rdS dS )Nz[\u4e00-\u9fff]zhen)recompileboolsearch)promptpattern r   l/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/longcat_image/pipeline_longcat_image.pyget_prompt_languageD   s   
r   c                 C   s   t d}|| }g }tt|D ]\}}d|d  }| ||} |||g q|du r2g d}ddd |D }t d	| d
| }	g }
|	D ],}|D ]
\}}|||}qNt 	||rkt
|rj|
|df qJt
|rv|
|df qJ|
S )ai  
    Implement a regex-based string splitting algorithm that identifies delimiters defined by single or double quote
    pairs. Examples::
        >>> prompt_en = "Please write 'Hello' on the blackboard for me." >>> print(split_quotation(prompt_en)) >>> #
        output: [('Please write ', False), ("'Hello'", True), (' on the blackboard for me.', False)]
    z[a-zA-Z]+'[a-zA-Z]+zlongcat_$##$_longcatr   N))'r    )"r!   )u   ‘u   ’)u   “u   ”|c                 S   s:   g | ]\}}t |d  t ||  d t | qS )z[^z]*?)r   escape).0q1q2r   r   r   
<listcomp>]   s   : z#split_quotation.<locals>.<listcomp>()TF)r   r   findall	enumeratesetreplaceappendjoinsplitmatchlen)r   quote_pairsword_internal_quote_pattern#matches_word_internal_quote_patternmapping_word_internal_quoteiword_srcword_tgtr   partsresultpartr   r   r   split_quotationK   s.   

r=   textr   r   c                 C   s  |dkr2|sJ |s|rt d t|d}| |d< t||d  |d< t||d  |d< |S |d	kr|r:|s<J |rBt d
 t||d}| |d< |d t|d d d f  |d  |d< |d t|d d d f  |d  |d< ||| d}|S td| d)Nr>   zKWarning: The parameters of height and width will be ignored in "text" type.r   ).r   r   ).r   r   ).   imagezDWarning: The parameter of num_token will be ignored in "image" type.zUnknow type z!, only support "text" or "image".)printtorchzerosarangereshapeKeyError)modality_idtypestart	num_tokenheightwidthpos_idsr   r   r   prepare_pos_idsm   s(   **rO               ?ffffff?base_seq_lenmax_seq_len
base_shift	max_shiftc                 C   s,   || ||  }|||  }| | | }|S Nr   )image_seq_lenrT   rU   rV   rW   mbmur   r   r   calculate_shift   s   r]   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr`   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r`   r_   ra   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)ra   r_   r_   r   )

ValueErrorr,   inspect	signatureset_timesteps
parameterskeys	__class__r`   r2   )	schedulerr^   r_   r`   ra   kwargsaccepts_timestepsaccept_sigmasr   r   r   retrieve_timesteps   s2   rm   c                (       s  e Zd ZdZdZg ZddgZdedede	de
d	ed
ef fddZdd Zdee fddZ			d@deee B dedB dejdB fddZedd Zedd Zedd Z	dAddZedd  Zed!d" Zed#d$ Zed%d& Zed'd( Z 	dBd)d*Z!e"e#e$ 					+		,						-	.		.	/	.dCdeee B d0eee B d1edB d2edB d3ed4ee% dB d5e%dedB d6ej&eej& B dB dej'dB dej'dB d7ej'dB d8edB d9e(d:e)ee*f dB d;e(dB d<e%dB d=e(dB f$d>d?Z+  Z,S )DLongCatImagePipelinez4
    The pipeline for text-to-image generation.
    ztext_encoder->transformer->vaelatentsprompt_embedsri   vaetext_encoder	tokenizertext_processortransformerc                    sv   t    | j||||||d t| dd r!dt| jjjd  nd| _t	| jd d| _
d| _d| _d	| _d
| _d S )N)rq   rr   rs   ru   ri   rt   rq   r@   r      )vae_scale_factorz<|im_start|>system
As an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>
<|im_start|>user
z!<|im_end|>
<|im_start|>assistant
   i   )super__init__register_modulesgetattrr2   rq   configblock_out_channelsrw   r   image_processorprompt_template_encode_prefixprompt_template_encode_suffixdefault_sample_sizetokenizer_max_length)selfri   rq   rr   rs   rt   ru   rh   r   r   rz      s   
	(	
zLongCatImagePipeline.__init__c                 C   s   t |tr|gn|}g }|D ]3}t|}|dkr!td| d }ntd| d }dd|dgd	g}| jj|d
dd}|| q| j|ddd| j	j
}	| j	jdi |	d| ji}
|
| dd t|	j|
D }| jj|dd
d}|}|S )Nr   u   
用户输入为：u   
改写后的prompt为：z
User Input: z
Rewritten prompt:userr>   )rI   r>   )rolecontentFT)tokenizeadd_generation_promptpt)r>   paddingreturn_tensorsmax_new_tokensc                 S   s    g | ]\}}|t |d  qS rX   )r2   )r$   in_idsout_idsr   r   r   r'     s     z6LongCatImagePipeline.rewire_prompt.<locals>.<listcomp>)skip_special_tokensclean_up_tokenization_spacesr   )
isinstancestrr   r   r   rt   apply_chat_templater.   torr   r_   generater   zip	input_idsbatch_decode)r   r   r_   all_texteach_promptlanguagequestionmessager>   inputsgenerated_idsgenerated_ids_trimmedoutput_textrewrite_promptr   r   r   rewire_prompt   s.   	
z"LongCatImagePipeline.rewire_promptr   c                 C   s<  g }|D ]R}g }t |D ](\}}|r&|D ]}| j|ddd }|| qq| j|ddd }|| qt|| jkrQtd| j dt|  |d | j }|| q| jjd|i| jdddd	}	| j| j	ddd }
| j| j
ddd }t|
}t|}tjd
gt|
 |	jd jd}tjd
gt| |	jd jd}tj|
|	jjd}
tj||	jjd}|	jd}|
d|d}|d|d}|d|d}|d|d}tj||	j|fdd}tj||	j|fdd}|| j}|| j}| j||dd}|jd  }|d d || d d f }|S )NF)add_special_tokensr   zBYour input was truncated because `max_sequence_length` is set to  z input token nums : 
max_lengthTr   )r   r   return_attention_maskr   r   r   dtype)dim)r   attention_maskoutput_hidden_states)r=   rs   extendr2   r   loggerwarningr.   padr   r   rC   tensorr   r   r   size	unsqueezeexpandcatr   r_   rr   hidden_statesdetach)r   r   batch_all_tokensr   
all_tokensclean_prompt_submatchedsub_wordtokenstext_tokens_and_maskprefix_tokenssuffix_tokens
prefix_len
suffix_lenprefix_tokens_masksuffix_tokens_mask
batch_sizeprefix_tokens_batchsuffix_tokens_batchprefix_mask_batchsuffix_mask_batchr   r   text_outputrp   r   r   r   _encode_prompt  sb     z#LongCatImagePipeline._encode_promptNr   num_images_per_promptc                 C   s   t |tr|gn|}t|}|d u r| |}|j\}}}|d|d}||| |d}tddd|jd d| j	}|| j	|fS )Nr   r   r   r>   r?   )rH   rI   rJ   rK   )
r   r   r2   r   shaperepeatviewrO   r   r_   )r   r   r   rp   r   _seq_lentext_idsr   r   r   encode_promptO  s   
z"LongCatImagePipeline.encode_promptc                 C   sR   |  |||d d|d d} | dddddd} | ||d |d  |d } | S )Nr@   r      r   r      )r   permuterF   )ro   r   num_channels_latentsrL   rM   r   r   r   _pack_latentse  s   z"LongCatImagePipeline._pack_latentsc                 C   s   | j \}}}dt||d   }dt||d   }| ||d |d |d dd} | dddddd} | ||d ||} | S )Nr@   r   r   r   r   r   )r   intr   r   rF   )ro   rL   rM   rw   r   num_patcheschannelsr   r   r   _unpack_latentsm  s    z$LongCatImagePipeline._unpack_latentsc                 C   s
   | j dkS )Nr   _guidance_scaler   r   r   r   do_classifier_free_guidance}  s   
z0LongCatImagePipeline.do_classifier_free_guidancec	                 C   s   dt || jd   }dt || jd   }||||f}	tdd| j| jf|d |d d|}
|d ur=|j||d|
fS t|trUt||krUtdt| d| dt	|	||d	}|j|d
}| 
|||||}||
fS )Nr@   r   rA   )rH   rI   rJ   rL   rM   )r_   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatorr_   r   )r   rw   rO   r   r   r   listr2   rb   r   r   )r   r   r   rL   rM   r   r_   r   ro   r   latent_image_idsr   r   r   prepare_latents  s0   
z$LongCatImagePipeline.prepare_latentsc                 C      | j S rX   r   r   r   r   r   guidance_scale     z#LongCatImagePipeline.guidance_scalec                 C   r   rX   )_joint_attention_kwargsr   r   r   r   joint_attention_kwargs  r   z+LongCatImagePipeline.joint_attention_kwargsc                 C   r   rX   )_num_timestepsr   r   r   r   num_timesteps  r   z"LongCatImagePipeline.num_timestepsc                 C   r   rX   )_current_timestepr   r   r   r   current_timestep  r   z%LongCatImagePipeline.current_timestepc                 C   r   rX   )
_interruptr   r   r   r   	interrupt  r   zLongCatImagePipeline.interruptc              	   C   s   || j d  dks|| j d  dkr$td| j d  d| d| d |d ur7|d ur7td| d| d	|d u rC|d u rCtd
|d urZt|tsZt|tsZtdt| |d urm|d urotd| d| d	d S d S )Nr@   r   z-`height` and `width` have to be divisible by z	 but are z and z(. Dimensions will be resized accordinglyzCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: )rw   r   r   rb   r   r   r   rI   )r   r   rL   rM   negative_promptrp   negative_prompt_embedsr   r   r   check_inputs  s*   $z!LongCatImagePipeline.check_inputs2         @pilT        r   rL   rM   r^   ra   r   r   r   output_typereturn_dictr   enable_cfg_renormcfg_renorm_minenable_prompt_rewritec           *      C   sd  |p| j | j }|p| j | j }| j||||||d || _|| _d| _d| _|dur3t|tr3d}n|durAt|t	rAt
|}n|jd }| j}|rZ| ||}td| d |du r`dn|}| j|||d	\}}| jry| j|||d	\}}d
}| || ||||j||	|
\}
}|du rtdd| |n|}|
jd }t|| jjdd| jjdd| jjdd| jjdd}t| j||||d\}}tt
||| jj  d}t
|| _d}| jdu ri | _| j|d}t |D ]\}} | j!rq| | _| "|
jd #|
j}!| j$%d | j$|
|!d ||||ddd }"W d   n	1 s,w   Y  | jr| j$%d | j$|
|!d |||ddd }#W d   n	1 sVw   Y  |#| j&|"|#   }$|rt'j(|"ddd}%t'j(|$ddd}&|%|&d  j)|dd}'|$|' }$n|"}$|
j}(| jj*|$| |
dd d }
|
j|(krt'j+j,- r|
#|(}
|t
|d ks|d |kr|d | jj dkr|.  t/rt01  qW d   n	1 sw   Y  d| _|d!kr|
})n7| 2|
||| j}
|
| j3jj4 | j3jj5 }
|
j| j3jkr|
j#| j3jd"}
| j3j6|
dd d })| j7j8|)|d#})| 9  |s-|)fS t:|)d$S )%aO  
        Function invoked when calling the pipeline for generation.

        Args:
            enable_cfg_renorm: Whether to enable cfg_renorm. Enabling cfg_renorm will improve image quality,
                but it may lead to a decrease in the stability of some image outputs..
            cfg_renorm_min: The minimum value of the cfg_renorm_scale range (0-1).
                cfg_renorm_min = 1.0, renorm has no effect, while cfg_renorm_min=0.0, the renorm range is larger.
            enable_prompt_rewrite: whether to enable prompt rewrite.
        Examples:

        Returns:
            [`~pipelines.LongCatImagePipelineOutput`] or `tuple`: [`~pipelines.LongCatImagePipelineOutput`] if
            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
            generated images.
        )r   rp   r   NFr   r   zRewrite prompt ! )r   rp   r      g      ?base_image_seq_lenrP   max_image_seq_lenrQ   rV   rR   rW   rS   )ra   r\   )totalcondi  )r   timestepguidanceencoder_hidden_statestxt_idsimg_idsr   uncond)r   r  r  r  r	  r   r   T)r   keepdimg:0yE>)minmax)r   latentr   )r   )images);r   rw   r   r   r   r   r   r   r   r   r2   r   _execution_devicer   r   infor   r   r   r   nplinspacer]   ri   r}   getrm   r  orderr   r   progress_barr+   r   r   r   ru   cache_contextr   rC   normclampstepbackendsmpsis_availableupdateXLA_AVAILABLExm	mark_stepr   rq   scaling_factorshift_factordecoder   postprocessmaybe_free_model_hooksr   )*r   r   r   rL   rM   r^   ra   r   r   r   ro   rp   r   r   r   r   r   r   r   r   r_   r   negative_text_idsr   r   rY   r\   r`   num_warmup_stepsr  r  r7   tr  noise_pred_textnoise_pred_uncond
noise_pred	cond_norm
noise_normscalelatents_dtyperA   r   r   r   __call__  s  (	








	
65

zLongCatImagePipeline.__call__)Nr   NrX   )NNN)NNNNr   Nr   r   NNNNr   TNTr   T)-__name__
__module____qualname____doc__model_cpu_offload_seq_optional_components_callback_tensor_inputsr   r	   r   r   r   r
   rz   r   r   r   r   r   rC   Tensorr   staticmethodr   r   propertyr   r   r   r   r   r   r   r   r   EXAMPLE_DOC_STRINGno_gradfloat	GeneratorFloatTensorr   dictr   r1  __classcell__r   r   r   r   rn      s     ?





(








	
rn   rX   )r   r>   r?   NNN)rP   rQ   rR   rS   )NNNN)6rc   r   typingr   numpyr  rC   transformersr   r   r   r   r   loadersr   models.autoencodersr	   models.transformersr
   pipelines.pipeline_utilsr   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_outputr   system_messagesr   r   torch_xla.core.xla_modelcore	xla_modelr   r  
get_loggerr2  r   r<  r   r=   rO   r   r>  r]   r   r_   r   rm   rn   r   r   r   r   <module>   sh   


"



;