o
    Gi~                     @   s~  d dl Z d dlZd dlmZ d dlmZmZmZ d dlm	Z	 d dl
mZ d dlmZ ddlmZmZmZmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ e rcd dlm  m Z! dZ"ndZ"G dd deZ#	 e$e%Z&dgZ'ddiZ(	 G dd deZ)d$dej*dej+de,dB fddZ-G dd dej.Z/G dd dej.Z0G dd deZ1G d d! d!e1Z2G d"d# d#e1Z3dS )%    N)PretrainedConfigPreTrainedModelPreTrainedTokenizer)ACT2FN)BaseModelOutput)logging   )AutoencoderKLUNet2DConditionModelUNet2DModelVQModel)DDIMSchedulerLMSDiscreteSchedulerPNDMScheduler)is_torch_xla_available)randn_tensor   )DiffusionPipelineImagePipelineOutputTFc                       s   e Zd ZdZdZdeeB dedede	e
B deeB eB f
 fdd	Ze 	
	
				
	
		ddeee B ded
B ded
B ded
B ded
B ded
B dejeej B d
B dejd
B ded
B dedeeB fddZ  ZS )LDMTextToImagePipelinea  
    Pipeline for text-to-image generation using latent diffusion.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Parameters:
        vqvae ([`VQModel`]):
            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
        bert ([`LDMBertModel`]):
            Text-encoder model based on [`~transformers.BERT`].
        tokenizer ([`~transformers.BertTokenizer`]):
            A `BertTokenizer` to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A `UNet2DConditionModel` to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    zbert->unet->vqvaevqvaebert	tokenizerunet	schedulerc                    s:   t    | j|||||d dt| jjjd  | _d S )N)r   r   r   r   r   r      )super__init__register_moduleslenr   configblock_out_channelsvae_scale_factor)selfr   r   r   r   r   	__class__ r/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.pyr   >   s   
zLDMTextToImagePipeline.__init__N2         ?        pilTpromptheightwidthnum_inference_stepsguidance_scaleeta	generatorlatentsoutput_typereturn_dictreturnc                 K   s  |p	| j jj| j }|p| j jj| j }t|trd}nt|tr&t|}n	tdt	| |d dks;|d dkrFtd| d| d|dkrc| j
d	g| d
dddd}| |j| jd }| j
|d
dddd}| |j| jd }|| j jj|d |d f}t|trt||krtdt| d| d|du rt||| j|jd}n|j|krtd|j d| || j}| j| dtt| jjj v }i }|r||d< | | jjD ]K}|dkr|}|}nt|gd }t||g}| j |||dj}|dkr| d\}}||||   }| jj|||fi |j!}t"r1t#$  qd| j%jj& | }| j%'|j}|d d (dd}|) *dddd+ }|	dkrb| ,|}|
sh|fS t-|dS )a
  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 1.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            generator (`torch.Generator`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.

        Example:

        ```py
        >>> from diffusers import DiffusionPipeline

        >>> # load model and scheduler
        >>> ldm = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")

        >>> # run pipeline in inference (sample random noise and denoise)
        >>> prompt = "A painting of a squirrel eating a burger"
        >>> images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images

        >>> # save images
        >>> for idx, image in enumerate(images):
        ...     image.save(f"squirrel-{idx}.png")
        ```

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        r   z2`prompt` has to be of type `str` or `list` but is    r   z7`height` and `width` have to be divisible by 8 but are z and .r)    
max_lengthM   Tpt)paddingr:   
truncationreturn_tensorsz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.N)r2   devicedtypezUnexpected latents shape, got z, expected r1   r   )encoder_hidden_statesg      ?r   r+   )images).r   r    sample_sizer"   
isinstancestrlistr   
ValueErrortyper   r   	input_idsto_execution_devicein_channelsr   rA   shaper   set_timestepssetinspect	signaturestep
parameterskeysprogress_bar	timestepstorchcatsamplechunkprev_sampleXLA_AVAILABLExm	mark_stepr   scaling_factordecodeclampcpupermutenumpynumpy_to_pilr   )r#   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   kwargs
batch_sizeuncond_inputnegative_prompt_embeds
text_inputprompt_embedslatents_shapeaccepts_etaextra_kwargstlatents_inputcontext
noise_prednoise_pred_uncondnoise_prediction_textimager&   r&   r'   __call__J   st   B







zLDMTextToImagePipeline.__call__)	NNr(   r)   r*   NNr+   T)__name__
__module____qualname____doc__model_cpu_offload_seqr   r	   r   r   r   r
   r   r   r   r   rX   no_gradrF   rG   intfloat	GeneratorTensorbooltupler   rw   __classcell__r&   r&   r$   r'   r   '   s^    

	
r   zldm-bertz>https://huggingface.co/valhalla/ldm-bert/blob/main/config.jsonc                       sT   e Zd ZdZdgZdddZ						
												d fdd	Z  ZS )LDMBertConfigldmbertpast_key_valuesencoder_attention_headsd_model)num_attention_headshidden_size:w  r;          r7   @   r*   gelu   皙?{Gz?FTr   c                    s   || _ || _|	| _|| _|| _|| _|| _|
| _|| _|| _	|| _
|| _|| _|| _|| _|| _|| _t jdd|i| d S )Npad_token_idr&   )
vocab_sizemax_position_embeddingsr   encoder_ffn_dimencoder_layersr   head_dimdropoutattention_dropoutactivation_dropoutactivation_functioninit_stdencoder_layerdropclassifier_dropout	use_cachenum_hidden_layersscale_embeddingr   r   )r#   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rg   r$   r&   r'   r     s$   zLDMBertConfig.__init__)r   r;   r   r   r7   r   r*   r   r   r   r*   r*   r   r*   FTr   )rx   ry   rz   
model_typekeys_to_ignore_at_inferenceattribute_mapr   r   r&   r&   r$   r'   r      s,    
r   maskrA   tgt_lenc                 C   sj   |   \}}|dur|n|}| ddddddf |d|||}d| }||tjt|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr   r)   )sizeexpandrK   masked_fillrX   r   finfomin)r   rA   r   bszsrc_lenexpanded_maskinverted_maskr&   r&   r'   _expand_mask+  s
   *r   c                       s   e Zd ZdZ			ddededededed	ef fd
dZdej	dedefddZ
					ddej	dej	dB deej	 dB dej	dB dej	dB dedeej	ej	dB eej	 dB f fddZ  ZS )LDMBertAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr*   F	embed_dim	num_headsr   r   
is_decoderbiasc                    s   t    || _|| _|| _|| _|| | _| jd | _|| _t	j
|| j|d| _t	j
|| j|d| _t	j
|| j|d| _t	
| j|| _d S )Ng      )r   )r   r   r   r   r   r   	inner_dimscalingr   nnLineark_projv_projq_projout_proj)r#   r   r   r   r   r   r   r$   r&   r'   r   =  s   
	
zLDMBertAttention.__init__tensorseq_lenr   c                 C   s    | ||| j| jdd S )Nr   r   )viewr   r   	transpose
contiguous)r#   r   r   r   r&   r&   r'   _shapeU  s    zLDMBertAttention._shapeNhidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsr6   c                 C   sX  |du}|  \}}	}
| || j }|r"|dur"|d }|d }nZ|r9| | |d|}| | |d|}nC|durh| | |d|}| | |d|}tj|d |gdd}tj|d |gdd}n| | |d|}| | |d|}| jr||f}|| j	 d| j
f}| ||	|j| }|j| }|j| }| d}t||dd}|  || j	 |	|fkrtd|| j	 |	|f d|   |dur|  |d|	|fkrtd	|d|	|f d|   ||| j	|	|| }||| j	 |	|}tjj|dd}|dur@|  | j	fkr%td
| j	f d|   |dddd||| j	|	| }||| j	 |	|}|rW||| j	|	|}||| j	 |	|}nd}tjj|| j| jd}t||}|  || j	 |	| j
fkrtd|| j	|	| j
f d|   ||| j	|	| j
}|dd}|||	| j}| |}|||fS )z#Input shape: Batch x Time x ChannelNr   r   r   )dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )r   r   r   r   r   r   rX   rY   r   r   r   r   bmmr   rH   r   
functionalsoftmaxr   r   reshaper   r   )r#   r   r   r   r   r   r   is_cross_attentionr   r   _query_states
key_statesvalue_states
proj_shaper   attn_weightsattn_weights_reshaped
attn_probsattn_outputr&   r&   r'   forwardX  s   





"

zLDMBertAttention.forward)r*   FF)NNNNF)rx   ry   rz   r{   r~   r   r   r   rX   r   r   r   r   r   r&   r&   r$   r'   r   :  sN    r   c                       s^   e Zd Zdef fddZ	ddejdejdejded	B d
eejejd	B f f
ddZ	  Z
S )LDMBertEncoderLayerr    c                    s   t    |j| _t| j|j|j|jd| _t	
| j| _|j| _t|j | _|j| _t	| j|j| _t	|j| j| _t	
| j| _d S )N)r   r   r   r   )r   r   r   r   r   r   r   r   	self_attnr   	LayerNormself_attn_layer_normr   r   r   activation_fnr   r   r   fc1fc2final_layer_normr#   r    r$   r&   r'   r     s   
zLDMBertEncoderLayer.__init__Fr   r   r   r   Nr6   c           
      C   s  |}|  |}| j||||d\}}}tjj|| j| jd}|| }|}| |}| | |}tjj|| j	| jd}| 
|}tjj|| j| jd}|| }|jtjkrvt| sdt| rvt|jjd }tj|| |d}|f}	|r|	|f7 }	|	S )a  
        Args:
            hidden_states (`torch.Tensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
            attention_mask (`torch.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.Tensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   r   i  )r   max)r   r   r   r   r   r   r   r   r   r   r   rA   rX   float16isinfanyisnanr   r   rb   )
r#   r   r   r   r   residualr   r   clamp_valueoutputsr&   r&   r'   r     s8   



zLDMBertEncoderLayer.forward)F)rx   ry   rz   r   r   rX   r   r   r   r   r   r&   r&   r$   r'   r     s    r   c                   @   s4   e Zd ZeZdZdZddgZdd Ze	dd Z
d	S )
LDMBertPreTrainedModelmodelTzencoder\.versionzdecoder\.versionc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )Nr*   )meanstd)r    r   rE   r   r   weightdatanormal_r   zero_	Embeddingpadding_idx)r#   moduler   r&   r&   r'   _init_weights  s   

z$LDMBertPreTrainedModel._init_weightsc                 C   s>   | j j}tjg ddddd|gg| jd}|||d}|S )N)r      
      r   r   r7      r   )r@   )r   rJ   )r    r   rX   r   r@   ne)r#   	pad_tokenrJ   dummy_inputsr&   r&   r'   r  !  s   "z#LDMBertPreTrainedModel.dummy_inputsN)rx   ry   rz   r   config_classbase_model_prefix _supports_gradient_checkpointing"_keys_to_ignore_on_load_unexpectedr   propertyr  r&   r&   r&   r'   r     s    r   c                       s   e Zd ZdZdef fddZdd Zdd Z																dd
ej	dej
d	B dej	d	B dej
d	B dej
d	B ded	B ded	B ded	B deeB fddZ  ZS )LDMBertEncoderz
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`LDMBertEncoderLayer`].

    Args:
        config: LDMBertConfig
        embed_tokens (nn.Embedding): output embedding
    r    c                    s   t     j| _ j} j| _ j| _t	 j
|| _t	 j|| _t fddt jD | _t|| _d| _|   d S )Nc                    s   g | ]}t  qS r&   )r   ).0r   r    r&   r'   
<listcomp>A  s    z+LDMBertEncoder.__init__.<locals>.<listcomp>F)r   r   r   r   r   r   r   max_source_positionsr   r   r   embed_tokensembed_positions
ModuleListranger   layersr   
layer_normgradient_checkpointing	post_init)r#   r    r   r$   r  r'   r   6  s    zLDMBertEncoder.__init__c                 C   s   | j S Nr  )r#   r&   r&   r'   get_input_embeddingsH  s   z#LDMBertEncoder.get_input_embeddingsc                 C   s
   || _ d S r  r  )r#   valuer&   r&   r'   set_input_embeddingsK  s   
z#LDMBertEncoder.set_input_embeddingsNrJ   r   position_ids	head_maskinputs_embedsr   output_hidden_statesr5   r6   c	                 C   s<  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|dur*td|dur;| }	|d|	d }n|durH| dd }	ntd|du rU| |}|	d }
|du rjtj	|
tj
|jdd}| |}|| }tjj|| j| jd}|durt||j}|rd	nd}|rd	nd}|dur| d
 t| jkrtdt| j d| d
  dt| jD ]@\}}|r||f }t r| jr| ||||dur|| nd}n||||dur|| nd|d}|d
 }|r||d f }q| |}|r||f }|stdd |||fD S t|||dS )a}  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.BaseModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   )rA   r@   )r   r   r   r&   r   z&The head_mask should be specified for z layers, but it is for r8   )r   r   c                 s   s    | ]	}|d ur|V  qd S r  r&   )r
  vr&   r&   r'   	<genexpr>  s    z)LDMBertEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)r    r   r  use_return_dictrH   r   r   r  rX   arangelongr@   r   r  r   r   r   r   r   rA   r   r  	enumerateis_grad_enabledr  _gradient_checkpointing_funcr  r   r   )r#   rJ   r   r  r  r  r   r  r5   input_shaper   	embed_posr   encoder_statesall_attentionsidxencoder_layerlayer_outputsr&   r&   r'   r   N  sv   /





zLDMBertEncoder.forwardNNNNNNNN)rx   ry   rz   r{   r   r   r  r  rX   
LongTensorr   r   r   r   r   r   r&   r&   r$   r'   r	  ,  s@    		
r	  c                       s@   e Zd Zg Zdef fddZ								dddZ  ZS )LDMBertModelr    c                    s,   t  | t|| _t|j|j| _d S r  )	r   r   r	  r   r   r   r   r   	to_logitsr   r$   r&   r'   r     s   
zLDMBertModel.__init__Nc	           
   
   C   s   | j ||||||||d}	|	S )N)r   r  r  r  r   r  r5   )r   )
r#   rJ   r   r  r  r  r   r  r5   r   r&   r&   r'   r     s   
zLDMBertModel.forwardr0  )rx   ry   rz   _no_split_modulesr   r   r   r   r&   r&   r$   r'   r2    s    r2  r  )4rQ   rX   torch.nnr   transformersr   r   r   transformers.activationsr   transformers.modeling_outputsr   transformers.utilsr   modelsr	   r
   r   r   
schedulersr   r   r   utilsr   utils.torch_utilsr   pipeline_utilsr   r   torch_xla.core.xla_modelcore	xla_modelr^   r]   r   
get_loggerrx   logger%LDMBERT_PRETRAINED_MODEL_ARCHIVE_list%LDMBERT_PRETRAINED_CONFIG_ARCHIVE_MAPr   r   rA   r~   r   Moduler   r   r   r	  r2  r&   r&   r&   r'   <module>   sD    C
 / F !