o
    GiZn                     @   s   d dl mZ d dlZd dlm  mZ d dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZ eeZG dd deZG dd deeZdS )    )AnyN)nn   )LegacyConfigMixinregister_to_config)	deprecatelogging   )BasicTransformerBlock)ImagePositionalEmbeddings
PatchEmbedPixArtAlphaTextProjection)Transformer2DModelOutput)LegacyModelMixin)AdaLayerNormSinglec                       s   e Zd Z fddZ  ZS )r   c                    s&   d}t dd| t j|i | d S )NzImporting `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead.r   1.0.0)r   super__init__)selfargskwargsdeprecation_message	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_2d.pyr   !   s   z!Transformer2DModelOutput.__init__)__name__
__module____qualname__r   __classcell__r   r   r   r   r       s    r   c                5       s  e Zd ZdZdZdgZddgZe							
																			dGdedededB dedB dede	dededB de
dedB dedB dedB dededB de
d e
d!e
d"e
d#ed$e
d%e	d&ed'ed(e	d)e
dB f2 fd*d+Zd,d- Zd.d/ Zd0d1 Z								dHd2ejd3ejdB d4ejdB d5eeejf d6ejdB d7eeef d8ejdB d9ejdB d:e
fd;d<Zd=d> Zd?d@ ZdAdB ZdCdD Z	dIdEdFZ  ZS )JTransformer2DModela+  
    A 2D Transformer model for image-like data.

    Parameters:
        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            The number of channels in the input and output (specify if the input is **continuous**).
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
            This is fixed during training since it is used to learn a number of position embeddings.
        num_vector_embeds (`int`, *optional*):
            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
            Includes the class for the masked latent pixel.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
        num_embeds_ada_norm ( `int`, *optional*):
            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
            added to the hidden states.

            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
        attention_bias (`bool`, *optional*):
            Configure if the `TransformerBlocks` attention should contain a bias parameter.
    Tr
   latent_image_embeddingnorm   X   N               Fgeglu
layer_normh㈵>defaultnum_attention_headsattention_head_dimin_channelsout_channels
num_layersdropoutnorm_num_groupscross_attention_dimattention_biassample_sizenum_vector_embeds
patch_sizeactivation_fnnum_embeds_ada_normuse_linear_projectiononly_cross_attentiondouble_self_attentionupcast_attention	norm_typenorm_elementwise_affinenorm_epsattention_typecaption_channelsinterpolation_scaleuse_additional_conditionsc                    s  t    |d ur%|dvrtd| d|dv r%|d u r%td| d|d uo,|d u | _|d u| _|d uo:|d u| _| jrM| jrMtd| d| d	| jr^| jr^td
| d| d| jsu| jsu| jsutd| d| d| d|dkr|d urd| j d}tdd|dd d}|| _	|| _
|| _|| _|| _| jj| jj | _|| _|d u r|n|| _d| _|d u r|dkr|
dkrd}nd}|| _| jr| j|d d S | jr| j|d d S | jr| j|d d S d S )N)ada_normada_norm_zeroada_norm_singlezRForward pass is not implemented when `patch_size` is not None and `norm_type` is 'z'.)rE   rF   z0When using a `patch_size` and this `norm_type` (z(), `num_embeds_ada_norm` cannot be None.z"Cannot define both `in_channels`: z and `num_vector_embeds`: zE. Make sure that either `in_channels` or `num_vector_embeds` is None.z(Cannot define both `num_vector_embeds`: z and `patch_size`: zE. Make sure that either `num_vector_embeds` or `num_patches` is None.zHas to define `in_channels`: z, `num_vector_embeds`: z, or patch_size: zQ. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None.r)   z&The configuration file of this model: a   is outdated. `norm_type` is either not set or incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config. Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for the `transformer/config.json` fileznorm_type!=num_embeds_ada_normr   F)standard_warnrE   rG      T)r>   )r   r   NotImplementedError
ValueErroris_input_continuousis_input_vectorizedis_input_patchesr   r   r:   rC   rB   r,   r-   config	inner_dimr.   r/   gradient_checkpointingrD   _init_continuous_input_init_vectorized_inputs_init_patched_inputs)r   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   r   r   r   r   r   G   sj   



zTransformer2DModel.__init__c                    s   t jjjjjddd_jrt jjj	_
nt jjjj	dddd_
t fddtjjD _jrKt jj	j_d S t jjj	jdddd_d S )	Nư>T)
num_groupsnum_channelsepsaffiner%   r   )kernel_sizestridepaddingc                    h   g | ]0}t jjjjjjjjjjjjjjj	jj
jjjj jjjjjjd qS )r1   r3   r8   r9   r4   r;   r<   r=   r>   r?   r@   rA   r
   rP   rO   r,   r-   r1   r3   r8   r9   r4   r;   r<   r=   r?   r@   rA   .0_r>   r   r   r   
<listcomp>   (    z=Transformer2DModel._init_continuous_input.<locals>.<listcomp>)torchr   	GroupNormrO   r2   r.   r"   r:   LinearrP   proj_inConv2d
ModuleListranger0   transformer_blocksr/   proj_outr   r>   r   rc   r   rR      s   
 z)Transformer2DModel._init_continuous_inputc                    s   j jd us
J dj jd usJ dj j_j j_jj _tj jjjjd_t	
 fddtj jD _t	j_t	jj jd _d S )Nz?Transformer2DModel over discrete input must provide sample_sizez=Transformer2DModel over discrete input must provide num_embed)	num_embed	embed_dimheightwidthc                    r]   r^   r_   r`   rc   r   r   rd      re   z>Transformer2DModel._init_vectorized_inputs.<locals>.<listcomp>r%   )rO   r5   r6   rr   rs   num_latent_pixelsr   rP   r!   r   rk   rl   r0   rm   	LayerNormnorm_outrh   outro   r   rc   r   rS      s"   


z*Transformer2DModel._init_vectorized_inputsc                    s  j jd us
J dj j_j j_j j_j jd ur#j jntj jd d}tj jj jj jjj	|d_
t fddtj jD _j jdkr~tjj	dd	d
_tj	dj	 _tj	j jj j j _n1j jdkrtjj	dd	d
_ttdj	j	d  _tj	j jj j j _d _j jdkrtj	jd_d _j d urt!j j	d_d S d S )Nz>Transformer2DModel over patched input must provide sample_size@   r%   )rr   rs   r7   r.   rq   rC   c                    r]   r^   r_   r`   rc   r   r   rd     re   z;Transformer2DModel._init_patched_inputs.<locals>.<listcomp>rG   FrU   )elementwise_affinerX   r	         ?)rD   )in_featureshidden_size)"rO   r5   rr   rs   r7   rC   maxr   r.   rP   	pos_embedr   rk   rl   r0   rm   r>   ru   rv   rh   
proj_out_1r/   
proj_out_2	Parameterrf   randnscale_shift_tablern   adaln_singler   rD   caption_projectionrB   r   )r   r>   rC   r   rc   r   rT      sX   


	
 
z'Transformer2DModel._init_patched_inputshidden_statesencoder_hidden_statestimestepadded_cond_kwargsclass_labelscross_attention_kwargsattention_maskencoder_attention_maskreturn_dictc
                 C   s  |dur| dddurtd |dur)|jdkr)d||j d }|d}|durA|jdkrAd||j d }|d}| jrU|j\}
}}}|}| 	|\}}n)| j
r^| |}n | jr~|jd | j |jd | j }}| ||||\}}}}| jD ]!}t r| jr| ||||||||}q||||||||d	}q| jr| j|||
|||d
}n| j
r| |}n| jr| j||||||d}|	s|fS t|dS )ar
  
        The [`Transformer2DModel`] forward method.

        Args:
            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
                Input `hidden_states`.
            encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
            timestep ( `torch.LongTensor`, *optional*):
                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
                `AdaLayerZeroNorm`.
            cross_attention_kwargs ( `dict[str, Any]`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            attention_mask ( `torch.Tensor`, *optional*):
                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
                negative values to the attention scores corresponding to "discard" tokens.
            encoder_attention_mask ( `torch.Tensor`, *optional*):
                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:

                    * Mask `(batch, sequence_length)` True = keep, False = discard.
                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.

                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
                above. This bias will be added to the cross-attention scores.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformers.transformer_2d.Transformer2DModelOutput`] is returned,
            otherwise a `tuple` where the first element is the sample tensor.
        NscalezSPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r	   r%   g     )r   r   r   r   r   r   )r   residual
batch_sizerr   rs   rP   )r   r   r   embedded_timesteprr   rs   )sample)getloggerwarningndimtodtype	unsqueezerL   shape_operate_on_continuous_inputsrM   r!   rN   r7   _operate_on_patched_inputsrm   rf   is_grad_enabledrQ   _gradient_checkpointing_func!_get_output_for_continuous_inputs!_get_output_for_vectorized_inputs_get_output_for_patched_inputsr   )r   r   r   r   r   r   r   r   r   r   r   rb   rr   rs   r   rP   r   blockoutputr   r   r   forwardD  s   2


"
	
zTransformer2DModel.forwardc                 C   s   |j \}}}}| |}| js,| |}|j d }|dddd||| |}||fS |j d }|dddd||| |}| |}||fS )Nr%   r   r	   r   )r   r"   r:   ri   permutereshape)r   r   batchrb   rr   rs   rP   r   r   r   r     s   




z0Transformer2DModel._operate_on_continuous_inputsc                 C   s   |j d }| |}d }| jd ur(| jr|d u rtd| j||||jd\}}| jd ur<| |}||d|j d }||||fS )Nr   zW`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.)r   hidden_dtyper   )r   r~   r   rD   rK   r   r   view)r   r   r   r   r   r   r   r   r   r   r     s   






z-Transformer2DModel._operate_on_patched_inputsc                 C   sh   | j s|||||dddd }| |}n| |}|||||dddd }|| }|S )Nr   r   r%   r	   )r:   r   r   
contiguousrn   )r   r   r   r   rr   rs   rP   r   r   r   r   r     s   
z4Transformer2DModel._get_output_for_continuous_inputsc                 C   s<   |  |}| |}|ddd}tj| dd }|S )Nr   r	   r%   dim)rv   rw   r   Flog_softmaxdoublefloat)r   r   logitsr   r   r   r   r     s
   

z4Transformer2DModel._get_output_for_vectorized_inputsc                 C   sP  | j jdkr?| jd jj|||jd}| t|j	ddd\}}	| 
|d|	d d d f   |d d d f  }| |}n1| j jdkrp| jd  |d d d f  j	ddd\}}	| 
|}|d|	  | }| |}|d}| jd u rt|jd d  }}|jd||| j| j| jfd	}td
|}|jd| j|| j || j fd	}
|
S )NrG   r   )r   r	   r%   r   rz   r   )r   znhwpqc->nchpwq)rO   r>   rm   norm1embr   r   r   siluchunkrv   r   r   rn   squeezer   intr   r   r7   r/   rf   einsum)r   r   r   r   r   rr   rs   conditioningshiftr   r   r   r   r   r     s.   .(



z1Transformer2DModel._get_output_for_patched_inputs)r#   r$   NNr%   r&   r'   NFNNNr(   NFFFFr)   Tr*   r+   NNN)NNNNNNNT)NN)r   r   r   __doc__ _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   r   r   boolstrr   rR   rS   rT   rf   Tensor
LongTensordictr   r   r   r   r   r   r   r   r   r   r   r   r    '   s    	
g%(K
	

 
r    )typingr   rf   torch.nn.functionalr   
functionalr   configuration_utilsr   r   utilsr   r   	attentionr
   
embeddingsr   r   r   modeling_outputsr   modeling_utilsr   normalizationr   
get_loggerr   r   r    r   r   r   r   <module>   s   
