o
    Gi<                  	   @   s   d dl mZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddlmZ ddl m!Z!m"Z" e#e$Z%eG dd dej&Z'G dd deeee
e	eZ(dS )    )AnyN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixinSD3Transformer2DLoadersMixin)apply_lora_scalelogging)maybe_allow_in_graph   )AttentionMixinFeedForwardJointTransformerBlock)	AttentionFusedJointAttnProcessor2_0JointAttnProcessor2_0)"CombinedTimestepTextProjEmbeddings
PatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousAdaLayerNormZeroc                       s@   e Zd Zdededef fddZdejdejfdd	Z  ZS )
SD3SingleTransformerBlockdimnum_attention_headsattention_head_dimc              	      sT   t    t|| _t||||dt dd| _tj|ddd| _	t
||dd| _d S )NTư>)	query_dimdim_headheadsout_dimbias	processorepsFelementwise_affiner$   zgelu-approximate)r   dim_outactivation_fn)super__init__r   norm1r   r   attnnn	LayerNormnorm2r   ff)selfr   r   r   	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_sd3.pyr*   (   s   


z"SD3SingleTransformerBlock.__init__hidden_statestembc           
      C   s   | j ||d\}}}}}| j|d d}|d| }|| }| |}|d|d  |d }| |}	|d|	 }	||	 }|S )N)emb)r6   encoder_hidden_states   )r+   r,   	unsqueezer/   r0   )
r1   r6   r7   norm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpattn_output	ff_outputr4   r4   r5   forward>   s   

z!SD3SingleTransformerBlock.forward)	__name__
__module____qualname__intr*   torchTensorrC   __classcell__r4   r4   r2   r5   r   &   s    r   c                       s0  e Zd ZdZdZdgZddgZe						
									d7dededededededededededede	edf de
dB f fdd Zd8d"edB d#ed$dfd%d&Zd'd( Zd)d* Zd+d, Zed-							d9d.ejd/ejd0ejd1ejd2ed-ee
ef dB d3ed4ee dB d$ejeB fd5d6Z  ZS ):SD3Transformer2DModela  
    The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

    Parameters:
        sample_size (`int`, defaults to `128`):
            The width/height of the latents. This is fixed during training since it is used to learn a number of
            position embeddings.
        patch_size (`int`, defaults to `2`):
            Patch size to turn the input data into small patches.
        in_channels (`int`, defaults to `16`):
            The number of latent channels in the input.
        num_layers (`int`, defaults to `18`):
            The number of layers of transformer blocks to use.
        attention_head_dim (`int`, defaults to `64`):
            The number of channels in each head.
        num_attention_heads (`int`, defaults to `18`):
            The number of heads to use for multi-head attention.
        joint_attention_dim (`int`, defaults to `4096`):
            The embedding dimension to use for joint text-image attention.
        caption_projection_dim (`int`, defaults to `1152`):
            The embedding dimension of caption embeddings.
        pooled_projection_dim (`int`, defaults to `2048`):
            The embedding dimension of pooled text projections.
        out_channels (`int`, defaults to `16`):
            The number of latent channels in the output.
        pos_embed_max_size (`int`, defaults to `96`):
            The maximum latent height/width of positional embeddings.
        dual_attention_layers (`tuple[int, ...]`, defaults to `()`):
            The number of dual-stream transformer blocks to use.
        qk_norm (`str`, *optional*, defaults to `None`):
            The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
    Tr   	pos_embednorm   r         @           `   r4   Nsample_size
patch_sizein_channels
num_layersr   r   joint_attention_dimcaption_projection_dimpooled_projection_dimout_channelspos_embed_max_sizedual_attention_layers.qk_normc                    s   t    |
d ur|
n|_  _t||||j|d_tj|	d_t	||_
t fddtD _tjjddd_tj	j|| j dd	_d_d S )
N)heightwidthrW   rX   	embed_dimr^   )embedding_dimr\   c              
      s6   g | ]}t j |d  k|v rdnddqS )r:   TF)r   r   r   context_pre_onlyr`   use_dual_attention)r   	inner_dim).0ir   r_   r   rY   r`   r1   r4   r5   
<listcomp>   s    	
z2SD3Transformer2DModel.__init__.<locals>.<listcomp>Fr   r%   T)r"   )r)   r*   r]   rg   r   rL   r   time_text_embedr-   Linearcontext_embedder
ModuleListrangetransformer_blocksr   norm_outproj_outgradient_checkpointing)r1   rV   rW   rX   rY   r   r   rZ   r[   r\   r]   r^   r_   r`   r2   rj   r5   r*   w   s.   

	
zSD3Transformer2DModel.__init__r   
chunk_sizer   returnc                    sZ   |dvrt d| |pd}dtjjdtdtf fdd |  D ]} ||| q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r:   z-Make sure to set `dim` to either 0 or 1, not r:   moduleru   r   c                    6   t | dr| j||d |  D ]} ||| qd S Nset_chunk_feed_forward)ru   r   hasattrrz   childrenrw   ru   r   childfn_recursive_feed_forwardr4   r5   r      
   
zPSD3Transformer2DModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)
ValueErrorrH   r-   ModulerG   r}   )r1   ru   r   rw   r4   r   r5   enable_forward_chunking   s   z-SD3Transformer2DModel.enable_forward_chunkingc                    s<   dt jjdtdtf fdd |  D ]} |d d qd S )Nrw   ru   r   c                    rx   ry   r{   r~   r   r4   r5   r      r   zQSD3Transformer2DModel.disable_forward_chunking.<locals>.fn_recursive_feed_forwardr   )rH   r-   r   rG   r}   )r1   rw   r4   r   r5   disable_forward_chunking   s   z.SD3Transformer2DModel.disable_forward_chunkingc                 C   sn   d| _ | j D ]\}}dt|jjv rtdq| j| _ |  D ]}t|t	r.|j
dd q!| t  dS )u  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        > [!WARNING] > This API is 🧪 experimental.
        NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsattn_processorsitemsstrr3   rD   r   modules
isinstancer   fuse_projectionsset_attn_processorr   )r1   _attn_processorrw   r4   r4   r5   fuse_qkv_projections   s   
z*SD3Transformer2DModel.fuse_qkv_projectionsc                 C   s   | j dur| | j  dS dS )un   Disables the fused QKV projection if enabled.

        > [!WARNING] > This API is 🧪 experimental.

        N)r   r   )r1   r4   r4   r5   unfuse_qkv_projections   s   
z,SD3Transformer2DModel.unfuse_qkv_projectionsjoint_attention_kwargsr6   r9   pooled_projectionstimestepblock_controlnet_hidden_statesreturn_dictskip_layersc	                 C   s  |j dd \}	}
| |}| ||}| |}|dur5d|v r5|d}| ||\}}|j||d t| jD ]M\}}|durH||v rHdnd}t	
 r_| jr_|s_| |||||\}}n|sk|||||d\}}|dur|jdu rt| jt| }||t||   }q:| ||}| |}| jj}|	| }	|
| }
|j|j d |	|
||| jfd	}t	d
|}|j|j d | j|	| |
| fd	}|s|fS t|dS )a  
        The [`SD3Transformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`):
                Embeddings projected from the embeddings of input conditions.
            timestep (`torch.LongTensor`):
                Used to indicate denoising step.
            block_controlnet_hidden_states (`list` of `torch.Tensor`):
                A list of tensors that if specified are added to the residuals of transformer blocks.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.
            skip_layers (`list` of `int`, *optional*):
                A list of layer indices to skip during the forward pass.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        Nip_adapter_image_embeds)ip_hidden_statesr7   TF)r6   r9   r7   r   r   )shapeznhwpqc->nchpwq)sample)r   rL   rl   rn   pop
image_projupdate	enumeraterq   rH   is_grad_enabledrt   _gradient_checkpointing_funcre   lenrG   rr   rs   configrW   reshaper]   einsumr   )r1   r6   r9   r   r   r   r   r   r   ra   rb   r7   r   r   ip_tembindex_blockblockis_skipinterval_controlrW   outputr4   r4   r5   rC      sX   )






zSD3Transformer2DModel.forward)rN   r   rO   rP   rQ   rP   rR   rS   rT   rO   rU   r4   N)Nr   )NNNNNTN)rD   rE   rF   __doc__ _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   rG   tupler   r*   r   r   r   r   r	   rH   rI   
LongTensorlistdictr   boolr   rC   rJ   r4   r4   r2   r5   rK   O   s    !	
7	
	
rK   ))typingr   rH   torch.nnr-   configuration_utilsr   r   loadersr   r   r   utilsr	   r
   utils.torch_utilsr   	attentionr   r   r   attention_processorr   r   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrD   loggerr   r   rK   r4   r4   r4   r5   <module>   s&   

(