o
    GÆÏiÍ<  ã                	   @   sþ   d dl mZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddlmZ ddl m!Z!m"Z" e #e$¡Z%eG dd„ dej&ƒƒZ'G dd„ deeee
e	eƒZ(dS )é    )ÚAnyNé   )ÚConfigMixinÚregister_to_config)ÚFromOriginalModelMixinÚPeftAdapterMixinÚSD3Transformer2DLoadersMixin)Úapply_lora_scaleÚlogging)Úmaybe_allow_in_graphé   )ÚAttentionMixinÚFeedForwardÚJointTransformerBlock)Ú	AttentionÚFusedJointAttnProcessor2_0ÚJointAttnProcessor2_0)Ú"CombinedTimestepTextProjEmbeddingsÚ
PatchEmbed)ÚTransformer2DModelOutput)Ú
ModelMixin)ÚAdaLayerNormContinuousÚAdaLayerNormZeroc                       s@   e Zd Zdededef‡ fdd„Zdejdejfdd	„Z‡  ZS )
ÚSD3SingleTransformerBlockÚdimÚnum_attention_headsÚattention_head_dimc              	      sT   t ƒ  ¡  t|ƒ| _t||||dtƒ dd| _tj|ddd| _	t
||dd| _d S )NTçíµ ÷Æ°>)Ú	query_dimÚdim_headÚheadsÚout_dimÚbiasÚ	processorÚepsF©Úelementwise_affiner$   zgelu-approximate)r   Údim_outÚactivation_fn)ÚsuperÚ__init__r   Únorm1r   r   ÚattnÚnnÚ	LayerNormÚnorm2r   Úff)Úselfr   r   r   ©Ú	__class__© úa/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_sd3.pyr*   (   s   

ù
z"SD3SingleTransformerBlock.__init__Úhidden_statesÚtembc           
      C   s†   | j ||d\}}}}}| j|d d}| d¡| }|| }|  |¡}|d| d¡  | d¡ }|  |¡}	| d¡|	 }	||	 }|S )N)Úemb)r6   Úencoder_hidden_statesé   )r+   r,   Ú	unsqueezer/   r0   )
r1   r6   r7   Únorm_hidden_statesÚgate_msaÚ	shift_mlpÚ	scale_mlpÚgate_mlpÚattn_outputÚ	ff_outputr4   r4   r5   Úforward>   s   

z!SD3SingleTransformerBlock.forward)	Ú__name__Ú
__module__Ú__qualname__Úintr*   ÚtorchÚTensorrC   Ú__classcell__r4   r4   r2   r5   r   &   s    þýür   c                       s0  e Zd ZdZdZdgZddgZe						
									d7dededededededededededede	edf de
dB f‡ fdd „ƒZd8d"edB d#ed$dfd%d&„Zd'd(„ Zd)d*„ Zd+d,„ Zed-ƒ							d9d.ejd/ejd0ejd1ejd2ed-ee
ef dB d3ed4ee dB d$ejeB fd5d6„ƒZ‡  ZS ):ÚSD3Transformer2DModelað  
    The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

    Parameters:
        sample_size (`int`, defaults to `128`):
            The width/height of the latents. This is fixed during training since it is used to learn a number of
            position embeddings.
        patch_size (`int`, defaults to `2`):
            Patch size to turn the input data into small patches.
        in_channels (`int`, defaults to `16`):
            The number of latent channels in the input.
        num_layers (`int`, defaults to `18`):
            The number of layers of transformer blocks to use.
        attention_head_dim (`int`, defaults to `64`):
            The number of channels in each head.
        num_attention_heads (`int`, defaults to `18`):
            The number of heads to use for multi-head attention.
        joint_attention_dim (`int`, defaults to `4096`):
            The embedding dimension to use for joint text-image attention.
        caption_projection_dim (`int`, defaults to `1152`):
            The embedding dimension of caption embeddings.
        pooled_projection_dim (`int`, defaults to `2048`):
            The embedding dimension of pooled text projections.
        out_channels (`int`, defaults to `16`):
            The number of latent channels in the output.
        pos_embed_max_size (`int`, defaults to `96`):
            The maximum latent height/width of positional embeddings.
        dual_attention_layers (`tuple[int, ...]`, defaults to `()`):
            The number of dual-stream transformer blocks to use.
        qk_norm (`str`, *optional*, defaults to `None`):
            The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
    Tr   Ú	pos_embedÚnormé€   r   é   é   é@   é   é€  é   é`   r4   NÚsample_sizeÚ
patch_sizeÚin_channelsÚ
num_layersr   r   Újoint_attention_dimÚcaption_projection_dimÚpooled_projection_dimÚout_channelsÚpos_embed_max_sizeÚdual_attention_layers.Úqk_normc                    sÂ   t ƒ  ¡  |
d ur|
n|ˆ_ˆˆ  ˆ_t||||ˆj|dˆ_tˆj|	dˆ_t 	||¡ˆ_
t ‡ ‡‡‡‡‡fdd„tˆƒD ƒ¡ˆ_tˆjˆjdddˆ_tj	ˆj|| ˆj dd	ˆ_dˆ_d S )
N)ÚheightÚwidthrW   rX   Ú	embed_dimr^   )Úembedding_dimr\   c              
      s6   g | ]}t ˆjˆˆ |ˆd  kˆ|ˆv rdndd‘qS )r:   TF)r   r   r   Úcontext_pre_onlyr`   Úuse_dual_attention)r   Ú	inner_dim)Ú.0Úi©r   r_   r   rY   r`   r1   r4   r5   Ú
<listcomp>œ   s    	ø
úÿz2SD3Transformer2DModel.__init__.<locals>.<listcomp>Fr   r%   T)r"   )r)   r*   r]   rg   r   rL   r   Útime_text_embedr-   ÚLinearÚcontext_embedderÚ
ModuleListÚrangeÚtransformer_blocksr   Únorm_outÚproj_outÚgradient_checkpointing)r1   rV   rW   rX   rY   r   r   rZ   r[   r\   r]   r^   r_   r`   r2   rj   r5   r*   w   s.   

úÿ	÷ÿ
zSD3Transformer2DModel.__init__r   Ú
chunk_sizer   Úreturnc                    sZ   |dvrt d|› ƒ‚|pd}dtjjdtdtf‡ fdd„‰ |  ¡ D ]}ˆ |||ƒ q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r:   z-Make sure to set `dim` to either 0 or 1, not r:   Úmoduleru   r   c                    ó6   t | dƒr| j||d |  ¡ D ]}ˆ |||ƒ qd S ©NÚset_chunk_feed_forward)ru   r   ©Úhasattrrz   Úchildren©rw   ru   r   Úchild©Úfn_recursive_feed_forwardr4   r5   r   Â   ó
   
ÿzPSD3Transformer2DModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)Ú
ValueErrorrH   r-   ÚModulerG   r}   )r1   ru   r   rw   r4   r€   r5   Úenable_forward_chunking¯   s   ÿz-SD3Transformer2DModel.enable_forward_chunkingc                    s<   dt jjdtdtf‡ fdd„‰ |  ¡ D ]}ˆ |d dƒ qd S )Nrw   ru   r   c                    rx   ry   r{   r~   r€   r4   r5   r   Î   r‚   zQSD3Transformer2DModel.disable_forward_chunking.<locals>.fn_recursive_feed_forwardr   )rH   r-   r„   rG   r}   )r1   rw   r4   r€   r5   Údisable_forward_chunkingÍ   s   ÿz.SD3Transformer2DModel.disable_forward_chunkingc                 C   sn   d| _ | j ¡ D ]\}}dt|jjƒv rtdƒ‚q| j| _ |  ¡ D ]}t|t	ƒr.|j
dd q!|  tƒ ¡ dS )u  
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
        are fused. For cross-attention modules, key and value projection matrices are fused.

        > [!WARNING] > This API is ðŸ§ª experimental.
        NÚAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)Úfuse)Úoriginal_attn_processorsÚattn_processorsÚitemsÚstrr3   rD   rƒ   ÚmodulesÚ
isinstancer   Úfuse_projectionsÚset_attn_processorr   )r1   Ú_Úattn_processorrw   r4   r4   r5   Úfuse_qkv_projectionsÙ   s   ÿ
€z*SD3Transformer2DModel.fuse_qkv_projectionsc                 C   s   | j dur|  | j ¡ dS dS )un   Disables the fused QKV projection if enabled.

        > [!WARNING] > This API is ðŸ§ª experimental.

        N)r‰   r   )r1   r4   r4   r5   Úunfuse_qkv_projectionsï   s   
ÿz,SD3Transformer2DModel.unfuse_qkv_projectionsÚjoint_attention_kwargsr6   r9   Úpooled_projectionsÚtimestepÚblock_controlnet_hidden_statesÚreturn_dictÚskip_layersc	                 C   s¢  |j dd… \}	}
|  |¡}|  ||¡}|  |¡}|dur5d|v r5| d¡}|  ||¡\}}|j||d t| jƒD ]M\}}|durH||v rHdnd}t	 
¡ r_| jr_|s_|  |||||¡\}}n|sk|||||d\}}|dur‡|jdu r‡t| jƒt|ƒ }||t|| ƒ  }q:|  ||¡}|  |¡}| jj}|	| }	|
| }
|j|j d |	|
||| jfd	}t	 d
|¡}|j|j d | j|	| |
| fd	}|sÌ|fS t|dS )aÓ  
        The [`SD3Transformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
                Input `hidden_states`.
            encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`):
                Embeddings projected from the embeddings of input conditions.
            timestep (`torch.LongTensor`):
                Used to indicate denoising step.
            block_controlnet_hidden_states (`list` of `torch.Tensor`):
                A list of tensors that if specified are added to the residuals of transformer blocks.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.
            skip_layers (`list` of `int`, *optional*):
                A list of layer indices to skip during the forward pass.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        éþÿÿÿNÚip_adapter_image_embeds)Úip_hidden_statesr7   TF)r6   r9   r7   r•   r   )Úshapeznhwpqc->nchpwq)Úsample)rž   rL   rl   rn   ÚpopÚ
image_projÚupdateÚ	enumeraterq   rH   Úis_grad_enabledrt   Ú_gradient_checkpointing_funcre   ÚlenrG   rr   rs   ÚconfigrW   Úreshaper]   Úeinsumr   )r1   r6   r9   r–   r—   r˜   r•   r™   rš   ra   rb   r7   rœ   r   Úip_tembÚindex_blockÚblockÚis_skipÚinterval_controlrW   Úoutputr4   r4   r5   rC   ø   sX   )



û
ü€
ÿÿ
zSD3Transformer2DModel.forward)rN   r   rO   rP   rQ   rP   rR   rS   rT   rO   rU   r4   N)Nr   )NNNNNTN)rD   rE   rF   Ú__doc__Ú _supports_gradient_checkpointingÚ_no_split_modulesÚ _skip_layerwise_casting_patternsr   rG   ÚtuplerŒ   r*   r…   r†   r“   r”   r	   rH   rI   Ú
LongTensorÚlistÚdictr   Úboolr   rC   rJ   r4   r4   r2   r5   rK   O   sž    !ðþýüûúùø	÷
öõôÿóð7	÷þýüûúùø
	÷
örK   ))Útypingr   rH   Útorch.nnr-   Úconfiguration_utilsr   r   Úloadersr   r   r   Úutilsr	   r
   Úutils.torch_utilsr   Ú	attentionr   r   r   Úattention_processorr   r   r   Ú
embeddingsr   r   Úmodeling_outputsr   Úmodeling_utilsr   Únormalizationr   r   Ú
get_loggerrD   Úloggerr„   r   rK   r4   r4   r4   r5   Ú<module>   s&   

(ÿ