o
    GiT                     @   s:  d dl mZ d dlZd dlm  mZ d dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZm Z  e!e"Z#G dd dej$Z%G dd dej$Z&G dd dej$Z'G dd dZ(G dd dej$Z)G dd deeeeeZ*dS )    )AnyN)nn   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)apply_lora_scalelogging   )AttentionMixin)	AttentionSanaLinearAttnProcessor2_0)
PatchEmbedPixArtAlphaTextProjectionTimestepEmbedding	Timesteps)Transformer2DModelOutput)
ModelMixin)AdaLayerNormSingleRMSNormc                       sX   e Zd Z			ddededededB ded	df fd
dZdej	d	ej	fddZ
  ZS )	GLUMBConv   NTin_channelsout_channelsexpand_ratio	norm_typeresidual_connectionreturnc                    s   t    t|| }|| _|| _t | _t||d ddd| _	tj|d |d ddd|d d| _
tj||ddddd| _d | _|dkrRt|d	d
d
d| _d S d S )Nr      r   r   )groupsFbiasrms_normh㈵>T)epselementwise_affiner"   )super__init__intr   r   r   SiLUnonlinearityConv2dconv_inverted
conv_depth
conv_pointnormr   )selfr   r   r   r   r   hidden_channels	__class__ b/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/sana_transformer.pyr(   '   s   

$zGLUMBConv.__init__hidden_statesc                 C   s   | j r|}| |}| |}| |}tj|ddd\}}|| | }| |}| jdkr<| |	dd	dd}| j rC|| }|S )Nr   r   dimr#   )
r   r-   r+   r.   torchchunkr/   r   r0   movedim)r1   r7   residualgater5   r5   r6   forward>   s   




zGLUMBConv.forward)r   NT)__name__
__module____qualname__r)   floatstrboolr(   r;   Tensorr@   __classcell__r5   r5   r3   r6   r   &   s$    r   c                       sN   e Zd Zddededef fddZdejd	ejd
ejdejfddZ	  Z
S )SanaModulatedNormFư>r9   r&   r%   c                    s    t    tj|||d| _d S )Nr&   r%   )r'   r(   r   	LayerNormr0   )r1   r9   r&   r%   r3   r5   r6   r(   V   s   
zSanaModulatedNorm.__init__r7   tembscale_shift_tabler   c                 C   sL   |  |}|d  |d d d f |j jddd\}}|d|  | }|S )Nr   r   r8   )r0   todevicer<   )r1   r7   rM   rN   shiftscaler5   r5   r6   r@   Z   s   
.zSanaModulatedNorm.forward)FrJ   )rA   rB   rC   r)   rF   rD   r(   r;   rG   r@   rH   r5   r5   r3   r6   rI   U   s    rI   c                       s:   e Zd Z fddZd	dejdejdejfddZ  ZS )
&SanaCombinedTimestepGuidanceEmbeddingsc                    sj   t    tdddd| _td|d| _tdddd| _td|d| _t	 | _
tj|d| dd| _d S )N   Tr   )num_channelsflip_sin_to_cosdownscale_freq_shift)r   time_embed_dim   r!   )r'   r(   r   	time_projr   timestep_embedderguidance_condition_projguidance_embedderr   r*   siluLinearlinear)r1   embedding_dimr3   r5   r6   r(   d   s   

z/SanaCombinedTimestepGuidanceEmbeddings.__init__Ntimestepguidancehidden_dtypec           	      C   sT   |  |}| |j|d}| |}| |j|d}|| }| | ||fS )N)dtype)rZ   r[   rO   r\   r]   r`   r^   )	r1   rb   rc   rd   timesteps_projtimesteps_embguidance_projguidance_embconditioningr5   r5   r6   r@   o   s   

z.SanaCombinedTimestepGuidanceEmbeddings.forwardNN)	rA   rB   rC   r(   r;   rG   re   r@   rH   r5   r5   r3   r6   rS   c   s    &rS   c                   @   sL   e Zd ZdZdd Z		ddedejdejdB dejdB d	ejf
d
dZdS )SanaAttnProcessor2_0zs
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
    c                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzTSanaAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)hasattrFImportError)r1   r5   r5   r6   r(      s   
zSanaAttnProcessor2_0.__init__Nattnr7   encoder_hidden_statesattention_maskr   c                 C   sd  |d u r|j n|j \}}}|d ur$||||}|||jd|j d }||}|d u r/|}||}	||}
|jd urC||}|jd urM||	}	|	j d }||j }||d|j|	dd}|	|d|j|	dd}	|
|d|j|	dd}
t
j||	|
|ddd}|	dd|d|j| }||j}|jd |}|jd |}||j }|S )Nr:   r   r           F)	attn_mask	dropout_p	is_causalr   )shapeprepare_attention_maskviewheadsto_qto_kto_vnorm_qnorm_k	transposero   rm   reshaperO   re   to_outrescale_output_factor)r1   rq   r7   rr   rs   
batch_sizesequence_length_querykeyvalue	inner_dimhead_dimr5   r5   r6   __call__   s8   









zSanaAttnProcessor2_0.__call__rk   )	rA   rB   rC   __doc__r(   r   r;   rG   r   r5   r5   r5   r6   rl   z   s     rl   c                       s   e Zd ZdZ											
			d&dedededededB dedB dedB dededededededB ddf fddZ						d'de	j
de	j
dB de	j
dB d e	j
dB d!e	jdB d"ed#ede	j
fd$d%Z  ZS )(SanaTransformerBlockz[
    Transformer block introduced in [Sana](https://huggingface.co/papers/2410.10629).
      F       rt      p   TFrJ         @Nr9   num_attention_headsattention_head_dimdropoutnum_cross_attention_headscross_attention_head_dimcross_attention_dimattention_biasnorm_elementwise_affinenorm_epsattention_out_bias	mlp_ratioqk_normr   c                    s   t    tj|d|
d| _t||||d ur|nd |||d t d	| _|d urFtj||	|
d| _t|||d ur9|nd ||||d|t	 d
| _
t|||d dd| _ttd||d  | _d S )	NFrK   )		query_dimr{   dim_headkv_headsr   r   r"   r   	processorT)
r   r   r   r   r{   r   r   r"   out_biasr   )r   r   rY         ?)r'   r(   r   rL   norm1r   r   attn1norm2rl   attn2r   ff	Parameterr;   randnrN   )r1   r9   r   r   r   r   r   r   r   r   r   r   r   r   r3   r5   r6   r(      s:   
 zSanaTransformerBlock.__init__r7   rs   rr   encoder_attention_maskrb   heightwidthc                 C   s   |j d }| jd  ||dd jddd\}	}
}}}}| |}|d|
  |	 }||j}| |}|||  }| jd urK| j|||d}|| }| 	|}|d|  | }|
d||fdddd}| |}|ddddd}|||  }|S )	Nr   rY   r:   r   r8   )rr   rs   r   r   )rx   rN   r   r<   r   rO   re   r   r   r   	unflattenpermuter   flatten)r1   r7   rs   rr   r   rb   r   r   r   	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpnorm_hidden_statesattn_output	ff_outputr5   r5   r6   r@      s.   






zSanaTransformerBlock.forward)r   r   r   rt   r   r   r   TFrJ   Tr   N)NNNNNN)rA   rB   rC   r   r)   rD   rF   rE   r(   r;   rG   
LongTensorr@   rH   r5   r5   r3   r6   r      s    	
8	r   c                .       sZ  e Zd ZdZdZg dZddgZe									
													d8dededB dededededB dedB dedB dede	de	de
d ed!ed"e
d#e	d$edB d%e
d&e	d'edB d(e	d)df, fd*d+Zed,						d9d-ejd.ejd/ejd0ejdB d1ejdB d2ejdB d,eeef dB d3eej dB d4e
d)eejd5f eB fd6d7Z  ZS ):SanaTransformer2DModelax  
    A 2D Transformer model introduced in [Sana](https://huggingface.co/papers/2410.10629) family of models.

    Args:
        in_channels (`int`, defaults to `32`):
            The number of channels in the input.
        out_channels (`int`, *optional*, defaults to `32`):
            The number of channels in the output.
        num_attention_heads (`int`, defaults to `70`):
            The number of heads to use for multi-head attention.
        attention_head_dim (`int`, defaults to `32`):
            The number of channels in each head.
        num_layers (`int`, defaults to `20`):
            The number of layers of Transformer blocks to use.
        num_cross_attention_heads (`int`, *optional*, defaults to `20`):
            The number of heads to use for cross-attention.
        cross_attention_head_dim (`int`, *optional*, defaults to `112`):
            The number of channels in each head for cross-attention.
        cross_attention_dim (`int`, *optional*, defaults to `2240`):
            The number of channels in the cross-attention output.
        caption_channels (`int`, defaults to `2304`):
            The number of channels in the caption embeddings.
        mlp_ratio (`float`, defaults to `2.5`):
            The expansion ratio to use in the GLUMBConv layer.
        dropout (`float`, defaults to `0.0`):
            The dropout probability.
        attention_bias (`bool`, defaults to `False`):
            Whether to use bias in the attention layer.
        sample_size (`int`, defaults to `32`):
            The base size of the input latent.
        patch_size (`int`, defaults to `1`):
            The size of the patches to use in the patch embedding layer.
        norm_elementwise_affine (`bool`, defaults to `False`):
            Whether to use elementwise affinity in the normalization layer.
        norm_eps (`float`, defaults to `1e-6`):
            The epsilon value for the normalization layer.
        qk_norm (`str`, *optional*, defaults to `None`):
            The normalization to use for the query and key.
        timestep_scale (`float`, defaults to `1.0`):
            The scale to use for the timesteps.
    T)r   r   rI   patch_embedr0   r   r   r   r   r    	  r   rt   Fr   rJ   N皙?      ?r   r   r   r   
num_layersr   r   r   caption_channelsr   r   r   sample_size
patch_sizer   r   interpolation_scaleguidance_embedsguidance_embeds_scaler   timestep_scaler   c                    s   t    |p|}	 t||||||d urdnd d| _|r't| _nt| _t|	d| _t	ddd| _
t 	
fddt|D | _ttd	d
  | _tddd| _t|| | | _d| _d S )Nsincos)r   r   r   r   	embed_dimr   pos_embed_type)in_featureshidden_sizer$   T)r%   r&   c                    s,   g | ]}t 	
 d qS ))	r   r   r   r   r   r   r   r   r   )r   ).0r   r   r   r   r   r   r   r   r   r   r   r   r   r5   r6   
<listcomp>  s"    z3SanaTransformer2DModel.__init__.<locals>.<listcomp>r   r   FrJ   rK   )r'   r(   r   r   rS   
time_embedr   r   caption_projectionr   caption_normr   
ModuleListrangetransformer_blocksr   r;   r   rN   rI   norm_outr_   proj_outgradient_checkpointing)r1   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r3   r   r6   r(   S  s4   

 
zSanaTransformer2DModel.__init__attention_kwargsr7   rr   rb   rc   r   rs   controlnet_block_samplesreturn_dict.c
                 C   s4  |d ur|j dkrd||j d }|d}|d ur0|j dkr0d||j d }|d}|j\}
}}}| jj}|| || }}| |}|d urY| j|||jd\}}n| j||
|jd\}}| 	|}|
|
d|jd }| |}t r| jrt| jD ]*\}}| ||||||||}|d urd|  k rt|krn q|||d   }qn.t| jD ](\}}||||||||}|d urd|  k rt|krn q|||d   }q| ||| j}| |}||
||| jj| jjd}|dddd	dd
}||
d|| || }|	s|fS t|dS )Nr   r   g     )rc   rd   )r   rd   r:   r      r   r   )sample)ndimrO   re   	unsqueezerx   configr   r   r   r   rz   r   r;   is_grad_enabledr   	enumerater   _gradient_checkpointing_funclenr   rN   r   r   r   r   )r1   r7   rr   rb   rc   r   rs   r   r   r   r   rU   r   r   ppost_patch_heightpost_patch_widthembedded_timestepindex_blockblockoutputr5   r5   r6   r@     sv   





$
$	

zSanaTransformer2DModel.forward)r   r   r   r   r   r   r   r   r   r   rt   Fr   r   FrJ   NFr   Nr   )NNNNNT)rA   rB   rC   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   r)   rD   rF   rE   r(   r	   r;   rG   dictr   tupler   r@   rH   r5   r5   r3   r6   r   $  s    *	
M	
r   )+typingr   r;   torch.nn.functionalr   
functionalro   configuration_utilsr   r   loadersr   r   utilsr	   r
   	attentionr   attention_processorr   r   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrA   loggerModuler   rI   rS   rl   r   r   r5   r5   r5   r6   <module>   s(   
/Bh