o
    ۷i8                     @   s   d dl mZ d dlZd dlmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ e	eZG dd dejZG dd deeZdS )    )AnyN   )ConfigMixinregister_to_config)logging   )LuminaFeedForward)	AttentionLuminaAttnProcessor2_0)&LuminaCombinedTimestepCaptionEmbeddingLuminaPatchEmbed)Transformer2DModelOutput)
ModelMixin)LuminaLayerNormContinuousLuminaRMSNormZeroRMSNormc                       s   e Zd ZdZ	ddedededededed	ed
ededdf fddZ	ddej	dej	dej	dej	dej	dej	de
eef dB dej	fddZ  ZS )LuminaNextDiTBlocka  
    A LuminaNextDiTBlock for LuminaNextDiT2DModel.

    Parameters:
        dim (`int`): Embedding dimension of the input features.
        num_attention_heads (`int`): Number of attention heads.
        num_kv_heads (`int`):
            Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
        multiple_of (`int`): The number of multiple of ffn layer.
        ffn_dim_multiplier (`float`): The multiplier factor of ffn layer dimension.
        norm_eps (`float`): The eps for norm layer.
        qk_norm (`bool`): normalization for query and key.
        cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
        norm_elementwise_affine (`bool`, *optional*, defaults to True),
    Tdimnum_attention_headsnum_kv_headsmultiple_offfn_dim_multipliernorm_epsqk_normcross_attention_dimnorm_elementwise_affinereturnNc
           
         s   t    || | _tt|g| _t|d || |rdnd ||dddt	 d
| _
t | j
_t|||| |r:dnd ||dddt	 d
| _t|td| d ||d| _t|||	d| _t|||	d	| _t|||	d	| _t|||	d	| _t|||	d	| _d S )
Nlayer_norm_across_headsh㈵>F)
	query_dimr   dim_headr   headskv_headsepsbiasout_bias	processor   r   )r   	inner_dimr   r   )embedding_dimr   r   )r#   elementwise_affine)super__init__head_dimnn	Parametertorchzerosgater	   r
   attn1Identityto_outattn2r   intfeed_forwardr   norm1r   	ffn_norm1norm2	ffn_norm2norm1_context)
selfr   r   r   r   r   r   r   r   r   	__class__ d/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/models/transformers/lumina_nextdit2d.pyr,   5   sV   



zLuminaNextDiTBlock.__init__hidden_statesattention_maskimage_rotary_embencoder_hidden_statesencoder_masktembcross_attention_kwargsc                 C   s   |}|  ||\}	}
}}| jd|	|	|||d|}| |}| jd|	|||dd|}|| j dddd }|| }|d}| jjd |}||
	d | 
|  }| | |d|	d  }||	d | |  }|S )a  
        Perform a forward pass through the LuminaNextDiTBlock.

        Parameters:
            hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
            attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
            image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
            encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
            encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
            temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
            cross_attention_kwargs (`dict[str, Any]`): kwargs for cross attention.
        )rC   rF   rD   query_rotary_embkey_rotary_embN   r   rA   )r9   r3   r=   r6   r2   tanhviewflattenr5   	unsqueezer;   r8   r:   r<   )r>   rC   rD   rE   rF   rG   rH   rI   residualnorm_hidden_statesgate_msa	scale_mlpgate_mlpself_attn_outputnorm_encoder_hidden_statescross_attn_outputmixed_attn_output
mlp_outputrA   rA   rB   forwardv   s:   


zLuminaNextDiTBlock.forward)T)N)__name__
__module____qualname____doc__r7   floatboolr,   r0   Tensordictstrr   r]   __classcell__rA   rA   r?   rB   r   $   sT    	
I	r   c                        s  e Zd ZdZg dZe											
				d'dededB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB dedB ddf fddZ			d(de
jd e
jd!e
jd"e
jd#e
jd$eeef dee
j eB fd%d&Z  ZS ))LuminaNextDiT2DModelaa  
    LuminaNextDiT: Diffusion model with a Transformer backbone.

    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.

    Parameters:
        sample_size (`int`): The width of the latent images. This is fixed during training since
            it is used to learn a number of position embeddings.
        patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
            The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
        in_channels (`int`, *optional*, defaults to 4):
            The number of input channels for the model. Typically, this matches the number of channels in the input
            images.
        hidden_size (`int`, *optional*, defaults to 4096):
            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
            hidden representations.
        num_layers (`int`, *optional*, default to 32):
            The number of layers in the model. This defines the depth of the neural network.
        num_attention_heads (`int`, *optional*, defaults to 32):
            The number of attention heads in each attention layer. This parameter specifies how many separate attention
            mechanisms are used.
        num_kv_heads (`int`, *optional*, defaults to 8):
            The number of key-value heads in the attention mechanism, if different from the number of attention heads.
            If None, it defaults to num_attention_heads.
        multiple_of (`int`, *optional*, defaults to 256):
            A factor that the hidden size should be a multiple of. This can help optimize certain hardware
            configurations.
        ffn_dim_multiplier (`float`, *optional*):
            A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
            the model configuration.
        norm_eps (`float`, *optional*, defaults to 1e-5):
            A small value added to the denominator for numerical stability in normalization layers.
        learn_sigma (`bool`, *optional*, defaults to True):
            Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
            predictions.
        qk_norm (`bool`, *optional*, defaults to True):
            Indicates if the queries and keys in the attention mechanism should be normalized.
        cross_attention_dim (`int`, *optional*, defaults to 2048):
            The dimensionality of the text embeddings. This parameter defines the size of the text representations used
            in the model.
        scaling_factor (`float`, *optional*, defaults to 1.0):
            A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
            overall scale of the model's operations.
    )patch_embeddernormffn_norm   r       	      N   r   T         ?sample_size
patch_sizein_channelshidden_size
num_layersr   r   r   r   r   learn_sigmar   r   scaling_factorr   c              
      s   t    || _|| _|| _|r|d n|| _| _| _ | _|| _	t
||dd| _tt| _ttd d| _t fddt|D | _ttddd	d|| | j d
| _ d dksvJ dd S )Nr   T)rt   ru   	embed_dimr$   i   )rv   r   c                    s"   g | ]}t  qS rA   )r   ).0_r   r   rv   r   r   r   r   r   rA   rB   
<listcomp>	  s    z1LuminaNextDiT2DModel.__init__.<locals>.<listcomp>Fgư>)r)   conditioning_embedding_dimr*   r#   r$   out_dimrm   r   z+2d rope needs head dim to be divisible by 4)r+   r,   rs   rt   ru   out_channelsrv   r   r-   ry   r   ri   r.   r/   r0   empty	pad_tokenr   mintime_caption_embed
ModuleListrangelayersr   norm_out)r>   rs   rt   ru   rv   rw   r   r   r   r   r   rx   r   r   ry   r?   r}   rB   r,      s<   



zLuminaNextDiT2DModel.__init__rC   timesteprF   rG   rE   rI   c              
   C   s   |  ||\}}}	}||j}| |||}
| }| jD ]}|||||||
|d}q| ||
}| j }}|	d \}}|d}|| ||  }|ddd|f 	||| || ||| j
}|dddddddddd}|sx|fS t|d	S )
a  
        Forward pass of LuminaNextDiT.

        Parameters:
            hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
            timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
            encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
            encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
        )rH   rI   r   N   rL   r   r   rm   )sample)ri   todevicer   rc   r   r   rt   sizerP   r   permuterQ   r   )r>   rC   r   rF   rG   rE   rI   return_dictmaskimg_sizerH   layerheight_tokenswidth_tokensheightwidth
batch_sizesequence_lengthoutputrA   rA   rB   r]   #  s4   



$
zLuminaNextDiT2DModel.forward)rl   r   rm   rn   ro   ro   Nrp   Nr   TTrq   rr   )NT)r^   r_   r`   ra    _skip_layerwise_casting_patternsr   r7   rb   rc   r,   r0   rd   re   rf   r   tupler   r]   rg   rA   rA   r?   rB   rh      s    -	
G
	rh   ) typingr   r0   torch.nnr.   configuration_utilsr   r   utilsr   	attentionr   attention_processorr	   r
   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerr^   loggerModuler   rh   rA   rA   rA   rB   <module>   s   
 