o
    pi98                     @   s   d dl mZmZmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ eeZG dd dejZ G dd deeZ!dS )    )AnyDictOptionalN   )ConfigMixinregister_to_config)logging   )LuminaFeedForward)	AttentionLuminaAttnProcessor2_0)&LuminaCombinedTimestepCaptionEmbeddingLuminaPatchEmbed)Transformer2DModelOutput)
ModelMixin)LuminaLayerNormContinuousLuminaRMSNormZeroRMSNormc                       s   e Zd ZdZ	ddedededededed	ed
ededdf fddZ	ddej	dej	dej	dej	dej	dej	de
eeef  fddZ  ZS )LuminaNextDiTBlocka  
    A LuminaNextDiTBlock for LuminaNextDiT2DModel.

    Parameters:
        dim (`int`): Embedding dimension of the input features.
        num_attention_heads (`int`): Number of attention heads.
        num_kv_heads (`int`):
            Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
        multiple_of (`int`): The number of multiple of ffn layer.
        ffn_dim_multiplier (`float`): The multipier factor of ffn layer dimension.
        norm_eps (`float`): The eps for norm layer.
        qk_norm (`bool`): normalization for query and key.
        cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
        norm_elementwise_affine (`bool`, *optional*, defaults to True),
    Tdimnum_attention_headsnum_kv_headsmultiple_offfn_dim_multipliernorm_epsqk_normcross_attention_dimnorm_elementwise_affinereturnNc
           
         s   t    || | _tt|g| _t|d || |rdnd ||dddt	 d
| _
t | j
_t|||| |r:dnd ||dddt	 d
| _t|d| ||d| _t|||	d| _t|||	d| _t|||	d| _t|||	d| _t|||	d| _d S )	Nlayer_norm_across_headsh㈵>F)
	query_dimr   dim_headr   headskv_headsepsbiasout_bias	processor   )r   	inner_dimr   r   )embedding_dimr   r   )r%   elementwise_affine)super__init__head_dimnn	Parametertorchzerosgater   r   attn1Identityto_outattn2r
   feed_forwardr   norm1r   	ffn_norm1norm2	ffn_norm2norm1_context)
selfr   r   r   r   r   r   r   r   r   	__class__ l/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/models/transformers/lumina_nextdit2d.pyr.   5   sV   



zLuminaNextDiTBlock.__init__hidden_statesattention_maskimage_rotary_embencoder_hidden_statesencoder_masktembcross_attention_kwargsc                 C   s   |}|  ||\}	}
}}| jd|	|	|||d|}| |}| jd|	|||dd|}|| j dddd }|| }|d}| jjd |}||
	d | 
|  }| | |d|	d  }||	d | |  }|S )a  
        Perform a forward pass through the LuminaNextDiTBlock.

        Parameters:
            hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
            attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
            image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
            encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
            encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
            temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
            cross_attention_kwargs (`Dict[str, Any]`): kwargs for cross attention.
        )rD   rG   rE   query_rotary_embkey_rotary_embN   r   rB   )r:   r5   r>   r8   r4   tanhviewflattenr7   	unsqueezer<   r9   r;   r=   )r?   rD   rE   rF   rG   rH   rI   rJ   residualnorm_hidden_statesgate_msa	scale_mlpgate_mlpself_attn_outputnorm_encoder_hidden_statescross_attn_outputmixed_attn_output
mlp_outputrB   rB   rC   forwardv   s:   


zLuminaNextDiTBlock.forward)T)N)__name__
__module____qualname____doc__intfloatboolr.   r2   Tensorr   r   strr   r^   __classcell__rB   rB   r@   rC   r   $   sP    	
Ir   c                        s   e Zd ZdZe												
	
		d&dedee dee dee dee dee dee dee dee dee dee dee dee dee ddf fddZ			
d'de
jde
jd e
jd!e
jd"e
jd#eeef de
jfd$d%Z  ZS )(LuminaNextDiT2DModelaa  
    LuminaNextDiT: Diffusion model with a Transformer backbone.

    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.

    Parameters:
        sample_size (`int`): The width of the latent images. This is fixed during training since
            it is used to learn a number of position embeddings.
        patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
            The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
        in_channels (`int`, *optional*, defaults to 4):
            The number of input channels for the model. Typically, this matches the number of channels in the input
            images.
        hidden_size (`int`, *optional*, defaults to 4096):
            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
            hidden representations.
        num_layers (`int`, *optional*, default to 32):
            The number of layers in the model. This defines the depth of the neural network.
        num_attention_heads (`int`, *optional*, defaults to 32):
            The number of attention heads in each attention layer. This parameter specifies how many separate attention
            mechanisms are used.
        num_kv_heads (`int`, *optional*, defaults to 8):
            The number of key-value heads in the attention mechanism, if different from the number of attention heads.
            If None, it defaults to num_attention_heads.
        multiple_of (`int`, *optional*, defaults to 256):
            A factor that the hidden size should be a multiple of. This can help optimize certain hardware
            configurations.
        ffn_dim_multiplier (`float`, *optional*):
            A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
            the model configuration.
        norm_eps (`float`, *optional*, defaults to 1e-5):
            A small value added to the denominator for numerical stability in normalization layers.
        learn_sigma (`bool`, *optional*, defaults to True):
            Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
            predictions.
        qk_norm (`bool`, *optional*, defaults to True):
            Indicates if the queries and keys in the attention mechanism should be normalized.
        cross_attention_dim (`int`, *optional*, defaults to 2048):
            The dimensionality of the text embeddings. This parameter defines the size of the text representations used
            in the model.
        scaling_factor (`float`, *optional*, defaults to 1.0):
            A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
            overall scale of the model's operations.
       r	   r)    	      N   r    T         ?sample_size
patch_sizein_channelshidden_size
num_layersr   r   r   r   r   learn_sigmar   r   scaling_factorr   c              
      s   t    || _|| _|| _|r|d n|| _| _| _ | _|| _	t
||dd| _tt| _ttd d| _t fddt|D | _ttddd	d|| | j d
| _ d dksvJ dd S )Nr	   T)rq   rr   	embed_dimr&   i   )rs   r   c                    s"   g | ]}t  qS rB   )r   ).0_r   r   rs   r   r   r   r   r   rB   rC   
<listcomp>  s    z1LuminaNextDiT2DModel.__init__.<locals>.<listcomp>Fgư>)r+   conditioning_embedding_dimr,   r%   r&   out_dimr)   r   z+2d rope needs head dim to be divisible by 4)r-   r.   rp   rq   rr   out_channelsrs   r   r/   rv   r   patch_embedderr0   r1   r2   empty	pad_tokenr   mintime_caption_embed
ModuleListrangelayersr   norm_out)r?   rp   rq   rr   rs   rt   r   r   r   r   r   ru   r   r   rv   r@   rz   rC   r.      s<   



zLuminaNextDiT2DModel.__init__rD   timesteprG   rH   rF   rJ   c              
   C   s   |  ||\}}}	}||j}| |||}
| }| jD ]}|||||||
|d}q| ||
}| j }}|	d \}}|d}|| ||  }|ddd|f 	||| || ||| j
}|dddddddddd}|sx|fS t|d	S )
a  
        Forward pass of LuminaNextDiT.

        Parameters:
            hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
            timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
            encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
            encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
        )rI   rJ   r   N   rM   r   r	   r)   )sample)r   todevicer   re   r   r   rq   sizerQ   r~   permuterR   r   )r?   rD   r   rG   rH   rF   rJ   return_dictmaskimg_sizerI   layerheight_tokenswidth_tokensheightwidth
batch_sizesequence_lengthoutputrB   rB   rC   r^   !  s4   



$
zLuminaNextDiT2DModel.forward)rj   r	   r)   rk   rl   rl   Nrm   Nr    TTrn   ro   )NT)r_   r`   ra   rb   r   rc   r   rd   re   r.   r2   rf   r   rg   r   r^   rh   rB   rB   r@   rC   ri      s    -	
G
	ri   )"typingr   r   r   r2   torch.nnr0   configuration_utilsr   r   utilsr   	attentionr
   attention_processorr   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerr_   loggerModuler   ri   rB   rB   rB   rC   <module>   s   
 