o
    GiS                     @   s6  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z# e$e%Z&G dd dej'Z(G dd dZ)G dd dej'Z*G dd dej'Z+G dd dee
eeZ,dS )    N)Any   )ConfigMixinregister_to_config)PeftAdapterMixin)FromOriginalModelMixin)apply_lora_scalelogging   )LuminaFeedForward)	Attention)TimestepEmbedding	Timestepsapply_rotary_embget_1d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)LuminaLayerNormContinuousLuminaRMSNormZeroRMSNormc                       sh   e Zd Z				ddedededed	d
f
 fddZdejdejdejd	eejejf fddZ	  Z
S )'Lumina2CombinedTimestepCaptionEmbedding         h㈵>hidden_sizecap_feat_dimfrequency_embedding_sizenorm_epsreturnNc                    sT   t    t|ddd| _t|t|dd| _tt	||dtj
||dd| _d S )NTg        )num_channelsflip_sin_to_cosdownscale_freq_shift   )in_channelstime_embed_dimeps)bias)super__init__r   	time_projr   mintimestep_embeddernn
Sequentialr   Linearcaption_embedder)selfr   r   r   r   	__class__ e/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_lumina2.pyr*   &   s   


z0Lumina2CombinedTimestepCaptionEmbedding.__init__hidden_statestimestepencoder_hidden_statesc                 C   s,   |  ||}| |}| |}||fS N)r+   type_asr-   r1   )r2   r7   r8   r9   timestep_proj
time_embedcaption_embedr5   r5   r6   forward;   s   

z/Lumina2CombinedTimestepCaptionEmbedding.forward)r   r   r   r   )__name__
__module____qualname__intfloatr*   torchTensortupler?   __classcell__r5   r5   r3   r6   r   %   s2    r   c                   @   s\   e Zd ZdZdd Z			ddedejdejdejdB d	ejdB d
edB dejfddZ	dS )Lumina2AttnProcessor2_0z
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
    used in the Lumina2Transformer2DModel model. It applies normalization and RoPE on query and key vectors.
    c                 C   s   t tds	tdd S )Nscaled_dot_product_attentionzPAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)hasattrFImportError)r2   r5   r5   r6   r*   J   s   
z Lumina2AttnProcessor2_0.__init__Nattnr7   r9   attention_maskimage_rotary_embbase_sequence_lengthr   c                 C   s  |j \}}}	||}
||}||}|
j d }|j d }||j }|
j}|| }|
|d|j|}
||d||}||d||}|jd urN||
}
|jd urX||}|d urjt	|
|dd}
t	||dd}|

||
|}
}|d urtt|||j }n|j}|j| }|dkr|dddd|ddd}|dddd|ddd}|d ur| |ddd}|
dd}
|dd}|dd}tj|
||||d}|dd|d|j| }||
}|jd |}|jd |}|S )	NF)use_real   r   r
   )	attn_maskscaler   )shapeto_qto_kto_vheadsdtypeviewnorm_qnorm_kr   tomathsqrtlogrV   	unsqueezerepeatflattenbool	transposerL   rJ   reshaper;   to_out)r2   rN   r7   r9   rO   rP   rQ   
batch_sizesequence_length_querykeyvalue	query_dim	inner_dimhead_dimr\   kv_headssoftmax_scalen_repr5   r5   r6   __call__N   sP   	










  

z Lumina2AttnProcessor2_0.__call__)NNN)
r@   rA   rB   __doc__r*   r   rE   rF   rC   rw   r5   r5   r5   r6   rI   D   s*    	rI   c                       sr   e Zd Z	ddededededededed	d
f fddZ	
ddejdejdejdejd
B d	ejf
ddZ	  Z
S )Lumina2TransformerBlockTdimnum_attention_headsnum_kv_headsmultiple_offfn_dim_multiplierr   
modulationr   Nc                    s   t    || | _|| _t|d || d||dddt d
| _t|d| ||d| _|r5t	||dd| _
nt||d	| _
t||d	| _t||d	| _t||d	| _d S )
Nrms_normr   F)
rq   cross_attention_dimdim_headqk_normr[   rt   r'   r(   out_bias	processor   )rz   rr   r}   r~   T)embedding_dimr   norm_elementwise_affiner&   )r)   r*   rs   r   r   rI   rN   r   feed_forwardr   norm1r   	ffn_norm1norm2	ffn_norm2)r2   rz   r{   r|   r}   r~   r   r   r3   r5   r6   r*      s>   



z Lumina2TransformerBlock.__init__r7   rO   rP   tembc                 C   s   | j rC| ||\}}}}| j||||d}	||d | |	  }| | |d|d  }
||d | |
  }|S | |}| j||||d}	|| |	 }| | |}
|| |
 }|S )N)r7   r9   rO   rP   rT   )	r   r   rN   rd   tanhr   r   r   r   )r2   r7   rO   rP   r   norm_hidden_statesgate_msa	scale_mlpgate_mlpattn_output
mlp_outputr5   r5   r6   r?      s.   
zLumina2TransformerBlock.forward)Tr:   )r@   rA   rB   rC   rD   rg   r*   rE   rF   r?   rH   r5   r5   r3   r6   ry      s>    		4ry   c                	       s   e Zd Zddedee dee def fddZdee dee ded	eej fd
dZdejd	ejfddZ	dejdejfddZ
  ZS )Lumina2RotaryPosEmbedi,     r   r
   thetaaxes_dim	axes_lens
patch_sizec                    s6   t    || _|| _|| _|| _| |||| _d S r:   )r)   r*   r   r   r   r   _precompute_freqs_cis	freqs_cis)r2   r   r   r   r   r3   r5   r6   r*      s   
zLumina2RotaryPosEmbed.__init__r   c           
      C   sX   g }t jj rt jnt j}tt||D ]\}\}}t||| j	|d}	|
|	 q|S )N)r   freqs_dtype)rE   backendsmpsis_availablefloat32float64	enumeratezipr   r   append)
r2   r   r   r   r   r   ideembr5   r5   r6   r      s   z+Lumina2RotaryPosEmbed._precompute_freqs_cisidsc              	   C   s   |j }|j jdkr|d}g }tt| jD ]<}| j| |j }|d d d d ||d f dd|jd t	j
}|t	j|d|jd ddd|d qt	j|dd|S )Nr   cpurT   rR   r   )rz   indexrz   )devicetyper`   rangelenr   r   re   rW   rE   int64r   gatherrd   cat)r2   r   r   resultr   freqsr   r5   r5   r6   _get_freqs_cis   s   
6.z$Lumina2RotaryPosEmbed._get_freqs_cisr7   rO   c                    s  |j \}}}}| j}|| || }}	||	  |j}
|j d }|jdd } fdd|D }t|}tj||dtj|
d}t	t
||D ]W\}\}}tj|tj|
d||d |df< |||||df< tj|tj|
dddd|	 }tj|	tj|
ddd|d }|||||df< |||||d	f< qF| |}tj|||j d |
|jd
}tj| |j d |
|jd
}t	t
||D ]"\}\}}||d |f ||d |f< ||||f ||d  f< q||||||	|dd	ddddddd	}||||||fS )NrT   r   c                    s   g | ]}|  qS r5   r5   ).0cap_seq_lenimage_seq_lenr5   r6   
<listcomp>  s    z1Lumina2RotaryPosEmbed.forward.<locals>.<listcomp>r   )r\   r   r   rR   r
   )r   r\   r      )rW   r   r   sumtolistmaxrE   zerosint32r   r   aranger]   re   rf   r   r\   permute)r2   r7   rO   rk   channelsheightwidthppost_patch_heightpost_patch_widthr   encoder_seq_lenl_effective_cap_lenseq_lengthsmax_seq_lenposition_idsr   r   seq_lenrow_idscol_idsr   cap_freqs_cisimg_freqs_cisr5   r   r6   r?     sR   
 
zLumina2RotaryPosEmbed.forward)r   r
   )r@   rA   rB   rC   listr*   rE   rF   r   r   r?   rH   r5   r5   r3   r6   r      s
    ((	r   c                $       s  e Zd ZdZdZdgZddgZe						
												d/dedededed	B dedededededede	d	B de	d e	d!e
eeef d"e
eeef d#ed$d	f" fd%d&Zed'			d0d(ejd)ejd*ejd+ejd'eeef d	B d,ed$ejeB fd-d.Z  ZS )1Lumina2Transformer2DModela  
    Lumina2NextDiT: Diffusion model with a Transformer backbone.

    Parameters:
        sample_size (`int`): The width of the latent images. This is fixed during training since
            it is used to learn a number of position embeddings.
        patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
            The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
        in_channels (`int`, *optional*, defaults to 4):
            The number of input channels for the model. Typically, this matches the number of channels in the input
            images.
        hidden_size (`int`, *optional*, defaults to 4096):
            The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
            hidden representations.
        num_layers (`int`, *optional*, default to 32):
            The number of layers in the model. This defines the depth of the neural network.
        num_attention_heads (`int`, *optional*, defaults to 32):
            The number of attention heads in each attention layer. This parameter specifies how many separate attention
            mechanisms are used.
        num_kv_heads (`int`, *optional*, defaults to 8):
            The number of key-value heads in the attention mechanism, if different from the number of attention heads.
            If None, it defaults to num_attention_heads.
        multiple_of (`int`, *optional*, defaults to 256):
            A factor that the hidden size should be a multiple of. This can help optimize certain hardware
            configurations.
        ffn_dim_multiplier (`float`, *optional*):
            A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
            the model configuration.
        norm_eps (`float`, *optional*, defaults to 1e-5):
            A small value added to the denominator for numerical stability in normalization layers.
        scaling_factor (`float`, *optional*, defaults to 1.0):
            A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
            overall scale of the model's operations.
    Try   
x_embeddernorm   r
      N 	           r   r         ?    r   r   r   r#   sample_sizer   r$   out_channelsr   
num_layersnum_refiner_layersr{   r|   r}   r~   r   scaling_factoraxes_dim_roper   r   r   c                    s   t    |p|| _td|||d| _tj|| | d| _t|d| _	t
 fddt|D | _t
 fddt|D | _t
 fddt|D | _ttd	d
dd|| | j d| _d
| _d S )Ni'  )r   r   r   r   )in_featuresout_features)r   r   r   c                    "   g | ]}t  d dqS Tr   ry   r   rm   r~   r   r}   r   r{   r|   r5   r6   r         
z6Lumina2Transformer2DModel.__init__.<locals>.<listcomp>c                    r   )Fr   r   r   r   r5   r6   r     r   c                    r   r   r   r   r   r5   r6   r     r   r#   Fgư>T)r   conditioning_embedding_dimelementwise_affiner'   r(   out_dim)r)   r*   r   r   rope_embedderr.   r0   r   r   time_caption_embed
ModuleListr   noise_refinercontext_refinerlayersr   r,   norm_outgradient_checkpointing)r2   r   r   r$   r   r   r   r   r{   r|   r}   r~   r   r   r   r   r   r3   r   r6   r*   m  sB   





	z"Lumina2Transformer2DModel.__init__attention_kwargsr7   r8   r9   encoder_attention_maskreturn_dictc              
   C   s  |j \}}}	}
| |||\}}| ||\}}}}}}| |}| jD ]}||||}q$| jD ]	}||d ||}q0t|}tt|dk}|j	||t
jd}|	||| jj}tt||D ]&\}\}}d||d |f< ||d |f ||d |f< || ||||f< q_|}| jD ]"}t
 r| jr| |||r|nd ||}q|||r|nd ||}q| ||}| jj}g }tt||D ],\}\}}||| || |	| |
| ||| jddddddddd qt
j|dd}|s|fS t|d	S )
NrT   )r\   Tr   r   r
   r   r   )sample)rW   r   r   r   r   r   r   r   set	new_zerosrE   rg   configr   r   r   r   is_grad_enabledr   _gradient_checkpointing_funcr   r   r   r]   r   r   rf   stackr   )r2   r7   r8   r9   r   r   r   rk   rm   r   r   r   context_rotary_embnoise_rotary_emb
rotary_embencoder_seq_lengthsr   layerr   use_maskrO   joint_hidden_statesr   r   r   r   outputr5   r5   r6   r?     sZ   
	




z!Lumina2Transformer2DModel.forward)r   r
   r   Nr   r   r
   r   r   r   Nr   r   r   r   r#   )NT)r@   rA   rB   rx    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   rC   rD   rG   r*   r   rE   rF   dictstrr   rg   r   r?   rH   r5   r5   r3   r6   r   E  s    #	
\r   )-ra   typingr   rE   torch.nnr.   torch.nn.functional
functionalrL   configuration_utilsr   r   loadersr   loaders.single_file_modelr   utilsr   r	   	attentionr   attention_processorr   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerr@   loggerModuler   rI   ry   r   r   r5   r5   r5   r6   <module>   s*   
TQ\