o
    GiDf                  
   @   s  d dl mZ d dlZd dlZd dlmZ ddlmZm	Z	 ddl
mZmZmZ ddlmZmZmZ ddlmZ ddlmZ d	d
lmZmZ d	dlmZ d	dlmZmZmZmZ d	dl m!Z! d	dl"m#Z# d	dl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* e+e,Z-G dd dej.Z/G dd dej.Z0G dd dej.Z1G dd dej.Z2G dd dej.Z3eG dd dej.Z4eG dd dej.Z5G d d! d!e#eeeeee	Z6dS )"    )AnyN   )ConfigMixinregister_to_config)FluxTransformer2DLoadersMixinFromOriginalModelMixinPeftAdapterMixin)apply_lora_scale	deprecatelogging)is_torch_npu_available)maybe_allow_in_graph   )AttentionMixinFeedForward)
CacheMixin)FluxPosEmbedPixArtAlphaTextProjection	Timestepsget_timestep_embedding)Transformer2DModelOutput)
ModelMixin)CombinedTimestepLabelEmbeddingsFP32LayerNormRMSNorm   )FluxAttentionFluxAttnProcessorc                       s   e Zd ZdZddededB f fddZ				dd	ejd
ejdB dejdB dej	dB dejdB de
ejejejejejf fddZ  ZS )ChromaAdaLayerNormZeroPruned
    Norm layer adaptive layer norm zero (adaLN-Zero).

    Parameters:
        embedding_dim (`int`): The size of each embedding vector.
        num_embeddings (`int`): The size of the embeddings dictionary.
    N
layer_normTembedding_dimnum_embeddingsc                    sp   t    |d urt||| _nd | _|dkr"tj|ddd| _d S |dkr0t|ddd| _d S td| d)	Nr    Fư>elementwise_affineepsfp32_layer_norm)r%   biasUnsupported `norm_type` (@) provided. Supported ones are: 'layer_norm', 'fp32_layer_norm'.)	super__init__r   embnn	LayerNormnormr   
ValueError)selfr!   r"   	norm_typer(   	__class__ d/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_chroma.pyr,   0   s   

z%ChromaAdaLayerNormZeroPruned.__init__xtimestepclass_labelshidden_dtyper-   returnc                 C   sx   | j d ur| j |||d}|ddjddd\}}}}	}
}| |d|d d d f   |d d d f  }|||	|
|fS )N)r;   r   r      dim)r-   flattenchunkr0   )r2   r8   r9   r:   r;   r-   	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpr6   r6   r7   forward@   s
   
".z$ChromaAdaLayerNormZeroPruned.forward)Nr    T)NNNN)__name__
__module____qualname____doc__intr,   torchTensor
LongTensordtypetuplerH   __classcell__r6   r6   r4   r7   r   '   s(    r   c                       sb   e Zd ZdZddef fddZ	ddejd	ejdB d
eejejejejejf fddZ	  Z
S )"ChromaAdaLayerNormZeroSinglePrunedr   r    Tr!   c                    s8   t    |dkrtj|ddd| _d S td| d)Nr    Fr#   r$   r)   r*   )r+   r,   r.   r/   r0   r1   )r2   r!   r3   r(   r4   r6   r7   r,   X   s   

z+ChromaAdaLayerNormZeroSinglePruned.__init__Nr8   r-   r<   c                 C   sR   | ddjddd\}}}| |d|d d d f   |d d d f  }||fS )Nr   r   r   r>   )r@   rA   r0   )r2   r8   r-   rB   rC   rD   r6   r6   r7   rH   b   s   .z*ChromaAdaLayerNormZeroSinglePruned.forward)r    TN)rI   rJ   rK   rL   rM   r,   rN   rO   rR   rH   rS   r6   r6   r4   r7   rT   O   s    rT   c                       sP   e Zd ZdZ				ddedef fddZd	ejd
ejdejfddZ  Z	S )"ChromaAdaLayerNormContinuousPruneda  
    Adaptive normalization layer with a norm layer (layer_norm or rms_norm).

    Args:
        embedding_dim (`int`): Embedding dimension to use during projection.
        conditioning_embedding_dim (`int`): Dimension of the input condition.
        elementwise_affine (`bool`, defaults to `True`):
            Boolean flag to denote if affine transformation should be applied.
        eps (`float`, defaults to 1e-5): Epsilon factor.
        bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
        norm_type (`str`, defaults to `"layer_norm"`):
            Normalization layer to use. Values supported: "layer_norm", "rms_norm".
    Th㈵>r    r!   conditioning_embedding_dimc                    sP   t    |dkrt||||| _d S |dkr!t|||| _d S td| )Nr    rms_normzunknown norm_type )r+   r,   r.   r/   r0   r   r1   )r2   r!   rX   r%   r&   r(   r3   r4   r6   r7   r,   {   s   
z+ChromaAdaLayerNormContinuousPruned.__init__r8   r-   r<   c                 C   sb   t j|dd|jddd\}}| |d| d d d d d f  |d d d d d f  }|S )Nr   r   r>   )rN   rA   r@   torQ   r0   )r2   r8   r-   shiftscaler6   r6   r7   rH      s   $:z*ChromaAdaLayerNormContinuousPruned.forward)TrW   Tr    )
rI   rJ   rK   rL   rM   r,   rN   rO   rH   rS   r6   r6   r4   r7   rV   l   s    $rV   c                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	(ChromaCombinedTimestepTextProjEmbeddingsnum_channelsout_dimc                    sX   t    t|ddd| _t|ddd| _| jdtt|d d| ddddd	 d S )
NTr   )r^   flip_sin_to_cosdownscale_freq_shiftmod_proj  r   )r`   ra   F)
persistent)	r+   r,   r   	time_projguidance_projregister_bufferr   rN   arange)r2   r^   r_   r4   r6   r7   r,      s   

z1ChromaCombinedTimestepTextProjEmbeddings.__init__r9   r<   c           	      C   s   | j jd }|jd }| |j|jd}| tdg| j|j|jd}| j j|j|jd	|dd}tj
||gddd	d|d}tj
||gdd}||jS )Nr   )rQ   )rQ   devicer   r>   )rb   shapere   rZ   rQ   rf   rN   tensorri   repeatcat	unsqueeze)	r2   r9   mod_index_length
batch_sizetimesteps_projrf   rb   timestep_guidance	input_vecr6   r6   r7   rH      s   
 z0ChromaCombinedTimestepTextProjEmbeddings.forward)	rI   rJ   rK   rM   r,   rN   rO   rH   rS   r6   r6   r4   r7   r]      s    r]   c                	       s8   e Zd Zd
dedededef fddZdd	 Z  ZS )ChromaApproximator   in_dimr_   
hidden_dimn_layersc                    sj   t    tj| dd| _t fddt|D | _t fddt|D | _t || _	d S )NTr(   c                    s   g | ]	}t   d dqS )silu)act_fn)r   .0_rx   r6   r7   
<listcomp>   s    z/ChromaApproximator.__init__.<locals>.<listcomp>c                    s   g | ]}t  qS r6   )r.   r   r}   r   r6   r7   r      s    )
r+   r,   r.   Linearin_proj
ModuleListrangelayersnormsout_proj)r2   rw   r_   rx   ry   r4   r   r7   r,      s   
zChromaApproximator.__init__c                 C   s<   |  |}t| j| jD ]\}}|||| }q| |S rU   )r   zipr   r   r   )r2   r8   layerr   r6   r6   r7   rH      s   

zChromaApproximator.forward)rv   )rI   rJ   rK   rM   r,   rH   rS   r6   r6   r4   r7   ru      s     	ru   c                       s   e Zd Z	ddedededef fddZ			dd	ejd
ejdeejejf dB dejdB de	e
ef dB dejfddZ  ZS )ChromaSingleTransformerBlock      @r?   num_attention_headsattention_head_dim	mlp_ratioc              
      s   t    t|| | _t|| _t|| j| _tj	dd| _
t|| j || _t r?ddlm} d}tdd| | }nt }t||||d|d	dd
| _d S )Ntanh)approximater   )FluxAttnProcessor2_0_NPUzDefaulting to FluxAttnProcessor2_0_NPU for NPU devices will be removed. Attention processors should be set explicitly using the `set_attn_processor` method.npu_processorz0.34.0Tr#   )	query_dimdim_headheadsr_   r(   	processorr&   pre_only)r+   r,   rM   mlp_hidden_dimrT   r0   r.   r   proj_mlpGELUact_mlpproj_outr   attention_processorr   r
   r   r   attn)r2   r?   r   r   r   r   deprecation_messager   r4   r6   r7   r,      s.   

z%ChromaSingleTransformerBlock.__init__Nhidden_statestembimage_rotary_embattention_maskjoint_attention_kwargsr<   c                 C   s   |}| j ||d\}}| | |}	|pi }|d ur3|d d d d d d f |d d d d d d f  }| jd|||d|}
tj|
|	gdd}|d}|| | }|| }|jtj	krd|
dd}|S )	Nr-   )r   r   r   r   r>   r       r6   )r0   r   r   r   rN   rn   ro   r   rQ   float16clip)r2   r   r   r   r   r   residualnorm_hidden_statesgatemlp_hidden_statesattn_outputr6   r6   r7   rH      s(   0
z$ChromaSingleTransformerBlock.forward)r   NNN)rI   rJ   rK   rM   floatr,   rN   rO   rR   dictstrr   rH   rS   r6   r6   r4   r7   r      s6    )r   c                       s   e Zd Z		ddededededef
 fdd	Z	
	
	
ddejdejdejde	ejejf d
B dejd
B de
eef d
B de	ejejf fddZ  ZS )ChromaTransformerBlockrY   r#   r?   r   r   qk_normr&   c                    s   t    t|| _t|| _t|||||ddt |d	| _tj	|ddd| _
t||dd| _tj	|ddd| _t||dd| _d S )NFT)	r   added_kv_proj_dimr   r   r_   context_pre_onlyr(   r   r&   r#   r$   zgelu-approximate)r?   dim_outactivation_fn)r+   r,   r   norm1norm1_contextr   r   r   r.   r/   norm2r   ffnorm2_context
ff_context)r2   r?   r   r   r   r&   r4   r6   r7   r,     s$   


zChromaTransformerBlock.__init__Nr   encoder_hidden_statesr   r   r   r   r<   c                 C   s  |d d d df |d d dd f }}| j ||d\}	}
}}}| j||d\}}}}}|p0i }|d urM|d d d d d d f |d d d d d d f  }| jd	|	|||d|}t|dkre|\}}nt|dkrp|\}}}|
d| }|| }| |}	|	d|d d d f   |d d d f  }	| |	}|d| }|| }t|dkr|| }|d| }|| }| |}|d|d d d f   |d d d f  }| |}||d|  }|j	t
jkr|dd}||fS )
Nr=   r   )r   r   r   r   r   r   r   r   r   r6   )r   r   r   lenro   r   r   r   r   rQ   rN   r   r   )r2   r   r   r   r   r   r   temb_imgtemb_txtr   rD   rE   rF   rG   norm_encoder_hidden_states
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattention_outputsr   context_attn_outputip_attn_output	ff_outputcontext_ff_outputr6   r6   r7   rH   3  sL   *	0


(

(
zChromaTransformerBlock.forward)rY   r#   r   )rI   rJ   rK   rM   r   r   r,   rN   rO   rR   r   r   rH   rS   r6   r6   r4   r7   r     s@    #r   c                       s  e Zd ZdZdZddgZddgZddgZe					
								d.de	de	de	d	B de	de	de	de	de	de
e	df de	de	de	f fdd Zed!																		"d/d#ejd$ejd%ejd&ejd'ejd(ejd!eeef d	B d)ed*ed+ejeB fd,d-Z  ZS )0ChromaTransformer2DModela  
    The Transformer model introduced in Flux, modified for Chroma.

    Reference: https://huggingface.co/lodestones/Chroma1-HD

    Args:
        patch_size (`int`, defaults to `1`):
            Patch size to turn the input data into small patches.
        in_channels (`int`, defaults to `64`):
            The number of channels in the input.
        out_channels (`int`, *optional*, defaults to `None`):
            The number of channels in the output. If not specified, it defaults to `in_channels`.
        num_layers (`int`, defaults to `19`):
            The number of layers of dual stream DiT blocks to use.
        num_single_layers (`int`, defaults to `38`):
            The number of layers of single stream DiT blocks to use.
        attention_head_dim (`int`, defaults to `128`):
            The number of dimensions to use for each attention head.
        num_attention_heads (`int`, defaults to `24`):
            The number of attention heads to use.
        joint_attention_dim (`int`, defaults to `4096`):
            The number of dimensions to use for the joint attention (embedding/channel dimension of
            `encoder_hidden_states`).
        axes_dims_rope (`tuple[int]`, defaults to `(16, 56, 56)`):
            The dimensions to use for the rotary positional embeddings.
    Tr   r   	pos_embedr0   r   @   N   &               8   r      rv   
patch_sizein_channelsout_channels
num_layersnum_single_layersr   r   joint_attention_dimaxes_dims_rope.approximator_num_channelsapproximator_hidden_dimapproximator_layersc                    s  t    |p|_  _td|	d_t|
d d| d|  d d_t|
j||d_	t
|j_t
|j_t
 fd	d
t|D _t
 fdd
t|D _tjjddd_t
jj|| j dd_d_d S )Ni'  )thetaaxes_dim   r      r   )r^   r_   )rw   r_   rx   ry   c                       g | ]
}t j d qS )r?   r   r   )r   	inner_dimr}   r   r   r2   r6   r7   r         z5ChromaTransformer2DModel.__init__.<locals>.<listcomp>c                    r   r   )r   r   r}   r   r6   r7   r     r   Fr#   r$   Trz   )r+   r,   r   r   r   r   r]   time_text_embedru   distilled_guidance_layerr.   r   context_embedder
x_embedderr   r   transformer_blockssingle_transformer_blocksrV   norm_outr   gradient_checkpointing)r2   r   r   r   r   r   r   r   r   r   r   r   r   r4   r   r7   r,     s>   



z!ChromaTransformer2DModel.__init__r   Fr   r   r9   img_idstxt_idsr   return_dictcontrolnet_blocks_repeatr<   c              	   C   s  |  |}||jd }| |}| |}| |}|jdkr*td |d }|jdkr8td |d }t	j
||fdd}| |}|dur_d|v r_|d}| |}|d	|i t| jD ]\}}dt| j }|d
t| j  }|d
|  }|d
|  }t	j
|dd||d
 f |dd||d
 f fdd}t	 r| jr| ||||||\}}n|||||||d\}}|durt| jt| }tt|}|r|||t|   }qd||||   }qdt	j
||gdd}t| jD ]d\}}d| }|dd||d f }t	 r| jr| ||||}n	||||||d}|	dur]t| jt|	 }tt|}|dd|jd ddf |	||   |dd|jd ddf< q|dd|jd ddf }|ddddf }| ||}| |}|
s|fS t|dS )a  
        The [`FluxTransformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
                Input `hidden_states`.
            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
                A list of tensors that if specified are added to the residuals of transformer blocks.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        rc   r   zrPassing `txt_ids` 3d torch.Tensor is deprecated.Please remove the batch dimension and pass it as a 2d torch Tensorr   zrPassing `img_ids` 3d torch.Tensor is deprecated.Please remove the batch dimension and pass it as a 2d torch Tensorr>   Nip_adapter_image_embedsip_hidden_statesr=   r   )r   r   r   r   r   r   )r   r   r   r   r   .)sample)r   rZ   rQ   r   r   r   ndimloggerwarningrN   rn   r   popencoder_hid_projupdate	enumerater   r   r   is_grad_enabledr   _gradient_checkpointing_funcrM   npceilrk   r   r   r   )r2   r   r   r9   r   r   r   r   controlnet_block_samplescontrolnet_single_block_samplesr   r   rt   pooled_tembidsr   r   r   index_blockblock
img_offset
txt_offsetimg_modulationtext_modulationr   interval_control	start_idxoutputr6   r6   r7   rH     s   
(











	


z ChromaTransformer2DModel.forward)r   r   Nr   r   r   r   r   r   r   r   rv   )
NNNNNNNNTF)rI   rJ   rK   rL    _supports_gradient_checkpointing_no_split_modules_repeated_blocks _skip_layerwise_casting_patternsr   rM   rR   r,   r	   rN   rO   rP   r   r   r   boolr   rH   rS   r6   r6   r4   r7   r   r  s    		

@r   )7typingr   numpyr  rN   torch.nnr.   configuration_utilsr   r   loadersr   r   r   utilsr	   r
   r   utils.import_utilsr   utils.torch_utilsr   	attentionr   r   cache_utilsr   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   transformer_fluxr   r   
get_loggerrI   r   Moduler   rT   rV   r]   ru   r   r   r   r6   r6   r6   r7   <module>   sF   
(, G
^