o
    Gi                  
   @   s  d dl Z d dlmZ d dlZd dlmZ d dlm  mZ ddl	m
Z
mZ ddlmZmZmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' e(e)Z*d/d0ddZ+d/d0ddZ,d/d0ddZ-G dd dej.Z/G dd dej.Z0G dd dZ1G dd dejj.eZ2G dd  d Z3G d!d" d"ejj.eZ4G d#d$ d$ej.Z5G d%d& d&ej.Z6G d'd( d(ej.Z7G d)d* d*ej.Z8G d+d, d,ej.Z9G d-d. d.e%e
eeeee	Z:dS )1    N)Any   )ConfigMixinregister_to_config)FluxTransformer2DLoadersMixinFromOriginalModelMixinPeftAdapterMixin)apply_lora_scalelogging   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixin)dispatch_attention_fn)
CacheMixin)TimestepEmbedding	Timestepsapply_rotary_embget_1d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousattnFlux2Attentionc           	      C   sj   |  |}| |}| |}d  } }}|d ur-| jd ur-| |}| |}| |}||||||fS N)to_qto_kto_vadded_kv_proj_dim
add_q_proj
add_k_proj
add_v_proj	r   hidden_statesencoder_hidden_statesquerykeyvalueencoder_queryencoder_keyencoder_value r,   c/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_flux2.py_get_projections+   s   





r.   c           	      C   sb   |  |jddd\}}}d } }}|d ur)t| dr)| |jddd\}}}||||||fS )Nr   dimr   to_added_qkv)to_qkvchunkhasattrr2   r#   r,   r,   r-   _get_fused_projections9   s
   r6   c                 C   s   | j r	t| ||S t| ||S r   )fused_projectionsr6   r.   )r   r$   r%   r,   r,   r-   _get_qkv_projectionsC   s   r8   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )Flux2SwiGLUz
    Flux 2 uses a SwiGLU-style activation in the transformer feedforward sub-blocks, but with the linear projection
    layer fused into the first linear layer of the FF sub-block. Thus, this module has no trainable parameters.
    c                    s   t    t | _d S r   )super__init__nnSiLUgate_fnself	__class__r,   r-   r;   O   s   
zFlux2SwiGLU.__init__xreturnc                 C   s$   |j ddd\}}| || }|S )Nr   r/   r0   )r4   r>   )r@   rC   x1x2r,   r,   r-   forwardS   s   zFlux2SwiGLU.forward)	__name__
__module____qualname____doc__r;   torchTensorrG   __classcell__r,   r,   rA   r-   r9   I   s    r9   c                       sZ   e Zd Z				ddededB dededB def
 fd	d
ZdejdejfddZ	  Z
S )Flux2FeedForwardN      @Fr1   dim_outmult	inner_dimbiasc                    sZ   t    |d u rt|| }|p|}tj||d |d| _t | _tj|||d| _d S )Nr   rT   )	r:   r;   intr<   Linear	linear_inr9   act_fn
linear_out)r@   r1   rQ   rR   rS   rT   rA   r,   r-   r;   Z   s   
zFlux2FeedForward.__init__rC   rD   c                 C   s"   |  |}| |}| |}|S r   )rX   rY   rZ   )r@   rC   r,   r,   r-   rG   l   s   


zFlux2FeedForward.forward)NrP   NF)rH   rI   rJ   rV   floatboolr;   rL   rM   rG   rN   r,   r,   rA   r-   rO   Y   s"    rO   c                   @   sX   e Zd ZdZdZdd Z			ddddejdejdejdB d	ejdB d
ejfddZdS )Flux2AttnProcessorNc                 C       t tdst| jj dd S Nscaled_dot_product_attentionz; requires PyTorch 2.0. Please upgrade your pytorch version.r5   FImportErrorrB   rH   r?   r,   r,   r-   r;   w      
zFlux2AttnProcessor.__init__r   r   r$   r%   attention_maskimage_rotary_embrD   c                 C   s  t |||\}}}}	}
}|d|jdf}|d|jdf}|d|jdf}||}||}|jd urv|	d|jdf}	|
d|jdf}
|d|jdf}||	}	||
}
tj	|	|gdd}tj	|
|gdd}tj	||gdd}|d urt
||dd}t
||dd}t||||| j| jd}|dd}||j}|d ur|j|jd |jd |jd  gdd\}}||}|jd |}|jd |}|d ur||fS |S )	Nr/      r0   sequence_dim	attn_maskbackendparallel_configr   r   r   )r8   	unflattenheadsnorm_qnorm_kr   norm_added_qnorm_added_krL   catr   r   _attention_backend_parallel_configflattentodtypesplit_with_sizesshape
to_add_outto_out)r@   r   r$   r%   re   rf   r&   r'   r(   r)   r*   r+   r,   r,   r-   __call__{   sP   






zFlux2AttnProcessor.__call__NNN	rH   rI   rJ   ru   rv   r;   rL   rM   r~   r,   r,   r,   r-   r]   s   s(    r]   c                       s   e Zd ZeZegZ											dded	ed
ededededB dedB dedededef fddZ				dde
jde
jdB de
jdB de
jdB de
jf
ddZ  ZS )r      @           FNTh㈵>	query_dimro   dim_headdropoutrT   r   added_proj_biasout_biasepsout_dimelementwise_affinec                    s  t    || _|
d ur|
n|| | _|| _|
d ur|
n|| _|
d ur'|
| n|| _|| _|| _|| _	|| _
tjj|| j|d| _tjj|| j|d| _tjj|| j|d| _tjj||	|d| _tjj||	|d| _tjg | _| jtjj| j| j|d | jtj| |d urtjj||	d| _tjj||	d| _tjj|| j|d| _tjj|| j|d| _tjj|| j|d| _tjj| j||d| _|d u r|  }| | d S )NrU   r   r   )r   ) r:   r;   head_dimrS   r   r   ro   use_biasr   r   r   rL   r<   rW   r   r   r   RMSNormrp   rq   
ModuleListr}   appendDropoutrr   rs   r    r!   r"   r|   _default_processor_clsset_processor)r@   r   ro   r   r   rT   r   r   r   r   r   r   	processorrA   r,   r-   r;      s8   
zFlux2Attention.__init__r$   r%   re   rf   rD   c                    s   t t| jjj   fdd| D }t|dkr,t	
d| d| jjj d  fdd| D }| j| ||||fi |S )	Nc                       g | ]
\}}| vr|qS r,   r,   .0k_attn_parametersr,   r-   
<listcomp>       z*Flux2Attention.forward.<locals>.<listcomp>r   joint_attention_kwargs  are not expected by  and will be ignored.c                       i | ]\}}| v r||qS r,   r,   r   r   wr   r,   r-   
<dictcomp>       z*Flux2Attention.forward.<locals>.<dictcomp>setinspect	signaturer   r~   
parameterskeysitemslenloggerwarningrB   rH   )r@   r$   r%   re   rf   kwargsunused_kwargsr,   r   r-   rG      s   zFlux2Attention.forward)r   r   r   FNTTr   NTNr   )rH   rI   rJ   r]   r   _available_processorsrV   r[   r\   r;   rL   rM   rG   rN   r,   r,   rA   r-   r      sf    	
8c                   @   sP   e Zd ZdZdZdd Z		ddddejdejdB dejdB d	ejf
d
dZdS )Flux2ParallelSelfAttnProcessorNc                 C   r^   r_   ra   r?   r,   r,   r-   r;     rd   z'Flux2ParallelSelfAttnProcessor.__init__r   Flux2ParallelSelfAttentionr$   re   rf   rD   c           
      C   s  | |}tj|d|j |j|j gdd\}}|jddd\}}}	|d|jdf}|d|jdf}|	d|jdf}	|	|}|
|}|d urYt||dd}t||dd}t|||	|| j| jd}|dd}||j}||}tj||gdd}||}|S )Nr   r/   r0   rg   rh   rj   r   )to_qkv_mlp_projrL   splitrS   mlp_hidden_dimmlp_mult_factorr4   rn   ro   rp   rq   r   r   ru   rv   rw   rx   ry   
mlp_act_fnrt   r}   )
r@   r   r$   re   rf   qkvmlp_hidden_statesr&   r'   r(   r,   r,   r-   r~     s6   





z'Flux2ParallelSelfAttnProcessor.__call__NNr   r,   r,   r,   r-   r     s"    r   c                       s   e Zd ZdZeZegZdZ											
	ddededede	de
de
de	dede
de	def fddZ		ddejdejdB dejdB dejfddZ  ZS ) r   a  
    Flux 2 parallel self-attention for the Flux 2 single-stream transformer blocks.

    This implements a parallel transformer block, where the attention QKV projections are fused to the feedforward (FF)
    input projections, and the attention output projections are fused to the FF output projections. See the [ViT-22B
    paper](https://arxiv.org/abs/2302.05442) for a visual depiction of this type of transformer block.
    Fr   r   r   Tr   N      @r   r   ro   r   r   rT   r   r   r   r   	mlp_ratior   c                    s  t    || _|d ur|n|| | _|| _|d ur|n|| _|d ur'|| n|| _|| _|| _|
| _	t
|| j	 | _|| _tjj| j| jd | j| j  |d| _t | _tjj|||	d| _tjj|||	d| _tjj| j| j | j|d| _|d u r|  }| | d S )Nr   rU   r   )r:   r;   r   rS   r   r   ro   r   r   r   rV   r   r   rL   r<   rW   r   r9   r   r   rp   rq   r}   r   r   )r@   r   ro   r   r   rT   r   r   r   r   r   r   r   rA   r,   r-   r;   I  s*   
z#Flux2ParallelSelfAttention.__init__r$   re   rf   rD   c                    s   t t| jjj   fdd| D }t|dkr,t	
d| d| jjj d  fdd| D }| j| |||fi |S )	Nc                    r   r,   r,   r   r   r,   r-   r     r   z6Flux2ParallelSelfAttention.forward.<locals>.<listcomp>r   r   r   r   c                    r   r,   r,   r   r   r,   r-   r     r   z6Flux2ParallelSelfAttention.forward.<locals>.<dictcomp>r   )r@   r$   re   rf   r   r   r,   r   r-   rG   x  s   z"Flux2ParallelSelfAttention.forward)r   r   r   FTr   NTr   r   Nr   )rH   rI   rJ   rK   r   r   r   _supports_qkv_fusionrV   r[   r\   r;   rL   rM   rG   rN   r,   r,   rA   r-   r   ;  sd    	
2r   c                       s   e Zd Z			ddededededed	ef fd
dZ				ddejdejdB dejde	ejejf dB de
eef dB dededB de	ejejf fddZ  ZS )Flux2SingleTransformerBlockrP   ư>Fr1   num_attention_headsattention_head_dimr   r   rT   c                    s@   t    tj|d|d| _t||||||||dt d
| _d S )NFr   r   r   )
r   r   ro   r   rT   r   r   r   r   r   )r:   r;   r<   	LayerNormnormr   r   r   r@   r1   r   r   r   r   rT   rA   r,   r-   r;     s   
	z$Flux2SingleTransformerBlock.__init__Nr$   r%   temb_modrf   joint_attention_kwargssplit_hidden_statestext_seq_lenrD   c                 C   s   |d ur|j d }tj||gdd}t|dd \}}	}
| |}d|	 | | }|p-i }| jd||d|}||
|  }|jtjkrK|	dd}|rf|d d d |f |d d |d f }}||fS |S )Nrg   r0   r   )r$   rf       r,   )
r{   rL   rt   Flux2Modulationr   r   r   ry   float16clip)r@   r$   r%   r   rf   r   r   r   	mod_shift	mod_scalemod_gatenorm_hidden_statesattn_outputr,   r,   r-   rG     s(   

*z#Flux2SingleTransformerBlock.forwardrP   r   F)NNFNrH   rI   rJ   rV   r[   r\   r;   rL   rM   tupledictstrr   rG   rN   r,   r,   rA   r-   r     sL    "	r   c                       s   e Zd Z			ddededededed	ef fd
dZ		ddejdejdejdejde	ejejf dB de
eef dB de	ejejf fddZ  ZS )Flux2TransformerBlockrP   r   Fr1   r   r   r   r   rT   c                    s   t    t|| | _tj|d|d| _tj|d|d| _t|||||||||t	 d
| _
tj|d|d| _t||||d| _tj|d|d| _t||||d| _d S )NFr   )
r   r   r   ro   r   rT   r   r   r   r   )r1   rQ   rR   rT   )r:   r;   rV   r   r<   r   norm1norm1_contextr   r]   r   norm2rO   ffnorm2_context
ff_contextr   rA   r,   r-   r;     s(   
	zFlux2TransformerBlock.__init__Nr$   r%   temb_mod_imgtemb_mod_txtrf   r   rD   c                 C   s4  |pi }t |d\\}}}	\}
}}t |d\\}}}\}}}| |}d| | | }| |}d| | | }| jd|||d|}|\}}|	| }|| }| |}|d|  |
 }| |}|||  }|| }|| }| |}|d|  | }| |}|||  }|j	t
jkr|dd}||fS )Nr   rg   )r$   r%   rf   r   r   r,   )r   r   r   r   r   r   r   r   r   ry   rL   r   r   )r@   r$   r%   r   r   rf   r   	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpc_shift_msac_scale_msa
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpr   norm_encoder_hidden_statesattention_outputsr   context_attn_output	ff_outputcontext_ff_outputr,   r,   r-   rG     s@   	





zFlux2TransformerBlock.forwardr   r   r   r,   r,   rA   r-   r     sD    (r   c                       s@   e Zd Zdedee f fddZdejdejfddZ  Z	S )	Flux2PosEmbedthetaaxes_dimc                    s   t    || _|| _d S r   )r:   r;   r   r   )r@   r   r   rA   r,   r-   r;   /  s   

zFlux2PosEmbed.__init__idsrD   c              	   C   s   g }g }|  }|jjdk}|jjdk}|s|rtjntj}tt| jD ] }t	| j| |d|f | j
dd|d\}	}
||	 ||
 q%tj|dd|j}tj|dd|j}||fS )Nmpsnpu.T)r   repeat_interleave_realuse_realfreqs_dtyper/   r0   )r[   devicetyperL   float32float64ranger   r   r   r   r   rt   rx   )r@   r   cos_outsin_outposis_mpsis_npur   icossin	freqs_cos	freqs_sinr,   r,   r-   rG   4  s(   


zFlux2PosEmbed.forward)
rH   rI   rJ   rV   listr;   rL   rM   rG   rN   r,   r,   rA   r-   r   -  s    r   c                	       sT   e Zd Z				ddedededef fd	d
ZdejdejdejfddZ  Z	S )Flux2TimestepGuidanceEmbeddings      FTin_channelsembedding_dimrT   guidance_embedsc                    sL   t    t|ddd| _t|||d| _|r!t|||d| _d S d | _d S )NTr   )num_channelsflip_sin_to_cosdownscale_freq_shift)r  time_embed_dimsample_proj_bias)r:   r;   r   	time_projr   timestep_embedderguidance_embedder)r@   r  r  rT   r  rA   r,   r-   r;   N  s   

z(Flux2TimestepGuidanceEmbeddings.__init__timestepguidancerD   c                 C   sZ   |  |}| ||j}|d ur+| jd ur+|  |}| ||j}|| }|S |S r   )r  r  rx   ry   r  )r@   r  r  timesteps_projtimesteps_embguidance_projguidance_embtime_guidance_embr,   r,   r-   rG   c  s   

z'Flux2TimestepGuidanceEmbeddings.forward)r  r  FT)
rH   rI   rJ   rV   r\   r;   rL   rM   rG   rN   r,   r,   rA   r-   r  M  s    $r  c                       sv   e Zd Zddededef fddZdejd	ejfd
dZe	dejded	e
e
ejejejf df fddZ  ZS )r   r   Fr1   mod_param_setsrT   c                    s:   t    || _tj||d | j |d| _t | _d S )Nr   rU   )r:   r;   r%  r<   rW   linearr=   rY   )r@   r1   r%  rT   rA   r,   r-   r;   q  s   
zFlux2Modulation.__init__tembrD   c                 C   s   |  |}| |}|S r   )rY   r&  )r@   r'  modr,   r,   r-   rG   x  s   

zFlux2Modulation.forwardr(  .c                    sB   | j dkr
| d} tj| d| dd t fddt|D S )Nr   rg   r   r/   r0   c                 3   s(    | ]} d | d |d   V  qdS )r   rg   Nr,   )r   r
  
mod_paramsr,   r-   	<genexpr>  s   & z(Flux2Modulation.split.<locals>.<genexpr>)ndim	unsqueezerL   r4   r   r  )r(  r%  r,   r)  r-   r   }  s   

zFlux2Modulation.split)r   F)rH   rI   rJ   rV   r\   r;   rL   rM   rG   staticmethodr   r   rN   r,   r,   rA   r-   r   p  s
    :r   c                       sL  e Zd ZdZdZddgZddgZddgZeddd	d
eddd	d
eddd	d
eddd	d
de	ddddZ
e														d4dedededB dededededed ed!ed"eed#f d$ed%ed&ef fd'd(Zed)							d5d*ejd+ejd,ejd-ejd.ejd/ejd)eeef dB d0ed1ejeB fd2d3Z  ZS )6Flux2Transformer2DModela.  
    The Transformer model introduced in Flux 2.

    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/

    Args:
        patch_size (`int`, defaults to `1`):
            Patch size to turn the input data into small patches.
        in_channels (`int`, defaults to `128`):
            The number of channels in the input.
        out_channels (`int`, *optional*, defaults to `None`):
            The number of channels in the output. If not specified, it defaults to `in_channels`.
        num_layers (`int`, defaults to `8`):
            The number of layers of dual stream DiT blocks to use.
        num_single_layers (`int`, defaults to `48`):
            The number of layers of single stream DiT blocks to use.
        attention_head_dim (`int`, defaults to `128`):
            The number of dimensions to use for each attention head.
        num_attention_heads (`int`, defaults to `48`):
            The number of attention heads to use.
        joint_attention_dim (`int`, defaults to `15360`):
            The number of dimensions to use for the joint attention (embedding/channel dimension of
            `encoder_hidden_states`).
        pooled_projection_dim (`int`, defaults to `768`):
            The number of dimensions to use for the pooled projection.
        guidance_embeds (`bool`, defaults to `True`):
            Whether to use guidance embeddings for guidance-distilled variant of the model.
        axes_dims_rope (`tuple[int]`, defaults to `(32, 32, 32, 32)`):
            The dimensions to use for the rotary positional embeddings.
    Tr   r   	pos_embedr   rg   r   F)	split_dimexpected_dimssplit_output)r$   r%   img_idstxt_ids)
gather_dimr2  ) proj_out   Nr   0    <  r  rP       r=  r=  r=    r   
patch_sizer  out_channels
num_layersnum_single_layersr   r   joint_attention_dimtimestep_guidance_channelsr   axes_dims_rope.
rope_thetar   r  c                    s*  t    |p|_  _t||d_t|	jd|d_tjddd_	tjddd_
tjddd_tj|jdd_tj|jdd_t fdd	t|D _t fd
d	t|D _tjjddd_tjj|| j dd_d_d S )N)r   r   F)r  r  rT   r  r   )r%  rT   rg   rU   c              
      "   g | ]}t j d dqS F)r1   r   r   r   r   rT   )r   rS   r   r   r   r   r   r   r@   r,   r-   r         	z4Flux2Transformer2DModel.__init__.<locals>.<listcomp>c              
      rG  rH  )r   rS   rI  rJ  r,   r-   r     rK  )r   r   rT   )r:   r;   r@  rS   r   r0  r  time_guidance_embedr   double_stream_modulation_imgdouble_stream_modulation_txtsingle_stream_modulationr<   rW   
x_embeddercontext_embedderr   r  transformer_blockssingle_transformer_blocksr   norm_outr8  gradient_checkpointing)r@   r?  r  r@  rA  rB  r   r   rC  rD  r   rE  rF  r   r  rA   rJ  r-   r;     s<   


			
z Flux2Transformer2DModel.__init__r   r$   r%   r  r4  r5  r  return_dictrD   c	              
   C   s  |j d }	||jd }|dur||jd }| ||}
| |
}| |
}| |
}| |}| |}|j	dkrA|d }|j	dkrJ|d }| 
|}| 
|}tj|d |d gddtj|d |d gddf}t| jD ]%\}}t r| jr| |||||||\}}qs|||||||d\}}qstj||gdd}t| jD ]\}}t r| jr| ||d|||}q||d|||d}q|dd|	dd	f }| ||
}| |}|s|fS t|d
S )a  
        The [`FluxTransformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
                Input `hidden_states`.
            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
                A list of tensors that if specified are added to the residuals of transformer blocks.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        rg   i  Nr   r   r0   )r$   r%   r   r   rf   r   )r$   r%   r   rf   r   .)sample)r{   rx   ry   rL  rM  rN  rO  rP  rQ  r,  r0  rL   rt   	enumeraterR  is_grad_enabledrU  _gradient_checkpointing_funcrS  rT  r8  r   )r@   r$   r%   r  r4  r5  r  r   rV  num_txt_tokensr'  double_stream_mod_imgdouble_stream_mod_txtsingle_stream_modrf   text_rotary_embconcat_rotary_embindex_blockblockoutputr,   r,   r-   rG     s|   
&










		

zFlux2Transformer2DModel.forward)rg   r9  Nr   r:  r9  r:  r;  r  rP   r<  r>  r   T)NNNNNNT)rH   rI   rJ   rK    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patterns_repeated_blocksr   r   _cp_planr   rV   r[   r   r\   r;   r	   rL   rM   
LongTensorr   r   r   r   rG   rN   r,   r,   rA   r-   r/    s    	

	

Q	
r/  r   )r   r   );r   typingr   rL   torch.nnr<   torch.nn.functional
functionalrb   configuration_utilsr   r   loadersr   r   r   utilsr	   r
   _modeling_parallelr   r   	attentionr   r   attention_dispatchr   cache_utilsr   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   
get_loggerrH   r   r.   r6   r8   Moduler9   rO   r]   r   r   r   r   r   r   r  r   r/  r,   r,   r,   r-   <module>   sP   

EK8NE_ #
