o
    Gi                  
   @   s  d dl Z d dlmZ d dlZd dlZd dlmZ d dlm  m	Z
 ddlmZmZ ddlmZmZmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. e/e0Z1d&d'ddZ2d&d'ddZ3d&d'ddZ4G dd dZ5G dd dejj6Z7G dd dejj6eZ8eG dd dej6Z9eG d d! d!ej6Z:G d"d# d#ej6Z;G d$d% d%e*eeeee!e	Z<dS )(    N)Any   )ConfigMixinregister_to_config)FluxTransformer2DLoadersMixinFromOriginalModelMixinPeftAdapterMixin)apply_lora_scalelogging)maybe_allow_in_graph   )ContextParallelInputContextParallelOutput)AttentionMixinAttentionModuleMixinFeedForward)dispatch_attention_fn)
CacheMixin)*CombinedTimestepGuidanceTextProjEmbeddings"CombinedTimestepTextProjEmbeddingsapply_rotary_embget_1d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousAdaLayerNormZeroAdaLayerNormZeroSingleattnFluxAttentionc           	      C   sj   |  |}| |}| |}d  } }}|d ur-| jd ur-| |}| |}| |}||||||fS N)to_qto_kto_vadded_kv_proj_dim
add_q_proj
add_k_proj
add_v_proj	r   hidden_statesencoder_hidden_statesquerykeyvalueencoder_queryencoder_keyencoder_value r0   b/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/models/transformers/transformer_flux.py_get_projections-   s   





r2   c           	      C   sb   |  |jddd\}}}d } }}|d ur)t| dr)| |jddd\}}}||||||fS )Nr   dimr   to_added_qkv)to_qkvchunkhasattrr6   r'   r0   r0   r1   _get_fused_projections;   s
   r:   c                 C   s   | j r	t| ||S t| ||S r   )fused_projectionsr:   r2   )r   r(   r)   r0   r0   r1   _get_qkv_projectionsE   s   r<   c                   @   sX   e Zd ZdZdZdd Z			ddddejdejdejdB d	ejdB d
ejfddZdS )FluxAttnProcessorNc                 C   s    t tdst| jj dd S )Nscaled_dot_product_attentionz; requires PyTorch 2.0. Please upgrade your pytorch version.)r9   FImportError	__class____name__)selfr0   r0   r1   __init__O   s   
zFluxAttnProcessor.__init__r   r   r(   r)   attention_maskimage_rotary_embreturnc                 C   s  t |||\}}}}	}
}|d|jdf}|d|jdf}|d|jdf}||}||}|jd urv|	d|jdf}	|
d|jdf}
|d|jdf}||	}	||
}
tj	|	|gdd}tj	|
|gdd}tj	||gdd}|d urt
||dd}t
||dd}t||||| j| jd}|dd}||j}|d ur|j|jd |jd |jd  gdd\}}|jd | }|jd |}|| }||fS |S )	Nr3      r4   sequence_dim)	attn_maskbackendparallel_configr   r   r   )r<   	unflattenheadsnorm_qnorm_kr#   norm_added_qnorm_added_ktorchcatr   r   _attention_backend_parallel_configflattentodtypesplit_with_sizesshapeto_out
contiguous
to_add_out)rC   r   r(   r)   rE   rF   r*   r+   r,   r-   r.   r/   r0   r0   r1   __call__S   sN   





zFluxAttnProcessor.__call__NNN)	rB   
__module____qualname__rV   rW   rD   rT   Tensorr`   r0   r0   r0   r1   r=   K   s(    r=   c                       s   e Zd ZdZdZdZ	ddedef fddZ					dd	d
dej	dej	dej	dB dej	dB de
ej	 dB dej	dB dej	fddZ  ZS )FluxIPAdapterAttnProcessorz(Flux Attention processor for IP-Adapter.N         ?hidden_sizecross_attention_dimc                    s   t    ttdst| jj d| _ | _t	|t
tfs#|g}t	|ts/|gt| }t|t|kr;td|| _t fddtt|D | _t fddtt|D | _d S )Nr>   z@ requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.zJ`scale` should be a list of integers with the same length as `num_tokens`.c              	          g | ]}t j d dqS T)biasdevicerZ   nnLinear.0_rj   rn   rZ   ri   r0   r1   
<listcomp>       z7FluxIPAdapterAttnProcessor.__init__.<locals>.<listcomp>c              	      rk   rl   ro   rr   ru   r0   r1   rv      rw   )superrD   r9   r?   r@   rA   rB   ri   rj   
isinstancetuplelistlen
ValueErrorscalerp   
ModuleListrangeto_k_ipto_v_ip)rC   ri   rj   
num_tokensr~   rn   rZ   rA   ru   r1   rD      s0   





z#FluxIPAdapterAttnProcessor.__init__r   r   r(   r)   rE   rF   ip_hidden_statesip_adapter_masksrG   c                 C   sh  |j d }t|||\}	}
}}}}|	d|jdf}	|
d|jdf}
|d|jdf}||	}	||
}
|	}|d ur||d|jdf}|d|jdf}|d|jdf}||}||}tj	||	gdd}	tj	||
gdd}
tj	||gdd}|d urt
|	|dd}	t
|
|dd}
t|	|
||dd| j| jd}|d	d
}||	j}|d ur2|j|j d |j d |j d  gdd\}}|jd |}|jd |}||}t|}t|| j| j| jD ]G\}}}}||}||}||d|j|j}||d|j|j}t|||d dd| j| jd}||d|j|j }||j}||| 7 }q|||fS |S )Nr   r3   rH   r4   rI           F)rK   	dropout_p	is_causalrL   rM   r   r   )r\   r<   rN   rO   rP   rQ   rR   rS   rT   rU   r   r   rV   rW   rX   rY   rZ   r[   r]   r_   
zeros_likezipr~   r   r   viewhead_dimreshape)rC   r   r(   r)   rE   rF   r   r   
batch_sizer*   r+   r,   r-   r.   r/   ip_queryip_attn_outputcurrent_ip_hidden_statesr~   r   r   ip_keyip_valuer0   r0   r1   r`      s   












z#FluxIPAdapterAttnProcessor.__call__)rf   rh   NN)NNNNN)rB   rb   rc   __doc__rV   rW   intrD   rT   rd   r{   r`   __classcell__r0   r0   r   r1   re      s@    '	re   c                       s   e Zd ZeZeegZ													dded	ed
edede	dedB de	dB de	dedede	dB de	de	f fddZ
			ddejdejdB dejdB dejdB dejf
ddZ  ZS ) r      @   r   FNTh㈵>	query_dimrO   dim_headdropoutrm   r#   added_proj_biasout_biasepsout_dimcontext_pre_onlypre_onlyelementwise_affinec                    s  t    || _|
d ur|
n|| | _|| _|| _|| _|
d ur"|
n|| _|| _|| _	|
d ur3|
| n|| _
|| _|| _tjj||	|d| _tjj||	|d| _tjj|| j|d| _tjj|| j|d| _tjj|| j|d| _| j	stjg | _| jtjj| j| j|d | jtj| |d urtjj||	d| _tjj||	d| _tjj|| j|d| _tjj|| j|d| _tjj|| j|d| _tjj| j||d| _|d u r|   }| !| d S )N)r   r   rm   )r   )"rx   rD   r   	inner_dimr   use_biasr   r   r   r   rO   r#   r   rT   rp   RMSNormrP   rQ   rq   r    r!   r"   r   r]   appendDropoutrR   rS   r$   r%   r&   r_   _default_processor_clsset_processor)rC   r   rO   r   r   rm   r#   r   r   r   r   r   r   r   	processorr   r0   r1   rD     s>   
zFluxAttention.__init__r(   r)   rE   rF   rG   c                    s   t t| jjj  ddh fdd| D }t|dkr1t	
d| d| jjj d  fd	d
| D }| j| ||||fi |S )Nr   r   c                    s$   g | ]\}}| vr|vr|qS r0   r0   )rs   krt   attn_parametersquiet_attn_parametersr0   r1   rv   Z  s   $ z)FluxAttention.forward.<locals>.<listcomp>r   zjoint_attention_kwargs z are not expected by z and will be ignored.c                    s   i | ]\}}| v r||qS r0   r0   )rs   r   w)r   r0   r1   
<dictcomp>_  s    z)FluxAttention.forward.<locals>.<dictcomp>)setinspect	signaturer   r`   
parameterskeysitemsr|   loggerwarningrA   rB   )rC   r(   r)   rE   rF   kwargsunused_kwargsr0   r   r1   forwardP  s   zFluxAttention.forward)r   r   r   FNTTr   NNFTNra   )rB   rb   rc   r=   r   re   _available_processorsr   floatboolrD   rT   rd   r   r   r0   r0   r   r1   r     sv    	
9c                       s   e Zd Zddedededef fddZ		dd	ejd
ejdejdeejejf dB de	e
ef dB deejejf fddZ  ZS )FluxSingleTransformerBlock      @r5   num_attention_headsattention_head_dim	mlp_ratioc              
      st   t    t|| | _t|| _t|| j| _tj	dd| _
t|| j || _t||||dt ddd| _d S )Ntanh)approximateTư>)r   r   rO   r   rm   r   r   r   )rx   rD   r   mlp_hidden_dimr   normrp   rq   proj_mlpGELUact_mlpproj_outr   r=   r   )rC   r5   r   r   r   r   r0   r1   rD   e  s    

z#FluxSingleTransformerBlock.__init__Nr(   r)   tembrF   joint_attention_kwargsrG   c                 C   s   |j d }tj||gdd}|}| j||d\}}	| | |}
|p$i }| jd||d|}tj||
gdd}|	d}	|	| | }|| }|j	tj
krU|dd}|d d d |f |d d |d f }}||fS )	NrH   r4   emb)r(   rF   r       r0   )r\   rT   rU   r   r   r   r   	unsqueezer   rZ   float16clip)rC   r(   r)   r   rF   r   text_seq_lenresidualnorm_hidden_statesgatemlp_hidden_statesattn_outputr0   r0   r1   r   y  s(   

*z"FluxSingleTransformerBlock.forward)r   NN)rB   rb   rc   r   r   rD   rT   rd   rz   dictstrr   r   r   r0   r0   r   r1   r   c  s"     r   c                       s   e Zd Z	ddededededef
 fdd	Z	
	
ddejdejdejde	ejejf d
B de
eef d
B de	ejejf fddZ  ZS )FluxTransformerBlockrms_normr   r5   r   r   qk_normr   c                    s   t    t|| _t|| _t|||||ddt |d	| _tj	|ddd| _
t||dd| _tj	|ddd| _t||dd| _d S )NFT)	r   r#   r   rO   r   r   rm   r   r   r   r   r   zgelu-approximate)r5   dim_outactivation_fn)rx   rD   r   norm1norm1_contextr   r=   r   rp   	LayerNormnorm2r   ffnorm2_context
ff_context)rC   r5   r   r   r   r   r   r0   r1   rD     s$   


zFluxTransformerBlock.__init__Nr(   r)   r   rF   r   rG   c                 C   s|  | j ||d\}}}}	}
| j||d\}}}}}|pi }| jd|||d|}t|dkr3|\}}nt|dkr>|\}}}|d| }|| }| |}|d|	d d d f   |d d d f  }| |}|
d| }|| }t|dkr||| }|d| }|| }| |}|d|d d d f   |d d d f  }| |}||d|  }|j	t
jkr|dd}||fS )	Nr   )r(   r)   rF   r   r   rH   r   r   r0   )r   r   r   r|   r   r   r   r   r   rZ   rT   r   r   )rC   r(   r)   r   rF   r   r   gate_msa	shift_mlp	scale_mlpgate_mlpnorm_encoder_hidden_states
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattention_outputsr   context_attn_outputr   	ff_outputcontext_ff_outputr0   r0   r1   r     sD   


(

(
zFluxTransformerBlock.forward)r   r   r   )rB   rb   rc   r   r   r   rD   rT   rd   rz   r   r   r   r   r0   r0   r   r1   r     s8    r   c                       s@   e Zd Zdedee f fddZdejdejfddZ  Z	S )	FluxPosEmbedthetaaxes_dimc                    s   t    || _|| _d S r   )rx   rD   r   r   )rC   r   r   r   r0   r1   rD     s   

zFluxPosEmbed.__init__idsrG   c              	   C   s   |j d }g }g }| }|jjdk}|jjdk}|s|r tjntj}t|D ]"}	t| j	|	 |d d |	f | j
dd|d\}
}||
 || q'tj|dd|j}tj|dd|j}||fS )Nr3   mpsnpuT)r   repeat_interleave_realuse_realfreqs_dtyper4   )r\   r   rn   typerT   float32float64r   r   r   r   r   rU   rY   )rC   r   n_axescos_outsin_outposis_mpsis_npur  icossin	freqs_cos	freqs_sinr0   r0   r1   r     s*   


zFluxPosEmbed.forward)
rB   rb   rc   r   r{   rD   rT   rd   r   r   r0   r0   r   r1   r     s    r   c                       sN  e Zd ZdZdZddgZddgZddgZeddd	d
eddd	d
eddd	d
eddd	d
de	ddddZ
e												d3dedededB dedededed ed!ed"ed#eeeef f fd$d%Zed&												d4d'ejd(ejd)ejd*ejd+ejd,ejd-ejd&eeef dB d.ed/ed0ejeB fd1d2Z  ZS )5FluxTransformer2DModela(  
    The Transformer model introduced in Flux.

    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/

    Args:
        patch_size (`int`, defaults to `1`):
            Patch size to turn the input data into small patches.
        in_channels (`int`, defaults to `64`):
            The number of channels in the input.
        out_channels (`int`, *optional*, defaults to `None`):
            The number of channels in the output. If not specified, it defaults to `in_channels`.
        num_layers (`int`, defaults to `19`):
            The number of layers of dual stream DiT blocks to use.
        num_single_layers (`int`, defaults to `38`):
            The number of layers of single stream DiT blocks to use.
        attention_head_dim (`int`, defaults to `128`):
            The number of dimensions to use for each attention head.
        num_attention_heads (`int`, defaults to `24`):
            The number of attention heads to use.
        joint_attention_dim (`int`, defaults to `4096`):
            The number of dimensions to use for the joint attention (embedding/channel dimension of
            `encoder_hidden_states`).
        pooled_projection_dim (`int`, defaults to `768`):
            The number of dimensions to use for the pooled projection.
        guidance_embeds (`bool`, defaults to `False`):
            Whether to use guidance embeddings for guidance-distilled variant of the model.
        axes_dims_rope (`tuple[int]`, defaults to `(16, 56, 56)`):
            The dimensions to use for the rotary positional embeddings.
    Tr   r   	pos_embedr   rH   r   F)	split_dimexpected_dimssplit_outputr   r   )r(   r)   img_idstxt_ids)
gather_dimr  ) r   r   N   &                  8   r$  
patch_sizein_channelsout_channels
num_layersnum_single_layersr   r   joint_attention_dimpooled_projection_dimguidance_embedsaxes_dims_ropec                    s   t    |p|_  _td|d_|
rtnt}|j|	d_t	
|j_t	
|j_t	 fddt|D _t	 fddt|D _tjjddd	_t	j
j|| j d
d_d_d S )Ni'  )r   r   )embedding_dimr+  c                       g | ]
}t j d qS )r5   r   r   )r   r   rr   r   r   rC   r0   r1   rv   c      z3FluxTransformer2DModel.__init__.<locals>.<listcomp>c                    r/  r0  )r   r   rr   r1  r0   r1   rv   n  r2  Fr   r   Tr   )rx   rD   r'  r   r   r  r   r   time_text_embedrp   rq   context_embedder
x_embedderr   r   transformer_blockssingle_transformer_blocksr   norm_outr   gradient_checkpointing)rC   r%  r&  r'  r(  r)  r   r   r*  r+  r,  r-  text_time_guidance_clsr   r1  r1   rD   C  s0   




zFluxTransformer2DModel.__init__r   r(   r)   pooled_projectionstimestepr  r  guidancereturn_dictcontrolnet_blocks_repeatrG   c              	   C   s@  |  |}||jd }|dur||jd }|du r#| ||n| |||}| |}|jdkr=td |d }|jdkrKtd |d }tj	||fdd}| 
|}|durrd|v rr|d}| |}|d	|i t| jD ]L\}}t r| jr| ||||||\}}n||||||d
\}}|	durt| jt|	 }tt|}|r||	|t|	   }qw||	||   }qwt| jD ]@\}}t r| jr| ||||||\}}n||||||d
\}}|
dur	t| jt|
 }tt|}||
||   }q| ||}| |}|s|fS t|dS )as  
        The [`FluxTransformer2DModel`] forward method.

        Args:
            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
                Input `hidden_states`.
            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
                from the embeddings of input conditions.
            timestep ( `torch.LongTensor`):
                Used to indicate denoising step.
            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
                A list of tensors that if specified are added to the residuals of transformer blocks.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
                tuple.

        Returns:
            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
            `tuple` where the first element is the sample tensor.
        i  Nr   zrPassing `txt_ids` 3d torch.Tensor is deprecated.Please remove the batch dimension and pass it as a 2d torch Tensorr   zrPassing `img_ids` 3d torch.Tensor is deprecated.Please remove the batch dimension and pass it as a 2d torch Tensorr4   ip_adapter_image_embedsr   )r(   r)   r   rF   r   )sample)r5  rY   rZ   r3  r4  ndimr   r   rT   rU   r  popencoder_hid_projupdate	enumerater6  is_grad_enabledr9  _gradient_checkpointing_funcr|   r   npceilr7  r8  r   r   )rC   r(   r)   r;  r<  r  r  r=  r   controlnet_block_samplescontrolnet_single_block_samplesr>  r?  r   r   rF   r@  r   index_blockblockinterval_controloutputr0   r0   r1   r   }  s   
+








	



	

zFluxTransformer2DModel.forward)rH   r   Nr  r  r  r  r   r!  Fr"  )NNNNNNNNNTF)rB   rb   rc   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patterns_repeated_blocksr   r   _cp_planr   r   r   rz   rD   r	   rT   rd   
LongTensorr   r   r   r   r   r   r0   r0   r   r1   r    s    	

	
9	r  r   )r   r   )=r   typingr   numpyrI  rT   torch.nnrp   torch.nn.functional
functionalr?   configuration_utilsr   r   loadersr   r   r   utilsr	   r
   utils.torch_utilsr   _modeling_parallelr   r   	attentionr   r   r   attention_dispatchr   cache_utilsr   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerrB   r   r2   r:   r<   r=   Modulere   r   r   r   r   r  r0   r0   r0   r1   <module>   sP   

C P5T
