o
    }oid                     @   s$  d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m!Z!m"Z" d dl#m$Z% d dl#m&Z' d dlm(Z( d dl)m*Z+ d dl,m-Z-m.Z. d dl/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z<m=Z=m>Z> d dl?m@Z@mAZA d dlBmCZC d dlDmEZEmFZF d dlGmHZH d dlImJZJmKZK d dlLmMZM dd ZNeG dd  d eeEjOZPeG d!d" d"ZQeG d#d$ d$ZReG d%d& d&ZSG d'd( d(eZTG d)d* d*ejUeEjOeEjVeCjWZXeEYeXd+G d,d- d-eEjZd.eXf Z[eEj\d/d0d1d2eEj]fd3d4Z^eEj\d5d6d1d2eEj]fd7d8Z_eEj\d9d:d1d2eEj]fd;d<Z`eEj\d=d>d1d2eEj]fd?d@ZaeEj\dAdBd1d2eEj]fdCdDZbeEj\dEdFd1d2eEj]fdGdHZceEj\dIdJd1dKdL ZddS )M    N)nullcontext)	dataclassfield)Path)CallableOptional)parallel_state)ShardedStateDict)replace_prefix_for_sharding)VisionModule)OptimizerConfig)	ModelType)TransformerConfig)openai_gelusharded_state_dict_default)	load_file)	save_file)nn)
functional)FrozenCLIPEmbedderFrozenT5Embedder)AdaLNContinuousFluxSingleTransformerBlock
MMDiTLayer'get_flux_double_transformer_engine_spec'get_flux_single_transformer_engine_spec)EmbedNDMLPEmbedderTimeStepEmbedder)FlowMatchEulerDiscreteScheduler)_import_qkv_import_qkv_biasflux_transformer_converter)AutoEncoderAutoEncoderConfig)fn)ioteardown)MaskedTokenLossReduction)MegatronOptimizerModuleOptimizerModule)loggingc                 C   sH   t | }t|trt|dkr|d }n|}tdgjdd|d< |S )N   r         ?Tnon_blocking	loss_mask)next
isinstancetuplelentorchTensorcuda)dataloader_iterbatch_batch r;   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/models/flux/model.pyflux_data_step>   s   
r=   c                   @   sr  e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< eZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< dZeed"< dZeed#< dZeed$< d%Z eed&< d'Z!eed(< e"Z#eed)< d*Z$e%e& ed+< dZ'eed,< dZ(eed-< d*Z)d.d/ Z*d*S )0
FluxConfigz)
    transformer related Flux Config
       
num_layers   num_joint_layers&   num_single_layers   hidden_size   num_attention_headsactivation_funcTadd_qkv_bias@   in_channelsi   context_dim   model_channels
patch_sizeFguidance_embedi   
vec_in_dimrotary_interleavedapply_rope_fusionư>layernorm_epsilonr   hidden_dropoutattention_dropoutuse_cpu_initializationgradient_accumulation_fusionenable_cuda_graphuse_te_rng_tracker   cuda_graph_warmup_stepsg      @guidance_scaledata_step_fnN	ckpt_pathload_dist_ckptdo_convert_from_hfc                 C   s   t | d}|S )Nconfig)Flux)selfmodelr;   r;   r<   configure_modeln   s   
zFluxConfig.configure_model)+__name__
__module____qualname____doc__r@   int__annotations__rB   rD   rF   rH   r   rI   r   rJ   boolrL   rM   rO   rP   rQ   rR   rS   rT   rV   floatrW   rX   rY   rZ   r[   r\   r^   r_   r=   r`   ra   r   strrb   rc   save_converted_model_tori   r;   r;   r;   r<   r>   I   s@   
 r>   c                   @   sR   e Zd ZU dZedd dZee ed< edd dZ	ee
 ed< dZeed	< d
S )T5Configz
    T5 Config
    c                   C      dS )Nzgoogle/t5-v1_1-xxlr;   r;   r;   r;   r<   <lambda>y       zT5Config.<lambda>default_factoryversionc                   C   ru   )Ni   r;   r;   r;   r;   r<   rv   z   rw   
max_lengthFload_config_onlyN)rj   rk   rl   rm   r   rz   r   rr   ro   r{   rn   r|   rp   r;   r;   r;   r<   rt   s   s
   
 rt   c                   @   s`   e Zd ZU dZedd dZee ed< edd dZ	ee
 ed< edd dZee ed	< d
S )
ClipConfigz
    Clip Config
    c                   C   ru   )Nzopenai/clip-vit-large-patch14r;   r;   r;   r;   r<   rv      rw   zClipConfig.<lambda>rx   rz   c                   C   ru   )NM   r;   r;   r;   r;   r<   rv      rw   r{   c                   C   ru   )NTr;   r;   r;   r;   r<   rv      rw   always_return_pooledN)rj   rk   rl   rm   r   rz   r   rr   ro   r{   rn   r   rp   r;   r;   r;   r<   r}   ~   s
   
 r}   c                   @   sv   e Zd ZU dZeedZeed< edd dZe	ed< ee
dZe
ed< eedZeed< d	Zeed
< dZeed< dS )FluxModelParamsz
    Flux Model Params
    rx   flux_configc                   C   s   t g dg dS )N)r?   r]      r   )ch_multattn_resolutions)r$   r;   r;   r;   r<   rv      s    zFluxModelParams.<lambda>
vae_configclip_params	t5_params  scheduler_stepsr7   deviceN)rj   rk   rl   rm   r   r>   r   ro   r   r$   r}   r   rt   r   r   rn   r   rr   r;   r;   r;   r<   r      s   
 r   c                       s   e Zd ZdZdef fddZdd Z								ddejd	ejd
ejdej	dejdejdejdejdejfddZ
	dddZddededefddZ  ZS ) rf   a  
    NeMo implementation of Flux model, with flux transformer and single flux transformer blocks implemented with
    Megatron Core.

    Args:
        config (FluxConfig): Configuration object containing the necessary parameters for setting up the model,
                              such as the number of channels, hidden size, attention heads, and more.

    Attributes:
        out_channels (int): The number of output channels for the model.
        hidden_size (int): The size of the hidden layers.
        num_attention_heads (int): The number of attention heads for the transformer.
        patch_size (int): The size of the image patches.
        in_channels (int): The number of input channels for the image.
        guidance_embed (bool): A flag to indicate if guidance embedding should be used.
        pos_embed (EmbedND): Position embedding layer for the model.
        img_embed (nn.Linear): Linear layer to embed image input into the hidden space.
        txt_embed (nn.Linear): Linear layer to embed text input into the hidden space.
        timestep_embedding (TimeStepEmbedder): Embedding layer for timesteps, used in generative models.
        vector_embedding (MLPEmbedder): MLP embedding for vector inputs.
        guidance_embedding (nn.Module or nn.Identity): Optional MLP embedding for guidance, or identity if not used.
        double_blocks (nn.ModuleList): A list of transformer blocks for the double block layers.
        single_blocks (nn.ModuleList): A list of transformer blocks for the single block layers.
        norm_out (AdaLNContinuous): Normalization layer for the output.
        proj_out (nn.Linear): Final linear layer for output projection.

    Methods:
        forward: Performs a forward pass through the network, processing images, text, timesteps, and guidance.
        load_from_pretrained: Loads model weights from a pretrained checkpoint, with optional support for distribution
                              and conversion from Hugging Face format.

    re   c                    sf  t     j| _ j| _ j| _ j| _ j| _ j| _t| jdg dd| _	t
 j| j| _t
 j| j| _t j| j| _t j| jd| _ jr_ jrZt j| jdnt
 | _t
 fddt jD | _t
 fddt jD | _t | jd| _t
j| j| j| j | j d	d
| _| j j!d ur| j"| j j!| j j#| j j$| j j%d d S d S )Ni'  )   8   r   )dimthetaaxes_dim)in_dim
hidden_dimc                    s    g | ]}t  t j|d dqS )F)re   
submoduleslayer_numbercontext_pre_only)r   r   r   .0ird   r;   r<   
<listcomp>   s    z!Flux.__init__.<locals>.<listcomp>c                    s   g | ]}t  t j|d qS ))re   r   r   )r   r   r   r   rd   r;   r<   r      s    )re   conditioning_embedding_dimT)bias)rc   rb   rs   )&super__init__rL   out_channelsrF   rH   rP   rQ   r   	pos_embedr   Linear	img_embedrM   	txt_embedr   rO   timestep_embeddingr   rR   vector_embeddingIdentityguidance_embedding
ModuleListrangerB   double_blocksrD   single_blocksr   norm_outproj_outre   ra   load_from_pretrainedrc   rb   rs   )rg   re   	__class__rd   r<   r      sJ   

"
zFlux.__init__c              	   C   s   | j js	t }|S dd l}| j jdkr|jjjj}n| j jdkr'|jjjj}nt	d|jjj
| j j| j j|| j j| j jdd| j j fd}d }t rRtjdd}|jjd||d	}|S )
Nr   e4m3hybridz3E4M3 and HYBRID are the only supported FP8 formats.F)margininterval
fp8_formatamax_compute_algoamax_history_lenoverride_linear_precisionT)with_context_parallel)enabled
fp8_recipe	fp8_group)re   fp8r   transformer_enginecommonrecipeFormatE4M3HYBRID
ValueErrorDelayedScaling
fp8_marginfp8_intervalfp8_amax_compute_algofp8_amax_history_len	fp8_wgradr   model_parallel_is_initializedget_amax_reduction_grouppytorchfp8_autocast)rg   fp8_contextr   r   r   r   r;   r;   r<   get_fp8_context   s0   zFlux.get_fp8_contextNimgtxty	timestepsimg_idstxt_idsguidancecontrolnet_double_block_samplescontrolnet_single_block_samplesc
              	   C   s  |  |}
| |}||jd }| |}|dur(|| | j|d  }|| | }tj	||fdd}| 
|}t| jD ]>\}}|  . ||
|||d\}
}|durqt| jt| }tt|}|
|||   }
W d   n1 s{w   Y  qBtj	||
gdd}
t| jD ]P\}}|  @ ||
||d\}
}|	durt| jt|	 }tt|}t	|
d|jd  |
|jd d |	||   g}
W d   n1 sw   Y  q|
|jd ddf }
| |
|}
| |
}|S )	a$  
        Forward pass through the model, processing image, text, and additional inputs like guidance and timesteps.

        Args:
            img (torch.Tensor):
                The image input tensor.
            txt (torch.Tensor, optional):
                The text input tensor (default is None).
            y (torch.Tensor, optional):
                The vector input for embedding (default is None).
            timesteps (torch.LongTensor, optional):
                The timestep input, typically used in generative models (default is None).
            img_ids (torch.Tensor, optional):
                Image IDs for positional encoding (default is None).
            txt_ids (torch.Tensor, optional):
                Text IDs for positional encoding (default is None).
            guidance (torch.Tensor, optional):
                Guidance input for conditioning (default is None).
            controlnet_double_block_samples (torch.Tensor, optional):
                Optional controlnet samples for double blocks (default is None).
            controlnet_single_block_samples (torch.Tensor, optional):
                Optional controlnet samples for single blocks (default is None).

        Returns:
            torch.Tensor: The final output tensor from the model after processing all inputs.
        r   Nr?   )r   )hidden_statesencoder_hidden_statesrotary_pos_embembr   )r   r   r   .)r   r   todtyper   r   	time_projr   r5   catr   	enumerater   r   r4   rn   npceilr   shaper   r   )rg   r   r   r   r   r   r   r   r   r   r   r   vec_embidsr   id_blockblockinterval_control_outputr;   r;   r<   forward  sb   
&








zFlux.forwardFc                 C   s   |r$ddl m} t| jddd}|j||d}dd |d	  D }n,|rLt|| j}|d urKtj	|d
d tj
|d}	t||	 td|	  nt|}| j|dd\}
}dd |
D }
t|
dkrvtd|
  td|  td|  d S )Nr   )dist_checkpointingmodule.)prefix)
state_dict)sharded_state_dictcheckpoint_dirc                 S   s   i | ]
\}}| d |qS )r   )removeprefix)r   kvr;   r;   r<   
<dictcomp>y      z-Flux.load_from_pretrained.<locals>.<dictcomp>r   T)exist_okz!nemo_flux_transformer.safetensorsz+saving converted transformer checkpoint to F)strictc                 S   s   g | ]	}| d s|qS )_extra_state)endswith)r   r   r;   r;   r<   r     s    z-Flux.load_from_pretrained.<locals>.<listcomp>zThe following keys are missing during checkpoint loading, please check the ckpt provided or the image quality may be compromised.
 zFound unexepected keys: 
 z!Restored flux model weights from )megatron.corer   dictr   loaditemsr"   re   osmakedirspathjoinsave_safetensorsr+   infoload_safetensorsload_state_dictr4   )rg   ra   rc   rs   rb   r   r   loaded_state_dictckpt	save_pathmissing
unexpectedr;   r;   r<   r   n  s4   
zFlux.load_from_pretrained r;   sharded_offsetsmetadatareturnc                 C   s,  i }| d}| j D ]/}|| j}|j}| ||  d}	| | d}
g }||	||}t||	|
 || q
| d}| jD ]/}|| j}|j}| ||  d}	| | d}
g }||	||}t||	|
 || qB|  D ]\}}|| ju s|| j u s|t	|| | d|| qv|S )Nzdouble_blocks..zsingle_blocks.)
r   _get_layer_offsetre   r   r   r
   updater   named_childrenr   )rg   r   r  r  r   layer_prefixlayeroffsetglobal_layer_offsetstate_dict_prefixsharded_prefixsharded_pp_offsetlayer_sharded_state_dictnamemoduler;   r;   r<   r     s8   



zFlux.sharded_state_dict)NNNNNNNN)FNF)r  r;   N)rj   rk   rl   rm   r>   r   r   r5   r6   
LongTensorr   r   r3   r   r	   r   __classcell__r;   r;   r   r<   rf      sD    !7!	

]
 !rf   c                       s2  e Zd ZdZ	d9dedee f fddZdd Zd	d
 Z	dd Z
dd Zdd Zdd Zd9dejfddZd9dejfddZdejfddZdejfddZ			d:d ed!ed"ed#ed$ef
d%d&Zd'd( Zd)d* Zd!ed+ed,ed-ejd.ejf
d/d0Zd1d2 Zd3d4 Ze de!fd5d6Z"e de!fd7d8Z#  Z$S );MegatronFluxModelz~
    Megatron wrapper for flux.

    Args:
        flux_params (FluxModelParams): Parameters to configure the Flux model.
    Nflux_paramsoptimc                    s   || _ |j| _t   d | _d | _| j j| _| j j| _| j j	| _	|p+t
tdddd| _| j|  tj| _| j	d u p@| jd u | _| jd u | _d S )Ng-C6?F)lruse_distributed_optimizerrd   )paramsr   re   r   r   _training_loss_reduction_validation_loss_reductionr   r   r   r)   r   r"  connectr   encoder_or_decoder
model_typetext_precachedimage_precached)rg   r!  r"  r   r;   r<   r     s   



zMegatronFluxModel.__init__c                 C   s   t | ds| j | _| | j |   | | j| j	 | j
 D ] \}}| jjdkr7d|v s4d|v r7d|_d|v rBd|v rBd|_q"d S )Nr  r   contextaddedFr   zself_attention.linear_proj.bias)hasattrre   ri   r  configure_vaer   configure_schedulerconfigure_text_encodersr   r   named_parametersrD   requires_grad)rg   r  paramr;   r;   r<   ri     s   
z!MegatronFluxModel.configure_modelc                 C   s   t | jjd| _d S )N)num_train_timesteps)r   r%  r   	schedulerrg   r;   r;   r<   r1    s   z%MegatronFluxModel.configure_schedulerc                 C   s   t |tjr$|  | _dt| jjj | _	| j
 D ]}d|_qd S t |trGt|  | _dt|j | _	| j
 D ]}d|_q?d S td d | _d| _	d S )Nr]   Fz:Vae not provided, assuming the image input is precached...r   )r2   r   Moduleevalr7   vaer4   r%  r   vae_scale_factor
parametersr4  r$   r#   r+   r  )rg   r;  r5  r;   r;   r<   r0    s   


zMegatronFluxModel.configure_vaec                 C   s   t |tjr
|| _n t |tr"t| jj| jj| jj	t
j d| _ntd d | _t |tjr5|| _d S t |trNt| jj| jjt
j | jjd| _d S td d | _d S )N)rz   r{   r   r   zGCLIP encoder not provided, assuming the text embeddings is precached...)r{   r   r|   zET5 encoder not provided, assuming the text embeddings is precached...)r2   r   r9  clipr}   r   r   rz   r{   r   r5   r7   current_devicer+   r  t5rt   r   r   r|   )rg   r>  r@  r;   r;   r<   r2    s,   






z)MegatronFluxModel.configure_text_encodersc                 C   s   | j |S N)re   r`   )rg   r8   r;   r;   r<   	data_step  s   zMegatronFluxModel.data_stepc                 O   s   | j |i |S rA  )r  )rg   argskwargsr;   r;   r<   r     s   zMegatronFluxModel.forwardr  c                 C   
   |  |S rA  forward_steprg   r9   	batch_idxr;   r;   r<   training_step  s   
zMegatronFluxModel.training_stepc                 C   rE  rA  rF  rH  r;   r;   r<   validation_step   s   
z!MegatronFluxModel.validation_stepc              
   C   s  | j jjr
tj| _n| j jjrtj| _ntj| _| j	r$|d j
dd}n|d j
dd}| j|j| jd}| |\}}}}}}| jrb|d j
dddd}	|d	 j
dd}
|d
 j
dd}n|d }| j||j|jd\}	}
}tj
jj| jtjtjfv | jd | j||	|
|d |||d}W d    n1 sw   Y  | j|ddt|jd | j d t|jd | j d | jddd}|| }tj| | dd}|S )NlatentsTr.   imagesr   prompt_embedsr   r?   pooled_prompt_embedstext_idsr   r   r   r   )r   r   r   r   r   r   r   r]   r,   )r<  mean)	reduction)r"  re   bf16r5   bfloat16autocast_dtypefp16rq   float32r,  r7   r;  encoder   prepare_image_latentr+  	transposeencode_promptr   r   ampautocasthalfr   _unpack_latentsrn   r   r<  Fmse_loss)rg   r9   rL  r   noisepacked_noisy_model_inputlatent_image_idsguidance_vecr   rO  rP  rQ  r   
noise_predtargetlossr;   r;   r<   rG  &  sX   





zMegatronFluxModel.forward_stepr7   c                 C   sV   |  |dd}| |\}}t|jd |jd dj||d}||j|d|fS )Nr   r?   r,   rR  rN  )r@  r\  r>  r5   zerosr   r   )rg   promptr   r   rO  r   rP  rQ  r;   r;   r<   r]  Y  s   $zMegatronFluxModel.encode_prompt        r-   p=
ף?weighting_scheme
batch_size
logit_mean	logit_std
mode_scalec                 C   s   |dkrt j|||fdd}t jj|}|S |dkr;t j|fdd}d| |t tj| d d d |   }|S t j|fdd}|S )a
  
        Compute the density for sampling the timesteps when doing SD3 training.

        Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.

        SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
        logit_normalcpu)rS  stdsizer   mode)rw  r   r?   r]   )	r5   normalr   r   sigmoidrandcosmathpi)rg   ro  rp  rq  rr  rs  ur;   r;   r<   %compute_density_for_timestep_samplinga  s   ,z7MegatronFluxModel.compute_density_for_timestep_samplingc                    s  |  |jd |jd |jd |j|j}tj||j|jd}|jd }| d|}|| jj 	 }| jj
| j|jd}| jjj|j|jd}| jj
j|jd  fdd|D }	|j|jd	}||	  }
t|
j|jk r{|
d
}
t|
j|jk snd|
 | |
|  }| j||jd |jd |jd |jd d}| jjrtj|jd f| jj|j|jd}nd }|dd|dd|dd|||fS )Nr   r]   r,   rR  rt  )r   c                    s   g | ]
} |k   qS r;   )nonzeroitem)r   tschduler_timestepsr;   r<   r     r   z:MegatronFluxModel.prepare_image_latent.<locals>.<listcomp>rN  r-   r?   )rp  num_channels_latentsheightwidth)_prepare_latent_image_idsr   r   r   r5   
randn_liker  r7  r6  longr   r   sigmasflattenr4   ndim	unsqueeze_pack_latentsre   rQ   fullr_   r\  )rg   rL  rf  rd  rp  r  indicesr   r  step_indicessigmanoisy_model_inputre  rg  r;   r  r<   r[  {  s\   





z&MegatronFluxModel.prepare_image_latentc                 C   sh   |j \}}}|| }|| }|||||d dd}|dddddd}|||d |d |d }|S )Nr   r]   r   r,   r?      )r   viewpermutereshape)rg   rL  r  r  r<  rp  num_patcheschannelsr;   r;   r<   ra    s   z!MegatronFluxModel._unpack_latentsr  r  r   r   c           
      C   s   t |d |d d}|d t |d d d d f  |d< |d t |d d d d f  |d< |j\}}}	|d d d f |ddd}|||| |	}|j||dS )Nr]   r,   ).r?   ).r]   r?   rR  )r5   rk  aranger   repeatr  r   )
rg   rp  r  r  r   r   rf  latent_image_id_heightlatent_image_id_widthlatent_image_id_channelsr;   r;   r<   r    s   &&
z+MegatronFluxModel._prepare_latent_image_idsc                 C   sR   | |||d d|d d}|dddddd}|||d |d  |d }|S )Nr]   r   r   r?   r,   r  )r  r  r  )rg   rL  rp  r  r  r  r;   r;   r<   r    s   zMegatronFluxModel._pack_latentsc                 C   s   d S rA  r;   )rg   tensorr;   r;   r<   set_input_tensor  s   z"MegatronFluxModel.set_input_tensorc                 C   s   | j st | _ | j S rA  )r&  r(   r8  r;   r;   r<   training_loss_reduction  s   z)MegatronFluxModel.training_loss_reductionc                 C   s   | j s	tdd| _ | j S )NT)rK  )r'  r(   r8  r;   r;   r<   validation_loss_reduction  s   z+MegatronFluxModel.validation_loss_reductionrA  )rm  r-   rn  )%rj   rk   rl   rm   r   r   r*   r   ri   r1  r0  r2  rB  r   r5   r6   rJ  rK  rG  rY  r]  rr   rn   rq   r  r[  ra  r   r   r  r  r  propertyr(   r  r  r  r;   r;   r   r<   r     sf    
3
8
r   hfc                   @   sJ   e Zd ZdZdefddZdedefddZede	fdd	Z
d
d ZdS )HFFluxImporterzB
    Convert a HF ckpt into NeMo dist-ckpt compatible format.
    r  c                 C   s
   t | jS rA  )r   re   r8  r;   r;   r<   init  s   
zHFFluxImporter.initoutput_pathc                 C   sr   ddl m} |jt| dd}|  }| |}| || td|  | || td|  t	|| |S )Nr   FluxTransformer2DModeltransformer	subfolderz.Converted flux transformer to Nemo, saving to z$Converted flux transformer saved to )
	diffusersr  from_pretrainedrr   r  
nemo_setupconvert_stateprint	nemo_saver'   )rg   r  r  sourceri  trainerr;   r;   r<   apply  s   

zHFFluxImporter.applyc                 C   s   ddl m} |jt| dd}|j}tdi ddd|jd|jd	|j|j	 d
|jdt
ddd|jd|jddd|jd|jd|jdddddddddd}t|d d d ddd}|S )Nr   r  r  r  r@   r?   rB   rD   rF   rH   rI   rJ   TrL   rM   rO   rN   rP   rQ   rR   rS   rV   rU   rW   rX   rY   r   r7   )r   r   r   r   r   r   r;   )r  r  r  rr   re   r>   r@   rD   rH   attention_head_dimr   rL   joint_attention_dimrP   guidance_embedspooled_projection_dimr   )rg   r  r  source_configr   r   r;   r;   r<   re     sb   	
zHFFluxImporter.configc                 C   s  i ddddddddd	d
dddddddddddddddddddddd d!d"i d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d;d<d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdP}t j|||tttttttgdQS )RNz(transformer_blocks.*.norm1.linear.weightz/double_blocks.*.adaln.adaLN_modulation.1.weightz&transformer_blocks.*.norm1.linear.biasz-double_blocks.*.adaln.adaLN_modulation.1.biasz0transformer_blocks.*.norm1_context.linear.weightz7double_blocks.*.adaln_context.adaLN_modulation.1.weightz.transformer_blocks.*.norm1_context.linear.biasz5double_blocks.*.adaln_context.adaLN_modulation.1.biasz'transformer_blocks.*.attn.norm_q.weightz1double_blocks.*.self_attention.q_layernorm.weightz'transformer_blocks.*.attn.norm_k.weightz1double_blocks.*.self_attention.k_layernorm.weightz-transformer_blocks.*.attn.norm_added_q.weightz7double_blocks.*.self_attention.added_q_layernorm.weightz-transformer_blocks.*.attn.norm_added_k.weightz7double_blocks.*.self_attention.added_k_layernorm.weightz)transformer_blocks.*.attn.to_out.0.weightz1double_blocks.*.self_attention.linear_proj.weightz'transformer_blocks.*.attn.to_out.0.biasz/double_blocks.*.self_attention.linear_proj.biasz+transformer_blocks.*.attn.to_add_out.weightz7double_blocks.*.self_attention.added_linear_proj.weightz)transformer_blocks.*.attn.to_add_out.biasz5double_blocks.*.self_attention.added_linear_proj.biasz)transformer_blocks.*.ff.net.0.proj.weightz%double_blocks.*.mlp.linear_fc1.weightz'transformer_blocks.*.ff.net.0.proj.biasz#double_blocks.*.mlp.linear_fc1.biasz$transformer_blocks.*.ff.net.2.weightz%double_blocks.*.mlp.linear_fc2.weightz"transformer_blocks.*.ff.net.2.biasz#double_blocks.*.mlp.linear_fc2.biasz1transformer_blocks.*.ff_context.net.0.proj.weightz-double_blocks.*.context_mlp.linear_fc1.weightz/transformer_blocks.*.ff_context.net.0.proj.biasz+double_blocks.*.context_mlp.linear_fc1.biasz,transformer_blocks.*.ff_context.net.2.weightz-double_blocks.*.context_mlp.linear_fc2.weightz*transformer_blocks.*.ff_context.net.2.biasz+double_blocks.*.context_mlp.linear_fc2.biasz.single_transformer_blocks.*.norm.linear.weightz/single_blocks.*.adaln.adaLN_modulation.1.weightz,single_transformer_blocks.*.norm.linear.biasz-single_blocks.*.adaln.adaLN_modulation.1.biasz+single_transformer_blocks.*.proj_mlp.weightz%single_blocks.*.mlp.linear_fc1.weightz)single_transformer_blocks.*.proj_mlp.biasz#single_blocks.*.mlp.linear_fc1.biasz.single_transformer_blocks.*.attn.norm_q.weightz1single_blocks.*.self_attention.q_layernorm.weightz.single_transformer_blocks.*.attn.norm_k.weightz1single_blocks.*.self_attention.k_layernorm.weightz)single_transformer_blocks.*.proj_out.biasz#single_blocks.*.mlp.linear_fc2.biasznorm_out.linear.biasz norm_out.adaLN_modulation.1.biasznorm_out.linear.weightz"norm_out.adaLN_modulation.1.weightzproj_out.biaszproj_out.weightz/time_text_embed.guidance_embedder.linear_1.biasz guidance_embedding.in_layer.biasz1time_text_embed.guidance_embedder.linear_1.weightz"guidance_embedding.in_layer.weightz/time_text_embed.guidance_embedder.linear_2.biasz!guidance_embedding.out_layer.biasz#guidance_embedding.out_layer.weightzimg_embed.biaszimg_embed.weightz.timestep_embedding.time_embedder.in_layer.biasz0timestep_embedding.time_embedder.in_layer.weightz/timestep_embedding.time_embedder.out_layer.biasz1timestep_embedding.time_embedder.out_layer.weightztxt_embed.biasztxt_embed.weightzvector_embedding.in_layer.biasz vector_embedding.in_layer.weightzvector_embedding.out_layer.biasz!vector_embedding.out_layer.weight)z1time_text_embed.guidance_embedder.linear_2.weightzx_embedder.biaszx_embedder.weightz/time_text_embed.timestep_embedder.linear_1.biasz1time_text_embed.timestep_embedder.linear_1.weightz/time_text_embed.timestep_embedder.linear_2.biasz1time_text_embed.timestep_embedder.linear_2.weightzcontext_embedder.biaszcontext_embedder.weightz+time_text_embed.text_embedder.linear_1.biasz-time_text_embed.text_embedder.linear_1.weightz+time_text_embed.text_embedder.linear_2.biasz-time_text_embed.text_embedder.linear_2.weight)mapping
transforms)	r&   apply_transformsimport_double_block_qkvimport_double_block_qkv_biasimport_added_qkvimport_added_qkv_biasimport_single_block_qkvimport_single_block_qkv_biastransform_single_proj_out)rg   r  ri  r  r;   r;   r<   r  0  s   	
 !"#1zHFFluxImporter.convert_stateN)rj   rk   rl   rm   r   r  r   r  r  r>   re   r  r;   r;   r;   r<   r    s    $r  r  )z%transformer_blocks.*.attn.to_q.weightz%transformer_blocks.*.attn.to_k.weightz%transformer_blocks.*.attn.to_v.weightz0double_blocks.*.self_attention.linear_qkv.weight)
source_key
target_keyctxc                 C      | j j}t||||S rA  ri  re   r    r  qr   r   transformer_configr;   r;   r<   r  s     	r  )z#transformer_blocks.*.attn.to_q.biasz#transformer_blocks.*.attn.to_k.biasz#transformer_blocks.*.attn.to_v.biasz.double_blocks.*.self_attention.linear_qkv.biasc                 C   r  rA  ri  re   r!   r  qbkbvbr  r;   r;   r<   r    r  r  )z+transformer_blocks.*.attn.add_q_proj.weightz+transformer_blocks.*.attn.add_k_proj.weightz+transformer_blocks.*.attn.add_v_proj.weightz6double_blocks.*.self_attention.added_linear_qkv.weightc                 C   r  rA  r  r  r;   r;   r<   r    r  r  )z)transformer_blocks.*.attn.add_q_proj.biasz)transformer_blocks.*.attn.add_k_proj.biasz)transformer_blocks.*.attn.add_v_proj.biasz4double_blocks.*.self_attention.added_linear_qkv.biasc                 C   r  rA  r  r  r;   r;   r<   r    r  r  )z,single_transformer_blocks.*.attn.to_q.weightz,single_transformer_blocks.*.attn.to_k.weightz,single_transformer_blocks.*.attn.to_v.weightz0single_blocks.*.self_attention.linear_qkv.weightc                 C   r  rA  r  r  r;   r;   r<   r    r  r  )z*single_transformer_blocks.*.attn.to_q.biasz*single_transformer_blocks.*.attn.to_k.biasz*single_transformer_blocks.*.attn.to_v.biasz.single_blocks.*.self_attention.linear_qkv.biasc                 C   r  rA  r  r  r;   r;   r<   r    r  r  z+single_transformer_blocks.*.proj_out.weight)z%single_blocks.*.mlp.linear_fc2.weightz1single_blocks.*.self_attention.linear_proj.weightc                 C   s@   |   d d dd f  }|   d d d df  }||fS )NrE   )detachclone)proj_weight
linear_fc2linear_projr;   r;   r<   r    s   r  )er}  r   
contextlibr   dataclassesr   r   pathlibr   typingr   r   lightning.pytorchr   Lnumpyr   r5   r   r   (megatron.core.dist_checkpointing.mappingr	   &megatron.core.dist_checkpointing.utilsr
   7megatron.core.models.common.vision_module.vision_moduler   megatron.core.optimizerr   megatron.core.transformer.enumsr   ,megatron.core.transformer.transformer_configr   megatron.core.transformer.utilsr   r   safetensors.torchr   r  r   r  r   torch.nnr   rb  /nemo.collections.diffusion.encoders.conditionerr   r   4nemo.collections.diffusion.models.dit.dit_layer_specr   r   r   r   r   -nemo.collections.diffusion.models.flux.layersr   r   r   Jnemo.collections.diffusion.sampler.flow_matching.flow_match_euler_discreter   4nemo.collections.diffusion.utils.flux_ckpt_converterr    r!   r"   *nemo.collections.diffusion.vae.autoencoderr#   r$   nemo.collections.llmr%   nemo.lightningr&   r'    nemo.lightning.megatron_parallelr(   nemo.lightning.pytorch.optimr)   r*   
nemo.utilsr+   r=   IOMixinr>   rt   r}   r   rf   LightningModuleConnectorMixinFNMixinr   model_importerModelConnectorr  state_transformTransformCTXr  r  r  r  r  r  r  r;   r;   r;   r<   <module>   s   )

    
= 