o
    }oi?                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Zd dlZd dlm  mZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* ddl+m,Z, dej-fddZ.dd Z/defddZ0eG dd dee%j1Z2eG dd de2Z3eG dd  d e2Z4eG d!d" d"e2Z5eG d#d$ d$e2Z6eG d%d& d&e2Z7eG d'd( d(e7Z8eG d)d* d*e7Z9eG d+d, d,e9Z:G d-d. d.e#Z;G d/d0 d0e(Z<d1d2 Z=dS )3    N)	dataclass)AnyCallableDictOptionalTuple)	rearrange)parallel_state)PackedSeqParams)AttnMaskType)TransformerConfig)nn)override)DiTLlamaModel)EDMPipeline)GPTModel)io)MaskedTokenLossReductionMegatronLossReduction)OptimizerModule   )DiTCrossAttentionModelreturnc                 C   s   | di |S )zForward pass of DiT.N r   )modelbatchr   r   [/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/models/model.pydit_forward_step,   s   r   c                 C   s   t |d }t|}dd | D }|d jddtj}tjdtjdd}t||f}|d	 jddtj}t||f}t	||| j
d
t	||| j
d
d|d< |S )zDiT data batch preparation.r   c                 S   s.   i | ]\}}|t |r|jd ddn|qS )cudaT)devicenon_blocking)torch	is_tensorto).0kvr   r   r   
<dictcomp>5   s   . z!dit_data_step.<locals>.<dictcomp>	seq_len_qdimr   r   )dtyper   
seq_len_kv)cu_seqlens_qcu_seqlens_kv
qkv_format)self_attentioncross_attentionpacked_seq_params)nextget_batch_on_this_cp_rankitemscumsumr#   r!   int32zeroscatr
   r/   )moduledataloader_iterr   
cu_seqlenszeror.   r   r   r   dit_data_step1   s(   
r>   datac              	   C   sT  ddl m} | }| }|dkrd}d| v r$| d dur$| d  }|  D ]]\}}|dur|dv rt|jdkr@|d}t|jdkrj|j\}}}	}
}|	||||	| |
|dddd|df 
 | |< q(|j\}}}|	|||| |dd|df 
 | |< q(| d }|	|jd ||jd | dd|df 
 | d< || d	< | S )
z'Split the data for context parallelism.r   )mpur   N	loss_mask)videovideo_latentnoise_latentpos_ids   .num_valid_tokens_in_ub)megatron.corer@   get_context_parallel_world_sizeget_context_parallel_ranksumr5   lenshapesqueezeview
contiguous)r?   r@   cp_sizecp_rankrG   keyvalueBCTHWSDrA   r   r   r   r4   N   s2   
6*r4   c                   @   s|  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZdZdZdZdZdZeed< dZeed< dZeed< dZeed< dZeed< dZeed< ejZejed< dZ e!ed < d!Z"e!ed"< d#Z#eed$< d%Z$eed&< e%Z&e'Z(dZ)d'Z*eed(< d)Z+e!ed*< e,j-Z.e,ed+< e/d2d,e0e d-e1fd.d/Z2d0d1 Z3d!S )3	DiTConfigz 
    Config for DiT-S model
       crossattn_emb_sizeFadd_bias_lineargated_linear_unit   
num_layersi  hidden_sizeP   	max_img_h	max_img_w"   
max_frames   patch_spatial   num_attention_headsgư>RMSNormTfp16_lm_cross_entropyparallel_output#share_embeddings_and_output_weightsr   hidden_dropoutattention_dropoutbf16params_dtypez=nemo.collections.diffusion.vae.diffusers_vae.AutoencoderKLVAE
vae_moduleNvae_pathg      ?
sigma_data   in_channels   
seq_lengthsbhdr/   attn_mask_typevp_stager   c                 C   s   | j }|r| j}| j| | dksJ dt| trt}nt}|p"d}|| | j| jt	j
d|dt	jd|d| j| j| j| j|d
S )zConfigure DiT Model from MCore.r   zLMake sure the number of model chunks is the same across all pipeline stages.Fignore_virtualr~   )	rn   ro   pre_processpost_processre   rf   rh   rj   r~   )$virtual_pipeline_model_parallel_sizepipeline_model_parallel_sizerb   
isinstanceDiTLlama30BConfigr   r   rn   ro   r	   is_pipeline_first_stageis_pipeline_last_stagere   rf   rh   rj   )self	tokenizerr~   vp_sizep_sizer   r   r   r   configure_model   s4   
zDiTConfig.configure_modelc                 C   s   t | j| jS )z#Dynamically import video tokenizer.)dynamic_importru   rv   r   r   r   r   configure_vae   s   zDiTConfig.configure_vae)NN)4__name__
__module____qualname____doc__r^   int__annotations__r_   boolr`   rb   rc   re   rf   rh   rj   rl   layernorm_epsilonnormalizationqk_layernorm_per_headlayernorm_zero_centered_gammarn   ro   rp   rq   floatrr   rs   r!   bfloat16rt   r+   ru   strrv   rw   ry   r>   data_step_fnr   forward_step_fnreplicated_t_embedderr{   r/   r   no_maskr}   r   r   r   r   r   r   r   r   r   r\   m   sJ   
 r\   c                   @   s6   e Zd ZU dZdZeed< dZeed< dZeed< dS )
DiTBConfigzDiT-Bra   rb   i   rc   rl   N	r   r   r   r   rb   r   r   rc   rl   r   r   r   r   r      
   
 r   c                   @   6   e Zd ZU dZdZeed< dZeed< dZeed< dS )	
DiTLConfigzDiT-L   rb   r]   rc   rx   rl   Nr   r   r   r   r   r      r   r   c                   @   r   )	DiTXLConfigzDiT-XL   rb   i  rc   rx   rl   Nr   r   r   r   r   r      r   r   c                   @   r   )	DiT7BConfigzDiT-7B    rb      rc   r   rl   Nr   r   r   r   r   r      r   r   c                   @   s  e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dZeed< ejZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< dZeed"< d#S )$r   zMovieGen 30B0   rb   i   rc   i @  ffn_hidden_sizerl      num_query_groupsTr`   bias_activation_fusionactivation_funcrm   r   gh㈵>r      rh      re   rf   ri   rj   g{Gz?init_method_stdFr_      r{   masked_softmax_fusionpersist_layer_normbias_dropout_fusionN) r   r   r   r   rb   r   r   rc   r   rl   r   r`   r   Fsilur   r   r   r   r   r   rh   re   rf   rj   r   r_   r   r{   r   r   r   r   r   r   r   r      s.   
 r   c                   @   B   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
S )DiTLlama5BConfigzMovieGen 5Br   rb   r   rc       r   r   rl   N
r   r   r   r   rb   r   r   rc   r   rl   r   r   r   r   r        
 r   c                   @   r   )DiTLlama1BConfigzMovieGen 1Brx   rb   rz   rc   r   r   r   rl   Nr   r   r   r   r   r     r   r   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZe	ed< dZ
eed	< dZe	ed
< dZeed< dZeed< dZeed< dS )ECDiTLlama1BConfigz	EC-DiT 1Bexpert_choicemoe_router_load_balancing_typealltoallmoe_token_dispatcher_typeTmoe_grouped_gemmr   moe_expert_capacity_factor moe_pad_expert_input_to_capacityr   moe_router_topk@   num_moe_expertsr]   r   N)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r     s   
 r   c                
       s   e Zd ZdZ				d!dee dee deeej	gej	f  dee
 f fddZd"d
dZdeee
f fddZdd ZdejfddZd#dejfddZdd Zdd Zd#dejfddZedefddZedefddZd$ fdd Z  ZS )%DiTModelz%
    Diffusion Transformer Model
    Nconfigoptimmodel_transformr   c                    sR   t  j|pt ||d d | _d | _d | _t| | jjd| _	d | _
d| _d | _d S )N)r   r   )netrw   *   )super__init__r\   vae_training_loss_reduction_validation_loss_reductionr   r   rw   diffusion_pipeline_noise_generatorseed)r   r   r   r   r   	__class__r   r   r   *  s   
zDiTModel.__init__Fc                 C   s   | j j|dd d S )NF)strict)r:   load_state_dict)r   
state_dictr   r   r   r   r   ?     zDiTModel.load_state_dictr   c                 C   s   | j |S N)r   r   )r   r;   r   r   r   	data_stepB  s   zDiTModel.data_stepc                 O   s   | j j|i |S r   )r:   forward)r   argskwargsr   r   r   r   E  s   zDiTModel.forwardc                 C   sF   t jd| jdr| j|d\}}tj|dd}|S | j|d}|S )NFr   r   r)   )r	   r   r~   r   training_stepr!   mean)r   r   output_batchlossoutput_tensorr   r   r   forward_stepH  s   zDiTModel.forward_stepc                 C   s
   |  |S r   )r   )r   r   	batch_idxr   r   r   r   Q  s   
zDiTModel.training_stepc                 C   s@   | j d u r| jjd u rtd d S | j | _ | j d d S )Nz*vae_path not specified skipping validationr   )r   r   rv   warningswarnr   r#   r   r   r   r   on_validation_startU  s   

zDiTModel.on_validation_startc                 C   s   | j dur| j d dS dS )z-Move video tokenizer to CPU after validation.Ncpu)r   r#   r   r   r   r   on_validation_end]  s   
zDiTModel.on_validation_endc                 C   s  |d j }| jj|d|dd|v rdndd}|d }|d	 d
 \}}}}|d d
 }	t|d
dd|	f d| jj| jj|||| jj || jj d}d| j|| jj  	d
dd }
|
d 
tj  tj}
t|
d}d
}t |krtj |krdd tt D }nd}tjj|||t d |durg }|D ]3}
z|tj|
ddd W q ty } ztd|  |tj|
dd W Y d}~qd}~ww td|i dS )z4Generated validation video sample and logs to wandb.rB      #   neg_t5_text_embeddingsTF)guidancestate_shape	num_stepsis_negative_prompt)r   Nlatent_shaper   r(   Nz2B (T H W) (ph pw pt C) -> B C (T pt) (H ph) (W pw))phpwrV   rW   rX   rY   g      ?ri      zb c t h w -> (b t) c h wc                 S   s   g | ]}d qS r   r   )r$   _r   r   r   
<listcomp>  s    z,DiTModel.validation_step.<locals>.<listcomp>)groupr   mp4)fpsformatzError saving video as mp4: )r  
prediction) rM   r   generate_samples_from_batchr   r   rj   r   decoderw   clampr#   r!   uint8r   numpyastypenpr	   get_data_parallel_src_rankdistributedget_rankrangeget_data_parallel_world_sizegather_objectget_data_parallel_groupappendwandbVideo	Exceptionr   r   log)r   r   r   r   samplerV   rW   rX   rY   r(   rB   result
wandb_rankgather_listvideoser   r   r   validation_stepb  sX   
	

$ 
 zDiTModel.validation_stepc                 C      | j st | _ | j S r   )r   r   r   r   r   r   training_loss_reduction     z DiTModel.training_loss_reductionc                 C   r   r   )r   DummyLossReductionr   r   r   r   validation_loss_reduction  r"  z"DiTModel.validation_loss_reductionc                    s<   t    | jjdurt| ddrd| j_d| _dS dS dS )z
        Small hack to avoid first validation on resume.
        This will NOT work if the gradient accumulation step should be performed at this point.
        https://github.com/Lightning-AI/pytorch-lightning/discussions/18110
        N_restarting_skip_val_flagTF)r   on_validation_model_zero_gradtrainer	ckpt_pathgetattrsanity_checkingr%  r   r   r   r   r&    s
   

z&DiTModel.on_validation_model_zero_grad)NNNN)Fr   )r   N)r   r   r   r   r   r\   r   r   r   Moduler   r   r   r   r   r   r   r!   Tensorr   r   r   r   r  propertyr   r!  r$  r&  __classcell__r   r   r   r   r   %  s8    
	9r   c                
       sv   e Zd ZdZddededdf fdd	Zd
eeej	f dej	de
ej	eeej	f f fddZdej	fddZ  ZS )r#  z"
    Diffusion Loss Reduction
    FTr  val_drop_lastr   Nc                    s   t    || _|| _d S r   )r   r   r  r/  )r   r  r/  r   r   r   r     s   

zDummyLossReduction.__init__r   forward_outc                 C   s,   t jdt j ddt jdt j difS )N        r   avgr!   tensorr   current_device)r   r   r0  r   r   r   r     s   zDummyLossReduction.forwardc                 C   s   t jdt j dS )Nr1  r2  r4  )r   losses_reduced_per_micro_batchr   r   r   reduce  r   zDummyLossReduction.reduce)FT)r   r   r   r   r   r   r   r   r!   r,  r   r   r8  r.  r   r   r   r   r#    s    
r#  c              
   C   s   z
|  dd\}}W n ty } z	td|  d|d}~ww zt|}W n ty< } z	td| d|d}~ww zt||}W |S  ty] } ztd| d	| d|d}~ww )
af  
    Dynamically import a class or function from a given full path.

    :param full_path: The full path to the class or function (e.g., "package.module.ClassName")
    :return: The imported class or function
    :raises ImportError: If the module or attribute cannot be imported
    :raises AttributeError: If the attribute does not exist in the module
    .r   zInvalid full path 'z5'. It should contain both module and attribute names.NzCannot import module 'z'.zModule 'z' does not have an attribute ')rsplit
ValueErrorImportError	importlibimport_moduler)  AttributeError)	full_pathmodule_pathattribute_namer  r:   	attributer   r   r   r     s.   	
r   )>r=  r   dataclassesr   typingr   r   r   r   r   r
  r  r!   torch.nn.functionalr   
functionalr   r  einopsr   rH   r	   megatron.core.packed_seq_paramsr
   megatron.core.transformer.enumsr   ,megatron.core.transformer.transformer_configr   typing_extensionsr   ;nemo.collections.diffusion.models.dit_llama.dit_llama_modelr   3nemo.collections.diffusion.sampler.edm.edm_pipeliner   #nemo.collections.llm.gpt.model.baser   nemo.lightningr    nemo.lightning.megatron_parallelr   r   nemo.lightning.pytorch.optimr   dit.dit_modelr   r,  r   r>   r4   IOMixinr\   r   r   r   r   r   r   r   r   r   r#  r   r   r   r   r   <module>   s^   U		 