o
    }oi?                     @   s  d dl mZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z$ dd Z%G dd dej&Z'G dd dej&Z(G dd deZ)dS )    )DictLiteralOptionalN)	Timesteps)	rearrangerepeat)parallel_statetensor_parallel)ShardedStateDict)VisionModule)PackedSeqParams)	ModelType)TransformerBlock)TransformerConfig)"make_sharded_tensor_for_checkpoint)Tensor)dit_embeddings)ParallelTimestepEmbedding)0get_dit_adaln_block_with_transformer_engine_specc                 C   s   | d| d  | d S )N   )	unsqueeze)xshiftscale r   c/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/models/dit/dit_model.pymodulate)   s   r   c                       s8   e Zd Zd
dedef fddZdd Zdd	 Z  ZS )RMSNormư>channelepsc                    s&   t    || _tt|| _d S N)super__init__r    nn	Parametertorchonesweight)selfr   r    	__class__r   r   r#   .   s   
zRMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)keepdim)r&   rsqrtpowmeanr    )r)   r   r   r   r   _norm3   s   $zRMSNorm._normc                 C   s   |  | |}|| j S r!   )r2   floattype_asr(   )r)   r   outputr   r   r   forward6   s   
zRMSNorm.forward)r   )	__name__
__module____qualname__intr3   r#   r2   r6   __classcell__r   r   r*   r   r   -   s    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )
FinalLayerz!
    The final layer of DiT.
    c                    s`   t    tj|ddd| _tj||| | | dd| _tt tj|d| dd| _	d S )NFr   )elementwise_affiner    )biasr,   )
r"   r#   r$   	LayerNorm
norm_finalLinearlinear
SequentialSiLUadaLN_modulation)r)   hidden_sizespatial_patch_sizetemporal_patch_sizeout_channelsr*   r   r   r#   @   s   
&zFinalLayer.__init__c                 C   sj   |  |jddd\}}|jd |jd  }t|d|dt|d|d}}t| |||}| |}|S )Nr,   r   )dimr   zb d -> (b t) d)t)rE   chunkshaper   r   r@   rB   )r)   	x_BT_HW_Demb_B_D	shift_B_D	scale_B_DT
shift_BT_D
scale_BT_Dr   r   r   r6   H   s   
zFinalLayer.forward)r7   r8   r9   __doc__r#   r6   r;   r   r   r*   r   r<   ;   s    r<   c                       s   e Zd ZdZddddddddddddeejd	fd
ededededede	d de
de
de
de
de
de
de
dee
 f fddZ				d2dededededed efd!d"Zd#ed d	fd$d%Z		d3d(ed)ed*ee d ef fd+d,Zd-ed.ed/ed d	fd0d1Z  ZS )4DiTCrossAttentionModela"
  
    DiTCrossAttentionModel is a VisionModule that implements a DiT model with a cross-attention block.
    Attributes:
        config (TransformerConfig): Configuration for the transformer.
        pre_process (bool): Whether to apply pre-processing steps.
        post_process (bool): Whether to apply post-processing steps.
        fp16_lm_cross_entropy (bool): Whether to use fp16 for cross-entropy loss.
        parallel_output (bool): Whether to use parallel output.
        position_embedding_type (Literal["learned_absolute", "rope"]): Type of position embedding.
        max_img_h (int): Maximum image height.
        max_img_w (int): Maximum image width.
        max_frames (int): Maximum number of frames.
        patch_spatial (int): Spatial patch size.
        patch_temporal (int): Temporal patch size.
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        transformer_decoder_layer_spec (DiTLayerWithAdaLNspec): Specification for the transformer decoder layer.
        add_encoder (bool): Whether to add an encoder.
        add_decoder (bool): Whether to add a decoder.
        share_embeddings_and_output_weights (bool): Whether to share embeddings and output weights.
        concat_padding_mask (bool): Whether to concatenate padding mask.
        pos_emb_cls (str): Class of position embedding.
        model_type (ModelType): Type of the model.
        decoder (TransformerBlock): Transformer decoder block.
        t_embedder (torch.nn.Sequential): Time embedding layer.
        x_embedder (nn.Conv3d): Convolutional layer for input embedding.
        pos_embedder (dit_embeddings.SinCosPosEmb3D): Position embedding layer.
        final_layer_linear (torch.nn.Linear): Final linear layer.
        affline_norm (RMSNorm): Affine normalization layer.
    Methods:
        forward(x: Tensor, timesteps: Tensor, crossattn_emb: Tensor, packed_seq_params: PackedSeqParams = None, pos_ids: Tensor = None, **kwargs) -> Tensor:
            Forward pass of the model.
        set_input_tensor(input_tensor: Tensor) -> None:
            Sets input tensor to the model.
        sharded_state_dict(prefix: str = 'module.', sharded_offsets: tuple = (), metadata: Optional[Dict] = None) -> ShardedStateDict:
            Sharded state dict implementation for backward-compatibility.
        tie_embeddings_weights_state_dict(tensor, sharded_state_dict: ShardedStateDict, output_layer_weight_key: str, first_stage_word_emb_key: str) -> None:
            Ties the embedding and output weights in a given sharded state dict.
    TFropeP   "   r      Nconfigpre_processpost_processfp16_lm_cross_entropyparallel_outputposition_embedding_type)learned_absoluterW   	max_img_h	max_img_w
max_framespatch_spatialpatch_temporalin_channelsrI   vp_stagec                    s  t t| j|d || _||jd| _|| _|| _d| _d| _	|| _
|| _|| _d| _d| _d| _|
| _|| _|| _tj| _t| j| j| jdd|d| _tjt| jjdddtj| jj| jjd	d
| _ttddddtddd	d
| _ | jrtj!||
d  | jj| _"|tj#u r| jr|||	| ||
 ||
 d| _$n$|||	| ||
 ||
 d	d| _$t%& dkr| j$' D ]}t(|dd q| jrtj!| jj|
d | | | _)t*| jj| _+t%& dkrt(| j+j,dd d S d S )N)r[   )attn_mask_typeTFsincos)r[   specr\   r]   post_layer_normrh   r   )flip_sin_to_cosdownscale_freq_shifti  )seed   r   )num_channelsrm   rn   r,   )rK   hw)rK   rr   rs   ro   pipeline_parallel)-r"   rV   r#   r[   ri   transformer_decoder_layer_specr\   r]   add_encoderadd_decoderr^   r_   r`   #share_embeddings_and_output_weightsconcat_padding_maskpos_emb_clsre   rf   rh   r   encoder_or_decoder
model_typer   decoderr&   r$   rC   r   rF   r   r   
t_embedderfps_embedderrA   
x_embedderSinCosPosEmb3Dpos_embedderr   &get_pipeline_model_parallel_world_size
parameterssetattrfinal_layer_linearr   affline_normr(   )r)   r[   r\   r]   r^   r_   r`   rb   rc   rd   re   rf   rg   rI   ru   r   rh   kwargspr*   r   r   r#   z   s~   	
zDiTCrossAttentionModel.__init__r   	timestepscrossattn_embpacked_seq_paramspos_idsreturnc                 K   s  |j d }|dtjdg| tj|jdd}| jrD| |}	t	| j
tjr2d}
|	| 
|7 }	n
| 
|}
t|
d}
t|	d }nd}t| drRt	| j
tjrUd}
n
t| 
|d }
| | tj}|}| |}tj|d| jj|j d	  f}||7 }| |}t|d }| jjr| jrt|}t| drt	| j
tjrt|
}
t|}| jjr| jr| }| }| j |||d|
|d
}| j!s|S | jjrt"|}| #|}t|dS )a<  Forward pass.

        Args:
            x (Tensor): vae encoded data (b s c)
            encoder_decoder_attn_mask (Tensor): cross-attention mask between encoder and decoder
            inference_params (InferenceParams): relevant arguments for inferencing

        Returns:
            Tensor: loss tensor
        r   fps   )dtypedevicer-   NzB S D -> S B Dr   r   )hidden_statesattention_maskcontextcontext_maskrotary_pos_embr   zS B D -> B S D)$rM   getr&   tensorbfloat16r   viewr\   r   
isinstancer   r   r   r   
contiguoushasattrr~   flattentor   r$   
functionalpadr[   rF   r   sequence_parallelr	   #scatter_to_sequence_parallel_regionFactorizedLearnable3DEmbedding!clone_scatter_output_in_embeddingcloner}   r]   $gather_from_sequence_parallel_regionr   )r)   r   r   r   r   r   r   Br   x_B_S_Dpos_embx_S_B_Dtimesteps_B_Daffline_emb_B_Dfps_B_Dr   r   r   r6      sv   





 



	


zDiTCrossAttentionModel.forwardinput_tensorc                 C   s8   t |ts|g}t|dksJ d| j|d  dS )zSets input tensor to the model.

        See megatron.model.transformer.set_input_tensor()

        Args:
            input_tensor (Tensor): Sets the input tensor for the model.
        r   z1input_tensor should only be length 1 for gpt/bertr   N)r   listlenr}   set_input_tensor)r)   r   r   r   r   r   9  s   

z'DiTCrossAttentionModel.set_input_tensormodule.r   prefixsharded_offsetsmetadatac           	         sV   t  |||}dD ]}t| | D ]\}}| | d| }| ||| qq
|S )a  Sharded state dict implementation for GPTModel backward-compatibility (removing extra state).

        Args:
            prefix (str): Module name prefix.
            sharded_offsets (tuple): PP related offsets, expected to be empty at this module level.
            metadata (Optional[Dict]): metadata controlling sharded state dict creation.

        Returns:
            ShardedStateDict: sharded state dict for the GPTModel
        )r~   .)r"   sharded_state_dictgetattrnamed_parameters _set_embedder_weights_replica_id)	r)   r   r   r   r   module
param_nameparam
weight_keyr*   r   r   r   I  s   z)DiTCrossAttentionModel.sharded_state_dictr   r   embedder_weight_keyc           	      C   sr   t  }| jdur| jnd}| jdd}t  }||v r ||= ||||  t jddf}t|||dd||< dS )	aZ  set replica ids of the weights in t_embedder for sharded state dict.

        Args:
            sharded_state_dict (ShardedStateDict): state dict with the weight to tie
            weight_key (str): key of the weight in the state dict.
                This entry will be replaced with a tied version

        Returns: None, acts in-place
        Nr   $virtual_pipeline_model_parallel_sizer   T)with_context_parallelF)r   key
replica_idallow_shape_mismatch)r   get_tensor_model_parallel_rankrh   r[   r    get_pipeline_model_parallel_rankget_data_parallel_rankr   )	r)   r   r   r   tp_rankrh   vp_worldpp_rankr   r   r   r   r   ^  s    

z7DiTCrossAttentionModel._set_embedder_weights_replica_id)NN)r   r   N)r7   r8   r9   rU   DiTLayerWithAdaLNspecr   r   r   boolr   r:   r   r#   r   r   r6   r   strtupler   r
   r   r   r;   r   r   r*   r   rV   Q   s    +	
e
_rV   )*typingr   r   r   r&   torch.nnr$   diffusers.models.embeddingsr   einopsr   r   megatron.corer   r	   (megatron.core.dist_checkpointing.mappingr
   7megatron.core.models.common.vision_module.vision_moduler   megatron.core.packed_seq_paramsr   megatron.core.transformer.enumsr   +megatron.core.transformer.transformer_blockr   ,megatron.core.transformer.transformer_configr   megatron.core.utilsr   r   %nemo.collections.diffusion.models.ditr   4nemo.collections.diffusion.models.dit.dit_embeddingsr   4nemo.collections.diffusion.models.dit.dit_layer_specr   r   r   Moduler   r<   rV   r   r   r   r   <module>   s*   