o
    }oiPC                     @   s   d dl mZ d dlmZ d dlZd dlmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZ zd d	lmZ W n eyP   d d
lmZ ed Y nw eG dd dZG dd deZG dd de	ZdS )    )	dataclass)UnionN)apply_rotary_pos_emb)	AttentionSelfAttentionSelfAttentionSubmodules)AttnMaskType)
ModuleSpecbuild_module)TransformerConfig)SplitAlongDim)loggingzFailed to import Transformer Engine dependencies. `from megatron.core.transformer.custom_layers.transformer_engine import *`If using NeMo Run, this is expected. Otherwise, please verify the Transformer Engine installation.c                   @   s   e Zd ZU dZdZeeef ed< dZ	eeef ed< dZ
eeef ed< dZeeef ed< dZeeef ed< dZeeef ed< dZeeef ed	< dZeeef ed
< dS )JointSelfAttentionSubmodulesz4
    Submodules for Joint Self-attention layer.
    N
linear_qkvadded_linear_qkvcore_attentionlinear_projq_layernormk_layernormadded_q_layernormadded_k_layernorm)__name__
__module____qualname____doc__r   r   r	   type__annotations__r   r   r   r   r   r   r    r   r   g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/diffusion/models/dit/dit_attention.pyr      s   
 r   c                	       sj   e Zd ZdZejdfdededede	f fddZ
d	d
 ZdddZdddZ					dddZ  ZS )JointSelfAttentionzQJoint Self-attention layer class

    Used for MMDIT-like transformer block.
    Fconfig
submoduleslayer_numbercontext_pre_onlyc                    s  d|_ t jd||||dd| t|j| jj| jd| j  | j| jj	d| jj
p,| jjdddd
| _|jd urUt|j| jj| jd| j  | j| jj	d| jjdddd
| _|sot|j| j| jj| j| jj| jj
dddd	d

| _|jd urt|j| j| j| jjd| _nd | _|jd urt|j| j| j| jjd| _nd | _|jd urt|j| j| j| jjd| _nd | _|jd urt|j| j| j| jjd| _d S d | _d S )NRMSNormself)r    r!   r"   attn_mask_typeattention_type   Fqkv)r    init_methodgather_outputbiasskip_bias_add	is_experttp_comm_buffer_nameTprojr    r*   r,   input_is_parallelr-   r.   r/   )hidden_sizer    epsr   )normalizationsuper__init__r
   r   r    r3   query_projection_sizekv_projection_sizer*   add_bias_linearadd_qkv_biasr   r   output_layer_init_methodadded_linear_projr   hidden_size_per_attention_headlayernorm_epsilonr   r   r   )r%   r    r!   r"   r&   r#   kwargs	__class__r   r   r7   1   s   
	








zJointSelfAttention.__init__c                 C   s   |  d d | j| j| j d | j f }|j| }| j| j | j | j| jg}td ur7t|d|\}}}ntj||dd\}}}|| d| dd| j}|||fS )Nr(      dimr      )	sizenum_query_groups_per_partition!num_attention_heads_per_partitionr>   viewr   torchsplitreshape)r%   	mixed_qkvnew_tensor_shapesplit_arg_listquerykeyvaluer   r   r   
_split_qkv   s8   


zJointSelfAttention._split_qkvNc                 C   `   |  |\}}| |\}}}| jjr|   | jdur!| |}| jdur+| |}|||fS zR
        Derives `query`, `key` and `value` tensors from `hidden_states`.
        N)r   rU   r    	test_moderun_realtime_testsr   r   )r%   hidden_stateskey_value_statesrO   _rR   rS   rT   r   r   r   get_query_key_value_tensors      




z.JointSelfAttention.get_query_key_value_tensorsc                 C   rV   rW   )r   rU   r    rX   rY   r   r   )r%   added_hidden_statesr[   rO   r\   rR   rS   rT   r   r   r   !get_added_query_key_value_tensors   r^   z4JointSelfAttention.get_added_query_key_value_tensorsc                 C   s  |d urt |ts|fd }| |\}}	}
| |\}}}tj||gdd}tj||	gdd}	tj||
gdd}
| |||	|
|^}}	}
}}}|d ur[|d}|	d}	|
d}
|d ur|\}}|d urn|j}|j	}nd  }}t
||| j|d}t
|	|| j|d}	| jr| jr| j||	|
|||d}n| j||	|
|||d}|d ur||ddd}|d |jd d d d d f }||jd d d d d d f }| |\}}| |\}}|| }|| }||fS )Nr(   r   rE   rG   r    
cu_seqlensr&   packed_seq_paramsrC   )
isinstancetupler]   r`   rL   cat_adjust_key_value_for_inferencesqueezecu_seqlens_qcu_seqlens_kvr   r    checkpoint_core_attentiontraining_checkpointed_attention_forwardr   rN   rH   shaper   r=   )r%   rZ   attention_maskr[   inference_paramsrotary_pos_embrd   additional_hidden_statesrR   rS   rT   added_query	added_keyadded_valuer&   r\   	q_pos_emb	k_pos_embrj   rk   core_attn_outencoder_attention_outputattention_outputoutputr,   encoder_outputencoder_biasr   r   r   forward   st   




		  zJointSelfAttention.forward)N)NNNNN)r   r   r   r   r   paddingr   r   intboolr7   rU   r]   r`   r   __classcell__r   r   rA   r   r   +   s,    
e
*
r   c                	       sL   e Zd ZdZejdfdededede	f fddZ
				dd	d
Z  ZS )FluxSingleAttentionzSelf-attention layer class

    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    Nr    r!   r"   cp_comm_typec                    sT   d|_ t jd|||||d| t|j| j| jj| j| jjdddddd
| _d S )Nr$   )r    r!   r"   r&   r   FTr0   r1   r   )	r5   r6   r7   r
   r   r8   r    r3   r<   )r%   r    r!   r"   r&   r   r@   rA   r   r   r7   _  s,   
zFluxSingleAttention.__init__c                 C   s,  |d urt |ts|fd }| ||\}}}	| ||||	|^}}}	}}
}|d ur9|d}|d}|	d}	|d urb|\}}|d urL|j}|j}nd  }}t||| j|d}t||| j|d}| j	rt| j
rt| j|||	||
|d}n| j|||	||
|d}|d ur||ddd}| |\}}|S )Nr(   rG   ra   rc   r   rC   )re   rf   r]   rh   ri   rj   rk   r   r    rl   rm   rn   r   rN   rH   r   )r%   rZ   rp   r[   rq   rr   rd   rR   rS   rT   r&   r\   rw   rx   rj   rk   ry   r|   r   r   r   r     sb   




		zFluxSingleAttention.forward)NNNN)r   r   r   r   r   r   r   r   r   strr7   r   r   r   r   rA   r   r   X  s$    $r   )dataclassesr   typingr   rL   ;megatron.core.models.common.embeddings.rotary_pos_embeddingr   #megatron.core.transformer.attentionr   r   r   megatron.core.transformer.enumsr   $megatron.core.transformer.spec_utilsr	   r
   ,megatron.core.transformer.transformer_configr   :megatron.core.transformer.custom_layers.transformer_enginer   ImportError
nemo.utilsr   warningr   r   r   r   r   r   r   <module>   s,   
  /