o
    }oiM                     @   s  d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZ d dlmZ z
d dlmZ dZ W n e!yo   dZdZ Y nw dd Z"dd Z#ddee$ defddZ%G dd deZ&dS )    )deepcopy)OptionalTupleUnionN)	rearrange)parallel_state)BaseInferenceContext)apply_rotary_pos_emb)get_gpt_decoder_block_spec)PackedSeqParams)SelfAttention)
ModuleSpec)L2Norm)deprecate_inference_paramsis_fa_min_version)Tensor)_flash_attn_forwardTFc                 C   s   | d g}|d g}t dt| D ]N}| |d  }| | }||d  }|| }	|| }
|
| }t d|d D ]}|||  }|| |||  }|| q6|d |kr_|| ||	 q||fS )a  
    Splits cumulative sequence lengths into chunks based on attention_chunk_size.

    Args:
        cu_seqlens (list[int]): List of cumulative sequence lengths.
        cu_seqlens_padded (list[int]): List of padded cumulative sequence lengths.
        attention_chunk_size (int): The maximum size of each chunk.

    Returns:
        Tuple[list[int], list[int]]: A tuple containing the new chunked cumulative
        sequence lengths and the new chunked padded cumulative sequence lengths.
    r      )rangelenappend)
cu_seqlenscu_seqlens_paddedattention_chunk_sizenew_cu_seqlensnew_cu_seqlens_paddedistartendstart_padded
end_paddedsegment_lengthnum_full_chunksj	new_indexnew_index_padded r'   _/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/llama4_utils.pychunkify_cu_seqlens(   s&   




r)   c                 C   s   | j d }|| d | | }||kr4|| }tj|g| j dd R | j| jd}tj| |gdd} || }| j||g| j dd R  } | dd} | j| j d dg| j dd R   } | S )a  
    Pads and reshapes a tensor for chunked processing.

    This function takes an input tensor `x` (typically representing query, key, or value
    in attention mechanisms) and pads its sequence dimension (dim 0) to be a multiple
    of `attention_chunk_size`. It then reshapes the tensor so that the sequence dimension
    is split into chunks, and the chunk dimension is combined with the batch dimension.

    Args:
        x (torch.Tensor): Input tensor, expected shape [seq_length, batch_size, ...].
        attention_chunk_size (int): The desired size of chunks along the sequence dimension.

    Returns:
        torch.Tensor: The reshaped tensor with shape
                      [attention_chunk_size, num_chunks * batch_size, ...].
    r   r   N)devicedtype)dimr      )	shapetorchzerosr*   r+   catreshape	transpose
contiguous)xr   
seq_lengthpad_seq_lenpad_size
pad_tensor
num_chunksr'   r'   r(   chunkifyM   s   
((r;   vp_stagereturnc                 C   s   ddl m} ddlm} t| d|d}g }|| |d}t|jD ]M\}}|| }	t|}
| jduo9|	d | j dk}t	|
j
j_|| j|jd	|
j
j_| jr\|s\t|
j
jj
_t|
j
jj
_nd|
j
jj
_d|
j
jj
_||
 q ||_|S )
zGet llama4 layer specr   )AttnMaskType)get_transformer_layer_offsetT)use_transformer_enginer<   )r<   Nr   )is_nope_layerr   attn_mask_type)megatron.core.transformer.enumsr>   +megatron.core.transformer.transformer_layerr?   r
   	enumeratelayer_specsr   nope_layer_intervalLlama4SelfAttention
submodulesself_attentionmoduler   causalparams
qk_l2_normr   q_layernormk_layernormr   )configr<   r>   r?   llama4_layer_specupdated_layer_specsoffsetidx
layer_speclayer_noupdated_layer_specrA   r'   r'   r(   get_llama4_layer_spec|   s,   

rY   c                       s   e Zd ZdZd fdd	Z								dddded	ed
ee dee deeee	eef f  dee dee dee dee
 dee dee de	eef fddZ  ZS )rH   z<Updated Transformer Layer to enable skip rope in some layersF    c                    s&   || _ || _tt| j|i | d S N)rA   r   superrH   __init__)selfrA   r   argskwargs	__class__r'   r(   r]      s   zLlama4SelfAttention.__init__N)inference_paramshidden_statesattention_maskkey_value_statesinference_contextrotary_pos_embrotary_pos_cosrotary_pos_sinattention_biaspacked_seq_paramssequence_len_offsetrc   r=   c          +      C   s  t ||}|r| rtrtdustdsJ d| jjr'| js'|dur'd}n
|du r/|du s1J |dur?t|t	s?|fd }| 
||\}}}| jjr|dur| r| js|dur| j|jv scJ |jdusjJ |j| j \}}| j|
|||||||d}|dd }||d|dd}| |\}}||fS | ||||||||
\}}}}}}d}d}|	dur|d}|d}|d}t|	j|	j}|| jkrt|	}| j |	_|	_t|	j|	j| j\|	_|	_t|	j|	j| j\|	_|	_n.|j d }|| jkr(|j }t!|| j}t!|| j}t!|| j}|dur&|d| j nd}t"# dkrT|| jkrT|t"# d  dksBJ |t"# d  }|| j dksTJ | j$s|dur| jjs|\}}|	dur|	jduru|	j}n|	j}|	jdur|	j}n|	j}nd }}|dur|du s|% rt&||| j|| j'j(d	}n|)||| j|| j'j(}|durt&||| j|| j'j(d	}| j*r| jr| j+|||||||	d
}n@|du s|% r| j,|||||||	d
}n)|||} }!}"|- \}#}$|. \}%}&}'}(| /| |!|"|$|(|#|%|&|'|
}t0|d}|	durL|	j1dkrL|2|ddd}|| jkrK|j|	_|j|	_|j|	_|j|	_|j|	_|j|	_n0|| jkr||d })|j d |) }*|2| j|*|)d}|dd}|2|*| j |)d}|d| }| |\}}||fS )a  
        Perform a forward pass through the attention module.

        Args:
            hidden_states (Tensor): Hidden states.
            attention_mask (Tensor): Attention mask.
            key_value_states (Optional[Tensor]): Key/value states (for cross attention).
            inference_context (Optional[BaseInferenceContext]): Inference context that manages
                KV cache.
            rotary_pos_emb (Optional[Union[Tensor, Tuple[Tensor, Tensor]]]): Rotary
                embedding tensor(s).
            rotary_pos_cos (Optional[Tensor]): Rotary embedding cosine.
            rotary_pos_sin (Optional[Tensor]): Rotary embedding sine.
            attention_bias (Optional[Tensor]): Attention bias.
            packed_seq_params (Optional[PackedSeqparams]): Parameters used for THD format.
            sequence_len_offset (Optional[int]): Sequence length offset used for
                inference CUDA graphs.

        Return:
            (Tuple[Tensor, Tensor]) Attention output and bias.

        Nz2.7.3zDflash attn verion v2.7.3 and above is required for dynamic batching.   )rm   query_layer	key_layervalue_layerinference_key_memoryinference_value_memory
rotary_cos
rotary_sinr   r   r   )rQ   r   cp_group)rB   rk   rl   zs b h d -> s b (h d)thd)3r   is_dynamic_batchingHAVE_FA3r   r   rQ   flash_decodetraining
isinstancetupleget_query_key_value_tensorsis_decode_onlylayer_numberkey_value_memory_dictrm   r3   r4   viewsizelinear_proj_adjust_key_value_for_inferencesqueezemaxmax_seqlen_qmax_seqlen_kvr   r   r)   cu_seqlens_qcu_seqlens_q_paddedcu_seqlens_kvcu_seqlens_kv_paddedr.   r;   r   get_context_parallel_world_sizerA   is_static_batchingr	   model_comm_pgscpapply_rotary_emb_querycheckpoint_core_attention_checkpointed_attention_forwardcore_attentioncu_query_lengthscu_kv_lengthsflash_decode_and_prefillr   
qkv_formatr2   )+r^   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rc   querykeyvaluerr   rs   outputoutcontext_layerbiasrB   block_tableoriginal_shapeoriginal_packed_seq_paramsoriginal_seq_lencp_chunk_len	q_pos_emb	k_pos_embr   r   core_attn_outqkvr   r   r   
kv_lengthskv_lengths_decode_onlymax_seqlen_k
batch_sizer:   r'   r'   r(   forward   sB  
&
	










zLlama4SelfAttention.forward)FrZ   )NNNNNNNN)__name__
__module____qualname____doc__r]   r   r   r   r   r   r   intr   __classcell__r'   r'   ra   r(   rH      sL    		

rH   r[   )'copyr   typingr   r   r   r/   einopsr   megatron.corer    megatron.core.inference.contextsr   1megatron.core.models.common.embeddings.rope_utilsr	   (megatron.core.models.gpt.gpt_layer_specsr
   megatron.core.packed_seq_paramsr   #megatron.core.transformer.attentionr   MCoreSelfAttention$megatron.core.transformer.spec_utilsr   $megatron.core.transformer.torch_normr   megatron.core.utilsr   r   r   %flashattn_hopper.flash_attn_interfacer   ry   ImportErrorr)   r;   r   rY   rH   r'   r'   r'   r(   <module>   s2   %/#