# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
import re
from dataclasses import dataclass
from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, TypeVar, Union, overload

import numpy as np
import torch
from torch import nn

from nemo.lightning.pytorch.utils import extract_dtypes
from nemo.utils import logging

SourceModuleT = TypeVar("SourceModuleT", bound=nn.Module)
TargetModuleT = TypeVar("TargetModuleT", bound=nn.Module)
F = TypeVar("F", bound=Callable[..., Any])


@dataclass
class TransformCTX:
    """Transform Data class Definition."""

    source: nn.Module
    source_state: dict
    target: nn.Module
    target_state: dict


class _ModelState:
    """
    Helper class for used for to modify state dict of a source model during model conversion.
    """

    def __init__(self, state_dict, config=None):
        self._state_dict = state_dict
        self.config = config

    def state_dict(self):
        # pylint: disable=C0115,C0116
        return self._state_dict

    def to(self, dtype):
        # pylint: disable=C0115,C0116
        for k, v in self._state_dict.items():
            if v.dtype != dtype:
                logging.warning(f"Converting {k} from {v.dtype} (source model) to {dtype} (target model)")
            self._state_dict[k] = v.to(dtype)


@torch.no_grad
def apply_transforms(
    source: Union[nn.Module, _ModelState],
    target: TargetModuleT,
    mapping: Dict[str, str],
    transforms: Optional[List[Callable[[TransformCTX], TransformCTX]]] = [],
    state_dict_ignored_entries: List = [],
) -> TargetModuleT:
    """
    Applies a series of transformations to adapt the state dictionary of a source module to
    match the structure of a target module's state dictionary.

    This function renames keys according to a provided mapping and modifies values using a list
    of transformation functions. Each transformation function typically is decorated
    with `io.state_transform`.

    Args:
        source (nn.Module): The source module from which parameters and buffers are taken.
        target (TargetModuleT): The target module to which parameters and buffers are adapted.
        mapping (Dict[str, str]): Key-value pairs where each key from the source state dictionary
            is mapped to a corresponding key in the target state dictionary.
        transforms (Optional[List[Callable[[TransformCTX], TransformCTX]]]): A list of functions
            that modify the `TransformCTX` object. If None, no transformations beyond key renaming
            are applied. Defaults to None.
        state_dict_ignored_entries: List of entries to ignore in _target.state_dict(). There are cases
            where multiple entries in model's state_dict point to one entry in model's named_parameter.
            E.g., model has multiple pointers pointing to one shared parameters (`encoder.embed_tokens.weight`,
            `decoder.embed_tokens.weight` and `shared.weight` all points to `shared.weight
            in T5 Huggingface implementation.). In these cases, ignore redundant entries.

    Returns
    -------
        TargetModuleT: The modified target module with its state dictionary adjusted according to
        the specified mappings and transformations.

    Raises
    ------
        ValueError: If there's a mismatch in shape between corresponding source and target parameters
            or buffers.
        RuntimeError: If the target state dictionary contains keys that are not present in the source
            state dictionary after all transformations.

    Examples
    --------
        >>> source_module = nn.Linear(10, 5)
        >>> target_module = nn.Linear(10, 5)
        >>> mapping = {'weight': 'weights', 'bias': 'biases'}
        @io.state_transform(
            source_key="weight",
            target_key="weights"
        )
        def scale_weights(ctx):
            ctx.target_state['weights'] = ctx.source_state['weight'] * 2
            return ctx
        >>> transformed_target = apply_transforms(
        ...     source_module, target_module, mapping, [scale_weights]
        ... )
        >>> print(transformed_target.state_dict()['weights'])

    See Also
    --------
        - `TransformCTX`: For more details on the context object used in transformations.
        - `StateDictTransform`: For creating complex transformations.

    Note:
        This function is particularly useful when adapting models from different frameworks or
        when consolidating models with different architectural changes.
    """
    from megatron.core.transformer.module import MegatronModule

    # TODO: How can we improve this?
    _source = source
    if hasattr(source, "module") and isinstance(source.module, MegatronModule):
        _source = source.module
    _target = target
    if hasattr(target, "module") and isinstance(target.module, MegatronModule):
        _target = target.module

    # Track dtypes to make sure they weren't modified during conversion.
    target_orig_dtypes = extract_dtypes(_target.named_parameters())

    target_state = _target.state_dict()
    ctx = TransformCTX(
        source=_source,
        source_state=_source.state_dict(),
        target=_target,
        target_state=target_state,
    )

    for key, val in mapping.items():
        logging.debug(f"Mapping {key} -> {val}")
        ctx = StateDictTransform(key, val)(ctx)

    for transform in transforms:
        logging.debug(f"Transforming {transform.source_key} -> {transform.target_key}")
        ctx = transform(ctx)

    _params: Dict[str, nn.Parameter] = {}
    for name, param in _target.named_parameters():
        if name in target_state:
            target_param = target_state[name]
            if param.data.shape != target_param.shape:
                raise ValueError(
                    f"Shape mismatch for parameter {name}: target shape {param.shape} vs "
                    f"converted source shape {target_param.shape}"
                )

            _params[name] = nn.Parameter(target_param, requires_grad=param.requires_grad)
            target_state.pop(name)
        else:
            print(f"Unexpected key: {name} not in checkpoint but in model.")

    for key, val in _params.items():
        _module, _key = _target, key
        if "." in key:
            for part in key.split(".")[:-1]:
                _module = getattr(_module, part)
            _key = key.split(".")[-1]

        _module.register_parameter(_key, val)

    _buffers = {}
    for name, buffer in _target.named_buffers():
        if name in target_state:
            if buffer.shape != target_state[name].shape:
                raise ValueError(f"Shape mismatch for buffer {name}: {buffer.shape} vs {target_state[name].shape}")

            _buffers[name] = nn.Parameter(target_state[name], requires_grad=False)
            target_state.pop(name)

    for key, val in _buffers.items():
        _module, _key = _target, key
        if "." in key:
            for part in key.split(".")[:-1]:
                _module = getattr(_module, part)
            _key = key.split(".")[-1]

        _module.register_buffer(_key, val)

    keys = list(filter(lambda x: x is not None and not x.endswith("_extra_state"), target_state.keys()))
    keys = [key for key in keys if key not in state_dict_ignored_entries]
    if len(keys) != 0:
        raise RuntimeError(f"Additional keys: {keys} in checkpoint but not in model.")

    # TODO: Is this correct?
    # for key in target.state_dict():
    #     if key.endswith("_extra_state"):
    #         del target.state_dict()[key]

    """finally:
        cls._set_model_restore_state(is_being_restored=False)"""

    meta_tensor_keys = []
    for name, param in target.named_parameters():
        if param.is_meta:
            meta_tensor_keys.append(name)

    assert not meta_tensor_keys, (
        f"{meta_tensor_keys}\nThere are meta tensors in the model after conversion."
        f"Did you forget to include these parameters in the mapping or transforms in `convert_state`?"
    )

    assert target_orig_dtypes == extract_dtypes(_target.named_parameters()), (
        f"dtype mismatch between source and target state dicts. "
        f"Left side is { {k: v for k, v in target_orig_dtypes.items() if v!=torch.bfloat16} }, "
        f"Right side is { {k: v for k, v in extract_dtypes(_target.named_parameters()).items() if v!=torch.bfloat16} }"
    )
    if hasattr(target, "module") and isinstance(target.module, MegatronModule):
        target.module = _target

        return target

    return _target


def _default_transform(inp):
    return inp


class StateDictTransform(Generic[F]):
    """
    A transformation class for state dictionaries, allowing for flexible key matching and
    transformation of values between source and target state dictionaries.

    Attributes
    ----------
        source_key: A string, tuple of strings, or a dictionary specifying the keys in the source
            state dictionary to match. Wildcards (*) are supported.
        target_key: A string or tuple of strings specifying the keys in the target state dictionary
            to match. Wildcards (*) are supported.
        transform: A callable that performs the transformation on matched keys' values.

    Examples
    --------
        >>> def example_transform(ctx, *args):
        ...     return sum(args)
        >>> transform = StateDictTransform(
        ...     source_key="model.layers.*.self_attn.*_proj.weight",
        ...     target_key="decoder.layers.*.self_attention.linear_qkv.weight",
        ...     transform=example_transform
        ... )
    """

    def __init__(
        self,
        source_key: Union[str, Tuple[str, ...], Dict[str, str]],
        target_key: Union[str, Tuple[str, ...]],
        transform: F = _default_transform,
    ):
        self.source_key = source_key
        self.target_key = target_key
        self.transform = transform

    def __call__(self, ctx: TransformCTX) -> TransformCTX:
        source_key = self.source_key
        target_key = self.target_key
        source_dict, target_dict = ctx.source_state, ctx.target_state

        fn_params = dict(inspect.signature(self.transform).parameters)
        fn_params.pop("ctx", None)

        if isinstance(source_key, (dict, tuple)):
            if isinstance(source_key, tuple):
                source_key_dict = {param: source_key[i] for i, param in enumerate(fn_params)}
            else:
                source_key_dict = source_key
            source_matches_dict = {k: _match_keys(list(source_dict.keys()), v) for k, v in source_key_dict.items()}
            target_matches = _match_keys(list(target_dict.keys()), target_key)
            param_names = list(filter(lambda x: x in source_matches_dict, fn_params))
            source_matches = [
                source_matches_dict[v] if source_matches_dict[v].ndim > 0 else [source_matches_dict[v].item()]
                for v in param_names
            ]
            target_matches = [target_matches if target_matches.ndim > 0 else [target_matches.item()]]
            for layer_names_group in zip(*(source_matches + target_matches)):
                # Wrap in a list if it's a single layer (ie non-expert)
                if isinstance(layer_names_group[0], str):
                    layer_names_group = [[x] for x in layer_names_group]
                for layer_names in zip(*layer_names_group):
                    target_dict[layer_names[-1]] = self.call_transform(
                        ctx, **dict(zip(param_names, [source_dict[x] for x in layer_names[:-1]]))
                    )
                logging.debug(f"Matched (transform)! {layer_names_group=}")
        else:
            source_keys = list(source_dict.keys())
            target_keys = list(target_dict.keys())

            source_matches = _match_keys(source_keys, source_key)
            if source_matches.size == 1 and source_matches == np.array(None):
                raise ValueError(f"No matches found for source key: {source_key}")

            if isinstance(target_key, str):
                target_matches = _match_keys(target_keys, target_key)
                if target_matches.size == 1 and target_matches == np.array(None):
                    raise ValueError(f"No matches found for target key: {target_key}")
            else:
                if isinstance(target_key, dict):
                    raise ValueError("Target key must be a string or a tuple of strings.")
                _matches = [_match_keys(target_keys, key) for key in target_key]
                target_matches = np.stack(_matches, axis=-1)

            # Determine if we are dealing with multiple source matches or multiple target matches
            multiple_sources = source_matches.ndim >= target_matches.ndim
            accepts_var_args = any(
                param.kind == param.VAR_POSITIONAL for param in inspect.signature(self.transform).parameters.values()
            )

            if multiple_sources:
                for target_index, target_match in np.ndenumerate(target_matches):
                    try:
                        source_match = source_matches[target_index]
                    except IndexError as e:
                        logging.error(f"Enountered IndexError during transform.\n{source_matches=}\n{target_matches=}")
                        raise e
                    if accepts_var_args:
                        source_values = [source_dict[k] for k in source_match]
                        target_dict[target_match] = self.call_transform(ctx, *source_values)
                    else:
                        _source_match_list = [source_match] if isinstance(source_match, str) else list(source_match)
                        if len(fn_params) != len(_source_match_list):
                            raise ValueError(
                                f"Mismatch between source and target keys: {source_match} vs {target_match}"
                            )

                        kwargs = {param: source_dict[k] for param, k in zip(fn_params, _source_match_list)}
                        target_dict[target_match] = self.call_transform(ctx, **kwargs)
                    logging.debug(f"Matched (multi source)! {target_match=} {source_match=}")
            else:
                for source_index, source_match in np.ndenumerate(source_matches):
                    target_match = target_matches[source_index]
                    source_values = (
                        [source_dict[source_match]]
                        if np.isscalar(source_match)
                        else [source_dict[k] for k in source_match]
                    )
                    if accepts_var_args:
                        outputs = self.call_transform(ctx, *source_values)
                    else:
                        kwargs = {param: val for param, val in zip(fn_params, source_values)}
                        outputs = self.call_transform(ctx, **kwargs)

                    if isinstance(target_match, str):
                        target_dict[target_match] = outputs
                    else:
                        for i, t in enumerate(outputs):
                            target_dict[target_match[i]] = t
                    logging.debug(f"Matched (single source)! {target_match=} {source_match=}")

        return ctx

    def call_transform(self, ctx: TransformCTX, *args, **kwargs):
        """Perform transform and check if the given args valid."""
        func_params = inspect.signature(self.transform).parameters
        expected_num_args = len([p for p in func_params if p not in ['self', 'ctx']])
        provided_num_args = len(args) + len(kwargs)
        accepts_var_args = any(param.kind == param.VAR_POSITIONAL for param in func_params.values())

        if not accepts_var_args and provided_num_args != expected_num_args:
            raise ValueError(
                f"Expected {expected_num_args} arguments for the transformation function, but got {provided_num_args}."
            )

        if 'ctx' in func_params:
            return self.transform(ctx, *args, **kwargs)

        return self.transform(*args, **kwargs)


def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
    escaped_pattern = ''
    i = 0
    wildcard_positions = []
    while i < len(pattern):
        if pattern[i : i + 2] == '**':
            escaped_pattern += r'(.+)'  # Match any characters including dots
            wildcard_positions.append('**')
            i += 2
        elif pattern[i] == '*':
            escaped_pattern += r'([^.]+)'  # Match any characters except dots
            wildcard_positions.append('*')
            i += 1
        else:
            if pattern[i] == '.':
                escaped_pattern += r'\.'  # Escape the dot
            else:
                escaped_pattern += pattern[i]
            i += 1

    regex_pattern = re.compile("^" + escaped_pattern + "$")
    num_wildcards = len(wildcard_positions)
    wildcard_matches = [[] for _ in range(num_wildcards)]

    for key in filter(lambda x: x is not None, keys):
        match = regex_pattern.match(key)
        if match:
            for i, group in enumerate(match.groups()):
                if group not in wildcard_matches[i]:
                    wildcard_matches[i].append(group)

    # Sort the wildcard matches to maintain consistent ordering
    for i in range(len(wildcard_matches)):
        wildcard_matches[i].sort(key=lambda x: int(x) if x.isdigit() else x)

    # Determine the shape of the output array based on the unique matches for each wildcard
    shape = [len(matches) for matches in wildcard_matches]

    if len(wildcard_matches) == 0:
        # If there is no wildcard matches, assuming it is a single match
        shape = [1]
    # Initialize an empty array with the determined shape
    output_array = np.empty(shape, dtype=object)

    # Populate the array with the keys, now that we have the correct shape and ordering
    for key in filter(lambda x: x is not None, keys):
        match = regex_pattern.match(key)
        if match:
            # Convert match groups to indices based on their position in wildcard_matches
            indices = [wildcard_matches[i].index(group) for i, group in enumerate(match.groups())]
            output_array[tuple(indices)] = key  # Place the key in the array based on the indices

    return output_array


@overload
def state_transform(
    source_key: Union[str, Tuple[str, ...], Dict[str, str]],
    target_key: Union[str, Tuple[str, ...]],
) -> Callable[[F], StateDictTransform[F]]: ...


@overload
def state_transform(
    source_key: Union[str, Tuple[str, ...], Dict[str, str]], target_key: Union[str, Tuple[str, ...]], fn: F
) -> StateDictTransform[F]: ...


def state_transform(
    source_key: Union[str, Tuple[str, ...], Dict[str, str]],
    target_key: Union[str, Tuple[str, ...]],
    fn: Optional[F] = None,
):
    """
    A decorator for creating StateDictTransform instances with specified source and target keys,
    and a transformation function. This allows for concise definition of state dictionary
    transformations.

    Args:
        source_key: A string, tuple of strings, or a dictionary specifying the keys in the source
            state dictionary to match. Wildcards (*) are supported.
        target_key: A string or tuple of strings specifying the keys in the target state dictionary
            to match. Wildcards (*) are supported.
        fn: An optional callable that performs the transformation on matched keys' values. If not
            provided, the decorator can be used to wrap a function definition.

    Returns
    -------
        A StateDictTransform instance if `fn` is provided, otherwise returns a decorator that
        takes a function and returns a StateDictTransform instance.

    Examples
    --------
        >>> @state_transform(
        ...     source_key="model.layers.*.self_attn.*_proj.weight",
        ...     target_key="decoder.layers.*.self_attention.linear_qkv.weight"
        ... )
        ... def sum_transform(ctx, *args):
        ...     return sum(args)
    """

    def wrapper(fn) -> StateDictTransform:
        return StateDictTransform(source_key, target_key, fn)

    if fn is None:
        return wrapper

    return wrapper(fn)


class TransformFns:
    """
    A collection of common functions used in state dict transformation.
    """

    @staticmethod
    def split_qkv(ctx: TransformCTX, linear_qkv: torch.Tensor):
        """
        Split interleave-concatenated qkv to q, k, v

        Example: export layer linear_qkv to HF {q|k|v}_proj
        """
        megatron_config = ctx.source.config

        head_num = megatron_config.num_attention_heads
        num_query_groups = megatron_config.num_query_groups
        heads_per_group = head_num // num_query_groups
        # hidden_size = megatron_config.hidden_size
        head_size = megatron_config.kv_channels
        qkv_total_dim = head_num + 2 * num_query_groups

        linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, -1])
        # when converting base model (linear_qkv), hidden size = megatron_config.hidden_size
        # when converting lora (linear_qkv.adapter.linear_out), hidden size = lora_r
        hidden_size = linear_qkv.size(-1)
        q_slice = torch.cat(
            [
                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
                for i in range(num_query_groups)
            ]
        )
        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))

        q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
        k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
        v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()

        return q_proj, k_proj, v_proj

    @staticmethod
    def split_qkv_bias(ctx: TransformCTX, qkv_bias: torch.Tensor):
        """
        Split interleave-concatenated qkv bias to separate q, k, v bias

        Example: export layer linear_qkv bias to HF {q|k|v}_proj bias
        """
        megatron_config = ctx.source.config

        head_num = megatron_config.num_attention_heads
        num_query_groups = megatron_config.num_query_groups
        heads_per_group = head_num // num_query_groups
        head_size = megatron_config.kv_channels
        qkv_total_dim = head_num + 2 * num_query_groups

        qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
        q_slice = torch.cat(
            [
                torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
                for i in range(num_query_groups)
            ]
        )
        k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
        v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))

        q_bias = qkv_bias[q_slice].reshape(-1).cpu()
        k_bias = qkv_bias[k_slice].reshape(-1).cpu()
        v_bias = qkv_bias[v_slice].reshape(-1).cpu()

        return q_bias, k_bias, v_bias

    @staticmethod
    def merge_qkv(ctx: TransformCTX, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
        """
        Merge q, k, v to interleave-concatenated qkv.

        Example: import HF {q|k|v}_proj to layer linear_qkv
        """
        megatron_config = ctx.target.config

        head_num = megatron_config.num_attention_heads
        num_query_groups = megatron_config.num_query_groups
        heads_per_group = head_num // num_query_groups
        hidden_size = megatron_config.hidden_size
        head_size = megatron_config.kv_channels
        old_tensor_shape = q.size()
        new_q_tensor_shape = (head_num, head_size) + old_tensor_shape[1:]
        new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]

        q = q.view(*new_q_tensor_shape)
        k = k.view(*new_kv_tensor_shape)
        v = v.view(*new_kv_tensor_shape)

        qkv_weights_l = []
        for i in range(num_query_groups):
            qkv_weights_l.append(q[i * heads_per_group : (i + 1) * heads_per_group, :, :])
            qkv_weights_l.append(k[i : i + 1, :, :])
            qkv_weights_l.append(v[i : i + 1, :, :])
        qkv_weights = torch.cat(qkv_weights_l)
        assert qkv_weights.ndim == 3, qkv_weights.shape
        assert qkv_weights.shape[0] == (heads_per_group + 2) * num_query_groups, qkv_weights.shape
        assert qkv_weights.shape[1] == head_size, qkv_weights.shape
        assert qkv_weights.shape[2] == old_tensor_shape[1], qkv_weights.shape

        qkv_weights = qkv_weights.reshape([head_size * (head_num + 2 * num_query_groups), hidden_size])

        return qkv_weights

    @staticmethod
    def merge_qkv_bias(ctx: TransformCTX, qb: torch.Tensor, kb: torch.Tensor, vb: torch.Tensor):
        """
        Merge q, k, v bias to interleave-concatenated qkv bias.

        Example: import HF {q|k|v}_proj bias to layer linear_qkv bias
        """
        megatron_config = ctx.target.config

        head_num = megatron_config.num_attention_heads
        num_query_groups = megatron_config.num_query_groups
        heads_per_group = head_num // num_query_groups
        head_size = megatron_config.kv_channels

        new_q_tensor_shape = (head_num, head_size)
        new_kv_tensor_shape = (num_query_groups, head_size)

        qb = qb.view(*new_q_tensor_shape)
        kb = kb.view(*new_kv_tensor_shape)
        vb = vb.view(*new_kv_tensor_shape)

        qkv_bias = torch.empty((0, head_size)).type_as(qb)
        for i in range(num_query_groups):
            qkv_bias = torch.cat((qkv_bias, qb[i * heads_per_group : (i + 1) * heads_per_group, :]))
            qkv_bias = torch.cat((qkv_bias, kb[i : i + 1, :]))
            qkv_bias = torch.cat((qkv_bias, vb[i : i + 1, :]))
        qkv_bias = qkv_bias.reshape(
            [
                head_size * (head_num + 2 * num_query_groups),
            ]
        )
        return qkv_bias

    @staticmethod
    def merge_fc1(gate: torch.Tensor, up: torch.Tensor):
        """
        Merge gate and up proj into concatenated fc1

        Example: import HF {gate|up}_proj to layer linear_fc1
        """
        return torch.cat((gate, up), dim=0)

    @staticmethod
    def split_fc1(linear_fc1: torch.Tensor):
        """
        Split concatenated fc1 to gate and up proj

        Example: export layer linear_fc1 to HF {gate|up}_proj
        """
        gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
        return gate_proj, up_proj

    @staticmethod
    def duplicate2(param: torch.Tensor):
        """
        Duplicate the source parameter to two target parameters

        Example: export Performant LoRA linear_fc1.adapter.linear_in to HF {gate|up}_proj.lora_A
        """
        return param, param

    @staticmethod
    def duplicate3(param: torch.Tensor):
        """
        Duplicate the source parameter to three target parameters

        Example: export Performant LoRA linear_qkv.adapter.linear_in to HF {q|k|v}_proj.lora_A
        """
        return param, param, param

    @staticmethod
    def prune_padding(ctx: TransformCTX, embedding: torch.Tensor):
        """
        Prune the embedding size to vocab size

        Example: export embedding/output layer to HF with non-padded vocab size
        """
        megatron_config = ctx.target.config
        return embedding[: megatron_config.vocab_size, :]