o
    wi7                     @   s(  d dl Z d dlZd dlmZ d dlmZmZmZ d dlm	Z
 d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlm Z m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+ erd dl,m-Z- d dl.m/Z/ G dd dZ0	d4dedej1de
j2defddZ3ej4dddfdedej1dej5d e6d!e6d"e7d#e8ee0f fd$d%Z9			&		d5dede0d'e:e; d(ee:e;  d)e7d*e6d+ee6 d,ee d#e<fd-d.Z=ej4dd/d0d1dfdedej1dej5d e6d!e6d"e7d*e6d+ee6 d#e8eee0f fd2d3Z>dS )6    N)Path)TYPE_CHECKINGAnyOptional)	TrainerFn)CommonInferenceParams)MCoreEngine)AbstractModelInferenceWrapper)TextGenerationController)AttnBackend)MegatronModule)io)ADAPTER_META_FILENAMEckpt_to_context_subdir)ckpt_to_weights_subdir)PEFT)MegatronStrategy)RestoreConfig)logging)GPTModel)T5Modelc                   @   sP   e Zd ZdZdddZdddZdd	 Zed
d Zedd Z	edd Z
dS )MCoreTokenizerWrappperz
    We need this wrapper since mcore generate uses methods/properties such as
    tokenizer.detokenize, tokenizer.tokenize, tokenizer.bos, tokenizer.pad, etc. to encode and decode prompts
    Nc                 C   s   || _ |j| _|p|j| _d S N)	tokenizereod
vocab_size)selfr   r    r   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/inference/base.py__init__3   s   zMCoreTokenizerWrappper.__init__Fc                 C   s.   dt | jjjv r| j||S | j|S )a-  
        Detokenizes a list of tokens into a string.

        Args:
            tokens (list): The list of tokens to detokenize.
            remove_special_tokens (bool, optional): Whether to remove special tokens. Defaults to False.

        Returns:
            str: The detokenized string.
        remove_special_tokens)inspect	signaturer   ids_to_text
parameters)r   tokensr    r   r   r   
detokenize8   s   z!MCoreTokenizerWrappper.detokenizec                 C   s   | j |S )z
        Tokenizes a prompt into a list of tokens.

        Args:
            prompt (str): The prompt to tokenize.

        Returns:
            list: The list of tokens.
        )r   text_to_ids)r   promptr   r   r   tokenizeH   s   
zMCoreTokenizerWrappper.tokenizec                 C      | j jS )z
        Gets the IDs of additional special tokens.

        Returns:
            list: The IDs of additional special tokens.
        )r   additional_special_tokens_idsr   r   r   r   r+   T      z4MCoreTokenizerWrappper.additional_special_tokens_idsc                 C   r*   )z
        Gets the ID of the beginning of sequence token.

        Returns:
            int: The ID of the beginning of sequence token.
        )r   bos_idr,   r   r   r   bos^   r-   zMCoreTokenizerWrappper.bosc                 C   r*   )zs
        Gets the ID of the padding token.

        Returns:
            int: The ID of the padding token.
        )r   pad_idr,   r   r   r   padh   r-   zMCoreTokenizerWrappper.padr   )F)__name__
__module____qualname____doc__r   r&   r)   propertyr+   r/   r1   r   r   r   r   r   -   s    


	
	r   pathtrainermodelr   c                 C   s  t |jts
J d|jjdksJ dddlm} |||  t| ddt  } rOt	|d}t
|}W d	   n1 s@w   Y  t|d
 ddd}nt| ddd}|d	uretd|  ||_||j_d|j_d	|_|j| ||_|jjd	ur|jjjdd |d |j  | s|  tj|j_|jj|d ||j_|j  |j }	t |	t!r|	|}|jj"#| }
t$j%||
d}dd |& D }|jj'j(t| dd|d}|jj)|dd d	S d	S )a  
    Sets up the trainer and restores the model from the given checkpoint path.

    It does the following:
    - Defines a RestoreConfig to restore only model weights
    - Disables setting up optimizers in the Trainer
    - Calls strategy.setup_environment(), model.configure_model() and strategy.setup_megatron_parallel(trainer=trainer)
    - Finally loads the model weights

    Args:
        path (Path): The path to the checkpoint file.
        trainer (nl.Trainer): The trainer object.
        model (pl.LightningModule): The model object.
        tokenizer (Any): The tokenizer object to override the tokenizer in the model.
    Returns:
        None
    z8Only MegatronStrategy is supported for trainer.strategy.   z3Context parallelism is not supported for inference.r   )#set_modelopt_spec_if_exists_in_ckptF)	is_savingrNmodel_ckpt_pathT)r7   load_model_stateload_optim_statezOverriding model.tokenizer to: c                   S   s   d S r   r   r   r   r   r   <lambda>   s    z2_setup_trainer_and_restore_model.<locals>.<lambda>)r8   )metadatac                 S   s   i | ]\}}d |v r||qS )z	.adapter.r   ).0kvr   r   r   
<dictcomp>   s    z4_setup_trainer_and_restore_model.<locals>.<dictcomp>)sharded_state_dict)strict)*
isinstancestrategyr   context_parallel_sizenemo.collections.llm.modeloptr;   r   r   existsopenjsonloadr   r   infor   restore_config_setup_optimizers	ckpt_pathconnectr8   launcherlaunchsetup_environment
state_dictconfigure_modelr   TESTINGstatefnsetup_megatron_parallelselective_restoremodel_transformr   unwrapped_checkpoint_ioload_content_metadatar   rG   itemscheckpoint_ioload_checkpointload_model_state_dict)r7   r8   r9   r   r;   adapter_meta_pathfrB   rR   peftsharded_sd_metadatarG   adapter_sharded_state_dictadapter_stater   r   r    _setup_trainer_and_restore_modelt   s\   




rm   i  i 
  Fparams_dtype&inference_batch_times_seqlen_thresholdinference_max_seq_lengthenable_flash_decodereturnc                 K   s   t jt| dd}|r*|tjks|tjkr$td d|j_	t
j|j_ntd| | D ]\}}	t|j|r@t|j||	 q.td| d q.t| ||d ||||}
|
t|jt|jd	d
fS )a  
    Sets up the model and tokenizer for inference.

    This function loads the model and tokenizer from the given checkpoint path,
    sets up the trainer, and returns the Megatron inference-wrapped model and tokenizer.

    Args:
        path (Path): The path to the checkpoint file.
        trainer (nl.Trainer): The trainer object.
        params_dtype (torch.dtype, optional): The data type of the model parameters.
            Defaults to torch.bfloat16.
        inference_batch_times_seqlen_threshold (int, optional): If batch-size times sequence-length is smaller
           than this threshold then we will not use pipelining, otherwise we will.
        inference_max_seq_length (int, optional): max_seq_length for inference. Required by MCoreEngine(>=0.12).
        Necessary for CUDA graphs. Defaults to 2560.
        enable_flash_decode (bool, optional): Whether to enable flash decode. Defaults to True.
        **kwargs: Additional keyword arguments to set in the model config.
    Returns:
        tuple[AbstractModelInferenceWrapper, MCoreTokenizerWrappper]:
            A tuple containing the inference-wrapped model and Mcore wrapped tokenizer.
    r9   )r7   subpathz0Enabling Flash Decode for in-framework inferenceTzZFlash Decode is not supported for params_dtype %s, defaulting to MCore's attention backendzConfig attribute zA not found in model.config, ignoring in setup_model_and_tokenizer)r7   r8   r9   r   N)r   load_contextr   torchbfloat16float16r   rQ   configflash_decoder   flashattention_backendwarningrc   hasattrsetattrrm   get_inference_wrapperr   r   getattr)r7   r8   rn   ro   rp   rq   kwargsr9   keyvalueinference_wrapped_modelr   r   r   setup_model_and_tokenizer   s*   
r      promptsencoder_promptsadd_BOSmax_batch_sizerandom_seedinference_paramsc                 C   sb   ddl m} |dur|| |d}	nt| |d}	t|	||d}
|p%tddd}|
j||||d	}|S )
a
  
    Runs generate on the model with the given prompts.

    This function uses the loaded model, loaded tokenizer, and prompts to generate text.
    It returns a dictionary containing the generated text.

    Args:
        model (AbstractModelInferenceWrapper): The inference-wrapped model.
        tokenizer (MCoreTokenizerWrappper): The tokenizer.
        prompts (list[str]): The list of prompts to generate text for.
        encoder_prompts (Optional[list[str]], optional): The list of encoder prompts. Defaults to None.
        add_BOS (bool, optional): Whether to add the beginning of sequence token. Defaults to False.
        max_batch_size (int, optional): The maximum batch size. Defaults to 4.
        random_seed (Optional[int], optional): The random seed. Defaults to None.
        inference_params (Optional[CommonInferenceParams], optional): The inference parameters defined in
            Mcore's CommonInferenceParams. Defaults to None.

    Returns:
        dict: A dictionary containing the generated results.
    r   )&EncoderDecoderTextGenerationControllerNr   r   text_generation_controllerr   r   i   r:   )num_tokens_to_generatetop_k)r   r   r   common_inference_params)^megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controllerr   r
   r   r   generate)r9   r   r   r   r   r   r   r   r   r   mcore_enginer   resultsr   r   r   r      s"   r   i   T    c                 C   s<   t | |||||d\}}	t||	d}
t|
||d}|||	fS )a  
    Sets up and returns a Megatron Core Engine for text generation inference.

    Args:
        path (Path): Path to the model checkpoint
        trainer (nl.Trainer): NeMo Lightning trainer instance
        params_dtype (torch.dtype): Data type for model parameters. Defaults to torch.bfloat16
        inference_batch_times_seqlen_threshold (int): Batch size * sequence length threshold. Defaults to 1000
        inference_max_seq_length (int): Maximum sequence length for inference. Defaults to 4096
        enable_flash_decode (bool): Whether to enable flash attention decoding. Defaults to False
        max_batch_size (int): Maximum batch size for inference. Defaults to 32
        random_seed (Optional[int]): Random seed for reproducibility. Defaults to None

    Returns:
        Tuple[MCoreEngine, AbstractModelInferenceWrapper, MCoreTokenizerWrapper]:
            - Configured Megatron Core Engine instance
            - Inference-wrapped model
            - Tokenizer wrapper
    )r7   r8   rn   ro   rp   rq   r   r   )r   r
   r   )r7   r8   rn   ro   rp   rq   r   r   r9   r   r   r   r   r   r   setup_mcore_engine3  s   

r   r   )NFr   NN)?r!   rO   pathlibr   typingr   r   r   lightning.pytorchpytorchpltorch.distributedru    lightning.pytorch.trainer.statesr   /megatron.core.inference.common_inference_paramsr   ,megatron.core.inference.engines.mcore_enginer   Qmegatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapperr	   Nmegatron.core.inference.text_generation_controllers.text_generation_controllerr
   megatron.core.transformer.enumsr    megatron.core.transformer.moduler   nemo.lightning	lightningnlr   nemo.lightning.ckpt_utilsr   r   nemo.lightning.io.plr    nemo.lightning.pytorch.callbacksr   3nemo.lightning.pytorch.strategies.megatron_strategyr   'nemo.lightning.pytorch.strategies.utilsr   
nemo.utilsr   #nemo.collections.llm.gpt.model.baser    nemo.collections.llm.t5.model.t5r   r   TrainerLightningModulerm   rv   dtypeintbooltupler   liststrdictr   r   r   r   r   r   <module>   s   H
O

?
	
;	