o
    }oi
J                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlZd dlZd dlZd dlZd dlm  mZ d dlZd dlmZ d dlmZ d dlmZ d dl m!Z" d d	l#m$Z$ d d
l%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJ d dlKmLZL d dlMmNZN d dlOmPZP d dlQmRZR d dlSmTZTmUZU d dlVmWZWmXZX d dlYmZZZ d dl[m\Z\m]Z]m^Z^m_Z_m`Z` d dlambZb d dlcmdZdmeZe d dlfmgZg d dlhmiZi d dljmkZkmlZlmmZmmnZnmoZompZp d dlqmrZr d dlsmtZtmuZumvZv d dlwmxZx dZyz
d dlzm{Z{m|Z| W n e}yU   dZyY nw e~dZejdd  ZdZeZzd d!lmZmZ d d"lmZ W n e}y   dZY nw G d#d$ d$ePZdS )%    N)glob)Path)AnyDictListOptional)check_max_num_tokens)numpy_to_torch)BuildConfig)build)Mapping)%BaichuanForCausalLMBertForQuestionAnsweringBertForSequenceClassification	BertModelBloomForCausalLMChatGLMForCausalLMCogVLMForCausalLMCohereForCausalLMDbrxForCausalLMDeciLMForCausalLMDecoderModelDeepseekForCausalLMDeepseekV2ForCausalLMDiTEagleForCausalLMEncoderModelFalconForCausalLMGemmaForCausalLMGPTForCausalLMGPTJForCausalLMGPTNeoXForCausalLMGrokForCausalLMLLaMAForCausalLMMambaForCausalLMMedusaForCausalLmMLLaMAForCausalLMMPTForCausalLMOPTForCausalLMPhi3ForCausalLMPhiForCausalLMQWenForCausalLMRecurrentGemmaForCausalLMReDrafterForCausalLMRobertaForQuestionAnswering RobertaForSequenceClassificationRobertaModelWhisperEncoder)PluginConfig)PreTrainedTokenizerBase)ITritonDeployable)TarPath)determine_quantization_settingsmodel_to_trtllm_ckpt)dist_model_to_trt_llm_ckptget_layer_prefix)init_model_parallel_from_nemo)build_tokenizerget_model_typeget_tokenizerget_weights_dtypeload_nemo_model)qnemo_to_tensorrt_llm)TOKENIZER_CONFIG_FILEget_nmt_tokenizer)is_qnemo_checkpoint)build_and_save_engine)generategenerate_streamingloadload_distributedrefitunload_engine)is_rank)is_nemo_tarfileprepare_directory_for_exporttorch_dtype_from_precision)TRTLLM_ENGINE_DIRT)cast_outputstr_ndarray2listFNeMoc                    s    fdd}|S )zNo op decoratorc                     s    | i |S )N )argskwargsfuncrS   L/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/tensorrt_llm.pywrappery   s   znoop_decorator.<locals>.wrapperrS   )rW   rY   rS   rV   rX   noop_decoratoru   s   rZ   )batchfirst_value)Tensorc                D   @   s$  e Zd ZdZ						ddedee deded	ed
edefddZ																																ddede	e dededede	e dede	e dede	e ded ed!ed"ed#ed$e	e ded%ed&ee d'ed(e	e d)e	e d*e	e d+ed,ed-ed.ed/ed0e	e d1e	e d2e	e d3e	e d4e	e fBd5d6Z
													7						dd8ededededed(e	e d)e	e d$e	e d*e	e d-ed"ed#ed!ed9ed+ed/ed:ed;ede	e def(d<d=Zded>e	e fd?d@ZdAeeef defdBdCZdDdE Z								Fddede	e dededededed ed$efdGdHZdIdJ ZdKdL ZedMdN Z	O	O	P			ddededed;edQed.efdRdSZddTdUZ			V	W											ddXee dedYedZed[ed\ee d]ee d^ed_ee d`ee daedbedceddedeefdfdgZdhedaefdidjZdhefdkdlZdmdn Zedodp Zedqdr Zedsdt Z edudv Z!edwdx Z"e#e$ddYdZd[dyd^dedddze%j&fd{d|Z'e#e$ddYdZd[dyd^dze%j&fd}d~Z(dd Z)dd Z*dd Z+		dddZ,dd Z-dd Z.dd Z/dS )TensorRTLLMa%  
    Exports nemo and huggingface checkpoints to TensorRT-LLM and run fast inference.

    Example:
        from nemo.export.tensorrt_llm import TensorRTLLM

        trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
        trt_llm_exporter.export(
            nemo_checkpoint_path="/path/for/nemo/checkpoint",
            model_type="llama",
            tensor_parallelism_size=1,
        )

        output = trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
        print("output: ", output)

    NTF	model_dirlora_ckpt_list
load_modeluse_python_runtimeenable_chunked_contextmax_tokens_in_paged_kv_cachemulti_block_modec                 C   s   |r|dus
|durt d|| _tj|t| _|| _|| _|dur%|nd| _	|| _
|| _d| _d| _d| _g | _d| _d| _g | _i | _|rN|   dS dS )a  
        Args:
            model_dir (str): path for storing the TensorRT-LLM model files.
            lora_ckpt_list (List[str]): lora checkpoint paths.
            load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
            use_python_runtime (bool): whether to use python or c++ runtime.
            multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context. Only available when using c++ runtime
        Nzenable_chunked_context and max_tokens_in_paged_kv_cache options work only with the TensorRT-LLM C++ runtime. Please set use_python_runtime=False to use these options.Fr   )	Exceptionr_   ospathjoinrO   
engine_dirr`   rb   rc   rd   re   model	tokenizerconfigptuning_tablesp_tabletask_vocab_sizetask_vtoken_countstask_ids_load)selfr_   r`   ra   rb   rc   rd   re   rS   rS   rX   __init__   s.   zTensorRTLLM.__init__         @      autor   nemo_checkpoint_path
model_typedelete_existing_filestensor_parallelism_sizepipeline_parallelism_sizegpus_per_nodemax_input_lenmax_output_lenmax_batch_sizemax_prompt_embedding_table_sizeuse_parallel_embeddinguse_embedding_sharingpaged_kv_cacheremove_input_paddingpaged_context_fmhadtypeuse_lora_pluginlora_target_modulesmax_lora_rankmax_num_tokensopt_num_tokensmax_seq_lenmultiple_profilesgpt_attention_plugingemm_pluginuse_mcore_pathreduce_fusionfp8_quantizedfp8_kvcachegather_context_logitsgather_generation_logits
build_rankc"           ?      C   s  |s	t jddd |du r|n|}t| j|td |
du rd}
d| _|durKt jdtdd |dur4|nd}|du r?|| }nt jd	| d
tdd |durQ|nd}|	dk r`t jddd d}	t|!}"|"r6t	 }#t
|#j}$t|rtj|r}|}$ntdtjtj|trtd t|| _nt|| _d}%tdai d|d| jd|d|d|	d|
d|d|d|d|d|d|d|d|d|d|d |d!|d"| n|du rt|}|du rtd#|| jvrtd$| d%| j d
|du r
t|}|du rtd&t||$|\}&}%| _|r%dd'lm }' dd(l!m"}( dd)l#m$}) dd*l%m&}* dd+l'm(}+ dd,l)m*}, |%+d-d.}-t,|%||\}}| -|%}.t.|)|}/|*}0d/d0 |0/ D d1d0 |0/ D B }1|%+d2d3}2|2d4kr|/jd5krd6}2|+|.|/|1|%+d7|%+d8|%+d9d:|%+d;d<|%+d=d|%+d>d.|2|%+d?|%+d@|,j0j1|-dA}3t.|'|}4|(||||-}5|3j2|&|5|4d.||dB\}6}7t3|6|7D ]R\}8}9|3j4dai d|dC|d|	d| jdD|8dE|9dF| j5d|d|d|d|
d|d|d|dGd.d|d|d |dHdIdJdKd!|dL|dM| qn||dNks/|dOkr1dP}|dQkr8dR}t6|&|%|$|||||||||dS\}6}7t3|6|7D ]Q\}8}9t4dai d|dC|d|	dT|9dU|8dV| jdW|dF| j5d|d|d|d|
d|d|d|d|d |d|d!|dL|dM|dX|dY|  qOtj|$dZ}:tj|$d[};tj|$d\}<t7| jt8r| j9| j nXtj|:rt:;|:| j nItj|;r
t<tj|;d]d^D ]
}=t:;|=| j qt<tj|;d_D ]}=t:;|=tj| jdZ qntj|<rt:;|<tj| jd\ tj|$d`}>tj|>r2t:;|>| j |#=  |"rD|%durD| >|%| t?@ dIkrOt?A  |"r[|r]| B  dS dS dS )ba  
        Exports nemo checkpoints to TensorRT-LLM.

        Args:
            nemo_checkpoint_path (str): path for the nemo checkpoint.
            model_type (Optional[str]): type of the model (optional for NeMo 2.0 and quantized checkpoints).
            delete_existing_files (bool): if True, deletes all the files in model_dir.
            tensor_parallelism_size (int): tensor parallelism.
            pipeline_parallelism_size (int): pipeline parallelism.
            gpus_per_node (int): number of gpus per node.
            max_input_len (int): max input length.
            max_output_len (int): max output length.
            max_batch_size (int): max batch size.
            max_prompt_embedding_table_size (int): max prompt embedding size.
            use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
            use_embedding_sharing (bool):
            paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
            paged_context_fmha (bool): whether to use paged context fmha feature of TRT-LLM or not
            remove_input_padding (bool): enables removing input padding or not.
            dtype (Optional[str]): Floating point type for model weights (supports 'bfloat16', 'float16' or 'float32').
                If None, try to autodetect the type from model config.
            load_model (bool): load TensorRT-LLM model after the export.
            use_lora_plugin (str): use dynamic lora or not.
            lora_target_modules (List[str]): list of the target lora modules.
            max_lora_rank (int): maximum lora rank.
            max_num_tokens (int):
            opt_num_tokens (int):
            max_seq_len (int): the maximum sequence length of a single request.
            multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
            gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
            gemm_plugin (str): enable the gpt plugin. Default = "auto"
            use_mcore_path (bool) : Use the more recent mcore path for export
            reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce
            fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type.
            fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type.
            gather_context_logits (Optional[bool]): if True, enables gather_context_logits while building trtllm engine. Default: False
            gather_generation_logits (Optional[bool]): if True, enables gather_generation_logits while building trtllm engine. Default: False
            build_rank (Optional[int]): rank to export the model on. If None, builds on all ranks.
        zExporting models using the local codebase with use_mcore_path=False is deprecated. Please install megatron-core and set use_mcore_path to True.   )
stacklevelNr~   subdirr   z;Parameter max_output_len is deprecated and will be removed.rw   z<Parameter max_output_len will be overwritten by max_seq_len=.rz      zeTensorRT LLM may hit a runtime issue with batch size is smaller than 4 on some models. Force set to 4z#Checkpoint path must be a directoryzCDetected legacy tokenizer_config.yaml, using it to build tokenizer.r|   rj   r   r   r   r   tensor_parallel_sizepipeline_parallel_sizer   r   r   r   r   r   r   r   r   r   r   ztParameter model_type needs to be provided and cannot be inferred from the checkpoint. Please specify it explicitely.Model E is not currently a supported model type. Supported model types are: zoParameter dtype needs to be provided and cannot be inferred from the checkpoint. Please specify it explicitely.DataType)ExportConfig	ModelTypeDEFAULT_CONVERSION_DICTTRTLLMHelper	MoeConfig#share_embeddings_and_output_weightsFc                 S      i | ]
\}}d | |qS )zmodel.rS   .0keyvaluerS   rS   rX   
<dictcomp>      z&TensorRTLLM.export.<locals>.<dictcomp>c                 S   r   )zmodule.rS   r   rS   rS   rX   r     r   
activationgeluzopenai-gelugemmagegluposition_embedding_typemax_position_embeddingsrotary_percentage      ?rotary_base'  moe_tp_modemulti_query_modeseq_len_interpolation_factormoe_renorm_modetransformer_configr}   trtllm_conversion_dictr   r   r   r   r   r   r   r   r   r   )model_state_dictexport_configr   !state_dict_split_by_layer_numbersr   r   r   trtllm_model_weightstrtllm_model_configr`   	use_refitmax_beam_widthrv   tokens_per_block   r   r   gpt	starcodergptnextmixtralllama)rk   nemo_model_confignemo_export_dirdecoder_typer   r   r   r   r   r   r   r   model_configmodel_weightsr_   r}   r   r   tokenizer.modelnemo_contextz
vocab.jsonnemo_tokenizer*.json*.modelmodel_config.yamlrS   )CwarningswarnrM   r_   rO   rk   DeprecationWarningrK   tempfileTemporaryDirectoryr   namerC   rg   rh   isdir
ValueErrorexistsri   rA   LOGGERwarningrB   rl   r=   r@   rj   r<   get_supported_models_listr>   r?   megatron.core.export.data_typer   "megatron.core.export.export_configr   megatron.core.export.model_typer   Jmegatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dictr   )megatron.core.export.trtllm.trtllm_helperr   tensorrt_llm.layersr   getr6   get_transformer_configgetattritemsExpertScaleNormalizationModeRENORMALIZE.get_trtllm_pretrained_config_and_model_weightsziprD   r`   r7   
isinstancer3   save_pretrainedshutilcopyr   cleanup_export_to_nim_formattensorrt_llmmpi_world_sizempi_barrierrs   )?rt   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   is_export_ranktmp_dirr   r   rk   r   r   r   r   r   r   r   r   input_model_typemcore_model_conversion_dictnemo_model_conversion_dictr   trtllm_helperinput_dtyper   trtllm_model_weights_listtrtllm_model_config_listr   r   tokenizer_pathtokenizer_path_nemo2
vocab_pathrh   r   rS   rS   rX   export   sh  K




	










	


	
zTensorRTLLM.exportr   hf_model_pathr   r   r   c                 C   s  t d || jvrtd| d| j  d|du r)| |}|du r)tdt| j|td |dk r;t	d	 d}t
 }|
|_|rJ|j|d
 nd|_||_||_||_||_|| }	t|||	|||||j||d
\}}|||||	||dd||d}tj||d}t|D ])}t d|  t|||d}| j| }|j|||d}t||}|| j qttj|dD ]	}t !|| j qttj|dD ]	}t !|| j qt d| j  t d| j  | "  dS )a7  
        Export a Hugging Face model checkpoint to TensorRT-LLM format.

        Args:
            hf_model_path (str): Path to the Hugging Face model directory
            max_batch_size (int, optional): Maximum batch size for inference. Defaults to 8.
            tensor_parallelism_size (int, optional): Size of tensor parallelism. Defaults to 1.
            max_input_len (int, optional): Maximum input sequence length. Defaults to 256.
            max_output_len (int, optional): Maximum output sequence length. Defaults to 256.
            max_num_tokens (int, optional): Maximum number of tokens. Defaults to None.
            opt_num_tokens (int, optional): Optimal number of tokens. Defaults to None.
            dtype (str, optional): Data type for model weights. If None, inferred from model config.
            max_seq_len (int, optional): Maximum total sequence length. Defaults to 512.
            gemm_plugin (str, optional): GEMM plugin type. Defaults to "auto".
            remove_input_padding (bool, optional): Whether to remove input padding. Defaults to True.
            paged_context_fmha (bool, optional): Whether to use paged context FMHA. Defaults to False.
            paged_kv_cache (bool, optional): Whether to use paged KV cache. Defaults to True.
            tokens_per_block (int, optional): Number of tokens per block for paged KV cache. Defaults to 128.
            multiple_profiles (bool, optional): Whether to use multiple TensorRT profiles. Defaults to False.
            reduce_fusion (bool, optional): Whether to reduce operator fusion. Defaults to False.
            max_beam_width (int, optional): Maximum beam width for beam search. Defaults to 1.
            use_refit (bool, optional): Whether to use TensorRT refitting. Defaults to False.
            model_type (str, optional): Type of the model architecture. Defaults to None.
            delete_existing_files (bool, optional): Whether to delete existing files in export dir. Defaults to True.

        Raises:
            ValueError: If model_type is not supported or dtype cannot be determined
        zStarting HF export to TRT-LLMr   r   r   Nz:No dtype found in hf model config. Please specify a dtype.r   r   zTTensorRT-LLM may hit runtime issue with batch size is smaller than 4. Force set to 4)r   F)
r   r   r   r   r   r   r   enable_context_fmhar   r   )r   r   r   r   r   r   r   strongly_typedbuilder_optr   r   )plugin_configzIterating over rank:)
world_sizeranktp_size)mappingr   r   z&Generarated TRT-LLM checkpoint at dir:zLoading the TRT-LLM checkpoint:)#r   infoget_supported_hf_model_mappingr   keysget_hf_model_dtyperM   r_   rO   printr2   r   enable_paged_kv_cacher   r   use_paged_context_fmhar   r   r   context_fmhar
   	from_dictranger   from_hugging_facebuild_trtllmsaverj   r   rg   rh   ri   r   r   rs   )rt   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r}   r~   r  
build_dictbuild_configr  r  trtllm_model_classrk   enginerh   rS   rS   rX   export_hf_model@  s   
3




zTensorRTLLM.export_hf_modelreturnc              
   C   sr  t |d }| std| zt|dt}t|}d|v r-|d W  d   W S d|v r=|d W  d   W S d|v rUd|d v rU|d d W  d   W S d|v rg|d rg	 W d   W d	S d
|v ry|d
 r	 W d   W dS W d   W dS W d   W dS 1 sw   Y  W dS  tjy   td|  ty } z	t	dt
| d}~ww )a)  
        Read the config file from a Hugging Face model directory and identify the model's data type.

        Args:
            model_dir (str): Path to the Hugging Face model directory

        Returns:
            Optional[str]: The model's data type if found in config, None otherwise
        config.jsonzConfig file not found at rtorch_dtypeNr   pretrained_configfp16float16bf16bfloat16zInvalid JSON in config file at zError reading config file: )r   r   FileNotFoundErroropenjsonrG   JSONDecodeErrorr   rf   RuntimeErrorstr)rt   r_   config_pathfrm   erS   rS   rX   r    sF   




zTensorRTLLM.get_hf_model_dtyper   c                 C   s   t j| jddd}t j|rt|| j |d}|ddg|du r(dn|dd	|d
}tt j| jdd}t	j
||dd |d W d   dS 1 sTw   Y  dS )a!  
        Exports the model configuration to a specific format required by NIM.
        This method performs the following steps:

        1. Copies the generation_config.json (if present) from the nemo_context directory to the root model directory.
        2. Creates a dummy Hugging Face configuration file based on the provided model configuration and type.

        Args:
            model_config (dict): A dictionary containing the model configuration parameters.
            model_type (str): The type of the model (e.g., "llama").
        r   	artifactszgeneration_config.jsonr   encoder_seq_lengthr#   Ndefault)factor	rope_type)r   architecturesrope_scalingr}   r/  wr   indent
)rg   rh   ri   r_   isfiler   r   r   r8  r9  dumpwrite)rt   r   r}   generation_config_pathr   	hf_configr>  rS   rS   rX   r    s"   
"z!TensorRTLLM._export_to_nim_formatc                 C   s   ddl m} |dd}d}|dd}|dkrd	}n|d
kr!d}|dd}||d|dd|d|d|d |dd|d|d|d|d|dkrW|nd|||ddd}|S )z.Given nemo model config get transformer configr   )TransformerConfignormalization	layernorm	LayerNormlayernorm_zero_centered_gammaFlayernorm1pTrmsnormRMSNormnum_moe_experts
num_layersmoe_router_topknum_attention_headsnum_query_groupskv_channelsNhidden_sizeffn_hidden_sizelayernorm_epsilonbiasgated_linear_unit)rY  rZ  r[  r\  r]  r^  r_  r`  add_bias_linearrX  rQ  rT  rb  ),megatron.core.transformer.transformer_configrP  r   )rt   r   rP  rQ   transformer_config_normalizationrT  rX  confrS   rS   rX   r     s2   


z"TensorRTLLM.get_transformer_configr6  c
                 C   sd  |du r|n|}t | j r_|rPtt| jdkrPt| jD ]}
tj| j|
}zt	| W q! t
y@   t| Y q!w tt| jdkrOtdntt| jdkr^tdn
t | jjddd |dksq|dkrsd	}|d
kryd}t dkr#t }t |j}t||\}}| _t|||||	|||||d
\}}t||D ]3\}}|jj}| D ]\}}t|tjrt|||< q|||< qtj |tj| jd| d q|d !tj| jd tj|d}tj|rt"|| j n| jdur
| j#| j tj|d}tj|rt"|| j |$  t% dkr0t&  dS dS )zConvert to safe tensorNr   zCouldn't delete all files.zGThere are files in this folder. Try setting delete_existing_files=True.T)parentsexist_okr   r   r   r   r   )
rk   r   r   r   r   r   r   r   r   r   r  z.safetensorsr/  r   r   rv   )'r   r_   r   lenrg   listdirrh   ri   r   rmtreeOSErrorremoverf   mkdirr  mpi_rankr   r   r   r?   rl   r7   r   r  tp_rankr   r   npndarrayr	   safetensorstorch	save_fileto_json_filer   r   r  r  r  )rt   r|   r}   r~   r   r   r   r   r   r   filesrh   r  r   rk   r   weights_dictsmodel_configsweight_dictr  kvr  r   rS   rS   rX   convert_to_safe_tensors)  sr   


$z#TensorRTLLM.convert_to_safe_tensorsc                     s  dd   fdd} fdd}ddl m} | }| }| }	| }
| }| | }|s6d	}| j	}| j
}d
}||ksF||kr^td |d	krZ|d	krZ||krZd}ntd|d }|| }|| }i }i }|d	krt|D ]2\}}|  D ]'\}trd|vrd|v r|||||| |  }||< q||< qqvn!|  D ]\}trd|vrd|v rƈ||< q||< q|d	ks|r4i }| D ]Y\}fddt|D }tjj|d t|D ]<}||||  }|||}|s*|| }||d	  d	 }||kr)||kr)|||| }|| ||< q|| ||< qq|}|| fdd}|rd}|||
}|durW||< d}|||
}|durh||< d}|||	}|dury||< d}|||
}|dur||< S )z
        Accumulate all vp model chunks together, and reshard model (i.e) gather all pp ranks
        if required and return the final model state dict
        c                 S   s4   t | D ]\}}|dkr|d   S qtd|  )Nlayersrv   zUnknown layer name format: )	enumerater   )	split_keyindexr   rS   rS   rX   _get_layer_index  s
   z>TensorRTLLM.gather_and_reshard_model.<locals>._get_layer_indexc                    s,   |  d}t |}t|||< d|S Nr   )splitintr<  ri   )
param_name	layer_numr  layer_indexr  rS   rX   rename_layer_num  s   

z>TensorRTLLM.gather_and_reshard_model.<locals>.rename_layer_numc                    s"   |  d}t |}t|| S r  )r  r  )r  r  r  r  rS   rX   get_layer_num  s   
z;TensorRTLLM.gather_and_reshard_model.<locals>.get_layer_numr   )parallel_staterv   Fz8Training/Generation model parallelism resharding enabledTzmNeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases.rY  _extra_stater~  zdecoder.layersc                    s   g | ]}t  qS rS   )rt  
zeros_like)r   _)valrS   rX   
<listcomp>      z8TensorRTLLM.gather_and_reshard_model.<locals>.<listcomp>groupc                    s     | }|d ur|jg}nd g}tjj|||d |d d u r"d S tj |kr4tj|d d }tjj|	 |d |S )Nr  r   r   )
r   shapert  distributedbroadcast_object_listget_rankemptycuda	broadcast
contiguous)r   
pp_src_idxr  tensortensor_shape)r   pp_groupstorage_dtyperS   rX   get_tensor_if_available  s   

zETensorRTLLM.gather_and_reshard_model.<locals>.get_tensor_if_availablezdecoder.final_layernorm.weightNzdecoder.final_layernorm.biasz embedding.word_embeddings.weightzoutput_layer.weight)megatron.corer  $get_tensor_model_parallel_world_size get_pipeline_model_parallel_rank&get_pipeline_model_parallel_first_rank%get_pipeline_model_parallel_last_rank&get_pipeline_model_parallel_world_size!get_pipeline_model_parallel_group.get_virtual_pipeline_model_parallel_world_sizer  pp_sizer   r  NotImplementedErrorr  
state_dictr   rt  	is_tensorr%  r  
all_gatherupdate) rt   r   rk   r  r  r  r  r  pp_rankpp_first_rankpp_last_rankr  vp_sizeinference_tp_sizeinference_pp_sizereshard_modelrY  layers_per_pplayers_per_chunk	tl_paramsmodel_level_paramsidxmodel_chunkr   key2gathered_paramsweight_listr  layers_start
layers_endr  r  rS   )r  r   r  r  r  rX   gather_and_reshard_model{  s   









z$TensorRTLLM.gather_and_reshard_modelc                 C   s@   ddl m} |tjkr|jS |tjkr|jS |tjkr|jS dS )z=
        Return mcore export dtype given torch dtype
        r   r   N)r   r   rt  r6  float32r4  )rt   r  r   rS   rS   rX   get_input_dtype  s   


zTensorRTLLM.get_input_dtypec                 C   sX   ddl m} t|  dd\}}i }| D ]\}}|r%||| | < q|||< q|S )a  MCore export supports some default conversion dictionaries
        All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models sometimes start with "model.decoder.layers.4.blahblah". so we append model prefix. to the keys
        r   r   T)layer_namesis_mcore)r   r   r9   r  r   )r   r   model_prefixr  r
  r   r   rS   rS   rX   "get_nemo_to_trtllm_conversion_dict  s   
z.TensorRTLLM.get_nemo_to_trtllm_conversion_dict   r   r  c                 C   s  t  tj ksJ |	||| _| _| _t|
\| _	| _
| _| _| _t|| _| jdkr;tj| jd| j
 | _|rddlm} ddlm} ddlm} t|j}| |||}| |}t||}| |}||||| d| d| d	d
| dd| dd| dd| dd| d| d|j!j"| ddd| _#| $|}| j#j%||dd| jj&|d\}}|d }|d }|
r| jdksJ d| j| j }| j|_||_'t j(|| j	| j| jd|_)| j#j*|||| |||| j|	d}n+t+||| j|| j| j|dd| j	| jj&d\}}t*|||| ||d |d | j||	d	}tj,  t-tj| jdtj  d}t.|d d!d"}t/j0|j12 |d#d$ W d%   n	1 sVw   Y  t3| j| j	| d%S )&zF
        Convert a model parallel nemo model to TensorRT-LLM.
        rv   dp_rankr   r   r   r   r   r   r   r   r   r   r   r   r   Fr   r   r   r   r   r   T)r   r   r    on_device_distributed_conversion
vocab_sizer   z'Reshard is true, but pp size is not one)r  r  r  r  )r   r   r   r   r   r   rj   r   )rk   r   r   r   r   r   r   r   use_distributed_convertmodel_parallel_rankr  )	r   r   r   r   r   r   r_   r}   r   config_z.jsonrG  utf-8encodingr   rH  N)4r  ro  rt  r  r  r   r}   r   r:   mp_rankr  r  r  dp_sizer;   rl   rg   rh   ri   r_   r   r   r   r   r   r   rN   	precisionr  r   r   r  r   r   r   r  r  r   r  r  r   r  rD   r7   barrierr   r8  r9  rL  rm   to_dictrH   )rt   rk   r   r}   r   rl   r   r   r   r   r  r   r   r   r   r  r   r   r  r
  r  r  r  r   r   r  r,  weightscfg_pathr>  rS   rS   rX   r     s   















"zTensorRTLLM.buildc                 C   s   d}|r't |j}| |||}| |}| jjj|| jj|d | jjj	}nt
||| j| j| jjd}t| j| j| j t  tj  t| dS )z
        Refits an TensorRT engine using an instantiated nemo model.
        This function should only be used after calling build()
        N)r   tokenizer_vocab_sizer   )rk   r   r  r  r  )rN   r  r  r  r  weights_converterconvertrl   r  r   r8   r  r  rH   r_   r  r   gccollectrt  r  empty_cacherI   )rt   rk   r   r   weights_dictr  r   r
  rS   rS   rX   rI     s,   


zTensorRTLLM.refit        r   input_textstop_ktop_ptemperaturestop_words_listbad_words_listno_repeat_ngram_sizerr   	lora_uids!prompt_embeddings_checkpoint_path	streamingoutput_log_probsoutput_context_logitsoutput_generation_logitsc                    s*   j du r	td|dus|dur! ||}|jdd}|g}nt jdkr2 j} j} j}nd}d}d}du rG|du sDJ dd}ng|du rNd}n`tdkr`tt|ks`J dtdkrd  j	
 v sxJ dd  fd	d
tt|D }n'g }tt|D ]}|  j	
 v sJ d| | j	|   q|stj st dkrd}nd}td i d|d|d j d|d|d|d|d|d|d|d|
d|d|d|d|d|d|d||S td || j ||||||||
|||d|S )!a`  
        Exports nemo checkpoints to TensorRT-LLM.

        Args:
            input_texts (List(str)): list of sentences.
            max_output_len (int): max generated tokens.
            top_k (int): limits us to a certain number (K) of the top tokens to consider.
            top_p (float): limits us to the top tokens within a certain probability mass (p).
            temperature (float): A parameter of the softmax function, which is the last layer in the network.
            stop_words_list (List(str)): list of stop words.
            bad_words_list (List(str)): list of bad words.
            no_repeat_ngram_size (int): no repeat ngram size.
            task_ids (List(str)): list of the task ids for the prompt tables.
            prompt_embeddings_table (List(float)): prompt embeddings table.
            prompt_embeddings_checkpoint_path (str): path for the nemo checkpoint for the prompt embedding table.
            output_generation_logits (bool): if True returns generation_logits in the outout of generate method.
            sampling_kwargs: Additional kwargs to set in the SamplingConfig.
        NiA nemo checkpoint should be exported to TensorRT-LLM and then it should be loaded first to run inference.r   dimz=There is a prompt embedding table and task_ids cannot be Nonerv   zSEither len of the task_ids has to be 1 orit needs to match with len of input_texts.z)Task: {0} doesn't exist in the task list.c                    s   g | ]	} j d   qS r   )rr   )r   irt   rr   rS   rX   r    s    z'TensorRTLLM.forward.<locals>.<listcomp>TFr  r   host_contextr  r  r  prompt_tablerp   rq   rr   r  r  r  r  r  multiprocessed_envr  r  )r  r   r  r  r  r  r  rp   rq   rr   r  r  r  r  rS   )rk   rf   _get_prompt_embedding_tablesizeri  rn   ro   rp   rq   rr   r  formatr%  appendrt  r  is_initializedr  r  rE   rF   )rt   r  r   r  r  r  r  r  r  rr   r  prompt_embeddings_tabler  r  r  r  r  sampling_kwargsr  tv_sizerq   input_task_idsr  r  rS   r  rX   forward  s   
'	
zTensorRTLLM.forward	task_namec                 C   s   | j du r	td| jD ]}|d |krtd|q| j|d}| j||d ttj	| j
dd}t| j| W d   n1 sHw   Y  |   dS )	zAdd prompt tableNr  r  zATask name: {0} has already added. Please pass a unique task name.)r  )tabler  prompt_tables.pklwb)rk   rf   rn   r  r  r  r8  rg   rh   ri   r_   picklerL  _prep_ptuning_table)rt   r  r  ptr  r>  rS   rS   rX   add_prompt_tableD  s    

zTensorRTLLM.add_prompt_tablec              	   C   s   | j durKtt| j D ]8}| j | d |krD| j | ttj| jdd}t	
| j | W d    dS 1 s<w   Y   dS q|   dS dS )zRemove prompt tableNr  r  r  )rn   r%  ri  popr8  rg   rh   ri   r_   r  rL  r	  )rt   r  r  r>  rS   rS   rX   remove_prompt_tableZ  s   

zTensorRTLLM.remove_prompt_tablec                 C   s`   t dd |D }t|D ] \}}|jd }||k r-|| }tj|ddd|fddd||< q|S )z>
        Pads the logits tensor with 0's on the right
        c                 S   s   g | ]}|j d  qS r  )r  r   logit_tensorrS   rS   rX   r  i  r  z+TensorRTLLM._pad_logits.<locals>.<listcomp>r   constant)moder   )maxr  r  Fpad)rt   logits_tensorpadding_lenr  r  
tensor_lenpadding_diffrS   rS   rX   _pad_logitse  s   
zTensorRTLLM._pad_logitsc                 C   s   g dS )zSupported model list)r   r   r   falconr   r   r   rS   rt   rS   rS   rX   r   r  s   z%TensorRTLLM.get_supported_models_listc                 C   s  i dt dt dt dt dt dt dt dtd	td
tdtdtdtdtdtdtdti dtdtdt	dt	dt
dt
dt
dt
dtdtdtdtdtdtd td!td"ti d#td$td%td&td'td(td)td*td+td,td-td.td/td0td1td2td3ti d4td5td6td7td8td9td:td;td<td=td>td?td@tdAtdBtdCtdDt t!t"t#t$dE}|S )FzSupported HF Model MappingGPT2LMHeadModelGPT2LMHeadCustomModelGPTBigCodeForCausalLMStarcoder2ForCausalLMJAISLMHeadModelr   NemotronForCausalLMr(   r   RWForCausalLMr   r*   r)   Phi3VForCausalLMPhi3SmallForCausalLMPhiMoEForCausalLMr$   r!   r    MptForCausalLMr'   GLMModelChatGLMModelr   ChatGLMForConditionalGenerationLlamaForCausalLMLlavaLlamaModelExaoneForCausalLMMistralForCausalLMMixtralForCausalLMArcticForCausalLMGrok1ModelForCausalLMInternLMForCausalLMInternLM2ForCausalLMInternLMXComposer2ForCausalLMGraniteForCausalLMGraniteMoeForCausalLMMedusaForCausalLMMedusaLlamaForCausalLMr-   r   BaiChuanForCausalLMSkyworkForCausalLMGEMMAGEMMA2QWenLMHeadModelr+   Qwen2ForCausalLMQwen2MoeForCausalLMQwen2ForSequenceClassificationQwen2VLForConditionalGenerationQwen2VLModelr1   r   r   r   r,   r   r   r   r   r   r   r   MLLaMAModelMllamaForConditionalGenerationr   r   )r   r0   r.   r/   )%r   r(   r   r   r*   r)   r$   r!   r    r'   r   r#   r"   r%   r-   r   r   r+   r1   r   r   r   r,   r   r   r   r   r   r   r   r&   r   r   r   r0   r.   r/   )rt   HF_MODEL_CLASS_MAPrS   rS   rX   r  x  s  	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEJz*TensorRTLLM.get_supported_hf_model_mappingc                 C   s   | j du rdS | j d d S )zGet hidden sizeNr2  r^  )rm   r  rS   rS   rX   get_hidden_size  s   
zTensorRTLLM.get_hidden_sizec                 C   s   t ddtdt ddtjddt ddtjddt ddtjddt d	dtjddt d
dtjddt ddtddt ddtddt ddtjddt ddtddt ddtddt ddtjddt ddtjddf}|S )zGet triton inputpromptsr   r  r   r   T)r   r  r   optionalr  r  r  random_seedr  r  r  task_idr  r  Fr  )r]   bytesrq  int_singlebool_)rt   inputsrS   rS   rX   get_triton_input  s   zTensorRTLLM.get_triton_inputc                 C   s0   t ddtdt ddtjdt ddtjdf}|S )NoutputsrG  rI  generation_logitscontext_logits)r]   rM  rq  rO  )rt   rS  rS   rS   rX   get_triton_output  s
   zTensorRTLLM.get_triton_outputrK  rQ  c              
   K   s  i }d}d}t |d}d|i}zd|v r|d|d< d|v r)|d|d< d|v r4|d|d< d|v r?|d|d< d|v rJ|d|d< d	|v r^t |d	}d
d |D |d	< d|v rrt |d}dd |D |d< d|v r}|d|d< d|v rtjj|dddd}	|	d |d< d|v rtjj|dddd}
|
d  |d< d|v r|d }|d|d< d|v r|d }|d|d< |r| jdi |\}}tdd |D |d< n)|r| jdi |\}}| 	|}tdd |D }||d< n| jdi |}t
|tj|d< W |S  tyA } zdt|}t
|gt| tj|d< W Y d}~|S d}~ww ) #Triton infer function for streamingFrF  r  r   r  r  r  rK  r  c                 S      g | ]}|gqS rS   rS   r   	stop_wordrS   rS   rX   r        z/TensorRTLLM.triton_infer_fn.<locals>.<listcomp>r  c                 S   rX  rS   rS   r   bad_wordrS   rS   rX   r    r[  r  rL  rM  r  r  r   rr   r  r  r  c                 S   s   g | ]}|   qS rS   )cpunumpy)r   generation_logitrS   rS   rX   r  %  s    rT  c                 S   s   g | ]}| d   qS r  )	unsqueezer^  r_  r  rS   rS   rX   r  /  s    rU  rS  An error occurred: {0}NrS   )rQ   r  rq  chardecodeastypetolistr  arrayr  rP   bytes_rf   r  r<  ri  )rt   rQ  output_dictcontext_logits_availablegeneration_logits_availablerF  infer_inputr  r  rL  r  output_textsrT  rU  errorerr_msgrS   rS   rX   triton_infer_fn  sn   


$zTensorRTLLM.triton_infer_fnc              
   k   s   zdt |di}d|v r|d|d< d|v r!|d|d< d|v r,|d|d< d|v r7|d|d< d|v rB|d|d< d|v rVt |d}d	d
 |D |d< d|v rjt |d}dd
 |D |d< d|v ru|d|d< d|v rtjj|dddd}|d |d< d|v rtjj|dddd}|d  |d< | jdi |ddi}|D ]}dt|tj	iV  qW dS  t
y }	 zdt|	}
t|
gtj	}d|iW  Y d}	~	S d}	~	ww )rW  r  rF  r   r  r  r  rK  r  c                 S   rX  rS   rS   rY  rS   rS   rX   r  L  r[  z9TensorRTLLM.triton_infer_fn_streaming.<locals>.<listcomp>r  c                 S   rX  rS   rS   r\  rS   rS   rX   r  O  r[  r  rL  rM  r  r  r   rr   r  r  TrS  rb  NrS   )rQ   r  rq  rc  rd  re  rf  r  rP   rh  rf   r  r<  )rt   rQ  rl  r  r  rL  r  partial_outputsrm  rn  ro  outputrS   rS   rX   triton_infer_fn_streaming:  sJ   z%TensorRTLLM.triton_infer_fn_streamingc                 C   s:  d| _ | jD ]}| j |d jddk r|d jdd| _ qg }g | _i | _d}t| jD ]:\}}|d }|jdd}tj| j | jf|j	d}||d |d d f< |
| || j|d < | j
| |d }q,t|dkrtj|ddd| j| _| jd d	 }	| jjd }
|
|	krtd
|
 d|	 dd S d | _d S )Nr   r  r  r  r  rv   rH  r*  r   z1The size of the combined prompt embedding table (z3) is greater than max_prompt_embedding_table_size (z).)rp   rn   r  rq   rr   r  rt  zerosrE  r   r  ri  stackviewro   rm   r  rf   )rt   r
  vtokens_embeddingstidr  ptuning_tableoriginal_tablevtoken_countpadded_tabler   actual_prompt_table_sizerS   rS   rX   r	  c  s8   



zTensorRTLLM._prep_ptuning_tablec                 C   sv   | j d ur9ttj| j d}| r4t|d}t|| _	W d    n1 s)w   Y  | 
  d S g | _	d S d S )Nr  rb)r_   r   rg   rh   ri   r   r8  r  rG   rn   r	  )rt   pt_pathr>  rS   rS   rX   _load_prompt_tables  s   

zTensorRTLLM._load_prompt_tablesc              	   C   s   t |r}|d }| s|d }| std||d}t|}W d    n1 s1w   Y  d}d|v rA|d }nd|v rJ|d }nd|v r^d	|d v r[|d d	 }nd
}nd
}|sitd|| 	 W  d    S 1 syw   Y  d S )Nzmodel_weights.ckptzmp_rank_00/model_weights.ckptz|File: {0} could not be found in the nemo checkpoint. Please check the nemo checkpoint format for the prompt embedding table.r~  Tz=model.embedding.adapter_layer.ptuning_adapter.inference_tablezqmodel.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weightr  z.prompt_table.taskname.prompt_embeddings.weightFzPCould not find the embedding table in the {0}. Please check the nemo file format)
r5   r   r7  r  r8  rt  rG   rf   r^  detach)rt   r  checkpoint_archivemw_pathmw_filer  weights_foundrS   rS   rX    _get_prompt_embedding_table_ckpt  sD   


$z,TensorRTLLM._get_prompt_embedding_table_ckptc                 C   s   |d ur|d urt d d}n|d urd}n	|d urd}ndS |dkr?t|tjs.tdt|jdkr9tdt	
|}n|dkrRt|sMt|d | |}| jd	 d
 }|jtj|d }|jdd| jd	 d kr~td| jd	 d |S )Nzgprompt_embeddings_table will be used and prompt_embeddings_checkpoint_path will be ignored for ptuning.	use_tableuse_checkpointNNz<Only numpy array is allowed for the prompt embeddings table.r   zNA two dimensional prompt embeddings table for a single task is only supported.z is not a nemo file.r2  r   r  rv   r  r^  z_Hidden dimension of the model is {0} and does not match with the dimension of the prompt table.)r   r   r   rq  rr  	TypeErrorri  r  rf   rt  
from_numpyrL   r  rm   tor  _utilsstr_dtype_to_torchr  r  r  )rt   r  r  p_tuningr   rS   rS   rX   r    sB   

z'TensorRTLLM._get_prompt_embedding_tablec                 C   sb   t | jd }| r)t|d}t|| _W d    d S 1 s"w   Y  d S td| d)Nr/  r0  zFile: z could not be found.)r   rj   r   r8  r9  rG   rm   r7  )rt   r=  r>  rS   rS   rX   _load_config_file  s   "zTensorRTLLM._load_config_filec              
   C   s   d | _ d | _d | _g | _t| j rUt| j}t	|dkrWz$| 
  t| j| _t| j| j| j| j| j| j| jd| _ |   W d S  tyT } ztd|d }~ww d S d S )Nr   )rl   rj   r`   rb   rc   rd   re   zXFiles in the TensorRT-LLM folder are corrupted and the model needs to be exported again.)rk   rl   rm   rn   r   r_   r   rg   rj  ri  r  r=   rG   rj   r`   rb   rc   rd   re   r  rf   r;  )rt   foldersrn  rS   rS   rX   rs     s<   	zTensorRTLLM._loadc                 C   s
   t   dS )zUnload engineN)rJ   r  rS   rS   rX   rJ   
  s   
zTensorRTLLM.unload_engine)NTTNNF) NTrv   rv   Nrw   Nrx   NFFTTFNTNNry   NNrz   Fr{   r{   TTNNFFr   )rx   rv   rw   rw   NNNrz   r{   TFTr   FFrv   FNT)NTrv   rv   NFFr6  )r  r  r   TFT)T)ry   rv   r  r   NNNNNNNFFFFr  )0__name__
__module____qualname____doc__r<  r   boolr  ru   r   r  r-  r  r   r   r  r   r}  r  r  staticmethodr  r   rI   floatr  r  r  r  propertyr   r  rE  rR  rV  r[   r\   rq  rr  rp  rs  r	  r  r  r  r  rs   rJ   rS   rS   rS   rX   r^      s   
1	
 !"
  {	

 &$!	

R 
	

 
#	

 

N



C'"
,
.r^   )r  r9  loggingrg   r  r   r   r   r   pathlibr   typingr   r   r   r   r_  rq  rs  r  rt  torch.nn.functionalnn
functionalr  wrapttensorrt_llm._commonr   tensorrt_llm._utilsr	   tensorrt_llm.builderr
   tensorrt_llm.commands.buildr   r'  tensorrt_llm.mappingr   tensorrt_llm.modelsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   tensorrt_llm.pluginr2   transformersr3   nemo.deployr4   nemo.export.tarutilsr5   -nemo.export.trt_llm.converter.model_converterr6   r7   3nemo.export.trt_llm.converter.model_to_trt_llm_ckptr8   r9   #nemo.export.trt_llm.converter.utilsr:   .nemo.export.trt_llm.nemo_ckpt_loader.nemo_filer;   r<   r=   r>   r?   nemo.export.trt_llm.qnemor@   )nemo.export.trt_llm.qnemo.tokenizer_utilsrA   rB   nemo.export.trt_llm.qnemo.utilsrC   &nemo.export.trt_llm.tensorrt_llm_buildrD   $nemo.export.trt_llm.tensorrt_llm_runrE   rF   rG   rH   rI   rJ   nemo.export.trt_llm.utilsrK   nemo.export.utilsrL   rM   rN   nemo.export.utils.constantsrO   
use_deploynemo.deploy.utilsrP   rQ   rf   	getLoggerr   	decoratorrZ   use_pytritonr[   pytriton.decoratorsr\   pytriton.model_configr]   r^   rS   rS   rS   rX   <module>   sr   ' 

	