o
    }oi_                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ e
rd dl0m1Z2 d dl3m4Z4 d dl5m6Z6 e-d\Z7Z8e-d\Z9Z:e-d\Z;Z<e:oe8oe<Z=e Z>g dZ?g dZ@dddZAeeeBf ZCeG dd  d ZDeG d!d" d"ZEG d#d$ d$ZF	dDd%eCd&eCd'ed( d)edB fd*d+ZGd,d- ZH	0dEd1eBd2eId3eId4eIfd5d6ZJd7d8 ZKejLd9fejMd:fejNd;fejOd<fejPd=fejQd>fejRd>fejSd>fejTd?fejUd@fejVd?fejWd?fejXdAfgZYd'eejZej[f d)eeB fdBdCZ\dS )F    N)	dataclass)Path)TYPE_CHECKINGOptionalUnion)load_dataset)CommonInferenceParams)tqdm)PreTrainedTokenizerBase)llm)MCoreTokenizerWrapppergenerate)get_quant_cfg_choices)load_quant_cfg)barriertorch_dtype_from_precision)ckpt_to_context_subdir) load_connector_from_trainer_ckpt)TrainerContextckpt_to_weights_subdir)logging)is_global_rank_zero)safe_import)unwrap_model)Trainer)MegatronParallelzmodelopt.torch.exportzmodelopt.torch.quantizationzmodelopt.torch.opt)   16bf16)trtllmnemohf
FP8_KV_CFGNVFP4_KV_CFG)fp8nvfp4c                   @   s   e Zd ZU dZdZee ed< dZe	ed< dZ
eed< dZee ed	< dZeed
< dZeed< dZe	ed< dZe	ed< dZe	ed< dS )QuantizationConfiga  Quantization parameters.

    Available quantization methods are listed in `QUANT_CFG_CHOICES` dictionary above.
    Please consult Model Optimizer documentation https://nvidia.github.io/TensorRT-Model-Optimizer/ for details.

    Quantization algorithm can also be conveniently set to None to perform only weights export step
    for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
    r$   	algorithm   awq_block_size      ?sq_alphaNenable_kv_cachekv_cache_qformatcnn_dailymailcalibration_dataset   calibration_dataset_size@   calibration_batch_sizecalibration_seq_len)__name__
__module____qualname____doc__r'   r   str__annotations__r)   intr+   floatr,   boolr-   r/   r1   r3   r4    r>   r>   h/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/modelopt/quantization/quantizer.pyr&   A   s   
 	r&   c                   @   sv   e Zd ZU dZeed< dZeed< dZeee	f ed< dZ
ee ed< d	Ze	ed
< d	Ze	ed< dZeed< dd ZdS )ExportConfigzInference configuration for the quantized TensorRT-LLM checkpoint.

    Available export formats methods are listed in `SUPPORTED_EXPORT_FMT` dictionary above.
    pathr   export_formatr   dtypeNdecoder_type   inference_tpinference_ppFgenerate_samplec                 C   s   t | j| _d S N)r   rA   )selfr>   r>   r?   __post_init__g   s   zExportConfig.__post_init__)r5   r6   r7   r8   r9   r:   rB   rC   r   r;   rD   r   rF   rG   rH   r=   rK   r>   r>   r>   r?   r@   X   s   
 r@   c                   @   s   e Zd ZdZdedefddZed'dd	Zd(de	de
e fddZedd Zdd Zdd Zd)d*ddZ	d+ddZededede	fddZdeded efd!d"Zd)ded#e
d$ ddfd%d&ZdS ),	Quantizera  Post-training quantization (PTQ) and export of NeMo 2.0 checkpoints.

    PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
    The process consist of several steps:

        1. Loading a Nemo model from disk using appropriate parallelism strategy
        2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
        3. Producing an output directory with a quantized checkpoint and a tokenizer

    By default, the output directory produced is intended to be consumed by TensorRT-LLM toolbox
    for efficient inference. This can be achieved using nemo.export.tensorrt_llm module.
    This can be changed to export a standard NeMo 2.0 checkpoint instead using `ExportConfig`.
    quantization_configexport_configc                 C   s|   t stdtj std|| _|| _|j}|j	r(|j
tv s(J d|j
 |dur7|tv s7J d| t|| _dS )zAInitialize Quantizer with quantization and export configurations.z*nvidia-modelopt is needed to use Quantizerz%GPU is required for the quantization.z*Unsupported kv cache quantization format: NzUnsupported export dtype: )HAVE_MODELOPTRuntimeErrortorchcudais_availableEnvironmentErrorrM   rN   rC   r,   r-   KV_QUANT_CFG_CHOICESSUPPORTED_DTYPEr   torch_dtype)rJ   rM   rN   rC   r>   r>   r?   __init__z   s   

zQuantizer.__init__returnNc                 C   s(   t | tjrdS | jj| j_|   dS )zSetup model for quantization.N)
isinstancer   HFAutoModelForCausalLM	tokenizer
vocab_sizeconfigfreezemodelr>   r>   r?   _setup   s   zQuantizer._setupFoptionalc                 C   sb   | j jdur
| j jS |}t|tjtjfs!|j}t|tjtjfrt| }r)|S |s/tddS )a8  
        Determines the decoder type for the given model. It is used for exporting a model to
        a TensorRT-LLM checkpoint and for configuring certain parameters in the quantization algorithm.

        Args:
            model: The model instance for which the decoder type needs to be determined.
            optional (bool): Allow to return None if the decoder type cannot be inferred.
                Otherwise an exception will be raised in such cases.

        Returns:
            Optional[str]: The decoder type as a string if it can be determined.
        NzxCould not infer the decoder type for the provided model. Please provide the decoder type explicitly in the ExportConfig.)	rN   rD   rZ   r   GPTModelr[   moduleget_modelopt_decoder_type
ValueError)rJ   ra   rc   unwrapped_modelrD   r>   r>   r?   _get_decoder_type   s   zQuantizer._get_decoder_typec           
   	      s   ddg}g }t  tjrB|D ]2} jj|dd} fdd| D } jjdi |ddi} jjj|d	 d
d}|| qn(t	 j} j
tjdd}dd t|||tddddD }	dd t||	D }td|  d S )Nz-Born in north-east France, Soyer trained as az&Born in California, Soyer trained as apt)return_tensorsc                    s    i | ]\}}||  jjqS r>   )tora   device).0kvr`   r>   r?   
<dictcomp>        z.Quantizer._generate_sample.<locals>.<dictcomp>max_new_tokens   r   T)skip_special_tokens)params_dtype&inference_batch_times_seqlen_thresholdc                 S   s   g | ]}|j qS r>   )generated_text)rn   rr>   r>   r?   
<listcomp>   s    z.Quantizer._generate_sample.<locals>.<listcomp>rE   )top_knum_tokens_to_generate)inference_paramsc                 S   s   g | ]\}}|| qS r>   r>   )rn   prompt
generationr>   r>   r?   rz      s    z,Sample generation after PTQ (with prompts): r>   )rZ   r   r[   r\   itemsra   r   decodeappendr   get_inference_wrapperrQ   bfloat16r   zipr   info)
ra   promptsoutputsr~   	input_idsoutputdecodedmcore_tokenizermcore_inference	generatedr>   r`   r?   _generate_sample   s0   

	zQuantizer._generate_samplec                    sr   t || jj| jj| jj| jjd| jj| jj }t|tjr,|j	j
  fdd}|S | j|| jj| jjdS )N)datasetseq_len
batch_sizecalibration_sizec                    s"    }|D ]	}| |   qd S rI   )rl   )ra   
dataloaderbatchrm   get_dataloaderr>   r?   huggingface_forward_loop   s   z=Quantizer._get_forward_loop.<locals>.huggingface_forward_loop)num_batches
seq_lengthmicro_batch_size)create_data_iterator_getterrM   r/   r4   r3   r1   rZ   r   r[   ra   rm   create_megatron_forward_loop)rJ   ra   number_of_batchesr   r>   r   r?   _get_forward_loop   s&   zQuantizer._get_forward_loopc                 C   sz  | j |dd}| jj}tj|rt|S |tv s J d| t| }d|v rWt	|}|d d }t
|tr<|d }| jjrH| jj|d d	< d
|krW|dv rWddd|d< | jjd u rfd|vod|dk}n| jj}| jjd u rx|rxtd nt|r~dnd d |r|ddddii|d< |d ttt| jj d  |dd sd|d< |dkrd|v rddd|d< |S )NTrc   z!Unsupported quantization format: awq	quant_cfgz*weight_quantizerr   block_sizesw4a8_awq)gemmamptawq_literE   )method
alpha_stepr'   int8gptzPEnabled KV cache quantization but enable_kv_cache is None in quantization_configEnabledDisabledz KV cache quantizationdefaultenableFmaxr   int8_sqsmoothquantr*   )r   alpha)ri   rM   r'   osrA   isfiler   QUANT_CFG_CHOICEScopydeepcopyrZ   listr)   r,   r   warningr   getupdategetattrmtqrU   r-   )rJ   ra   rD   r'   r   weight_quantizerenable_quant_kv_cacher>   r>   r?   _get_quant_cfg   s>   

zQuantizer._get_quant_cfgra   r   c                    s
  | j j}|du rtd |S td| d | | | j|dd}| |}tdt|  |du rCt	j
|rC| |}t	t|||}|dkrm| d	krY d
 n	dkr_d n	 d t	|d fdd}t rut	| | jjrtd | | |S )zQuantize the model and calibrate using given forward loop.

        If forward_loop is not provided, a forward loop will be created using the calibration dataset.
        NzEQuantization algorithm set to None, returning the non-quantized modelzQuantizing model to z...Tr   zUsing quant_cfg:
r   r$   i  r      r   z*input_quantizerc                    s   t j| d  dS )Ng{Gz?)min)rQ   clamp)amaxmaxboundr>   r?   <lambda>D  s    z$Quantizer.quantize.<locals>.<lambda>z4Generating a sample output after model quantization.)rM   r'   r   r   rb   ri   r   pprintpformatr   r^   need_calibrationr   quantizeunwrap_for_modelopt_operationspostprocess_amaxr   print_quant_summaryrN   rH   r   )rJ   ra   forward_loopr'   rD   r   rh   r>   r   r?   r   #  s:   







zQuantizer.quantizec                    s6   ddl m} | dd  fdd}|S )z5Create a forward loop for over a given data iterator.r   )get_forward_backward_funcc                 S   sH   t | }|j\}}tj||jd||f}|||d }dd }||fS )Nrm   c                 S   s   t di fS )NrE   )rQ   zeros)tensorr>   r>   r?   _mock_loss_function^  s   z^Quantizer.create_megatron_forward_loop.<locals>.forward_step_func.<locals>._mock_loss_function)nextshaperQ   arangerm   expand)data_iteratorra   data	batch_lenr   position_idsoutput_tensorr   r>   r>   r?   forward_step_funcX  s   
zAQuantizer.create_megatron_forward_loop.<locals>.forward_step_funcc              
      s"    }||  dd d S )NT)r   r   ra   num_microbatchesr   r   decoder_seq_lengthforward_onlyr>   )ra   r   r   forward_backward_funcr   r   r   r   r   r>   r?   loopc  s   
z4Quantizer.create_megatron_forward_loop.<locals>.loop))megatron.core.pipeline_parallel.schedulesr   )rJ   r   r   r   r   r   r   r   r>   r   r?   r   P  s
   z&Quantizer.create_megatron_forward_loopcheckpoint_dirtensor_parallelism_sizec                 C   sP   | d   }d}t|D ]}|| d| d   M }q|o|}|s&td |S )z(Basic validation of the model structure.zconfig.jsonTrankz.safetensorsz%Failed to export the quantized model.)existsranger   error)r   r   saved_configsaved_weightsiexport_successfulr>   r>   r?   _validate_quantized_checkpointr  s   
z(Quantizer._validate_quantized_checkpoint	model_dir
export_dir
export_fmtc                 C   s   t  r|dkr	d S t|tj}|r#|dkr|d }|jt| d S |dkrDt|drDt|jdrDt|jjtrD|jjt| d S t	j
t|tj|ddd d S )Nr    r!   huggingface_tokenizerr\   nemo_contextT)dirs_exist_ok)r   rZ   r   r[   r\   save_pretrainedr9   hasattrr
   shutilcopytreer   r   rA   join)rJ   ra   r   r   r   is_automodelr>   r>   r?   _save_tokenizer  s$   

zQuantizer._save_tokenizertrainerr   c              
   C   s  ddl m} | jj}| jj}|tv sJ d| t|tj}| jjdkrY|r+J d|dus3J d|	| t
  t rXt|jt|dgd	 tt|d
d  sXJ nY| jjdkrgt|||d nK| jj}| jj}	| ow|jjdk}
t   ||dd tjt|| || j|||	|
d W d   n1 sw   Y  t
  t r| ||sJ t r|  |||| t!"d| d dS dS )z2Export model to a TensorRT-LLM or NeMo checkpoint.r   )remove_hook_from_modulezUnsupported export format: r    zXNeMo export format can only be used with native NeMo checkpoints, not HuggingFace modelsNz!Trainer required for NeMo export.ra   )
yaml_attrsFmodelopt_stater!   r`   rE   T)recurse)ra   rD   rC   r   inference_tensor_parallelinference_pipeline_paralleluse_nfs_workspacez-Export succeeded, model has been exported to .)#accelerate.hooksr  rN   rA   rB   SUPPORTED_EXPORT_FMTrZ   r   r[   save_checkpointr   r   r   from_trainerio_dumpr   r   r   r   export_hf_checkpointrF   rG   r^   pipeline_model_parallel_sizerQ   inference_modemteexport_tensorrt_llm_checkpointr   ri   rW   r   r   r   r   )rJ   ra   r   r   r  r   r   r   rF   rG   r  r>   r>   r?   export  sR   

zQuantizer.export)rY   N)FrI   )ra   r   )NNN)r5   r6   r7   r8   r&   r@   rX   staticmethodrb   r=   r   r9   ri   r   r   r   r   r   r   r;   r   r   r  r>   r>   r>   r?   rL   k   s"    
3.
" rL   r   r   ra   zpl.LightningModulerY   c              	   K   sF  t |tjr:t|}tj|sdS t  t	j
|fdt|i| W d   t|S 1 s1w   Y  t|S t| d}|du rJ|| \}}t|}tj|sVdS t = t }|j| t	j|f|t|d| W d   n1 s~w   Y  W d   t|S W d   t|S 1 sw   Y  t|S )zHExport a GPTModel or HFAutoModelForCausalLM to a HuggingFace checkpoint.Nr   r!   )pretrained_model_name_or_pathr   )rZ   r   r[   r   mtoModeloptStateManageris_convertedrQ   r  r  r  r9   r   	nemo_loadtempfileTemporaryDirectoryr^   r   export_mcore_gpt_to_hfr   )r   r   ra   kwargsrh   exporter_tmp_dirr>   r>   r?   r    sD   






r  c                 C   s   t | tjr	| jS t| S )zUnwraps the model to expose the underlying architecture that Model Optimizer can work with.
    For HuggingFace models, returns the base model. For MCore models, returns the unwrapped version.)rZ   r   r[   ra   r   r`   r>   r>   r?   r     s   r   r.   r2   r0   r   r   
calib_sizemax_sequence_lengthc           	      c   s    | dkrt dddd}d}n| dkrt dddd}d	}n	t d
| dd}d}ttt|||}t|| D ]&}||| |d |  | }tt|D ]}|| d| ||< qL|V  q6dS )z/Creates a sample data iterator for calibration.wikitextzwikitext-103-v1train)splittextr.   z3.0.0)namer%  articlejson)
data_filesr%  rE   N)r   r   r   lenr   )	r   r   r!  r"  r   text_columnr   r   jr>   r>   r?   get_calib_data_iter  s    r.  c                    s    fdd}|S )z>Create a function that provides iterator over a given dataset.c                     sp   d} t |   d}g }|D ]!}fdd|D }fdd|D }|tj|jd qtt|S )N   )r   r"  r   r!  c                    s    g | ]} j |d  qS rI   )r\   text_to_ids)rn   r&  ra   r   r>   r?   rz     rr   zFcreate_data_iterator_getter.<locals>._get_iterator.<locals>.<listcomp>c                    s&   g | ]}|t |  jjg  qS r>   )r+  r\   eos)rn   idsr1  r>   r?   rz     s   & r   )r.  r   rQ   r   rm   iterr	   )CHARACTERS_PER_TOKENr   r   r   r   r   r   ra   r   r>   r?   _get_iterator  s   z2create_data_iterator_getter.<locals>._get_iteratorr>   )ra   r   r   r   r   r7  r>   r6  r?   r     s   r   baichuanchatglmgemma2gemma3r   llamar   qwenphi3c                 C   s>   t | tjrtj| jS tD ]\}}t | |r|  S qdS )a  Infers the modelopt decoder type from GPTModel or HFAutoModelForCausalLM.

    Args:
        model (GPTModel | HFAutoModelForCausalLM): The model to infer the decoder type from.
    Returns:
        Optional[str]: The inferred decoder type or None if no match is found.
    N)rZ   r   r[   r  model_utilsget_model_typera   gpt_model_type)ra   config_classrD   r>   r>   r?   rf   .  s   
rf   rI   )r.   r2   r0   r0   )]r   r   r   r   r  dataclassesr   pathlibr   typingr   r   r   rQ   datasetsr   /megatron.core.inference.common_inference_paramsr   r	   transformersr
   nemo.collectionsr   nemo.collections.llm.inferencer   r   <nemo.collections.llm.modelopt.quantization.quant_cfg_choicesr   0nemo.collections.llm.modelopt.quantization.utilsr   nemo.collections.llm.utilsr   r   nemo.lightning.ckpt_utilsr   nemo.lightning.io.apir   nemo.lightning.io.plr   r   
nemo.utilsr   nemo.utils.get_rankr   nemo.utils.import_utilsr   nemo.utils.model_utilsr   lightning.pytorchpytorchplnemo.lightningr    nemo.lightning.megatron_parallelr   r  HAVE_MODELOPT_MTEr   HAVE_MODELOPT_MTQr  HAVE_MODELOPT_MTOrO   r   rV   r
  rU   r9   AnyPathr&   r@   rL   r  r   r;   r.  r   Baichuan2ModelChatGLMModelGemma2ModelGemma3Model
GemmaModel
LlamaModelMistralModelMixtralModelNemotronModel
Qwen2ModelStarcoderModelStarcoder2Model	Phi3ModelrA  rd   r[   rf   r>   r>   r>   r?   <module>   s     `


&