o
    }o™iA,  ã                
   @   s<  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ z$d dlm  mZ d dl m!Z! ej"ej#ej$ej%ej&ej'ej(dœZ)dZ*W n e+e,fy Z- z
dZ*e-Z.W Y dZ-[-ndZ-[-ww g d¢Z/G dd„ dƒZ0dS )é    N)Únullcontext)ÚCallableÚOptional)Úparallel_state)ÚFloat16Module)Ú
DictConfigÚ	open_dict)ÚMegatronGPTModel)Útorch_dtype_from_precision)Úlogging)Útemporary_directory)Úsave_artifactsÚunwrap_model)Úexport_tensorrt_llm_checkpoint)Úint8Úint8_sqÚfp8Úint4_awqÚw4a8_awqÚint4Únvfp4TF)é   Ú16Úbf16c                   @   sŽ   e Zd ZdZdee dee fdd„Zedefdd„ƒZ	ed	ed
efdd„ƒZ
edefdd„ƒZdedeegdf fdd„Zdefdd„ZdS )Ú	Quantizera?  Post-training quantization (PTQ) and TRT-LLM export of Nemo checkpoints.

    PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
    The process consist of several steps:

        1. Loading a Nemo model from disk using appropriate parallelism strategy
        2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
        3. Producing output directory or .qnemo tarball with model config (json),
           quantized weights (safetensors) and tokenizer config (yaml).

    The output directory (or .qnemo file) produced is intended to be consumed by TensorRT-LLM toolbox
    for efficient inference. This can be achieved using Nemo inference containers.

    Currently supported and tested model family is Llama2. Model type needs to be specified in
    the quantization command with decoder_type parameter on exporting (see below). Quantizing other
    model families is experimental and might not be fully supported.

    Available quantization methods are listed in `QUANT_CFG_CHOICES` dictionary above.
    Please consult Model Optimizer documentation https://nvidia.github.io/TensorRT-Model-Optimizer/ for details.
    You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
    for quantization algorithms and calibration data as well as recommended settings.

    Quantization algorithm can also be conveniently set to 'null' to perform only weights export step
    for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
    Úquantization_configÚexport_configc                 C   sN  t stdƒt‚|| _|| _|jdu s|jtv sJ d|j› ƒ‚|jdurt|j }d|jv rD|d d }t|tƒr=|d }|j	|d d	< | 
d
d¡}|du rXd|jvoW|jdk}t |r^dnd› d¡ |jdkrkdndd|dœ|d d< |jdkr‹t d|j› ¡ d|jdœ|d< || _nd| _|dur£|jtv s¥J d|j› ƒ‚dS dS )a  Initialize Quantizer with quantization and export configurations.

        Expected keys in `quantization_config`:
            - algorithm: str
            - decoder_type: str
            - awq_block_size: int (only for awq algorithms)
            - sq_alpha: float (only for smooth quant algorithms)
            - enable_kv_cache: bool (default: None i.e. auto-detect based on algorithm and decoder_type)

        Expected keys in `export_config`:
            - dtype: str/int
            - decoder_type: str
            - inference_tensor_parallel: int
            - inference_pipeline_parallel: int
            - save_path: str
        z*nvidia-modelopt is needed to use QuantizerNz$Unsupported quantization algorithm: ÚawqÚ	quant_cfgz*weight_quantizerr   Úblock_sizeséÿÿÿÿÚenable_kv_cacher   ÚgptÚEnabledÚDisabledz KV cache quantizationr   é   )é   é   )Únum_bitsÚaxisÚenablez*output_quantizerzUsing int8_sq alpha = Úsmoothquant)ÚmethodÚalphaÚ	algorithmzUnsupported export dtype: )ÚHAVE_MODELOPTÚRuntimeErrorÚHAVE_MODELOPT_ERRORr   r   r.   ÚQUANT_CFG_CHOICESÚ
isinstanceÚlistÚawq_block_sizeÚgetÚdecoder_typer   ÚinfoÚsq_alphar   ÚdtypeÚSUPPORTED_DTYPE)Úselfr   r   r   Úweight_quantizerÚenable_quant_kv_cache© r?   úR/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/quantize/quantizer.pyÚ__init__S   s@   

ÿ


ÿý
ÿzQuantizer.__init__Úmodelc                 C   sn   z	d| j jjj_W n	 ty   Y nw t ¡ s5dd„ }| jj	j
dur-| jj	j
j|| jd | jj	 ¡  dS dS )zSetup model for quantization.Nc                   S   s   d S )Nr?   r?   r?   r?   r@   Údummy›   s   zQuantizer._setup.<locals>.dummy)Útrainer)rB   ÚmoduleÚlanguage_modelÚencoderÚactivations_checkpoint_methodÚAttributeErrorr   Úis_initializedrD   ÚstrategyÚlauncherÚlaunchÚsetup_environment)rB   rC   r?   r?   r@   Ú_setup‘   s   ÿùzQuantizer._setupÚ	model_cfgÚreturnc                 C   sV   t | ƒ |  dd¡rt d¡ d| _d| _d| _W d  ƒ | S 1 s$w   Y  | S )z%Modify model config for quantization.Úsequence_parallelFz2Disabling sequence parallelism for quantization...ÚmodeloptN)r   r6   r   ÚwarningrR   ÚnameÚapply_rope_fusion)rP   r?   r?   r@   Úmodify_model_config¢   s   


ûùzQuantizer.modify_model_configc                 C   s<   t  d¡ | jddgdddœd}t  d|d › d	¡ d
S )z,Generate sample output for a model instance.z)Generating sample output for the model...z-Born in north-east France, Soyer trained as az&Born in California, Soyer trained as aéd   )Ú
max_lengthÚ
min_length)ÚinputsÚlength_paramsz#Example NeMo output before export: Ú	sentencesú"N)r   r8   Úgenerate)rB   Úresponser?   r?   r@   Ú_sample_output®   s   
þþûzQuantizer._sample_outputÚforward_loopNc                    s¤   | j dus	J dƒ‚t d| jj› d¡ |  |¡ t || j |¡}| jjdkrEd‰ | jjdkr2d‰ n| jjd	kr:d
‰ t 	|d‡ fdd„¡}t
 ¡ dkrPt |¡ |S )z:Quantize the model and calibrate using given forward loop.Nz!Quantization algorithm is not setzQuantizing model to ú...r"   r   r   iÀ  r   é   z*input_quantizerc                    s   t j| dˆ  dS )Ng{®Gáz„?)Úmin)ÚtorchÚclamp)Úamax©Úmaxboundr?   r@   Ú<lambda>Ò   s    z$Quantizer.quantize.<locals>.<lambda>)r   r   r8   r   r.   rO   ÚmtqÚquantizer7   Úpostprocess_amaxÚdistÚget_rankÚprint_quant_summary)r<   rB   rb   r?   ri   r@   rm   À   s    
ÿ
zQuantizer.quantizec              
   C   sˆ  | j dus	J dƒ‚t| j jƒ}| j  dd¡r|  |¡ |jjr&t|jt	ƒ|_| j  dd¡}|r3t
ƒ }nt| j jd}|~}t|| j j||| j j| j j|jjdkd	 t ¡  t d
| j j› d¡ t ¡ dkr¢t||ƒ |rªtjtj | j j¡dd t | j jd¡}|j|dd W d  ƒ n 1 s•w   Y  W d  ƒ dS W d  ƒ dS W d  ƒ dS W d  ƒ dS 1 s½w   Y  dS )z>Export model to '.qnemo' format for TensorRT-LLM engine build.NzExport config is not setÚsample_outputTÚcompressF)Úenter_resulté   )rB   r7   r:   Ú
export_dirÚinference_tensor_parallelÚinference_pipeline_parallelÚuse_nfs_workspacezFExporting quantized weights, model artifacts, and tokenizer config to rc   r   )Úexist_okÚwz./)Úarcname) r   r
   r:   r6   ra   ÚcfgÚmegatron_amp_O2r   rB   r   r   r   Ú	save_pathr   r7   rw   rx   rD   Ú	num_nodesro   Úbarrierr   r8   rp   r   ÚosÚmakedirsÚpathÚdirnameÚtarfileÚopenÚadd)r<   rB   Útorch_dtypers   Úexport_handlerrv   Útarr?   r?   r@   ÚexportÚ   sP   

ù	ÿÿ
ÿíñï"ízQuantizer.export)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   rA   Ústaticmethodr	   rO   rW   ra   r   rm   rŒ   r?   r?   r?   r@   r   8   s    >r   )1r‚   r†   Ú
contextlibr   Útypingr   r   rf   Útorch.distributedÚdistributedro   Úmegatron.corer   Ú megatron.core.transformer.moduler   Úomegaconf.omegaconfr   r   Ú@nemo.collections.nlp.models.language_modeling.megatron_gpt_modelr	   Ú&nemo.collections.nlp.parts.utils_funcsr
   Ú
nemo.utilsr   Únemo.utils.distributedr   Únemo.utils.model_utilsr   r   Úmodelopt.torch.quantizationÚquantizationrl   Úmodelopt.torch.exportr   ÚINT8_DEFAULT_CFGÚINT8_SMOOTHQUANT_CFGÚFP8_DEFAULT_CFGÚINT4_AWQ_CFGÚW4A8_AWQ_BETA_CFGÚINT4_BLOCKWISE_WEIGHT_ONLY_CFGÚNVFP4_DEFAULT_CFGr2   r/   ÚImportErrorÚModuleNotFoundErrorÚer1   r;   r   r?   r?   r?   r@   Ú<module>   sB   ù
€þ