o
    }oiA,                  
   @   s<  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ z$d dlm  mZ d dl m!Z! ej"ej#ej$ej%ej&ej'ej(dZ)dZ*W n e+e,fy Z- z
dZ*e-Z.W Y dZ-[-ndZ-[-ww g dZ/G dd dZ0dS )    N)nullcontext)CallableOptional)parallel_state)Float16Module)
DictConfig	open_dict)MegatronGPTModel)torch_dtype_from_precision)logging)temporary_directory)save_artifactsunwrap_model)export_tensorrt_llm_checkpoint)int8int8_sqfp8int4_awqw4a8_awqint4nvfp4TF)   16bf16c                   @   s   e Zd ZdZdee dee fddZedefddZ	ed	ed
efddZ
edefddZdedeegdf fddZdefddZdS )	Quantizera?  Post-training quantization (PTQ) and TRT-LLM export of Nemo checkpoints.

    PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
    The process consist of several steps:

        1. Loading a Nemo model from disk using appropriate parallelism strategy
        2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
        3. Producing output directory or .qnemo tarball with model config (json),
           quantized weights (safetensors) and tokenizer config (yaml).

    The output directory (or .qnemo file) produced is intended to be consumed by TensorRT-LLM toolbox
    for efficient inference. This can be achieved using Nemo inference containers.

    Currently supported and tested model family is Llama2. Model type needs to be specified in
    the quantization command with decoder_type parameter on exporting (see below). Quantizing other
    model families is experimental and might not be fully supported.

    Available quantization methods are listed in `QUANT_CFG_CHOICES` dictionary above.
    Please consult Model Optimizer documentation https://nvidia.github.io/TensorRT-Model-Optimizer/ for details.
    You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
    for quantization algorithms and calibration data as well as recommended settings.

    Quantization algorithm can also be conveniently set to 'null' to perform only weights export step
    for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
    quantization_configexport_configc                 C   sN  t stdt|| _|| _|jdu s|jtv sJ d|j |jdurt|j }d|jv rD|d d }t|tr=|d }|j	|d d	< |
d
d}|du rXd|jvoW|jdk}t|r^dnd d |jdkrkdndd|d|d d< |jdkrtd|j  d|jd|d< || _nd| _|dur|jtv sJ d|j dS dS )a  Initialize Quantizer with quantization and export configurations.

        Expected keys in `quantization_config`:
            - algorithm: str
            - decoder_type: str
            - awq_block_size: int (only for awq algorithms)
            - sq_alpha: float (only for smooth quant algorithms)
            - enable_kv_cache: bool (default: None i.e. auto-detect based on algorithm and decoder_type)

        Expected keys in `export_config`:
            - dtype: str/int
            - decoder_type: str
            - inference_tensor_parallel: int
            - inference_pipeline_parallel: int
            - save_path: str
        z*nvidia-modelopt is needed to use QuantizerNz$Unsupported quantization algorithm: awq	quant_cfgz*weight_quantizerr   block_sizesenable_kv_cacher   gptEnabledDisabledz KV cache quantizationr      )      )num_bitsaxisenablez*output_quantizerzUsing int8_sq alpha = smoothquant)methodalpha	algorithmzUnsupported export dtype: )HAVE_MODELOPTRuntimeErrorHAVE_MODELOPT_ERRORr   r   r.   QUANT_CFG_CHOICES
isinstancelistawq_block_sizegetdecoder_typer   infosq_alphar   dtypeSUPPORTED_DTYPE)selfr   r   r   weight_quantizerenable_quant_kv_cache r?   R/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/quantize/quantizer.py__init__S   s@   






zQuantizer.__init__modelc                 C   sn   z	d| j jjj_W n	 ty   Y nw t s5dd }| jj	j
dur-| jj	j
j|| jd | jj	  dS dS )zSetup model for quantization.Nc                   S   s   d S )Nr?   r?   r?   r?   r@   dummy   s   zQuantizer._setup.<locals>.dummy)trainer)rB   modulelanguage_modelencoderactivations_checkpoint_methodAttributeErrorr   is_initializedrD   strategylauncherlaunchsetup_environment)rB   rC   r?   r?   r@   _setup   s   zQuantizer._setup	model_cfgreturnc                 C   sV   t |  | ddrtd d| _d| _d| _W d   | S 1 s$w   Y  | S )z%Modify model config for quantization.sequence_parallelFz2Disabling sequence parallelism for quantization...modeloptN)r   r6   r   warningrR   nameapply_rope_fusion)rP   r?   r?   r@   modify_model_config   s   


zQuantizer.modify_model_configc                 C   s<   t d | jddgdddd}t d|d  d	 d
S )z,Generate sample output for a model instance.z)Generating sample output for the model...z-Born in north-east France, Soyer trained as az&Born in California, Soyer trained as ad   )
max_length
min_length)inputslength_paramsz#Example NeMo output before export: 	sentences"N)r   r8   generate)rB   responser?   r?   r@   _sample_output   s   
zQuantizer._sample_outputforward_loopNc                    s   | j dus	J dtd| jj d | | t|| j |}| jjdkrEd | jjdkr2d n| jjd	kr:d
 t	|d fdd}t
 dkrPt| |S )z:Quantize the model and calibrate using given forward loop.Nz!Quantization algorithm is not setzQuantizing model to ...r"   r   r   i  r      z*input_quantizerc                    s   t j| d  dS )Ng{Gz?)min)torchclamp)amaxmaxboundr?   r@   <lambda>   s    z$Quantizer.quantize.<locals>.<lambda>)r   r   r8   r   r.   rO   mtqquantizer7   postprocess_amaxdistget_rankprint_quant_summary)r<   rB   rb   r?   ri   r@   rm      s    

zQuantizer.quantizec              
   C   s  | j dus	J dt| j j}| j ddr| | |jjr&t|jt	|_| j dd}|r3t
 }nt| j jd}|~}t|| j j||| j j| j j|jjdkd	 t  td
| j j d t dkrt|| |rtjtj| j jdd t| j jd}|j|dd W d   n 1 sw   Y  W d   dS W d   dS W d   dS W d   dS 1 sw   Y  dS )z>Export model to '.qnemo' format for TensorRT-LLM engine build.NzExport config is not setsample_outputTcompressF)enter_result   )rB   r7   r:   
export_dirinference_tensor_parallelinference_pipeline_paralleluse_nfs_workspacezFExporting quantized weights, model artifacts, and tokenizer config to rc   r   )exist_okwz./)arcname) r   r
   r:   r6   ra   cfgmegatron_amp_O2r   rB   r   r   r   	save_pathr   r7   rw   rx   rD   	num_nodesro   barrierr   r8   rp   r   osmakedirspathdirnametarfileopenadd)r<   rB   torch_dtypers   export_handlerrv   tarr?   r?   r@   export   sP   

	
"zQuantizer.export)__name__
__module____qualname____doc__r   r   rA   staticmethodr	   rO   rW   ra   r   rm   r   r?   r?   r?   r@   r   8   s    >r   )1r   r   
contextlibr   typingr   r   rf   torch.distributeddistributedro   megatron.corer    megatron.core.transformer.moduler   omegaconf.omegaconfr   r   @nemo.collections.nlp.models.language_modeling.megatron_gpt_modelr	   &nemo.collections.nlp.parts.utils_funcsr
   
nemo.utilsr   nemo.utils.distributedr   nemo.utils.model_utilsr   r   modelopt.torch.quantizationquantizationrl   modelopt.torch.exportr   INT8_DEFAULT_CFGINT8_SMOOTHQUANT_CFGFP8_DEFAULT_CFGINT4_AWQ_CFGW4A8_AWQ_BETA_CFGINT4_BLOCKWISE_WEIGHT_ONLY_CFGNVFP4_DEFAULT_CFGr2   r/   ImportErrorModuleNotFoundErrorer1   r;   r   r?   r?   r?   r@   <module>   sB   
