o
    wi`                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZ d dlm  mZ d dlm  mZ d dlm  mZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
l m!Z!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 e
rd dl7m8Z9 d dl:m;Z; d dl<m=Z= e$ Z>g dZ?g dZ@dddZAeeeBf ZCeG dd dZDeG dd dZEG dd  d ZF	d@d!eCd"eCd#ed$ d%edB fd&d'ZGd(d) ZH	,dAd-eBd.eId/eId0eIfd1d2ZJd3d4 ZKejLd5fejMd6fejNd7fejOd8fejPd9fejQd:fejRd:fejSd:fejTd;fejUd<fejVd;fejWd;fejXd=fgZYd#eejZej[f d%eeB fd>d?Z\dS )B    N)	dataclass)Path)TYPE_CHECKINGOptionalUnion)load_dataset)CommonInferenceParams)tqdm)PreTrainedTokenizerBase)llm)MCoreTokenizerWrapppergenerate)get_quant_cfg_choices)load_quant_cfg)barriertorch_dtype_from_precision)ckpt_to_context_subdir) load_connector_from_trainer_ckpt)TrainerContextckpt_to_weights_subdir)logging)is_global_rank_zero)unwrap_model)Trainer)MegatronParallel)   16bf16)trtllmnemohf
FP8_KV_CFGNVFP4_KV_CFG)fp8nvfp4c                   @   s   e Zd ZU dZdZee ed< dZe	ed< dZ
eed< dZee ed	< dZeed
< dZeed< dZe	ed< dZe	ed< dZe	ed< dS )QuantizationConfiga  Quantization parameters.

    Available quantization methods are listed in `QUANT_CFG_CHOICES` dictionary above.
    Please consult Model Optimizer documentation https://nvidia.github.io/TensorRT-Model-Optimizer/ for details.

    Quantization algorithm can also be conveniently set to None to perform only weights export step
    for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
    r#   	algorithm   awq_block_size      ?sq_alphaNenable_kv_cachekv_cache_qformatcnn_dailymailcalibration_dataset   calibration_dataset_size@   calibration_batch_sizecalibration_seq_len)__name__
__module____qualname____doc__r&   r   str__annotations__r(   intr*   floatr+   boolr,   r.   r0   r2   r3    r=   r=   q/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/modelopt/quantization/quantizer.pyr%   ?   s   
 	r%   c                   @   sv   e Zd ZU dZeed< dZeed< dZeee	f ed< dZ
ee ed< d	Ze	ed
< d	Ze	ed< dZeed< dd ZdS )ExportConfigzInference configuration for the quantized TensorRT-LLM checkpoint.

    Available export formats methods are listed in `SUPPORTED_EXPORT_FMT` dictionary above.
    pathr   export_formatr   dtypeNdecoder_type   inference_tpinference_ppFgenerate_samplec                 C   s   t | j| _d S N)r   r@   )selfr=   r=   r>   __post_init__e   s   zExportConfig.__post_init__)r4   r5   r6   r7   r8   r9   rA   rB   r   r:   rC   r   rE   rF   rG   r<   rJ   r=   r=   r=   r>   r?   V   s   
 r?   c                   @   s   e Zd ZdZdedefddZed'dd	Zd(de	de
e fddZedd Zdd Zdd Zd)d*ddZ	d+ddZededede	fddZdeded efd!d"Zd)ded#e
d$ ddfd%d&ZdS ),	Quantizera  Post-training quantization (PTQ) and export of NeMo 2.0 checkpoints.

    PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
    The process consist of several steps:

        1. Loading a Nemo model from disk using appropriate parallelism strategy
        2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
        3. Producing an output directory with a quantized checkpoint and a tokenizer

    By default, the output directory produced is intended to be consumed by TensorRT-LLM toolbox
    for efficient inference. This can be achieved using nemo.export.tensorrt_llm module.
    This can be changed to export a standard NeMo 2.0 checkpoint instead using `ExportConfig`.
    quantization_configexport_configc                 C   sp   t j s	td|| _|| _|j}|jr"|jt	v s"J d|j |dur1|t
v s1J d| t|| _dS )zAInitialize Quantizer with quantization and export configurations.z%GPU is required for the quantization.z*Unsupported kv cache quantization format: NzUnsupported export dtype: )torchcudais_availableEnvironmentErrorrL   rM   rB   r+   r,   KV_QUANT_CFG_CHOICESSUPPORTED_DTYPEr   torch_dtype)rI   rL   rM   rB   r=   r=   r>   __init__x   s   

zQuantizer.__init__returnNc                 C   s(   t | tjrdS | jj| j_|   dS )zSetup model for quantization.N)
isinstancer   HFAutoModelForCausalLM	tokenizer
vocab_sizeconfigfreezemodelr=   r=   r>   _setup   s   zQuantizer._setupFoptionalc                 C   s|   | j jdur
| j jS |}t|tjtjfs.t|dr"|jjdkr"dS |j	}t|tjtjfrt
| }r6|S |s<tddS )a8  
        Determines the decoder type for the given model. It is used for exporting a model to
        a TensorRT-LLM checkpoint and for configuring certain parameters in the quantization algorithm.

        Args:
            model: The model instance for which the decoder type needs to be determined.
            optional (bool): Allow to return None if the decoder type cannot be inferred.
                Otherwise an exception will be raised in such cases.

        Returns:
            Optional[str]: The decoder type as a string if it can be determined.
        N	__class__Llama4OmniModelllamazxCould not infer the decoder type for the provided model. Please provide the decoder type explicitly in the ExportConfig.)rM   rC   rW   r   GPTModelrX   hasattrra   r4   moduleget_modelopt_decoder_type
ValueError)rI   r^   r`   unwrapped_modelrC   r=   r=   r>   _get_decoder_type   s   zQuantizer._get_decoder_typec           
   	      s   ddg}g }t  tjrB|D ]2} jj|dd} fdd| D } jjdi |ddi} jjj|d	 d
d}|| qn(t	 j} j
tjdd}dd t|||tddddD }	dd t||	D }td|  d S )Nz-Born in north-east France, Soyer trained as az&Born in California, Soyer trained as apt)return_tensorsc                    s    i | ]\}}||  jjqS r=   )tor^   device).0kvr]   r=   r>   
<dictcomp>        z.Quantizer._generate_sample.<locals>.<dictcomp>max_new_tokens   r   T)skip_special_tokens)params_dtype&inference_batch_times_seqlen_thresholdc                 S   s   g | ]}|j qS r=   )generated_text)ro   rr=   r=   r>   
<listcomp>   s    z.Quantizer._generate_sample.<locals>.<listcomp>rD   )top_knum_tokens_to_generate)inference_paramsc                 S   s   g | ]\}}|| qS r=   r=   )ro   prompt
generationr=   r=   r>   r{      s    z,Sample generation after PTQ (with prompts): r=   )rW   r   rX   rY   itemsr^   r   decodeappendr   get_inference_wrapperrN   bfloat16r   zipr   info)
r^   promptsoutputsr   	input_idsoutputdecodedmcore_tokenizermcore_inference	generatedr=   r]   r>   _generate_sample   s0   

	zQuantizer._generate_samplec                    sr   t || jj| jj| jj| jjd| jj| jj }t|tjr,|j	j
  fdd}|S | j|| jj| jjdS )N)datasetseq_len
batch_sizecalibration_sizec                    s"    }|D ]	}| |   qd S rH   )rm   )r^   
dataloaderbatchrn   get_dataloaderr=   r>   huggingface_forward_loop   s   z=Quantizer._get_forward_loop.<locals>.huggingface_forward_loop)num_batches
seq_lengthmicro_batch_size)create_data_iterator_getterrL   r.   r3   r2   r0   rW   r   rX   r^   rn   create_megatron_forward_loop)rI   r^   number_of_batchesr   r=   r   r>   _get_forward_loop   s&   zQuantizer._get_forward_loopc                 C   s  | j |dd}| jj}tj|rt|S |tv s J d| t| }d|v rWt	|}|d d }t
|tr<|d }| jjrH| jj|d d	< d
|krW|dv rWddd|d< | jjd u rfd|vod|dk}n| jj}| jjd u rx|rxtd nt|r~dnd d |r|ddddii|d< |d ttt| jj d  |dd sd|d< |dkrd|v rddd|d< ddi|d d< |S ) NTr`   z!Unsupported quantization format: awq	quant_cfgz*weight_quantizerr   block_sizesw4a8_awq)gemmamptawq_literD   )method
alpha_stepr&   int8gptzPEnabled KV cache quantization but enable_kv_cache is None in quantization_configEnabledDisabledz KV cache quantizationdefaultenableFmaxr   int8_sqsmoothquantr)   )r   alphazvision_projection.*)rj   rL   r&   osr@   isfiler   QUANT_CFG_CHOICEScopydeepcopyrW   listr(   r+   r   warningr   getupdategetattrmtqrR   r,   )rI   r^   rC   r&   r   weight_quantizerenable_quant_kv_cacher=   r=   r>   _get_quant_cfg   sB   

zQuantizer._get_quant_cfgr^   r   c                    s
  | j j}|du rtd |S td| d | | | j|dd}| |}tdt|  |du rCt	j
|rC| |}t	t|||}|dkrm| d	krY d
 n	dkr_d n	 d t	|d fdd}t rut	| | jjrtd | | |S )zQuantize the model and calibrate using given forward loop.

        If forward_loop is not provided, a forward loop will be created using the calibration dataset.
        NzEQuantization algorithm set to None, returning the non-quantized modelzQuantizing model to z...Tr   zUsing quant_cfg:
r   r#   i  r      r   z*input_quantizerc                    s   t j| d  dS )Ng{Gz?)min)rN   clamp)amaxmaxboundr=   r>   <lambda>F  s    z$Quantizer.quantize.<locals>.<lambda>z4Generating a sample output after model quantization.)rL   r&   r   r   r_   rj   r   pprintpformatr   r[   need_calibrationr   quantizeunwrap_for_modelopt_operationspostprocess_amaxr   print_quant_summaryrM   rG   r   )rI   r^   forward_loopr&   rC   r   ri   r=   r   r>   r   %  s:   







zQuantizer.quantizec                    s6   ddl m} | dd  fdd}|S )z5Create a forward loop for over a given data iterator.r   )get_forward_backward_funcc                 S   sH   t | }|j\}}tj||jd||f}|||d }dd }||fS )Nrn   c                 S   s   t di fS )NrD   )rN   zeros)tensorr=   r=   r>   _mock_loss_function`  s   z^Quantizer.create_megatron_forward_loop.<locals>.forward_step_func.<locals>._mock_loss_function)nextshaperN   arangern   expand)data_iteratorr^   data	batch_lenr   position_idsoutput_tensorr   r=   r=   r>   forward_step_funcZ  s   
zAQuantizer.create_megatron_forward_loop.<locals>.forward_step_funcc              
      s"    }||  dd d S )NT)r   r   r^   num_microbatchesr   r   decoder_seq_lengthforward_onlyr=   )r^   r   r   forward_backward_funcr   r   r   r   r   r=   r>   loope  s   
z4Quantizer.create_megatron_forward_loop.<locals>.loop))megatron.core.pipeline_parallel.schedulesr   )rI   r   r   r   r   r   r   r   r=   r   r>   r   R  s
   z&Quantizer.create_megatron_forward_loopcheckpoint_dirtensor_parallelism_sizec                 C   sP   | d   }d}t|D ]}|| d| d   M }q|o|}|s&td |S )z(Basic validation of the model structure.zconfig.jsonTrankz.safetensorsz%Failed to export the quantized model.)existsranger   error)r   r   saved_configsaved_weightsiexport_successfulr=   r=   r>   _validate_quantized_checkpointt  s   
z(Quantizer._validate_quantized_checkpoint	model_dir
export_dir
export_fmtc                 C   s   t  r|dkr	d S t|tj}|r#|dkr|d }|jt| d S |dkrDt|drDt|jdrDt|jjtrD|jjt| d S t	j
t|tj|ddd d S )Nr   r    huggingface_tokenizerrY   nemo_contextT)dirs_exist_ok)r   rW   r   rX   rY   save_pretrainedr8   re   r
   shutilcopytreer   r   r@   join)rI   r^   r   r   r   is_automodelr=   r=   r>   _save_tokenizer  s$   

zQuantizer._save_tokenizertrainerr   c              
   C   s  ddl m} | jj}| jj}|tv sJ d| t|tj}| jjdkro|r+J d|dus3J d|j	
| |j	  |j	j|d ||j	_|| t  t rnt|jt|d	gd
 tt|dd  snJ nY| jjdkr}t|||d nK| jj}| jj}	| o|jjdk}
t   ||dd tj t!|| "|| j#|||	|
d W d   n1 sw   Y  t  t r| $||sJ t r| %|||| t&'d| d dS dS )z2Export model to a TensorRT-LLM or NeMo checkpoint.r   )remove_hook_from_modulezUnsupported export format: r   zXNeMo export format can only be used with native NeMo checkpoints, not HuggingFace modelsNz!Trainer required for NeMo export.)r   r^   )
yaml_attrsFmodelopt_stater    r]   rD   T)recurse)r^   rC   rB   r   inference_tensor_parallelinference_pipeline_paralleluse_nfs_workspacez-Export succeeded, model has been exported to .)(accelerate.hooksr  rM   r@   rA   SUPPORTED_EXPORT_FMTrW   r   rX   strategyconnectsetup_environmentsetup_megatron_parallelr   save_checkpointr   r   r   from_trainerio_dumpr   r   r   r   export_hf_checkpointrE   rF   r[   pipeline_model_parallel_sizerN   inference_modemteexport_tensorrt_llm_checkpointr   rj   rT   r   r   r   r   )rI   r^   r   r   r  r   r   r   rE   rF   r  r=   r=   r>   export  sZ   


zQuantizer.export)rV   N)FrH   )r^   r   )NNN)r4   r5   r6   r7   r%   r?   rU   staticmethodr_   r<   r   r8   rj   r   r   r   r   r   r   r:   r   r   r  r=   r=   r=   r>   rK   i   s"    "
6.
" rK   r   r   r^   zpl.LightningModulerV   c              	   K   sF  t |tjr:t|}tj|sdS t  t	j
|fdt|i| W d   t|S 1 s1w   Y  t|S t| d}|du rJ|| \}}t|}tj|sVdS t = t }|j| t	j|f|t|d| W d   n1 s~w   Y  W d   t|S W d   t|S 1 sw   Y  t|S )zHExport a GPTModel or HFAutoModelForCausalLM to a HuggingFace checkpoint.Nr   r    )pretrained_model_name_or_pathr   )rW   r   rX   r   mtoModeloptStateManageris_convertedrN   r  r  r  r8   r   	nemo_loadtempfileTemporaryDirectoryr[   r   export_mcore_gpt_to_hfr   )r   r   r^   kwargsri   exporter_tmp_dirr=   r=   r>   r    sD   






r  c                 C   s   t | tjr	| jS t| S )zUnwraps the model to expose the underlying architecture that Model Optimizer can work with.
    For HuggingFace models, returns the base model. For MCore models, returns the unwrapped version.)rW   r   rX   r^   r   r]   r=   r=   r>   r     s   r   r-   r1   r/   r   r   
calib_sizemax_sequence_lengthc           	      c   s    | dkrt dddd}d}n| dkrt dddd}d	}n	t d
| dd}d}ttt|||}t|| D ]&}||| |d |  | }tt|D ]}|| d| ||< qL|V  q6dS )z/Creates a sample data iterator for calibration.wikitextzwikitext-103-v1train)splittextr-   z3.0.0)namer)  articlejson)
data_filesr)  rD   N)r   r   r   lenr   )	r   r   r%  r&  r   text_columnr   r   jr=   r=   r>   get_calib_data_iter  s    r2  c                    s    fdd}|S )z>Create a function that provides iterator over a given dataset.c                     sp   d} t |   d}g }|D ]!}fdd|D }fdd|D }|tj|jd qtt|S )N   )r   r&  r   r%  c                    s    g | ]} j |d  qS rH   )rY   text_to_ids)ro   r*  r^   r   r=   r>   r{     rs   zFcreate_data_iterator_getter.<locals>._get_iterator.<locals>.<listcomp>c                    s&   g | ]}|t |  jjg  qS r=   )r/  rY   eos)ro   idsr5  r=   r>   r{     s   & r   )r2  r   rN   r   rn   iterr	   )CHARACTERS_PER_TOKENr   r   r   r   r   r   r^   r   r=   r>   _get_iterator  s   z2create_data_iterator_getter.<locals>._get_iteratorr=   )r^   r   r   r   r   r;  r=   r:  r>   r     s   r   baichuanchatglmgemma2gemma3r   rc   r   qwenphi3c                 C   s>   t | tjrtj| jS tD ]\}}t | |r|  S qdS )a  Infers the modelopt decoder type from GPTModel or HFAutoModelForCausalLM.

    Args:
        model (GPTModel | HFAutoModelForCausalLM): The model to infer the decoder type from.
    Returns:
        Optional[str]: The inferred decoder type or None if no match is found.
    N)rW   r   rX   r  model_utilsget_model_typer^   gpt_model_type)r^   config_classrC   r=   r=   r>   rg   4  s   
rg   rH   )r-   r1   r/   r/   )]r   r   r   r   r  dataclassesr   pathlibr   typingr   r   r   modelopt.torch.exportrN   r  r  modelopt.torch.optoptr  modelopt.torch.quantizationquantizationr   datasetsr   /megatron.core.inference.common_inference_paramsr   r	   transformersr
   nemo.collectionsr   nemo.collections.llm.inferencer   r   <nemo.collections.llm.modelopt.quantization.quant_cfg_choicesr   0nemo.collections.llm.modelopt.quantization.utilsr   nemo.collections.llm.utilsr   r   nemo.lightning.ckpt_utilsr   nemo.lightning.io.apir   nemo.lightning.io.plr   r   
nemo.utilsr   nemo.utils.get_rankr   nemo.utils.model_utilsr   lightning.pytorchpytorchplnemo.lightningr    nemo.lightning.megatron_parallelr   r   rS   r
  rR   r8   AnyPathr%   r?   rK   r  r   r:   r2  r   Baichuan2ModelChatGLMModelGemma2ModelGemma3Model
GemmaModel
LlamaModelMistralModelMixtralModelNemotronModel
Qwen2ModelStarcoderModelStarcoder2Model	Phi3ModelrD  rd   rX   rg   r=   r=   r=   r>   <module>   s     h


&