o
    ٷiǍ                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlmZ ddlmZ edZ	d*d	d
Zd+ddZd,ddZd,ddZd+ddZdd Zi dddi ddgdfddZdd Zd d! Z				"	#	$	$	%d-d&d'Zi dddd"d#d"d$d$d%ddgfd(d)ZdS ).zWeightOnly for onnxrt adaptor.    N)numpy_helper)np_dtype_to_tensor_dtype   )	ONNXModel)simple_progress_barneural_compressorc	                 C   s.  || d }	t j|jd |	fdd}
| jd d|d| }| jd |g}g }i }d}|d	kr[|d
d
d
d
df |d
d
dd
df d	> B }|d
d
d
|	f |
d
d
d
d
f< n|dkrb|}
n	td| d t |
d||	f}
t |d|f}|jt jks|jt j	ksJ t
jj| jd d t|j|j| dd}||j || |d
urB|dkr|d}n`|d	krt j|jd d d ddd}t |jd | | d}|d
d
d }|dd
d }||d  d@ ||  B ||d < ||d  d@ ||  d	> B ||d < ntd| dt ||d df}t
jj| jd d d|j| dd}||j || |d |d< |d |d< ||d< ||d< |dkr_||d< t
jj|d|
j|
 dd}|| t
jj|f|| j| jr| jd t| ndt| dd|}||fS )aB  Build MatMulNBits node.

    Args:
        node: original matmul node
        weight_shape: original weight shape
        num_bits (int): num_bits
        group_size (int): how many elements share one scale/zp
        k_blocks (int): block number
        q_weight (array): quantized weight
        scale (array): scale
        zero_point (array): zero point
        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).

    Returns:
        matmul_weight_only_node: MatMulNBits node
        new_inits: initializers of the new node
       r   uint8dtyper   _QGMatMulNBits   N   z8MatMulNBits does not have kernel support for num_bits = ._scaleTname	data_typedimsvalsraw         _zpKNbits
block_sizeaccuracy_levelzcom.microsoft)inputsoutputsr   domain)npzerosshapeinputloggererrorreshaper   float32float16onnxhelpermake_tensorr   tobytesappendr   astypefullarangeravel
ValueError	make_nodeoutputstr)nodeweight_shapenum_bits
group_sizek_blocksq_weightscale
zero_pointr"   	blob_sizepackedq_weight_nameinput_names	new_initskwargsop_typeq_weight_pairsscale_tensor	packed_zpidxeven_idxodd_idx	zp_tensorq_weight_tensormatmul_weight_only_node rT   j/home/ubuntu/.local/lib/python3.10/site-packages/onnxruntime/quantization/neural_compressor/weight_only.pymake_matmul_weight_only_node,   s   0&


$*


$rV   r       asymint      ?c              	      s  t | d|f} |dks|dkrd| d  dn|dkr8|dkr)d|d  d nd |dkr6d|d   ndt j| ddd	| }t j| ddd	| }|dkrt t |t |}t |j}	|dk}
||
 d
 t j	   |	|
< |dkrt 
|	jnt j|jddd|d >  }nHt |j}	t  fdd|| ||k   D |	||k< |dkrt 
|	j| |	  nt dt  t 
|	j| |	  d}t j| |	jd}t j| |	|d t j|||d t j||d t j| |d ||	|fS )a	  Quantize tensor per group.

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
        scheme (str, optional): quantization scheme. Defaults to "asym".
        dtype (str, optional): data type. Defaults to "int".
        ratio (float, optional): percentile of clip. Defaults to 1.0.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   rX   uintr   r   r   symTaxiskeepdimsg       @rY   r	   r
   c                    s   g | ]
}t |   qS rT   )float.0imaxqminqrT   rU   
<listcomp>       z quant_tensor.<locals>.<listcomp>out)r&   r,   minmaxmaximumabsonesr(   r4   float64r'   arrayflattentolistroundminimum
empty_liker   divideaddclip)datar>   r?   schemer   ratiorminrmax	max_rangerB   maskrC   rA   rT   rd   rU   quant_tensor   s<    .&,
r   c           &      C   s@  t | d|ft j} d| d }d}t j| d ddd}t || }t |t | }t j| ddd}t j	| ddd}	t j|ddd}
t j||  ddd}t j
|	j| jd}||	k}|| |	| ||   ||< d| }t t || |  ||}|| | |  }t j||d  ddd}d}d	}d}t|D ]}t j
|	j| jd}t |||  | | g| jd }||	k}||	| ||   ||< t t || |  ||}|| }t j|ddd}t j|| ddd}t j||  ddd}t |
| |d }|
| ||  | }|| ||  | }|| | |  }t j||d  ddd} t | }!t |}"t |!|"k d }#||#d
d
f ||#d
d
f< | |# ||#< ||# ||#< ||# ||#< qt | |  d|d}$|t j}t j| |jd}%t j| ||%d t j|%|$|%d t j|%|%d t j|%|||%d |%||$fS )a  Quantize tensor per group based on k quant.

    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 32.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   r   r   r   Tr]   r
      皙?Nr	   ri   )r&   r,   r4   r-   sumsqrtrx   rn   rk   rl   ro   r(   r   ry   rt   rangerq   subtractwhererp   rv   rw   )&rz   r>   r?   re   rf   sum_x2av_xweightsr}   r~   sum_wsum_xiscaler   rB   
quant_datadiffbest_madnsteprdeltarrminis_
iscale_newfactorquant_data_newmul_weights_quant_data_newsum_lsum_l2sum_xlD
this_scalethis_minmadmad_1
best_mad_1idx_to_replacerC   rA   rT   rT   rU   quant_tensor_k_quant_cpu   sd   (


r   c           (      C   s  zddl }ddl}|j r|| } | d|f|j} d| d }d}|j| d ddd}|	|| }|
||| }	|j| ddd}
|j| ddd}|j|	ddd}|j|	|  ddd}|j|j| jd}|
|k}|| || |
|   ||< d| }|||| |
  ||}|| |
 |  }|j|	|d  ddd}d	}d
}d}t|D ]}|j|j| jd}||||  | | g| jd }|
|k}||| |
|   ||< |||| |
  ||}|	| }|j|ddd}|j|| ddd}|j||  ddd}||| |d }|| ||  | } || ||  | }!| | |! |  }|j|	|d  ddd}"||"}#||}$||#|$k d }%||%ddf ||%ddf< |"|% ||%< | |% ||%< |!|% |
|%< q||
 |  d|d}&||j}|j| |jd}'|j| ||'d |j
|'|&|'d |j|'|'d |j|'|||'d |' | |& fW S td t| ||W S  ty   td t| || Y S w )a  Quantize tensor per group based on k quant.

    Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.

    Returns:
        output: quantized weight
        scale: scale
        zero_point: zero point
    r   Nr   r   r   Tr]   r
   r   r   r	   ri   zqTry to use k-quant quantization on CUDA. However, CUDA is not available.Fall back to k-quant quantization on CPU.zNow we are using k-quant quantization on cpu, which is time consuming.Please consider install cupy to speed up on CUDA. See https://cupy.dev/Please also install torch to check CUDA availability.) cupytorchcudais_availableasarrayr,   r4   r-   r   r   rx   rn   rk   rl   ro   r(   r   ry   rt   r   rq   r   r   rp   rv   rw   getr*   warningr   ImportErrorinfo)(rz   r>   r?   cpr   re   rf   r   r   r   r}   r~   r   r   r   r   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rC   rA   rT   rT   rU   quant_tensor_k_quant_cuda  s   
(

r   c           
      C   s2   | j }t| |||||\}}}	t|||	  |S )a  Quant dequant tensor per group.

    Args:
        data : input weight
        num_bits (int, optional): num_bits. Defaults to 4.
        group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
        scheme (str, optional): quantization scheme. Defaults to "asym".
        dtype (str, optional): data type. Defaults to "int".
        ratio (float, optional): percentile of clip. Defaults to 1.0.

    Returns:
        output: quant-dequant weight
    )r(   r   r&   r,   )
rz   r>   r?   r{   r   r|   	org_shapeweightrB   zprT   rT   rU   
qdq_tensory  s   r   c                 C   sH   |dkr| S | j }|| }||d  }|dkr"t| d|fdfd} | S )a  Pad tensor rowi so that it can be is divisible by group_size.

    Args:
        weight (array): weight
        group_size (int): how many elements share one scale/zp
        k_blocks (int): the number of block

    Returns:
        weight: paded weight
    r   r   )r   r   constant)r(   r&   pad)r   r?   r@   org_w_shapepadded_rowspad_lenrT   rT   rU   
pad_tensor  s   r   CPUExecutionProviderk_quantc	                 C   s  t | } | jdurtj| jnd}	g }
g }tdd |  D }d}|  D ]B}|jdv r8|d7 }t|| |jdv rj| 	|j
d durj||ji dkrj| 	|j
d }tj||	d	 }t|jd
krlq'|j}|j|v r||j d }||j d }||j d }|j}|dkr|n|d }|d d | d }| |j
d }t|||}|dkp|dk}|r|dkrt|j||\}}}nt|j|||d||j
d d\}}}t||||||d|||dks|dkr|nd|d	\}}| | || |
| nUt|j|||d||j
d d}t||d df}t|}|d|d ddf |}tj j!|j
d d|d| t"||j|# dd}| $| |j|j
d< |dkrj| %| q'| &|
 | '| | (  | S )a  Quant the model with round to nearst method.

    Args:
        model (ModelProto or ONNXModel): onnx model
        weight_config (dict): quantization config
                For example,
                weight_config = {
                    'fc2':
                        {
                            'bits': 4,
                            'group_size': 32,
                            'scheme': 'sym',
                            'algorithm': 'RTN'
                        }
                }
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        ratios (dict, optional): percentile of clip. Defaults to {}.
        accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
        providers (list): providers to use

    Returns:
        model: fake quantized ONNXModel
    N c                 S   s   g | ]	}|j d v r|qS )MatMul)rJ   ra   rT   rT   rU   rg     s    z rtn_quantize.<locals>.<listcomp>r   r   r   fp32)base_dirr   r    r?   r{   r   r   r   r   r[   r	   rX   	r<   r=   r>   r?   r@   rA   rB   rC   r"   rY   r   r   Tr   ))r   
model_pathospathdirnamelennodesrJ   r   get_initializerr)   r   r   r   to_arraycopyr(   r   get_initializer_share_numr   r   Tr   rV   r4   add_initializersr3   r   r&   r,   	transposer/   r0   r1   r   r2   add_initializerremove_initializer	add_nodesremove_nodestopological_sort)modelweight_configr>   r?   r{   ratiosr"   	providers	algorithmr   	new_nodesr   	total_numcurr_idr<   weight_tensorr   r   r   r@   init_share_numsatisfy_MatMulNBits_conditionrA   rB   r   q_matmul_noderH   rR   rT   rT   rU   rtn_quantize  s   $






"





r   c              	   C   sX   | j }|dkrt| d|fn| } tjtt| tjt| ddd |dd}|S )zGet the scale of weight.r   r   Tr]   r   r^   )r(   r&   r,   meanrn   rl   )r   r?   r   rB   rT   rT   rU   get_weight_scale  s   2r   c              
      s  ddl m} ddlm  t }tjdk r%|dr%ddlm	} |
|  | jr6tj| j| jd d	d	d
d | jsDtj| j ||dn
tj| jd ||d}dd | D }~g }	t|D ]\}
}|dkru|
d |j |kru |	|fS t|dkst|d trt|d t|ksJ dt| dt|d  t|d tr|	t fdd|d  D  q_t|d tjr|	tdd t||d gd
dD  q_|	t fddt||d d
dD  q_|	|fS )as  Prepare inputs for weight only quantization.

    Args:
        model (ModelProto or ONNXModel): onnx model
        n_samples (int, optional): calibration sample number. -1 means all samples.
        dataloader (object): dataloader for calibration.
        providers (list): providers to use

    Returns:
        inputs: prepared inputs.
        so: session options
    r   )	find_specr   to_numpy)      onnxruntime_extensions)get_library_path_augment.onnxTFsave_as_external_dataall_tensors_to_one_fileconvert_attributer   c                 S      g | ]}|j qS rT   r   ra   rT   rT   rU   rg   G      z"prepare_inputs.<locals>.<listcomp>r   zInput number mismatch, require z	 but get c                       g | ]
\}}| |fqS rT   rT   )rb   r   inp_datar   rT   rU   rg   T  rh   c                 S   s   g | ]\}}||fqS rT   rT   rb   r   inprT   rT   rU   rg   V  s    strictc                    r   rT   rT   r   r   rT   rU   rg   X  rh   )importlib.utilr   utilr   ortSessionOptionssysversion_infor   r   register_custom_ops_libraryis_large_modelr/   
save_modelr   r   InferenceSessionSerializeToString
get_inputs	enumerate
batch_sizer   
isinstancedictr3   itemsr&   ndarrayzip)r   	n_samples
dataloaderr   r   sor   sessioninputs_namesr#   rc   rz   rT   r   rU   prepare_inputs#  sF   
&*,r     {Gz?FTc
           #         sZ  d| d d dd fdd}
| j }|
| \}}t|dk}d|||f< d| |d	d	f< |r[tt|d	d	d
 }| |d	d	f } ||d	d	f d	d	|f }t| }t| }|tt| }t|d }|||f  |7  < tjtj	|j
}|}td|d |D ]}t|| |d }|| }t| ||d	d	f }t|}t|}t|}|||||f }t|D ]}||d	d	f }|||f }|d
kr|| | dkr|
| || || | d	d	f \}}|tt|d	d	tjf | | d|   } | ||d	d	f< ||  d |d  ||d	d	f< ||  | }!||d	d	d	f  ttj||d	|f ddtj|!dd8  < |!||d	d	f< q||||d	d	f< |d |||d	d	f< | |d	d	d	f  t||d	||f |8  < q|rt|}"||"d	d	f }t|| j }~ |S )a  Quant the weight with GPTQ method.

    Args:
        W (array): weight.
        H (array): Hessian matrix.
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        blocksize (int, optional): blocksize to quantize weight.
        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
        mse (bool, optional): whether get scale and zero point with mse error.
        perchannel (bool, optional): whether quantize weight per-channel.

    Returns:
        Q: fake quantized weight
    r   r   d   g?g333333@c                    s>  | j }stj|  dd} t| j d }ttj| dd|}ttj| dd|}dkrItt	||}|dk }t
|rI||  ||< |dk|dk@ }d||< d||< ||  }dkrpt|j d  d }nt| | }rt| j d gtd }tt  D ]`}d|   }	|	| }
|	| }||
  }dkrt|
 | n|}tt| | | d}|| 8 }tt	|}t|d}||k }t
|r|| ||< || ||< || ||< qs|d }t||}t||}dgdgt|d   }t||}t||}||fS )Nr   r   r   r\   r   r   inf)r(   r&   expand_dimsrr   r'   ru   rk   rm   rl   rn   anyro   rt   r`   r   rY   ry   powerr   repeatr   r,   )r   r   tmpxminxmaxrB   zerobestrc   pxmin1xmax1scale1zero1qerrr(   gridre   	maxshrinkmsenorm
perchannelr{   rT   rU   find_params~  sX   

zgptq.<locals>.find_paramsr   Nr   r   )r(   r&   diagargsort
zeros_liker   r6   linalgcholeskyinvr   r   rk   r   deepcopyry   rt   newaxisrr   matmulr  r,   )#WHr>   r?   r{   	blocksizepercdampactorderr*  r,  r-  r(   rB   r   deadpermLossesQdampr.  Hinvi1i2countW1Q1Err1Losses1Hinv1rc   wdr%  err1invpermrT   r'  rU   gptq\  sd   0




(6 D6
rN  c           *         s  t | } | jdurtj| jnd}t| |||\}}~t| jj	j
}| dd |D  g }|  D ]%}|jdv rY||ji dkrY||ji dddkrY||jd	  q4tt|}| | | jrvtj| j| jd
 dddd | jstj| j ||dn
tj| jd
 ||d}t|D ]\}}tt||d  g }g }| j| D ]O}|jdv r||ji dkr||ji dddkr| |jd durt !| | "|jjd | }t|j#dkrq|| || "|j qt|d	krqdd |D }d	|D ]<}|$|g|d	   j#d	 t%& d j#d f fdd|D }7 t%'d     fdd|D }qt(|||ddD ]\}}}|j|v rr||j d }||j d }||j d }|dkry|n|j#d	 }|j)}t*||||||||	|
|d
}| |jd }| +|jd } |dk}!|!r|j#}"|"d	 | d | }#t,|||#}t-|j.|||d\}}$}%t/||"|||#|0d|$0||dkr|%nd|d	\}&}'| 1|' | 2| | 3|& n)tj4j5|jd d|d | t6||j#|0|7 dd!}(| 8|( |(j|jd< | dkr$| 9| qRq| | | jj	j
:| | ;  | jrOd	d"l<m=}) |)| jtj>| jd	  | S )#a  Quant the model with GPTQ method.

    Args:
        model (ModelProto or ONNXModel): onnx model
        dataloader (object): dataloader for calibration.
        weight_config (dict): quantization config
                For example,
                weight_config = {
                    'fc2':
                        {
                            'bits': 4,
                            'group_size': 32,
                            'scheme': 'sym',
                            'algorithm': 'GPTQ'
                        }
                }
        num_bits (int, optional): num_bits. Default is 4.
        group_size (int, optional): how many elements share one scale/zp. Default is 32.
        scheme (str, optional): sym or asym. Defaults to "asym".
        n_samples (int, optional): calibration sample number.
        percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
        blocksize (int, optional): blocksize to quantize weight.
        actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
        mse (bool, optional): whether get scale and zero point with mse error.
        perchannel (bool, optional): whether quantize weight per-channel.
        accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
        providers (list): providers to use

    Returns:
        model: fake quantized ONNXModel
    Nr   c                 S   r   rT   r   ra   rT   rT   rU   rg     r   z!gptq_quantize.<locals>.<listcomp>r   r   r   GPTQr   r   TFr   r   r   r   c                 S   s&   g | ]}t |jd  |jd  fqS r   )r&   r'   r(   ra   rT   rT   rU   rg   O  s   & r   c                    s   g | ]
}|     qS rT   rT   ra   )nsamplesr  rT   rU   rg   U  rh   c                    s   g | ]}|t  j  qS rT   )r&   r6  r   ra   )r   rT   rU   rg   X  s    r   r    r?   r{   )r>   r?   r{   r9  r:  r;  r*  r,  r   r[   r	   rX   r   r   r   r   )load_external_data_for_model)?r   r   r   r   r   r  r   r4  r   graphr:   remove_tensors_from_outputsr   rJ   r   r   r3   r)   listsetadd_tensors_to_outputsr  r/   r  r   r  r  r  r   r   input_name_to_nodesr   r   r   get_noder(   runr&   r,   r   r  r   rN  r   r   r   r   rV   r4   r   remove_nodeadd_noder0   r1   r   r2   r   r   	MergeFromr   onnx.external_data_helperrR  split)*r   r  r   r>   r?   r{   r  r:  r9  r;  r*  r,  r"   r   r   r#   r  
org_outputoutput_namesr<   r  rN   
input_name	node_listr   r   Hsrz   r8  r   rA   r   r   r   r   r@   rB   r   r   rH   rR   rR  rT   )r   rQ  r  rU   gptq_quantize  s   /












?re  rP  )r   rW   rX   rY   rZ   )r   rW   )r   rW   rX   r  r  FFT)__doc__r   loggingr   r   numpyr&   r/   r   onnx.helperr   onnxruntimer   
onnx_modelr   r   r   	getLoggerr*   rV   r   r   r   r   r   r   r   r  rN  re  rT   rT   rT   rU   <module>   sl   


s
6
I
[
w<
 