o
    biN                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ dd ZddddedddZG dd dZdS )    N)partial)ops)
quantizers)Dense)EinsumDense)linalg)
GPTQConfig)GPTQQuantizer)compute_quantization_parameters)dequantize_with_zero_point)quantize_with_zero_pointc                 C   s^   t | d }t jd|dd}t t |dt |d}t | t |d}t t |S )zReturn a stable permutation that sorts `metric` in descending order.
    Uses an index-based jitter to break ties deterministically.r   int32dtypefloat32-q=)	r   shapearangedividecastaddmultiplyargsortnegative)metricnidxjittermetric_jittered r   M/home/ubuntu/.local/lib/python3.10/site-packages/keras/src/quantizers/gptq.py_stable_permutation   s
   r!      F)	blocksize
group_sizeactivation_orderorder_metriccompute_scale_zeroc          .      C   s  t | d }|rM|du rt t t |d}nt |d}t t ||t |}t	|}t 
|}	t j| |dd} t jt j||dd|dd}nd }}	| }
t j| dd}g }g }|d	krd|n|}td||D ]D}t|| |}|| }|
dd||f }t |}|||||f }d}d}d}d	}t|D ]}|| }|dd|f }||ks|| | }||krt|| |}|
dd||f }||\}}}|| || |}|||}} }!n|du r||
\}}}|| || d}|||}} }!tt |d|| |!}"t |d|ft |"d}t|"|| dddf }#|||f }$t t ||#|$}%t |d|ft |%d}||d k rt t |%dt |||d df d}&|dd|d df }'t |d|d ft |'|&}q||k rt |||||df }(t j|
ddd|f t |
dd|df |(gdd}
ql|})t jd|dd}*t |*|)}*t |*d}*|rt j|*|	dd}*t j||	dd}t|dkr|| \}+},}-|+}|,} nt j|dd}t j|dd} ||| |*fS )
ab  
    Implements the GPTQ error correction updates.

    For a single column update (column j):
        e = invH[j, j] * (w_j - q_j)
        W[:, j+1:] -= e * invH[j, j+1:]
    where:
    - w_j is the original column,
    - q_j is the quantized column,
    - invH is the inverse Hessian,
    - e is the propagated error term.

    Across entire blocks:
        W[:, future] -= E_block * invH[block, future]
    where:
    - E_block is the quantization error accumulated for the current block,
    - invH[block, future] denotes the cross-block slice of the inverse Hessian,
    - W[:, future] are the columns yet to be quantized.

    Args:
        weights_transpose: Transposed weight matrix [out_features, in_features]
         to quantize.
        inv_hessian: Inverse Hessian matrix [in_features, in_features] for
         error propagation.
        blocksize: Size of the blocks to process (default: 128).
        group_size: Size of the groups for parameter reuse
         (default: -1, no grouping).
        activation_order: Whether to apply activation-order permutation
         (default: False).
        order_metric: Metric for ordering features
         (default: None, uses 1 / diag(invH)).
        compute_scale_zero: Function to compute scale and zero for
         quantization.

    Returns:
        quantized_weights: Quantized weight matrix [out_features, in_features].
        scale: float32. Scale parameters for quantization
         [out_features, num_groups].
        zero: Zero-point parameters for quantization [out_features, num_groups].
        g_idx: int32. Group indices for each feature [in_features].
       Nr   r   )axisr   r   r   r#   )r   r   
reciprocalr   diagonalr   whereisfinite
zeros_liker!   r   takerangeminappendr   expand_dimsslice_updater   r   subtractmatmulconcatenater   len).weights_transposeinv_hessianr$   r%   r&   r'   r(   in_featuresperminv_permweights_bufferquantized_weights_bufferscale_chunkszero_chunkseffective_groupblock_start	block_end
block_sizeblock_weightsblock_errorblock_inv_hessiancached_scalecached_zerocached_maxqcached_group_start	block_idx
global_idxweight_columngroup_start	group_endgroup_slicescalezeromaxqquantized_columndequantized_colcurrent_block_influenceerrupdatetailtotal_update
base_groupg_idxsz_r   r   r    gptq_quantize_matrix   s   3











	



rc   c                   @   s<   e Zd ZedddfddZdd Z	ddd	Zd
d ZdS )GPTQN)	tokenizerdatasetc           	      C   sF  || _ d| _|| _t||jd| _t|ts!t|tr6|j	j
dkr6|j	j| _| jd | _| jd | _|| _n_t|tr|j	j
dkr|j	j| _t| j}|t|}|dkrg|\}}}|t||| _| _n|dv r{|\}}}t|||| _| _tjt|j	| j| jfd| _n	tdt| tj| j| jfd	d
| _d S )Nr   )compute_dtype   r)      )r)   rh   )kernelz!Unsupported layer type for GPTQ: r   r   )original_layernum_samplesconfigr	   variable_dtype	quantizer
isinstancer   r   rj   ndimr   kernel_shaperowscolumnslayerlistindexmaxr   r   typesSimpleNamespacereshape	TypeErrortypezeroshessian)	selfru   rm   r   d_model_dim_indexr<   headshead_dimout_featuresr   r   r    __init__  sB   









zGPTQ.__init__c              	   C   sp  |du rt dt|jdk rt dt|j dt|dkr%t dt|jdkr7t|d|jd f}t|d	}t|d }| j}t||}t| j	d t|d krrt d
t| j	d  dt|d  dt
t||}tt|t|d}| jdkrt| j	t||| _	t| j	ttd||| _	| jt|d  pd| _dS )a  
        Updates the running average of the Hessian matrix with a new batch.

        This method computes the Hessian matrix for a given batch of input
        activations and updates the accumulated Hessian (`self.hessian`) using a
        numerically stable running average. This allows the Hessian to be
        computed over a large dataset without loading all samples into memory
        at once.

        The input tensor is first reshaped into a 2D matrix [num_samples,
        num_features] before the Hessian is calculated.

        Args:
            input_batch: A 2D or higher-dimensional tensor of input activations
                from a calibration batch.

        Raises:
            ValueError: If the feature dimension of the input tensor
                `input_batch` does not match the dimensions of the
                pre-initialized Hessian matrix `self.hessian`.
        NzInput tensor cannot be None.rh   z+Input tensor must have rank >= 2 (got rank z).r   zInput tensor cannot be empty.r#   r   zHessian dimensions (z) do not match input features (g       @)
ValueErrorr9   r   r   sizer{   r   rl   r   r   r7   	transposer   r   )r   input_batchxnum_new_samplesnum_prev_samplestotal_samplesgram_matrixr   r   r    update_hessian_with_batchD  sF   
zGPTQ.update_hessian_with_batchr"   c                 C   s`  t | jj}t | j}t |d}t |d|}t | jt 	t |dt 
|}t | jjt |}t ||}t t |t 	t |t 	|}t|}t|||| jj| jjt |t| jjddd\}}	}
}t || jjj}| jjdkrtj|ddd	\}}}| j`| jj | | jj! |	 | jj" |
 | jj# | d| j_$d
S )a8  
        Performs GPTQ quantization and correction on the layer's weights.

        This method implements the core logic of the "Optimal Brain Quant"
        (OBQ) method, as applied by GPTQ, to quantize the weights of a single
        layer. It iteratively quantizes blocks of weights and corrects for the
        quantization error by updating the remaining weights.

        The algorithm follows these main steps:
        1.  Initialization: It optionally reorders the weight columns based
            on activation magnitudes (`activation_order=True`) to protect more
            salient
            weights.
        2.  Hessian Modification: The Hessian matrix, pre-computed from
            calibration data, is dampened to ensure its invertibility and
            stability.
        3.  Iterative Quantization: The function iterates through the
            weight columns in blocks (`blocksize`). In each iteration, it:
            a. Quantizes one column.
            b. Calculates the quantization error.
            c. Updates the remaining weights in the *current* block by
                distributing the error, using the inverse Hessian.
        4.  Block-wise Correction: After a block is quantized, the total
            error from that block is propagated to the *next* block of weights
            to be processed.
        5.  Finalization: The quantized weights are reordered back if
            `activation_order` was used, and the layer's weights are updated.
        This implementation is based on the official GPTQ paper and repository.
        For more details, see:
        - Paper: https://arxiv.org/abs/2210.17323
        - Original Code: https://github.com/IST-DASLab/gptq


        Args:
            blocksize: (int, optional) The size of the weight block to process
             at a time. Defaults to 128.
        g        g      ?T)weight)r;   r$   r%   r&   r'   r(      r   uint8)r*   r   N)%r   r   ru   rj   r,   r   equalr-   r   diagr/   r   rm   hessian_dampingmeanr6   r   invrc   r%   r&   r   ro   find_paramsr   rk   quantized_kernelr   weight_bitsr   	pack_int4_kernelassignkernel_scalekernel_zeror_   is_gptq_calibrated)r   r$   weights_matrixhessian_diagonaldead_diagonalhessian_matrixdamping_factorinverse_hessian	quantizedrT   rU   r_   rb   r   r   r    quantize_and_correct_layer  sV   )
	
zGPTQ.quantize_and_correct_layerc                 C   s   | ` | `d S )N)r   ru   )r   r   r   r    free  s   z	GPTQ.free)r"   )__name__
__module____qualname__r   r   r   r   r   r   r   r   r    rd     s    2G
_rd   )ry   	functoolsr   	keras.srcr   r   keras.src.layersr   r   keras.src.opsr    keras.src.quantizers.gptq_configr   keras.src.quantizers.quantizersr	   r
   r   r   r!   rc   rd   r   r   r   r    <module>   s*     v