o
    
۾i9                     @   s2  d dl Z d dl mZ d dlmZ d dlmZmZmZ d dlZd dl	m
Z d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZmZmZm Z m!Z! d dl"m#Z# d dl$m%Z% eryd dl&m'Z' d dl(m)Z) ne*Z'ee+Z,G dd deZ-G dd deZ.G dd deZ/dS )    N)Enum)Fraction)TYPE_CHECKINGAnyUnion)_TYPES)	Parameter)_custom_ops)init_logger)FusedMoE)LinearMethodBase)QuantizationConfigQuantizeMethodBase)get_linear_quant_method)ChannelQuantScaleParameterGroupQuantScaleParameterPackedColumnParameterPackedvLLMParameterRowvLLMParameter)get_safetensors_params_metadata)
is_list_of)QuantizationMethods)WeightsMapperc                       s,  e Zd ZdZ			d)dededededeeeeeeB f f d	ed
ee dB deddf fddZ	defddZ
edefddZedeej fddZedefddZedee fddZedeeef dd fddZdejjdeded dB fdd Zd*d#d$Zd+d%ed&edB fd'd(Z  ZS ),
GPTQConfigzLConfig class for GPTQ.

    Reference: https://arxiv.org/abs/2210.17323
     Nweight_bits
group_sizedesc_actlm_head_quantizeddynamicautoround_versionmodules_in_block_to_quantizecheckpoint_formatreturnc	           	         s   t    || _|| _|| _|| _|| _td| j| _| jdvr)t	d| j d| jdkr3t
d |p6g | _|| _|| _d S )N    )            zOCurrently, only 2/3/4/8-bit weight quantization is supported for GPTQ, but got z bits.r'   zVCurrently, the 4-bit gptq_gemm kernel for GPTQ is buggy. Please switch to gptq_marlin.)super__init__r   r   r   r   r   r   pack_factor
ValueErrorloggerwarning_oncer!   r    r"   )	selfr   r   r   r   r   r    r!   r"   	__class__ `/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/gptq.pyr*   1   s(   
"



zGPTQConfig.__init__c                 C   s>   d| j  d| j d| j d| j d| j d| j d| j dS )	NzGPTQConfig(weight_bits=z, group_size=z, desc_act=z), lm_head_quantized=z
, dynamic=z, modules_in_block_to_quantize=z), checkpoint_format=))r   r   r   r   r   r!   r"   r/   r2   r2   r3   __repr__r   s   
zGPTQConfig.__repr__c                 C      dS )Ngptqr2   clsr2   r2   r3   get_name}   s   zGPTQConfig.get_namec                 C   s   t jgS N)torchhalfr9   r2   r2   r3   get_supported_act_dtypes   s   z#GPTQConfig.get_supported_act_dtypesc                 C   r7   )N<   r2   r9   r2   r2   r3   get_min_capability   s   zGPTQConfig.get_min_capabilityc                 C   s   dgS )Nzquantize_config.jsonr2   r9   r2   r2   r3   get_config_filenames   s   zGPTQConfig.get_config_filenamesconfigc           
   	   C   s   | j |dgi d}|d u ri n|}| |dg}| |dg}| |dg}| j |dgdd}| j |dgd	d}| j |d
gd d}| j |dgd	d}	| ||||||||	S )Nr   )defaultbitsr   r   lm_headFr    r   r!   r"   )get_from_keys_orget_from_keys)
r:   rC   r   r   r   r   r   r    r!   r"   r2   r2   r3   from_config   s2   zGPTQConfig.from_configlayerprefix)GPTQLinearMethodr   c                 C   sJ   t |trddlm} d| j| jddd}||||S t| ||t	S )N   )MoeWNA16Configr8   TF)quant_methodrE   r   symrF   )

isinstancer   	moe_wna16rN   r   r   rI   get_quant_methodr   rL   )r/   rJ   rK   rN   rC   r2   r2   r3   rS      s   
zGPTQConfig.get_quant_methodhf_to_vllm_mapperr   c                 C   s    | j d ur|| j | _ d S d S r<   )r!   
apply_list)r/   rT   r2   r2   r3   apply_vllm_mapper   s
   

zGPTQConfig.apply_vllm_mapper
model_namerevisionc                    sj   | j rt| j trdd | j D | _ d S tjtjtjgt||d} fdd| D }t|| _ d S )Nc                 S   s   g | ]	}|D ]}|qqS r2   r2   ).0sublistitemr2   r2   r3   
<listcomp>   s    z2GPTQConfig.maybe_update_config.<locals>.<listcomp>)rX   c                    s<   h | ]\}}| d d  rt  vr|ddd qS )dtypeN.rM   r   )get_SAFETENSORS_TO_TORCH_DTYPErsplit)rY   
param_nameinfor]   unquant_dtypesr2   r3   	<setcomp>   s    z1GPTQConfig.maybe_update_config.<locals>.<setcomp>)	r!   r   listr=   float16bfloat16float32r   items)r/   rW   rX   metadataquant_layersr2   rd   r3   maybe_update_config   s   zGPTQConfig.maybe_update_config)r   Nr   )rT   r   r<   )__name__
__module____qualname____doc__intbooldictstrrg   r*   r6   classmethodr   r;   r=   r]   r?   rA   rB   r   rI   nnModuler   rS   rV   rn   __classcell__r2   r2   r0   r3   r   +   sX    
	
A


 r   c                   @   s$   e Zd Ze Ze Ze ZdS )ExllamaStateN)ro   rp   rq   enumautoUNUSEDUNINITIALIZEDREADYr2   r2   r2   r3   r{      s    r{   c                   @   s   e Zd ZdZdefddZdejjde	de
e	 de	d	e	d
ejfddZdejjddfddZ	ddejjdejdejdB dejfddZdS )rL   z[Linear method for GPTQ.

    Args:
        quant_config: The GPTQ quantization config.
    quant_configc                 C   s   || _ |jdk| _d S )Ngptq_v2)r   r"   use_v2_format)r/   r   r2   r2   r3   r*      s   zGPTQLinearMethod.__init__rJ   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                    s  ~| d}| jj dkrtdt|}	|	 jjj dkr#td jjdkr. jj}
n|}
tj}||
 }d }||krQ jjdkrQ jj	rKtj
}n||
 }d}ttj| jj |	tjdddd jj|d}ttj fd	d
t|D tjdd|d}tj||	 jj tjd|d}tj||	|d|d}|d u rtdddi|}tddd jjd|}ntdddd|}tdddd jjd|}|d| |d| |d| |d| ||_d S )Nweight_loaderr   ztThe input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.zuThe output size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.)r]   rM   )data	input_dim
output_dim
packed_dimpacked_factorr   c                    s   g | ]}| j j qS r2   )r   r   )rY   ir5   r2   r3   r\   *  s    
z3GPTQLinearMethod.create_weights.<locals>.<listcomp>)r   r   r   )r   r   r   )r   r   r   )r   r   )r   r   r   r   qweightg_idxqzerosscalesr2   )r_   r   r   r,   sumr+   	numeratorr{   r   r   r~   r   r=   emptyint32r   tensorranger   r   r   register_parameterexllama_state)r/   rJ   r   r   r   r   r   extra_weight_attrsr   output_size_per_partitionr   r   scale_and_zero_sizescale_and_zero_input_dimr   r   qzeros_argsweight_scale_argsr   r   r2   r5   r3   create_weights   s   





	
zGPTQLinearMethod.create_weightsr#   Nc                 C   s   t |jjdd|_t |jjdd|_t |jjdd|_t |jjdd|_|jtjkrY| j	j
r;t|jtj|j_ntjdtj|jjd|j_tj|_t|j|j| j	j d S d S )NF)requires_grad)r   )r]   device)r   r   r   r   r   r   r   r{   r   r   r   r=   argsorttors   r   r   r   opsgptq_shuffler   )r/   rJ   r2   r2   r3   process_weights_after_loading_  s   
z.GPTQLinearMethod.process_weights_after_loadingxbiasc              
   C   sx   |j d d |jj d f }|d|j d }t||j|j|j|j|jt	j
k| j| jj}|d ur7|| ||S )Nr   )shaper   reshaper   	gptq_gemmr   r   r   r   r{   r   r   r   r   add_)r/   rJ   r   r   	out_shape
reshaped_xoutputr2   r2   r3   applyr  s   



zGPTQLinearMethod.applyr<   )ro   rp   rq   rr   r   r*   r=   rx   ry   rs   rg   r]   r   r   Tensorr   r2   r2   r2   r3   rL      s6    
qrL   )0r|   r   	fractionsr   typingr   r   r   r=   safetensors.torchr   r`   torch.nn.parameterr   vllmr	   r   vllm.loggerr
   *vllm.model_executor.layers.fused_moe.layerr   !vllm.model_executor.layers.linearr   3vllm.model_executor.layers.quantization.base_configr   r   8vllm.model_executor.layers.quantization.utils.gptq_utilsr   vllm.model_executor.parameterr   r   r   r   r   vllm.transformers_utils.configr   vllm.utils.collection_utilsr   'vllm.model_executor.layers.quantizationr    vllm.model_executor.models.utilsr   rv   ro   r-   r   r{   rL   r2   r2   r2   r3   <module>   s2    1