o
    
۾iW'                     @   s   d dl mZmZmZ d dlZd dlmZ d dlm	Z
 d dlmZ d dlmZ d dlmZmZmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZ erZd dlmZ d dlm Z  ee!Z"G dd deZ#G dd deZ$dS )    )TYPE_CHECKINGAnyUnionN)_TYPES)_custom_ops)init_logger)FusedMoE)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationConfigQuantizeMethodBase)is_layer_skipped)GroupQuantScaleParameterPackedvLLMParameter)get_safetensors_params_metadata)QuantizationMethods)WeightsMapperc                       s   e Zd ZdZ	d%dedededee dB ddf
 fdd	Zdefd
dZ	d&ddZ
deej fddZedefddZedee fddZedeeef dd fddZdejjdeded dB fddZd'dd Zd%d!ed"edB fd#d$Z  ZS )(	AWQConfigzKConfig class for AWQ.

    Reference: https://arxiv.org/abs/2306.00978
    Nweight_bits
group_size
zero_pointmodules_to_not_convertreturnc                    sR   t    || _|| _|| _|pg | _| jdkr!td| j dd| j | _d S )N   zHCurrently, only 4-bit weight quantization is supported for AWQ, but got z bits.    )super__init__r   r   r   r   
ValueErrorpack_factor)selfr   r   r   r   	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/awq.pyr   &   s   


zAWQConfig.__init__c              	   C   s&   d| j  d| j d| j d| j d	S )NzAWQConfig(weight_bits=z, group_size=z, zero_point=z, modules_to_not_convert=))r   r   r   r   r    r#   r#   r$   __repr__:   s   
zAWQConfig.__repr__r   c                 C      dS )Nawqr#   r&   r#   r#   r$   get_nameB   s   zAWQConfig.get_namec                 C   s   t jgS N)torchhalfr&   r#   r#   r$   get_supported_act_dtypesE   s   z"AWQConfig.get_supported_act_dtypesc                 C   r(   )NK   r#   )clsr#   r#   r$   get_min_capabilityH   s   zAWQConfig.get_min_capabilityc                   C   s   ddgS )Nzquant_config.jsonzquantize_config.jsonr#   r#   r#   r#   r$   get_config_filenamesM   s   zAWQConfig.get_config_filenamesconfigc                 C   sL   |  |ddg}|  |ddg}|  |dg}| |dgd }| ||||S )Nw_bitbitsq_group_sizer   r   r   )get_from_keysget_from_keys_or)r0   r3   r   r   r   r   r#   r#   r$   from_configU   s   zAWQConfig.from_configlayerprefix)r
   r   c           	      C   s   t |trt|| j| jddrt S t| S t |trjddlm	} ddl
m} ddlm} ||| jsRtd| d d	| j| j| jd
| jd}||||S d	| j| j| jd
| jd}||}|||S d S )NT)skip_with_substr   )AWQMarlinConfig)MoeWNA16Config)check_moe_marlin_supports_layerzLayer 'zF' is not supported by AWQMoeMarlin. Falling back to Moe WNA16 kernels.r)   F)quant_methodr5   r   r   lm_headr   )
isinstancer	   r   r   packed_modules_mappingr   AWQLinearMethodr   
awq_marlinr>   	moe_wna16r?   utils.marlin_utilsr@   r   loggerwarning_oncer   r   r9   get_quant_method)	r    r:   r;   r>   r?   r@   r3   marlin_compatible_config_dictawq_marlin_configr#   r#   r$   rK   _   sN   



zAWQConfig.get_quant_methodhf_to_vllm_mapperr   c                 C   s   | j r|| j | _ d S d S r+   )r   
apply_list)r    rN   r#   r#   r$   apply_vllm_mapper   s
   
zAWQConfig.apply_vllm_mapper
model_namerevisionc                    s^   | j rd S tjtjtjgt||d}dd |D } fdd| D }t|| | _ d S )N)rR   c                 S   s   h | ]
}| d dd qS ).r=   r   )rsplit).0
param_namer#   r#   r$   	<setcomp>   s    z0AWQConfig.maybe_update_config.<locals>.<setcomp>c                    s<   h | ]\}}| d d  rt  vr|ddd qS )dtypeNrS   r=   r   )get_SAFETENSORS_TO_TORCH_DTYPErT   )rU   rV   inforX   unquant_dtypesr#   r$   rW      s    )r   r,   float16bfloat16float32r   itemslist)r    rQ   rR   metadatalayersquant_layersr#   r\   r$   maybe_update_config   s   zAWQConfig.maybe_update_configr+   )r   r   )rN   r   )__name__
__module____qualname____doc__intboolrb   strr   r'   r*   r,   rX   r.   classmethodr1   staticmethodr2   dictr   r9   nnModuler   rK   rP   rf   __classcell__r#   r#   r!   r$   r       s@    


	


0 r   c                   @   s   e Zd ZdZdefddZdejjde	de
e	 de	d	e	d
ejfddZdejjddfddZ	ddejjdejdejdB dejfddZdS )rE   zYLinear method for AWQ.

    Args:
        quant_config: The AWQ quantization config.
    quant_configc                 C   s
   || _ d S r+   )rt   )r    rt   r#   r#   r$   r      s   
zAWQLinearMethod.__init__r:   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                 K   s   | j jdkr| j j}n|}|| dkrtdt|}	|	| j j dkr'td|d}
ttj||	| j j tj	dddd| j j|
d}|| }ttj||	| j j tj	dddd| j j|
d}t
tj||	|ddd|
d	}|d
| |d| |d| d S )Nr   ztThe input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.zuThe output size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.weight_loader)rX   r=   )data	input_dim
output_dim
packed_dimpacked_factorr{   )r|   r}   r~   r{   qweightqzerosscales)rt   r   r   sumr   rY   r   r,   emptyint32r   register_parameter)r    r:   ru   rv   rw   rx   ry   extra_weight_attrsr   output_size_per_partitionr{   r   
num_groupsr   r   r#   r#   r$   create_weights   sf   



zAWQLinearMethod.create_weightsr   Nc                 C   sF   t jj|jjdd|_t jj|jjdd|_t jj|jjdd|_d S )NF)requires_grad)r,   rq   	Parameterr   r|   r   r   )r    r:   r#   r#   r$   process_weights_after_loading   s   z-AWQLinearMethod.process_weights_after_loadingxbiasc                 C   s   |j }|j}|j}| jj}|jd d |jd | f }|d|jd }	|jd d  dk}
|
rCt	|||ddd}t
|	|}n	t|	||||}|d urU|| ||S )Nrz      r   )r   r   r   rt   r   shapereshapenumelopsawq_dequantizer,   matmulawq_gemmadd_)r    r:   r   r   r   r   r   r   	out_shape
reshaped_xFP16_MATMUL_HEURISTIC_CONDITIONoutr#   r#   r$   apply   s   

zAWQLinearMethod.applyr+   )rg   rh   ri   rj   r   r   r,   rq   rr   rk   rb   rX   r   r   Tensorr   r#   r#   r#   r$   rE      s6    
K	rE   )%typingr   r   r   r,   safetensors.torchr   rZ   vllmr   r   vllm.loggerr   *vllm.model_executor.layers.fused_moe.layerr   !vllm.model_executor.layers.linearr	   r
   r   3vllm.model_executor.layers.quantization.base_configr   r   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.model_executor.parameterr   r   vllm.transformers_utils.configr   'vllm.model_executor.layers.quantizationr    vllm.model_executor.models.utilsr   rg   rI   r   rE   r#   r#   r#   r$   <module>   s$    