o
    پiك                     @  sf  d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZmZ d dlmZmZmZmZ d dlmZ d dlmZmZ d d	lmZmZmZmZ d d
lm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 erd dl3m4Z4m5Z5 d dl6m7Z7m8Z8m9Z9m:Z: e7 Z;e8 Z<e: Z=e9 Z>e>rd dl?Z?e;rd dl@mAZAmBZBmCZC ne<rd dlDmEZA ne=rd dl@mAZA eFd neFd eGeHZIe/ \ZJZKd.ddZLG dd deZMG dd deZNG dd  d eZOG d!d" d"eZPG d#d$ d$eOZQG d%d& d&eZRG d'd( d(eRZSe;r1e2d)d*d+ ZTe2d,d-d+ ZTdS dS )/    )annotationsN)TYPE_CHECKINGAnyDictListOptional)npu_fused_experts)
LinearBaseset_weight_attrs)	MoeRunnerMoeRunnerBackendMoeRunnerConfigget_moe_runner_backend)MarlinMoeQuantInfo)GroupQuantScaleParameterPackedvLLMParameter)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)apply_awq_marlin_linearawq_to_marlin_zero_pointscheck_marlin_supportedcheck_marlin_supports_layercheck_moe_marlin_supports_layermarlin_make_empty_g_idxmarlin_make_workspacemarlin_moe_permute_scalesmarlin_permute_scalesmoe_awq_to_marlin_zero_pointsverify_marlin_supportedverify_marlin_supports_shape)UnquantizedLinearMethod)get_scalar_typesreplace_parameter)register_fake_if_exists)CombineInputStandardDispatchOutput)is_cudais_hipis_npuis_xpu)awq_dequantizeawq_marlin_moe_repackawq_marlin_repack)awq_dequantize_triton)r,   z0XPU does not support fused_marlin_moe currently.z-Only CUDA, HIP and XPU support AWQ currently.prefixstrmodules_to_not_convert	List[str]c                   s   t  fdd|D S )Nc                 3  s    | ]}| v V  qd S N ).0module_namer0   r5   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/awq.py	<genexpr>U   s    z'is_layer_skipped_awq.<locals>.<genexpr>)any)r0   r2   r5   r8   r9   is_layer_skipped_awqT   s   r<   c                      s   e Zd ZdZ	d'd( fddZd)ddZd*ddZd)ddZd+ddZe	d,ddZ
ed*ddZe	d-dd Zd.d%d&Z  ZS )/	AWQConfigzKConfig class for AWQ.

    Reference: https://arxiv.org/abs/2306.00978
    Nweight_bitsint
group_size
zero_pointboolr2   Optional[List[str]]returnNonec                   sR   t    || _|| _|| _|pg | _| jdkr!td| j dd| j | _d S )N   zHCurrently, only 4-bit weight quantization is supported for AWQ, but got z bits.    )super__init__r>   r@   rA   r2   
ValueErrorpack_factor)selfr>   r@   rA   r2   	__class__r5   r9   rI   ^   s   


zAWQConfig.__init__r1   c              	   C  s&   d| j  d| j d| j d| j d	S )NzAWQConfig(weight_bits=, group_size=, zero_point=, modules_to_not_convert=))r>   r@   rA   r2   rL   r5   r5   r9   __repr__r   s   
zAWQConfig.__repr__r3   c                 C     g S r4   r5   rS   r5   r5   r9   get_scaled_act_namesz      zAWQConfig.get_scaled_act_namesc                 C     dS )Nawqr5   rS   r5   r5   r9   get_name}   rW   zAWQConfig.get_nameList[torch.dtype]c                 C  s   t stjgS tjtjgS r4   )_is_nputorchfloat16bfloat16rS   r5   r5   r9   get_supported_act_dtypes   s   z"AWQConfig.get_supported_act_dtypesc                 C  s   t rtddS )Nz;NPU hardware does not support "get_min_capability" feature.K   )r\   NotImplementedErrorclsr5   r5   r9   get_min_capability   s
   zAWQConfig.get_min_capabilityc                   C  s   ddgS )Nzquant_config.jsonquantize_config.jsonr5   r5   r5   r5   r9   get_config_filenames   s   zAWQConfig.get_config_filenamesconfigDict[str, Any]c                 C  sL   |  |ddg}|  |ddg}|  |dg}| |dgd }| ||||S )Nw_bitbitsq_group_sizer@   rA   r2   get_from_keysget_from_keys_or)rd   rh   r>   r@   rA   r2   r5   r5   r9   from_config   s   zAWQConfig.from_configlayertorch.nn.Moduler0   Optional[LinearMethodBase]c                 C  s~   ddl m} ddlm} tr+t||r t|| jrt S t	| S t||r)t
| S d S t||r=t|| jr9t S t| S d S )Nr   )r	   FusedMoE)sglang.srt.layers.linearr	   &sglang.srt.layers.moe.fused_moe_tritonru   r\   
isinstancer<   r2   r"   AWQLinearAscendMethodAWQMoEAscendMethodAWQLinearMethod)rL   rq   r0   r	   ru   r5   r5   r9   get_quant_method   s   


zAWQConfig.get_quant_methodr4   )
r>   r?   r@   r?   rA   rB   r2   rC   rD   rE   rD   r1   rD   r3   )rD   r[   rD   r?   )rh   ri   rD   r=   )rq   rr   r0   r1   rD   rs   )__name__
__module____qualname____doc__rI   rT   rV   rZ   r`   classmethodre   staticmethodrg   rp   r|   __classcell__r5   r5   rM   r9   r=   X   s    




		r=   c                      s   e Zd ZdZejejdZd0 fddZd1ddZ	d2ddZ
ed1ddZed3ddZed4ddZed5dd Zed6d"d#Zed7d%d&Zd8d+d,Zed9d.d/Z  ZS ):AWQMarlinConfigzConfig class for AWQ Marlin)rF      r>   r?   r@   rA   rB   lm_head_quantizedr2   Optional[list[str]]full_configdict[str, Any]rD   rE   c                   s   t    trtd d| | _|| _|| _|| _|| _	|p g | _
|| _| j	| jvr9td| j	 d| j  | j| j	 | _t| j| j| jd d S )Nz0HIP does not support fused_marlin_moe currently.rG   zUnsupported num_bits = z. Supported num_bits = )r@   has_zp)rH   rI   _is_hipwarningswarnrK   r@   rA   r   r>   r2   r   TYPE_MAPrJ   keys
quant_typer    )rL   r>   r@   rA   r   r2   r   rM   r5   r9   rI      s(   
	




zAWQMarlinConfig.__init__r1   c                 C  s.   d| j  d| j d| j d| j d| j dS )NzAWQMarlinConfig(quant_type=rO   rP   z, lm_head_quantized=rQ   rR   )r   r@   rA   r   r2   rS   r5   r5   r9   rT      s   
zAWQMarlinConfig.__repr__r3   c                 C  rU   r4   r5   rS   r5   r5   r9   rV      rW   z$AWQMarlinConfig.get_scaled_act_namesc                 C  rX   )N
awq_marlinr5   rc   r5   r5   r9   rZ         zAWQMarlinConfig.get_namelist[torch.dtype]c                 C  s   t jt jgS r4   )r]   halfr_   rc   r5   r5   r9   r`      s   z(AWQMarlinConfig.get_supported_act_dtypesc                 C  rX   )NP   r5   rc   r5   r5   r9   re      r   z"AWQMarlinConfig.get_min_capability	list[str]c                 C  s   dgS )Nrf   r5   rc   r5   r5   r9   rg      s   z$AWQMarlinConfig.get_config_filenamesrh   c                 C  s^   |  |dg}|  |dg}|  |dg}| j|dgdd}| |dgd }| ||||||S )Nrk   r@   rA   lm_headF)defaultr2   rm   )rd   rh   r>   r@   rA   r   r2   r5   r5   r9   rp      s   zAWQMarlinConfig.from_configOptional[str]c                 C  sj   |  |}|d u p|dkp|dk}|r(|r(d|  |  }t| |  S |r3|dkr3td d S )Nmarlinr   z?The model is convertible to {} during runtime. Using {} kernel.rY   zDetected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference)is_awq_marlin_compatibleformatrZ   loggerinfo)rd   hf_quant_cfg
user_quantcan_convertis_valid_user_quantmsgr5   r5   r9   override_quantization_method  s   

z,AWQMarlinConfig.override_quantization_methodrq   rr   r0   Optional[QuantizeMethodBase]c                 C  s   ddl m} ddlm} t|tst||r<| jr<t|| jr"t	 S t
|| js8td| t| j||S t| S t||rdddlm} t|| js`td| d || j||S t| S d S )Nr   rt   )ParallelLMHeadzRLayer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.)MoeWNA16ConfigzLayer 'zF' is not supported by AWQMoeMarlin. Falling back to Moe WNA16 kernels.)rw   ru   *sglang.srt.layers.vocab_parallel_embeddingr   rx   r	   r   r<   r2   r"   r   r@   r   warning_oncer=   rp   r   r|   AWQMarlinLinearMethod(sglang.srt.layers.quantization.moe_wna16r   r   AWQMoEMethod)rL   rq   r0   ru   r   r   r5   r5   r9   r|   $  s:   


z AWQMarlinConfig.get_quant_methodquant_configc                 C  s   | dd }| d}| d}| d}tsdS |dkr!dS |d u s-|d u s-|d u r/dS || jvr6dS t| j| ||dS )	Nquant_method rk   r@   rA   FrY   )r   r@   r   )getlower_is_cudar   r   )rd   r   r   num_bitsr@   rA   r5   r5   r9   r   G  s   



z(AWQMarlinConfig.is_awq_marlin_compatible)r>   r?   r@   r?   rA   rB   r   rB   r2   r   r   r   rD   rE   r}   r~   )rD   r   r   )rD   r   )rh   r   rD   r   )rD   r   )rq   rr   r0   r1   rD   r   )r   r   )r   r   r   r   scalar_typesuint4uint8r   rI   rT   rV   r   rZ   r`   re   rg   rp   r   r|   r   r   r5   r5   rM   r9   r      s.    
 
	
#r   c                   @  s<   e Zd ZdZdddZdddZdddZ	d d!ddZdS )"r{   zYLinear method for AWQ.

    Args:
        quant_config: The AWQ quantization config.
    r   r=   c                 C  
   || _ d S r4   r   rL   r   r5   r5   r9   rI   h     
zAWQLinearMethod.__init__rq   rr   input_size_per_partitionr?   output_partition_sizes	List[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec                 K  s   || j j dkrtdt|}|| j j dkrtd|d}	ttj||| j j tj	dddd| j j|	d}
ttj|| j j || j j tj	dddd| j j|	d}t
tj|| j j ||ddd|	d}|d	|
 |d
| |d| d S )Nr   ztThe input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.zuThe output size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.weight_loaderdtype   data	input_dim
output_dim
packed_dimpacked_factorr   r   r   r   r   qweightqzerosscales)r   r@   rJ   sumrK   r   r   r]   emptyint32r   register_parameter)rL   rq   r   r   r   r   r   extra_weight_attrsoutput_size_per_partitionr   r   r   r   r5   r5   r9   create_weightsk  s^   





zAWQLinearMethod.create_weightsrD   rE   c                 C  sF   t jj|jjdd|_t jj|jjdd|_t jj|jjdd|_d S )NFrequires_grad)r]   nn	Parameterr   r   r   r   )rL   rq   r5   r5   r9   process_weights_after_loading  s   z-AWQLinearMethod.process_weights_after_loadingNxtorch.TensorbiasOptional[torch.Tensor]c                 C  s~   |j }|j}|j}| jj}|jd d |jd | f }|d|jd }	t|||}
t	|	|
}
|d ur:|

| |
|S )N)r   r   r   r   rK   shapereshaper,   r]   matmuladd_rL   rq   r   r   r   r   r   rK   	out_shape
reshaped_xoutr5   r5   r9   apply  s   

zAWQLinearMethod.applyr   r=   )rq   rr   r   r?   r   r   r   r?   r   r?   r   r   rq   rr   rD   rE   r4   rq   rr   r   r   r   r   rD   r   r   r   r   r   rI   r   r   r   r5   r5   r5   r9   r{   a  s    


C	r{   c                   @  s<   e Zd ZdZdddZdddZdddZ	d d!ddZdS )"r   zgLinear method for AWQ Marlin.

    Args:
        quant_config: The AWQ Marlin quantization config.
    r   r   rD   rE   c                 C  r   r4   r   r   r5   r5   r9   rI     r   zAWQMarlinLinearMethod.__init__rq   rr   r   r?   r   	list[int]r   r   r   r   c                 K  s   ~t |}|d}	| jjdkr| jj}
n|}
t||||
d ttj||| jj tj	dddd| jj|	d}||
 }ttj||| jj tj	dddd| jj|	d}t
tj|||ddd|	d}|d	| |d
| |d| ||_||_||_d S )Nr   r   )r   r   r   r@   r   r   r   r   r   r   r   r   )r   r   r   r@   r!   r   r]   r   rK   r   r   r   r   r   
num_groups)rL   rq   r   r   r   r   r   r   r   r   r@   r   r   r   r   r5   r5   r9   r     sj   





z$AWQMarlinLinearMethod.create_weightsc                 C  s   |j j}tjj|j jdd|_ tjj|jjdd|_tjj|jjdd|_t||_	t
|j |j|j| jjjd}t|d| t|j|j|j| jjd}t|d| t|j|j|j| jjjd}t|d| t||_t||_d S )NFr   size_ksize_nr   r   )r   r   r@   r   r   )r   devicer]   r   r   r   r   r   r   	workspacer.   r   r   r   r   	size_bitsr$   r   r@   r   r   r   g_idxg_idx_sort_indices)rL   rq   r   marlin_qweightmarlin_scales	marlin_zpr5   r5   r9   r     s8   

z3AWQMarlinLinearMethod.process_weights_after_loadingNr   r   r   r   c                 C  s2   t ||j|j|j|j|j|j| jj|j	|j
|dS )N)inputweightweight_scale	weight_zpr   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   )rL   rq   r   r   r5   r5   r9   r   G  s   zAWQMarlinLinearMethod.apply)r   r   rD   rE   )rq   rr   r   r?   r   r   r   r?   r   r?   r   r   rD   rE   r   r4   r   r   r5   r5   r5   r9   r     s    


N,r   c                   @  s(   e Zd ZdZdddZ	ddddZdS )ry   zcLinear method for AWQ on Ascend.

    Args:
        quant_config: The AWQ quantization config.
    rq   rr   rD   rE   c                 C  s  t jj|jjdd|_t |jj}|jj}g }g d}td| j	j
D ]*}|| d }||dd|? d@  ||jj|? d	d|   dd| > @  q#|d
 t j|dd|jd d}|d  }||jjj}t jj|dd|_t jj|dd|_d S )NFr   r   rF   r                  r   rF   r   r      r      dimr   )r]   r   r   r   r   
zeros_liker   r   ranger   rK   appendr   bitwise_or_bitwise_xor_catr   tor   zerosr   )rL   rq   qweight_tmp
qzeros_tmpqzeros_listshiftsi	shift_numr5   r5   r9   r   c  s"   "

z3AWQLinearAscendMethod.process_weights_after_loadingNr   r   r   r   c                 C  s   |j }|j}|j}| jj}|jd d |jd | f }|d|jd }	|d ur3|jtj	kr3|
 }tj|	|||| jj|d}
|
|S )Nr   )antiquant_scaleantiquant_offsetantiquant_group_sizer   )r   r   r  r   rK   r   r   r   r]   r_   float	torch_npunpu_weight_quant_batchmatmulr@   r   r5   r5   r9   r   z  s"   
	zAWQLinearAscendMethod.applyr   r4   r   )r   r   r   r   r   r   r5   r5   r5   r9   ry   \  s
    
ry   c                   @  s>   e Zd ZdddZdddZdddZd ddZd!ddZdS )"r   r   r   c                 C  s&   || _ | j jdkrtdtj| _d S )NrF   z$AWQMoEMethod only supports 4bit now.)r   r>   rJ   r   r   r   r   r5   r5   r9   rI     s   zAWQMoEMethod.__init__rq   rr   num_expertsr?   hidden_sizeintermediate_size_per_partitionr   r   c                 K  s  ddl m} |d|jjd tjjtj||d| | j	j
 tjddd}|d	| t|| tjjtj|||| j	j
 tjddd}	|d
|	 t|	| || j	j }
|| j	j }tjjtj||
|d |ddd}|d| t|| tjjtj||||ddd}|d| t|| tjjtj||
d| | j	j
 tjddd}|d| t|| tjjtj|||| j	j
 tjddd}|d| t|| d S )Nr   )FusedMoeWeightScaleSupportedT)is_transposedr   r  r   Fr   w13_qweight
w2_qweight
w13_scales	w2_scales
w13_qzeros	w2_qzeros)rw   r#  updateGROUPvaluer]   r   r   r   r   rK   r   r   r
   r@   )rL   rq   r   r!  r"  r   r   r#  r%  r&  num_groups_w13num_groups_w2r'  r(  r)  r*  r5   r5   r9   r     s   
	

	
	

	

	zAWQMoEMethod.create_weightsrD   rE   c           
      C  s  |j jd }|j j}tjjtj|dftj|ddd|_tjjtj|dftj|ddd|_	t
|j |j|j jd |j jd | jj | jjd}t|d| t
|j|j	|jjd |jjd | jj | jjd}t|d	| t|j|j|jjd | jjd
}t|d| t|j|j|jjd | jjd
}t|d| t|j|jjd |jjd | jj | jjd}t|d| t|j|jjd |jjd | jj | jjd}	t|d|	 d S )Nr   )r   r   Fr   r   r  r   r%  r&  )sr   r   r@   r'  r(  r)  r*  )r%  r   r   r]   r   r   r   r   w13_g_idx_sort_indicesw2_g_idx_sort_indicesr-   r   rK   r>   r$   r&  r   r'  r"  r@   r(  r   r)  r*  )
rL   rq   r   r   marlin_w13_qweightmarlin_w2_qweightmarlin_w13_scalesmarlin_w2_scalesmarlin_w13_zpmarlin_w2_zpr5   r5   r9   r     sl   





z*AWQMoEMethod.process_weights_after_loadingmoe_runner_configr   c                 C  s&   t   sJ || _ttj|| _d S r4   )r   is_autor9  r   r   MARLINrunnerrL   rq   r9  r5   r5   r9   create_moe_runner<  s   zAWQMoEMethod.create_moe_runnerdispatch_outputr'   r&   c                 C  s<   t |j|j|j|j|j|j|j|j| j	j
d	}| j||S )N)	r%  r&  r'  r(  r1  r2  r)  r*  r>   )r   r%  r&  r'  r(  r1  r2  r)  r*  r   r>   r<  run)rL   rq   r?  
quant_infor5   r5   r9   r   C  s   zAWQMoEMethod.applyN)r   r   )
rq   rr   r   r?   r!  r?   r"  r?   r   r   r   rq   rr   r9  r   )rq   rr   r?  r'   rD   r&   )r   r   r   rI   r   r   r>  r   r5   r5   r5   r9   r     s    


]
Ar   c                   @  s4   e Zd ZdddZdd	d
ZdddZdddZdS )rz   r   r=   c                 C  r   r4   r   r   r5   r5   r9   rI   Y  r   zAWQMoEAscendMethod.__init__rq   rr   rD   rE   c                 C  s  t |jj}t |jj}g }g }g d}td| jjD ]P}|| d }||j	j
dd|? d@  ||jj
dd|? d@  ||jj|? dd|   dd| > @  ||jj|? dd|   dd| > @  q|d |d t j|dd	
|j	jd |j	jd d}	|	d
  }	|	|jjj}	t j|dd	
|jjd |jjd d}
|
d
  }
|
|jjj}
|dt jj|	dd |dt jj|dd |dt jj|
dd |dt jj|dd d S )Nr  r   rF   r   r   r  r  r	  r
  r   r)  Fr   r%  r*  r&  )r]   r  r%  r   r&  r  r   rK   r  r)  r   r*  r  r  r  r   r  r'  r   r(  r   r   r   )rL   rq   w13_qweight_tmpw2_qweight_tmpw13_qzeros_listw2_qzeros_listr  r  r  w13_qzeros_tmpw2_qzeros_tmpr5   r5   r9   r   \  s^   





z0AWQMoEAscendMethod.process_weights_after_loadingr9  r   c                 C  s
   || _ d S r4   )r9  r=  r5   r5   r9   r>    s   
z$AWQMoEAscendMethod.create_moe_runnerr?  r'   r   c           
      C  s   ddl m} | jjdksJ d|j}|j}|\}}}|tj}||j	}t
||j|j|j|j|j|j|||jd dd}	||	dS )	Nr   )StandardCombineInputsiluz"Only SiLU activation is supported.r   T)hidden_statesw13	w13_scale
w13_offsetw2w2_scale	w2_offsettopk_weightstopk_idstop_k	use_wna16)rK  )&sglang.srt.layers.moe.token_dispatcherrI  r9  
activationrK  topk_outputr  r]   r   r   r   r%  r'  r)  r&  r(  r*  r   )
rL   rq   r?  rI  r   rX  rR  rS  _outputr5   r5   r9   r     s.   

zAWQMoEAscendMethod.applyNr   r   rB  )rq   rr   r?  r'   rD   r   )r   r   r   rI   r   r>  r   r5   r5   r5   r9   rz   X  s
    


2rz   zsgl_kernel::awq_dequantizec                 C  s2   | j d d | j d d | f }| j||jdS )Nr   rG   r   )r   	new_emptyr   )r   r   r   ch_axisr@   r   r   r5   r5   r9   rY    s   "	rY  zsgl_kernel::awq_marlin_repackc                 C  s    | j |d ||d  f| jdS )N   r  r   )r[  r   )
b_q_weightr   r   r   r5   r5   r9   rY    s   )r0   r1   r2   r3   )U
__future__r   loggingr   typingr   r   r   r   r   r]   Asglang.srt.hardware_backend.npu.quantization.fused_moe_method_npur   rv   r	   r
   sglang.srt.layers.moer   r   r   r   'sglang.srt.layers.moe.moe_runner.marlinr   sglang.srt.layers.parameterr   r   *sglang.srt.layers.quantization.base_configr   r   r   r   +sglang.srt.layers.quantization.marlin_utilsr   r   r   r   r   r   r   r   r   r   r    r!   &sglang.srt.layers.quantization.unquantr"   $sglang.srt.layers.quantization.utilsr#   r$   sglang.srt.utils.patch_torchr%   rV  r&   r'   sglang.srt.utilsr(   r)   r*   r+   r   r   _is_xpur\   r  
sgl_kernelr,   r-   r.   )sglang.srt.layers.quantization.awq_tritonr/   r   	getLoggerr   r   
ScalarTyper   r<   r=   r   r{   r   ry   r   rz   rY  r5   r5   r5   r9   <module>   sf   8



] -f : C]
