o
    پiĲ                     @  sZ  d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlZd dlmZmZmZmZ d dlmZ d dlmZmZmZmZmZmZmZ d d	lm Z m!Z!m"Z"m#Z# d d
l$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4m5Z5 d dl6m7Z7m8Z8m9Z9 d dl:m;Z; erd dl<m=Z=m>Z> e7 Z?e?rd dl@mAZAmBZB d dlCmDZD e8 ZEeErd dlFZFeGeHZIe3 \ZJZKd:ddZLd;ddZMeG d d! d!ZNG d"d# d#e"ZOG d$d% d%e"ZPG d&d' d'e!ZQG d(d) d)e!ZR	*d<d=d-d.ZSG d/d0 d0eQZTG d1d2 d2e ZUe?r+e;d3d4d5 ZVe;d6d7d5 ZVe;d8d9d5 ZVdS dS )>    )annotationsN)	dataclass)Fraction)TYPE_CHECKINGAnyCallableDictListOptionalUnion)	MoeRunnerMoeRunnerBackendMoeRunnerConfigget_moe_runner_backend)MarlinMoeQuantInfo)BasevLLMParameterChannelQuantScaleParameterGroupQuantScaleParameterPackedColumnParameterPackedvLLMParameterRowvLLMParameterpermute_param_layout_)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)apply_gptq_marlin_linearcheck_marlin_supportedcheck_marlin_supports_shapemarlin_is_k_fullmarlin_make_empty_g_idxmarlin_make_workspacemarlin_moe_permute_scalesmarlin_permute_scales!marlin_repeat_scales_on_all_ranksmarlin_sort_g_idxmarlin_zero_pointsverify_marlin_supported)get_linear_quant_methodget_scalar_typesreplace_parameterunpack_cols)is_cudais_npuset_weight_attrs)register_fake_if_exists)CombineInputStandardDispatchOutput)	gptq_gemmgptq_shuffle)gptq_marlin_repackhf_quant_cfgDict[str, Any]returnboolc                 C  s   |  ddkp|  ddS )Ncheckpoint_formatmarlinis_marlin_formatF)get)r5    r=   W/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/gptq.pycheck_marlin_formatM   s   r?   
b_q_weighttorch.Tensorpermsize_kintsize_nnum_bitsc                 C  sn   | j d }|d dksJ tj||d ||d  f| j| jd}t|D ]}t| | || |||||< q$|S )Nr         devicedtype)shapetorchemptyrJ   rK   ranger4   )r@   rB   rC   rE   rF   num_expertsoutputer=   r=   r>   gptq_marlin_moe_repackU   s   
rS   c                   @  sF   e Zd ZU ded< ded< ded< ded< ded	< d
ed< d
ed< dS )MarlinLinearLayerConfigztuple[int, int]full_weight_shapepartition_weight_shape
ScalarTypeweight_typetorch.dtypeact_typerD   
group_sizer8   zero_points	has_g_idxN)__name__
__module____qualname____annotations__r=   r=   r=   r>   rT   h   s   
 rT   c                      s   e Zd ZdZ	d)d* fddZd+ddZd,ddZed+ddZed-ddZ	ed.ddZ
ed,ddZed/d!d"Zd0d'd(Z  ZS )1
GPTQConfigzLConfig class for GPTQ.

    Reference: https://arxiv.org/abs/2210.17323
     weight_bitsrD   r[   desc_actr8   lm_head_quantizeddynamic&Dict[str, Dict[str, Union[int, bool]]]r9   strr7   Nonec                   s\   t    || _|| _|| _|| _|| _td| j| _|| _	| jdvr,t
d| j dd S )N    )rH            zOCurrently, only 2/3/4/8-bit weight quantization is supported for GPTQ, but got z bits.)super__init__rg   rd   r[   re   rf   r   pack_factorr9   
ValueError)selfrd   r[   re   rf   rg   r9   	__class__r=   r>   rp   y   s   
 
zGPTQConfig.__init__c              
   C  s:   d| j  d| j d| j d| j d| j 
d| j dfS )NzGPTQConfig(weight_bits=, group_size=, desc_act=z),lm_head_quantized=), dynamic=zcheckpoint_format=))rd   r[   re   rf   rg   r9   rs   r=   r=   r>   __repr__   s   
zGPTQConfig.__repr__	List[str]c                 C     t zvReturns the activation function names that should be post-scaled.

        For now, this is only used by AWQ.
        NotImplementedErrorrz   r=   r=   r>   get_scaled_act_names      zGPTQConfig.get_scaled_act_namesc                 C     dS )Ngptqr=   clsr=   r=   r>   get_name      zGPTQConfig.get_nameList[torch.dtype]c                 C  s   t stjgS tjtjgS N)_is_npurM   halfbfloat16r   r=   r=   r>   get_supported_act_dtypes   s   z#GPTQConfig.get_supported_act_dtypesc                 C  s   t rtddS )Nz;NPU hardware does not support "get_min_capability" feature.<   )r   r   r   r=   r=   r>   get_min_capability   s
   zGPTQConfig.get_min_capabilityc                 C     dgS Nzquantize_config.jsonr=   r   r=   r=   r>   get_config_filenames      zGPTQConfig.get_config_filenamesconfigr6   c                 C  s   | j |dgi d}|d u ri n|}| |dg}| |dg}| |dg}| j |dgdd}| j |dgd	d}| ||||||S )
Nrg   defaultbitsr[   re   lm_headFr9   rc   get_from_keys_orget_from_keys)r   r   rg   rd   r[   re   rf   r9   r=   r=   r>   from_config   s"   zGPTQConfig.from_configlayertorch.nn.ModuleprefixOptional[LinearMethodBase]c                 C  sf   ddl m} ddlm} tr"t||rt| S t||r tdd S t||r+tdt	| ||t
dS )Nr   )
LinearBaseFusedMoEz%GPTQ Method does not support MoE yet.z8GPTQ Method does not support MoE, please use gptq_marlin)r   linear_method_cls)sglang.srt.layers.linearr   &sglang.srt.layers.moe.fused_moe_tritonr   r   
isinstanceGPTQLinearAscendMethodr   	TypeErrorr(   GPTQLinearMethod)rs   r   r   r   r   r=   r=   r>   get_quant_method   s   


zGPTQConfig.get_quant_method)rc   )rd   rD   r[   rD   re   r8   rf   r8   rg   rh   r9   ri   r7   rj   r7   ri   r7   r|   r7   r   r7   rD   )r   r6   r7   rb   )r   r   r   ri   r7   r   )r^   r_   r`   __doc__rp   r{   r   classmethodr   r   r   r   r   r   __classcell__r=   r=   rt   r>   rb   s   s"    
2

rb   c                      s   e Zd ZdZejejdZd0 fddZd1ddZ	d2ddZ
ed1ddZed3ddZed4ddZed2dd Zed5d"d#Zed6d%d&Zd7d+d,Zed8d.d/Z  ZS )9GPTQMarlinConfigzConfig class for GPTQ Marlin))rm   T)rn   Trd   rD   r[   re   r8   is_symrf   rg   rh   full_configr6   r7   rj   c                   s   t    |r|dkrd}|| _|| _|| _d| | _|| _|| _|| _|| _	||f| j
vr8td| d| | j
||f | _d S )NFrk   z&Unsupported quantization config: bits=z, sym=)ro   rp   rg   rd   r   rq   r[   re   rf   r   TYPE_MAPrr   
quant_type)rs   rd   r[   re   r   rf   rg   r   rt   r=   r>   rp     s    


zGPTQMarlinConfig.__init__ri   c              
   C  s,   d| j  d| j d| j d| j d| j 
S )NzGPTQMarlinConfig(quant_type=rv   rw   z, lm_head_quantized=rx   )r   r[   re   rf   rg   rz   r=   r=   r>   r{   A  s   
zGPTQMarlinConfig.__repr__r|   c                 C  r}   r~   r   rz   r=   r=   r>   r   J  r   z%GPTQMarlinConfig.get_scaled_act_namesc                 C  r   )Ngptq_marlinr=   r   r=   r=   r>   r   Q  r   zGPTQMarlinConfig.get_namer   c                 C  s   t jt jgS r   )rM   r   r   r   r=   r=   r>   r   U  s   z)GPTQMarlinConfig.get_supported_act_dtypesc                 C  r   )NP   r=   r   r=   r=   r>   r   Y  r   z#GPTQMarlinConfig.get_min_capabilityc                 C  r   r   r=   r   r=   r=   r>   r   ]  r   z%GPTQMarlinConfig.get_config_filenamesr   c                 C  s   | j |dgi d}|d u ri n|}| |dg}| |dg}| |dg}| |dg}| j |dgdd}| |||||||S )	Nrg   r   r   r[   re   symr   Fr   )r   r   rg   rd   r[   re   r   rf   r=   r=   r>   r   a  s    zGPTQMarlinConfig.from_configOptional[str]c                 C  sz   t |}| |}|d u p|dkp|dk}|s.|r.|r.d|  |  }t| |  S |s;|r;|dkr;td d S )Nr:   r   z?The model is convertible to {} during runtime. Using {} kernel.r   zDetected that the model can run with gptq_marlin, however you specified quantization=gptq explicitly, so forcing gptq. Use quantization=gptq_marlin for faster inference)r?   is_gptq_marlin_compatibleformatr   loggerinfo)r   r5   
user_quantr;   can_convertis_valid_user_quantmsgr=   r=   r>   override_quantization_methodu  s   

z-GPTQMarlinConfig.override_quantization_methodr   r   r   Optional[QuantizeMethodBase]c                 C  s,   ddl m} t||rt| S t| ||tS )Nr   r   )r   r   r   GPTQMarlinMoEMethodr(   GPTQMarlinLinearMethod)rs   r   r   r   r=   r=   r>   r     s   
z!GPTQMarlinConfig.get_quant_methodquant_configc                 C  s   | dd }| d}| d}| d}| d}ts dS |dkr&dS |d u s6|d u s6|d u s6|d u r8dS ||f| jvrAdS t| j||f |d	S )
Nquant_methodrc   r   r[   r   re   Fr   r   r[   )r<   lower_is_cudar   r   )r   r   r   rF   r[   r   re   r=   r=   r>   r     s    



 z*GPTQMarlinConfig.is_gptq_marlin_compatible)rd   rD   r[   rD   re   r8   r   r8   rf   r8   rg   rh   r   r6   r7   rj   r   r   r   r   )r   r6   r7   r   )r7   r   )r   r   r   ri   r7   r   )r   r6   )r^   r_   r`   r   scalar_typesuint4b8	uint8b128r   rp   r{   r   r   r   r   r   r   r   r   r   r   r   r=   r=   rt   r>   r      s.    
:
	

r   c                   @  s<   e Zd ZdZdddZdddZdddZ	d d!ddZdS )"r   z[Linear method for GPTQ.

    Args:
        quant_config: The GPTQ quantization config.
    r   rb   c                 C  s   || _ |jdk| _d S )Ngptq_v2)r   r9   use_v2_formatrs   r   r=   r=   r>   rp     s   zGPTQLinearMethod.__init__r   r   input_size_per_partitionrD   output_partition_sizes	list[int]
input_sizeoutput_sizeparams_dtyperY   c                   s  ~| d}| jj dkrtdt|}	|	 jjj dkr#td jjdkr. jj}
n|}
d _||
 }d }||krQ jjdkrQ jjrKd _n||
 }d}t	t
j| jj |	t
jddd	d jj|d
}tt
j fddt|D t
jdd|d}t
j||	 jj t
jd|d}t
j||	|d|d}|d u rtddd	i|}tdd	d	 jjd|}ntdd	dd|}t	ddd	d	 jjd|}|d| |d| |d| |d| d S )Nweight_loaderr   ztThe input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.zuThe output size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size.r   TFrK      data	input_dim
output_dim
packed_dimpacked_factorr   c                   s   g | ]}| j j qS r=   )r   r[   ).0irz   r=   r>   
<listcomp>  s    
z3GPTQLinearMethod.create_weights.<locals>.<listcomp>r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   qweightg_idxqzerosscalesr=   )r<   r   r[   rr   sumrq   	numeratoruse_shufflere   r   rM   rN   int32r   tensorrO   r   r   r   register_parameter)rs   r   r   r   r   r   r   extra_weight_attrsr   output_size_per_partitionr[   scale_and_zero_sizescale_and_zero_input_dimr   r   qzeros_argsweight_scale_argsr   r   r=   rz   r>   create_weights  s   





	zGPTQLinearMethod.create_weightsr7   rj   c                 C  s   t jj|jjdd|_t jj|jjdd|_t jj|jjdd|_t jj|jjdd|_| jrY| j	j
r@t |jt j|j_nt jdt j|jjd|j_t|j|j| j	j d S d S )NFrequires_grad)r   rK   rJ   )rM   nn	Parameterr   r   r   r   r   r   r   re   argsorttorD   rN   rJ   r3   rd   )rs   r   r=   r=   r>   process_weights_after_loading/  s   
z.GPTQLinearMethod.process_weights_after_loadingNxrA   biasOptional[torch.Tensor]c                 C  sl   |j d d |jj d f }|d|j d }t||j|j|j|j| j| jj	}|d ur1|
| ||S )Nr   )rL   r   reshaper2   r   r   r   r   r   rd   add_)rs   r   r  r  	out_shape
reshaped_xrQ   r=   r=   r>   applyA  s   	

zGPTQLinearMethod.apply)r   rb   r   r   r   rD   r   r   r   rD   r   rD   r   rY   r   r   r7   rj   r   r   r   r  rA   r  r  r7   rA   )r^   r_   r`   r   rp   r   r  r  r=   r=   r=   r>   r     s    


or   c                   @  sL   e Zd ZU dZe Zded< ddd	Zd ddZd!ddZ		d"d#ddZ
dS )$r   ziLinear method for GPTQ Marlin.

    Args:
        quant_config: The GPTQ Marlin quantization config.
    zset[str]_kernel_backends_being_usedr   r   r7   rj   c                 C  s   || _ t| j j| j jd d S )Nr   )r   r'   r   r[   r   r=   r=   r>   rp   a  s
   
zGPTQMarlinLinearMethod.__init__r   r   r   rD   r   r   r   r   r   rY   c              	   K  s  t |}||k}	|d}
t||f||f| jj|| jjd| jjd| _| jjdkr.| jj}n|}t| jj| jj|	rAd }|| }nd}|| }t	t
j|| jj |t
jdddd| jj|
d}tt
j|t
jdd|
d	}t
j||| jj t
jd|
d
}t
j|||d|
d
}|d u rtdddi|}tddd| jjd|}ntdddd|}t	dddd| jjd|}|d| |d| |d| |d| d S )Nr   F)rU   rV   rX   rZ   r[   r\   r]   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r=   )r   r<   rT   r   r   r[   re   kernel_configr$   r   rM   rN   rq   r   r   r   r   r   r   )rs   r   r   r   r   r   r   r   r   is_row_parallelr   r[   scales_and_zp_input_dimscales_and_zp_sizer   r   r   r   r   r   r=   r=   r>   r   j  s   






		z%GPTQMarlinLinearMethod.create_weightsc                   s`  t dj}| j t jd  jd  jd  j  jd  jd k}t j|| _	t
|| _d| _d| _d| _d| _ddd} fdd} fdd} jrjtt | j\}|| jfdd |_nt| jt| t|_ jr jdkr jd  j nd|| j fdd n	t| jt| || j| || j| d S )Nr   r   r   r   r   r   r   r   namer   fnr   r7   rj   c                 S  sP   |d ur$t | |d d ur&t | |}||}t| |tjj|jdd d S d S d S )NFr   )getattrr*   rM   r   r  r   )r   r  r  	old_param	new_paramr=   r=   r>   _transform_param  s   
zNGPTQMarlinLinearMethod.process_weights_after_loading.<locals>._transform_paramc                   sN   t | tsJ t| dddd t| j j jd  jd  jj	d| _| S )Nr   r   )r   r   r   )rB   rC   rE   rF   )
r   r   r   r4   r   
contiguousg_idx_sort_indicesrV   rX   	size_bitsr  )cr   r=   r>   transform_w_q   s   zKGPTQMarlinLinearMethod.process_weights_after_loading.<locals>.transform_w_qc                   sF   t | tsJ t| ddd t| j  jd  jd  jd| _| S )Nr   r   )r   r   )rC   rE   r[   )r   r   r   r#   r   r  rV   r[   r  )r  r=   r>   transform_w_s  s   zKGPTQMarlinLinearMethod.process_weights_after_loading.<locals>.transform_w_sc                   s    S r   r=   )_)r   r=   r>   <lambda>  s    zFGPTQMarlinLinearMethod.process_weights_after_loading.<locals>.<lambda>r   c                   s2   t t|   jj jd  jd  jjdS )Nr   )rC   rE   rF   )r&   r+   trX   r  rV   r  )r  	grouped_kr=   r>   r#  (  s    )r   r   r  r   r  r   r7   rj   )r  rJ   r  r   rV   rU   r[   r   r]   	is_k_fullr!   	workspacew_q_namew_s_name	w_zp_namew_gidx_namer%   r  setattrr    r\   )rs   r   rJ   row_parallelr  r   r!  r  r=   )r  r   r%  r   r>   r    sJ   



z4GPTQMarlinLinearMethod.process_weights_after_loadingNr  rA   r  r  c           
        sX    j }d
 fdd}||\}}}}	t|||||	|j j|j|jd |jd  j|d	S )Nr   r   r7   Qtuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]c                   s8   t |  jt |  jt |  jpdd t |  jpdd fS )Nrc   )r  r(  r)  r*  r+  )r   rz   r=   r>   _get_weight_paramsA  s
   
	
z8GPTQMarlinLinearMethod.apply.<locals>._get_weight_paramsr   r   )inputweightweight_scale	weight_zpr   r  r'  wtyper   r   r&  r  )r   r   r7   r.  )r  r   r  r'  rX   rV   r&  )
rs   r   r  r  r  r/  w_qw_sw_zpw_gidxr=   rz   r>   r  9  s"   zGPTQMarlinLinearMethod.applyr   r   r7   rj   )r   r   r   rD   r   r   r   rD   r   rD   r   rY   r7   rj   r  r   r  )r^   r_   r`   r   setr  ra   rp   r   r  r  r=   r=   r=   r>   r   X  s   
 

	
qbr   r   r1  r   c                 C  s&  | j tjksJ d| j  d|dksJ d| dd| }d|> d }|dkrVtj| jd | jd | f| jtjd}t|D ]}| || ? |@ |d	d	|d	|f< qAn,tj| jd | | jd f| jtjd}t|D ]}| || ? |@ ||d	|d	d	f< qntd
|d
 }|| tj	}|S )a  
    Unpacks quantized weights from int32 format back to original bits.

    :param weight: The packed int32 tensor containing quantized weights
    :param num_bits: The number of bits used for quantization (<= 8)
    :param packed_dim: Dimension along which weights are packed (0 or 1), defaults to 1
    :return: Unpacked tensor with int8 dtype after applying offset correction
    z0Expecting `weight.dtype` is torch.int32 but got .rn   z9Expecting `num_bits` should not be larger than 8 but got rk   r   r   rI   NrH   )
rK   rM   r   zerosrL   rJ   rO   powr  int8)r1  rF   r   rq   maskunpacked_weightr   offsetr=   r=   r>   unpack_from_int32d  s6   

$$rB  c                      s:   e Zd ZdZd fddZdddZ	ddddZ  ZS )r   z%Linear method for GPTQ on Ascend NPU.r   r   r   rD   r   r   r   r   r   rY   c                   sZ   t  j||||||fi | t|jd| jji t|jd| jji | jjr+tdd S )Nrq   zHCurrently, desc_act (True) is not supported by GPTQ quantization on npu.)	ro   r   r.   r   r   rq   r   re   rr   )rs   r   r   r   r   r   r   r   rt   r=   r>   r     s"   
	z%GPTQLinearAscendMethod.create_weightsr7   rj   c                 C  s   t jjt|jj | jjdd	|j
jdd|_| js#| jd7  _t|jj | jjdd}| jjdkrAt jj|dd|_d S t jjt|	t jdd|_d S )Nr   )r   Fr   r   rm   )rM   r   r  rB  r   r   r  r   rd   r  r   rK   r   r   	torch_npunpu_convert_weight_to_int4packr   )rs   r   qweight_tmpr=   r=   r>   r    s0   

z4GPTQLinearAscendMethod.process_weights_after_loadingNr  rA   r  r  c           
      C  s   |j }|j}|j}|d|jd }|d ur |jtjkr | }| j	j
dkr6|jd d |jd d f }n|jd d |jd f }tj||||| j	j|d}	|	|S )Nr   rm   rn   )antiquant_scaleantiquant_offsetantiquant_group_sizer  )r   r   r   r  rL   rK   rM   r   floatr   rd   rC  npu_weight_quant_batchmatmulr[   )
rs   r   r  r  r   r   r   r  r
  outr=   r=   r>   r    s$    
	zGPTQLinearAscendMethod.applyr  r  r   r  )r^   r_   r`   r   r   r  r  r   r=   r=   rt   r>   r     s    
"r   c                   @  sB   e Zd ZdZdddZdddZd ddZd!ddZd"ddZdS )#r   z$MoE Marlin method with quantization.r   r   r7   rj   c                 C  s
   || _ d S r   )r   r   r=   r=   r>   rp     s   
zGPTQMarlinMoEMethod.__init__r   r   rP   rD   hidden_sizeintermediate_size_per_partitionr   rY   c                 K  s  ddl m} ddlm} | jj p|jdk| _| jjdkr:|| jj }	| jjr*|}
n||j }
|
| jj }|j	j
}nd}	d}|jj
}||dd tjjtj||| jj d| tjd	d
d}|d| ||| tjjtj||| jj |tjd	d
d}|d| ||| tjjtj||	d| tjd	d
d}|d| ||| tjjtj|||tjd	d
d}|d| ||| ||d| jji tjjtj||	d| | jj |d	d
d}|d| ||| tjjtj|||| jj |d	d
d}|d| ||| ||d| jji tjjtj||tjd	d
d}|d| ||| tjjtj||tjd	d
d}|d| ||| tjjtj||tjd	d
d}|d| ||| tjjtj||tjd	d
d}|d| ||| d S )Nr   )r.   )FusedMoeWeightScaleSupportedr   r   T)r   is_transposedrH   r   Fr   w13_qweight
w2_qweight
w13_scales	w2_scalesload_full_w2
w13_qzeros	w2_qzeros	w13_g_idxw2_g_idxw13_g_idx_sort_indicesw2_g_idx_sort_indices)r   r.   r   rN  r   re   moe_tp_sizer&  r[   GROUPvalueCHANNELupdaterM   r   r  rN   rq   r   r   r   )rs   r   rP   rL  rM  r   r   r.   rN  scales_size13w2_scales_sizescales_size2strategyrP  rQ  rR  rS  rU  rV  rW  rX  rY  rZ  r=   r=   r>   r     s   



	

	
	

	

	



z"GPTQMarlinMoEMethod.create_weightsc                 C  s  | j jrt|jjd }t|j}t|j}t|j}t|j}t|D ]4}t|j| 	tj
||< t|j| 	tj
||< |j| ||  ||< |j| ||  ||< q&t|d| t|d| t|d| t|d| nR|jjd }|jj}tjjtj|dftj
|ddd|_tjjtj|dftj
|ddd|_tjjtj|dftj
|ddd|_tjjtj|dftj
|ddd|_t|j|j|jjd	 | j j |jjd
 | j j}	t|d|	 t|j|j|jjd	 | j j |jjd
 | j j}
t|d|
 t|j|j|jjd
 | j jd}t|d| t|j|jjd	 | j jdkr+| j jn| j j |jjd
 | j jd}t|d| d S )Nr   rW  rX  rY  rZ  r   Fr   r   rH   rP  rQ  )srC   rE   r[   rR  r   rS  )r   re   rW  rL   rM   
empty_likerX  rO   r  r  r   r*   rJ   r   r  rN   rY  rZ  rS   rP  rq   rd   rQ  r"   rR  rM  r[   rS  )rs   r   rP   rY  rZ  w13_sorted_g_idxw2_sorted_g_idxrR   rJ   marlin_w13_qweightmarlin_w2_qweightmarlin_w13_scalesmarlin_w2_scalesr=   r=   r>   r    s   




z1GPTQMarlinMoEMethod.process_weights_after_loadingmoe_runner_configr   c                 C  s&   t   sJ || _ttj|| _d S r   )r   is_autorl  r   r   MARLINrunner)rs   r   rl  r=   r=   r>   create_moe_runner  s   z%GPTQMarlinMoEMethod.create_moe_runnerdispatch_outputr1   r0   c                 C  s@   t |j|j|j|j|j|j|j|j| j	j
| jd
}| j||S )N)
rP  rQ  rR  rS  rW  rX  rY  rZ  rd   r&  )r   rP  rQ  rR  rS  rW  rX  rY  rZ  r   rd   r&  ro  run)rs   r   rq  
quant_infor=   r=   r>   r    s   zGPTQMarlinMoEMethod.applyNr9  )
r   r   rP   rD   rL  rD   rM  rD   r   rY   r  )r   r   rl  r   )r   r   rq  r1   r7   r0   )	r^   r_   r`   r   rp   r   r  rp  r  r=   r=   r=   r>   r     s    

 

Qr   zsgl_kernel::gptq_gemmc                 C  s    | j | jd |jd f| jdS )Nr   r   r   )	new_emptyrL   rK   )ar@   b_gptq_qzerosb_gptq_scalesb_g_idxr   bitr=   r=   r>   r"    s    r"  zsgl_kernel::gptq_marlin_repackc                 C  s    | j |d ||d  f| jdS )NrG   rH   r   )rt  rK   )r@   rB   rC   rE   rF   r=   r=   r>   r"    s   zsgl_kernel::gptq_shufflec                 C  s   d S r   r=   )q_weightq_permry  r=   r=   r>   r"    r   )r5   r6   r7   r8   )r@   rA   rB   rA   rC   rD   rE   rD   rF   rD   r7   rA   )r   )r1  rA   rF   rD   r   rD   r7   rA   )W
__future__r   loggingdataclassesr   	fractionsr   typingr   r   r   r   r	   r
   r   rM   sglang.srt.layers.moer   r   r   r   'sglang.srt.layers.moe.moe_runner.marlinr   sglang.srt.layers.parameterr   r   r   r   r   r   r   *sglang.srt.layers.quantization.base_configr   r   r   r   +sglang.srt.layers.quantization.marlin_utilsr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   $sglang.srt.layers.quantization.utilsr(   r)   r*   r+   sglang.srt.utilsr,   r-   r.   sglang.srt.utils.patch_torchr/   &sglang.srt.layers.moe.token_dispatcherr0   r1   r   
sgl_kernelr2   r3   $sglang.jit_kernel.gptq_marlin_repackr4   r   rC  	getLoggerr^   r   rW   r   r?   rS   rT   rb   r   r   r   rB  r   r   r"  r=   r=   r=   r>   <module>   sh    $$	8




  7 %  ,]  

