o
    پi9                     @   sx   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZe 	e
Zd dlmZ e \ZZd dlmZ G dd deZdS )    N)Fraction)AnyOptionalUnion)get_scalar_types)QuantizationConfigc                       sv  e Zd ZdZh dZdhZddhZh dZ							d1d
edede	de
deee
ee
 f  deee
ef  de
de
ddf fddZde
fddZedd Zedeej fddZedefddZedee
 fddZedee
ef dd fd d!Zdee
 fd"d#Zd$e
fd%d&Zd
ede	fd'd(Zd2d)e
de
fd*d+Zd2d)e
de
fd,d-Zd.ejj d)e
fd/d0Z!  Z"S )3AutoRoundConfigzPConfig class for AutoRound.
    Reference: https://arxiv.org/pdf/2309.05516
    >               intauto_round:auto_gptqzauto_round:auto_awq>   
awq:marlingptq:marlinawqautogptqmarlinTNr   weight_bits
group_sizesympacking_formatblock_name_to_quantizeextra_config	data_typebackendreturnc	           	         s   t    || jvrtd| d| j || jvr%td| d| j || jvr5td| d| j || jvrEtd| d| j || _|| _|| _	|| _
t|tr[|dn|| _|| _|| _|| _td|| _d S )	NzUnsupported weight_bits: z, currently only support  zUnsupported data_type: zUnsupported packing_format: zUnsupported backend: z,  currently only support  ,    )super__init__SUPPORTED_BITS
ValueErrorSUPPORTED_DTYPESSUPPORTED_FORMATSSUPPORTED_BACKENDSr   r   r   r   
isinstancestrsplitr   r   r   r   r   pack_factor)	selfr   r   r   r   r   r   r   r   	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/auto_round.pyr!      sJ   




zAutoRoundConfig.__init__c                 C   s   d| j  d| j d| j dS )NzAutoRoundConfig(weight_bits=z, group_size=z, sym=))r   r   r   r+   r.   r.   r/   __repr__L   s   
zAutoRoundConfig.__repr__c                 C      dS )Nz
auto-roundr.   clsr.   r.   r/   get_nameR      zAutoRoundConfig.get_namec                 C   s   t jt jgS N)torchhalfbfloat16r4   r.   r.   r/   get_supported_act_dtypesV   s   z(AutoRoundConfig.get_supported_act_dtypesc                 C   r3   )N<   r.   r4   r.   r.   r/   get_min_capabilityZ   r7   z"AutoRoundConfig.get_min_capabilityc                 C   s   dgS )Nzquantization_config.jsonr.   r4   r.   r.   r/   get_config_filenames^   s   z$AutoRoundConfig.get_config_filenamesconfigc                 C   sv   | |  |dg|  |dg|  |dg| |dgd| |ddgd | |dgd | |d	gd
| |g dddS )Nbitsr   r   r   r   r   to_quant_block_namesr   r   r   )r   vllm_backendsglang_backendr   )r   r   r   r   r   r   r   r   )get_from_keysget_from_keys_or)r5   r@   r.   r.   r/   from_configb   s$   

zAutoRoundConfig.from_configc                 C   s   t )zvReturns the activation function names that should be post-scaled.

        For now, this is only used by AWQ.
        )NotImplementedErrorr1   r.   r.   r/   get_scaled_act_namesw   s   z$AutoRoundConfig.get_scaled_act_names
layer_namec                    sR  ddl m} ddtdtffddjrjv rS t|| jr3tfdd	jD jr`d
|jj	
 v r`fddjD }|r`tt|dkrX|d S td djrj D ];\ } v r dkr fdd|D }fdd|D }tt|dkr|d   S td d| qhS )Nr   ParallelLMHeadTname	quantizedc              	      sZ  j s|rjnd|rjnd|rjfS dfS | j v rDj |  }|d|r*jnd|d|r4jnd|d|r@jfS dfS td j  D ]J\}}t|trat	 fdd	|D sbqMz+t
|| r|d|rqjnd|d|r{jnd|d|rjndfW   S W qM t
jy   Y qMw |rjnd|rjnd|rjfS dfS )
N   TrA   r   r   z*+?^$()[]{}|\\c                 3   s    | ]}| v V  qd S r8   r.   ).0cREGEX_SPECIAL_CHARSr.   r/   	<genexpr>   s    
zGAutoRoundConfig.get_layer_config.<locals>.get_config.<locals>.<genexpr>)r   r   r   r   getsetitemsr'   r(   anyre	fullmatcherror)rM   rN   cfgpatternr1   rS   r/   
get_config   sJ   


z4AutoRoundConfig.get_layer_config.<locals>.get_configc                 3   s    | ]}  |V  qd S r8   
startswithrQ   rM   )rJ   r.   r/   rU      s    

z3AutoRoundConfig.get_layer_config.<locals>.<genexpr>fusedmoec                    s    g | ]}| r |qS r.   r`   rb   )r_   rJ   rN   r.   r/   
<listcomp>   s    z4AutoRoundConfig.get_layer_config.<locals>.<listcomp>   zFused MoE layer 'z5' requires consistent quant config for all sub-layersc                    s   g | ]}  |qS r.   )replace)rQ   sub_key)
fusion_keyrJ   r.   r/   rd      s    c                    s   g | ]} |qS r.   r.   rb   )r_   rN   r.   r/   rd      s    zFused module 'z'' requires consistent quant config for )T)*sglang.srt.layers.vocab_parallel_embeddingrL   r(   boolr   r'   r   rY   r-   __name__lowerlenrW   r#   packed_modules_mappingrX   count)r+   layerrJ   rL   moe_configssub_keys	sub_namessub_configsr.   )rh   r_   rJ   rN   r+   r/   get_layer_config~   sF   *

z AutoRoundConfig.get_layer_configc                 C   s   |dk S )NrO   r.   )r+   r   r.   r.   r/   check_quantized   s   zAutoRoundConfig.check_quantizedprefixc                 C   s  ddl m} ddlm} ddlm}m} ddlm} ddl	m
}	 | ||\}
}}| |
s:t|||	fr8| S d S td||jj|
|| |dksNd	|v rotjtjd
}|
|v oa|||
 || }t||rn|om|||}nd}|rddlm}m}m} ||
|| di g d}nddlm}m} ||
|| d}t||r|r||S ddlm} d|
|| dd}||||S t|||	fr|r||S ||S d S )Nr   
LinearBaseFusedMoEcheck_marlin_supportedcheck_moe_marlin_supports_layerUnquantizedLinearMethodrK   0[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %sr   r   )r   r   F)AWQMarlinConfigAWQMarlinLinearMethodAWQMoEMethod)r   r   
zero_pointlm_head_quantizedfull_configmodules_to_not_convert)	AWQConfigAWQLinearMethod)r   r   r   MoeWNA16Configr   )quant_methodrA   r   r   lm_head)sglang.srt.layers.linearry   &sglang.srt.layers.moe.fused_moe_tritonr{   +sglang.srt.layers.quantization.marlin_utilsr}   r~   &sglang.srt.layers.quantization.unquantr   ri   rL   ru   rv   r'   loggerdebugr-   rk   scalar_typesuint4uint8"sglang.srt.layers.quantization.awqr   r   r   r   r   (sglang.srt.layers.quantization.moe_wna16r   rG   get_quant_method)r+   rp   rw   r   ry   r{   r}   r~   r   rL   r   r   r   AWQ_TYPE_MAP
use_marlinr   r   r   quant_args_marlinr   r   
quant_argsr   r@   r.   r.   r/   apply_awq_quant_layer   s~   


	
z%AutoRoundConfig.apply_awq_quant_layerc              	   C   s  ddl m} ddlm} ddlm}m} ddlm} ddl	m
}	 | ||\}
}}| |
s:t|||	fr8| S d S td||jj|
|| |dksNd	|v rttjtjd
}|
|f|v of|||
|f || d}t||rs|or|||}nd}|rddlm}m}m} ||
||ddi i d}nddlm}m} ||
|ddi d}t||r|rddlm} d|
||dd}||||S ||S t|||	fr|r||S ||S d S )Nr   rx   rz   r|   r   rK   r   r   r   ))r   T)r   T)has_zpF)GPTQMarlinConfigGPTQMarlinLinearMethodGPTQMarlinMoEMethod)r   r   is_symr   desc_actdynamicr   )
GPTQConfigGPTQLinearMethod)r   r   r   r   r   r   r   )r   rA   r   r   r   )r   ry   r   r{   r   r}   r~   r   r   ri   rL   ru   rv   r'   r   r   r-   rk   r   uint4b8	uint8b128#sglang.srt.layers.quantization.gptqr   r   r   r   r   r   r   rG   r   )r+   rp   rw   r   ry   r{   r}   r~   r   rL   r   r   r   GPTQ_TYPE_MAPr   r   r   r   r   r   r   r   r   r@   r.   r.   r/   apply_gptq_quant_layer-  s   




z&AutoRoundConfig.apply_gptq_quant_layerrp   c                 C   sD   d| j v s
d| jv r| ||S d| j v sd| jv r | ||S d S )Nr   r   )r   r   r   r   )r+   rp   rw   r.   r.   r/   r     s
   z AutoRoundConfig.get_quant_method)Tr   NNr   r   )r   )#rk   
__module____qualname____doc__r"   r$   r%   r&   r   rj   r(   r   r   listdictr   r!   r2   classmethodr6   r9   dtyper<   r>   r?   rG   rI   ru   rv   r   r   nnModuler   __classcell__r.   r.   r,   r/   r      s`    	
/
XTZr   )loggingrZ   	fractionsr   typingr   r   r   r9   	getLoggerrk   r   $sglang.srt.layers.quantization.utilsr   
ScalarTyper   *sglang.srt.layers.quantization.base_configr   r   r.   r.   r.   r/   <module>   s   

