o
    پi                     @  s   d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 d dl
mZmZmZ d dlmZmZmZ d dlmZ d dlmZ e ZerId d	lmZmZ d
gZddgZG dd deZG dd deZdS )    )annotations)AnyDictListOptionalN)	Parameter)ChannelQuantScaleParameterGroupQuantScaleParameterModelWeightParameter)LinearMethodBaseQuantizationConfigQuantizeMethodBase)per_token_quant_int8)is_cuda)qserve_w4a8_per_chn_gemmqserve_w4a8_per_group_gemm      c                   @  s~   e Zd ZdZd#ddZd$d
dZed%ddZed&ddZed$ddZ	ed'ddZ
ed(ddZd)ddZd'd d!Zd"S )*	QoQConfigzConfig class for QoQ Quantization.

    - Weight: static, per-channel/group, asymmetric
    - Activation: dynamic, per-token, symmetric

    Reference: https://arxiv.org/abs/2405.04532
    https://github.com/mit-han-lab/omniserve
    weight_bitsint
group_sizereturnNonec                 C  s`   || _ || _| j tvrtd| j  dt d| jtvr(td| j dt dd| j  | _d S )Nz#QoQ does not support weight_bits = z. Only weight_bits = z are supported.z"QoQ does not support group_size = z. Only group_sizes =    )r   r   QoQ_SUPPORTED_WEIGHT_BITS
ValueErrorQoQ_SUPPORTED_GROUP_SIZESpack_factor)selfr   r    r!   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/qoq.py__init__(   s   



zQoQConfig.__init__strc                 C  s   d | j| jS )Nz(QoQConfig(weight_bits={}, group_size={}))formatr   r   r    r!   r!   r"   __repr__=   s   zQoQConfig.__repr__List[torch.dtype]c                 C  s   t jgS N)torchfloat16clsr!   r!   r"   get_supported_act_dtypesB   s   z"QoQConfig.get_supported_act_dtypesc                 C     dS )NP   r!   r,   r!   r!   r"   get_min_capabilityF      zQoQConfig.get_min_capabilityc                 C  r/   )Nqoqr!   r,   r!   r!   r"   get_nameJ   r2   zQoQConfig.get_name	List[str]c                 C  s   ddgS )z7List of filenames to search for in the model directory.zquant_config.jsonzquantize_config.jsonr!   r,   r!   r!   r"   get_config_filenamesN   s   zQoQConfig.get_config_filenamesconfigDict[str, Any]c                 C  s&   |  |dg}|  |dg}| ||S )Nwbitsr   )get_from_keys)r-   r7   r   r   r!   r!   r"   from_configV   s   
zQoQConfig.from_configlayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  s"   ddl m} t||rt| S d S )Nr   )
LinearBase)sglang.srt.layers.linearr@   
isinstanceQoQLinearMethod)r    r<   r>   r@   r!   r!   r"   get_quant_method\   s   
zQoQConfig.get_quant_methodc                 C  s   g S r)   r!   r&   r!   r!   r"   get_scaled_act_namesg   s   zQoQConfig.get_scaled_act_namesN)r   r   r   r   r   r   )r   r$   )r   r(   )r   r   )r   r5   )r7   r8   r   r   )r<   r=   r>   r$   r   r?   )__name__
__module____qualname____doc__r#   r'   classmethodr.   r1   r4   r6   r;   rD   rE   r!   r!   r!   r"   r      s     
	

r   c                   @  s<   e Zd ZdZdddZdddZdddZ	d d!ddZdS )"rC   zYLinear method for QoQ.

    Args:
        quant_config: The QoQ quantization config.
    quant_configr   c                 C  s
   || _ d S r)   )rK   )r    rK   r!   r!   r"   r#   r   s   
zQoQLinearMethod.__init__r<   r=   input_size_per_partitionr   output_partition_sizes	List[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec                 K  s  | d}t|}	|	d dkrtd|	 d|| jj dkr,td| d| jj d| jjd	krG|| jj dkrGtd| d
| jj dttj|	|| jj tj	ddd|d}
|
d|
 ttj|	tjdd|d}|
d| | jjd	krttj|	tjdd|d}|
d| d S ttj|| jj |	ftj	ddd|d}|
d| ttj|| jj |	ftj	ddd|d}|
d| d S )Nweight_loader    r   z#Weight output_size_per_partition = z is not divisible by 32.z"Weight input_size_per_partition = z# is not divisible by pack_factor = .r   z" is not divisible by group_size = )dtype   )data	input_dim
output_dimrS   qweight)rX   rZ   rS   	s1_scales	s1_szeros	s2_scaless2_zeros)getsumr   rK   r   r   r
   r*   emptyint8register_parameterr   r+   r	   )r    r<   rL   rM   rO   rP   rQ   extra_weight_attrsrS   output_size_per_partitionr[   r\   r]   r^   r_   r!   r!   r"   create_weightsu   s   




zQoQLinearMethod.create_weightsr   r   c                 C  sn   t |jjdd|_t |jjdd|_| jjdkr#t |jjdd|_d S t |jjdd|_t |jjdd|_d S )NF)requires_gradr   )	r   r[   rX   r\   rK   r   r]   r^   r_   )r    r<   r!   r!   r"   process_weights_after_loading   s   z-QoQLinearMethod.process_weights_after_loadingNxtorch.TensorbiasOptional[torch.Tensor]c                 C  s   |j tjks
J d| jjdkr(t||j dd\}}}t||j|j||j	|}nt||j d\}}t
||j|j|j|j|}|d urF|| }|S )Nz#QoQ only supports float16 input nowr   T)scale_dtypecal_sum)rn   )rV   r*   r+   rK   r   r   r   r[   r\   r]   r   r_   r^   )r    r<   rj   rl   x_qx_scalex_sumoutr!   r!   r"   apply   s(   zQoQLinearMethod.apply)rK   r   )r<   r=   rL   r   rM   rN   rO   r   rP   r   rQ   rR   )r<   r=   r   r   r)   )r<   r=   rj   rk   rl   rm   )rF   rG   rH   rI   r#   rg   ri   rt   r!   r!   r!   r"   rC   k   s    


]rC   )
__future__r   typingr   r   r   r   r*   torch.nn.parameterr   sglang.srt.layers.parameterr   r	   r
   *sglang.srt.layers.quantization.base_configr   r   r   *sglang.srt.layers.quantization.int8_kernelr   sglang.srt.utilsr   _is_cuda
sgl_kernelr   r   r   r   r   rC   r!   r!   r!   r"   <module>   s    M