o
    پiQ                  
   @  s   d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZmZ erDd d	lmZmZ G d
d deZd%ddZdd ZG dd deZd&ddZd&dd Zzed!edged" ejjj Z W n e!y Z" ze"dZ"["ww G d#d$ d$eZ#dS )'    )annotations)TYPE_CHECKINGAnyOptionalN)version)
LinearBase)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)UnquantizedLinearMethod)direct_register_custom_opset_weight_attrs)CombineInputStandardDispatchOutputc                      s   e Zd ZdZ										d5d6 fddZd7ddZd7ddZd8d d!Zd9d#d$Ze	d:d&d'Z
ed8d(d)Ze	d;d-d.Zd<d3d4Z  ZS )=BitsAndBytesConfigzaConfig class for BitsAndBytes Quantization.

    Reference: https://arxiv.org/abs/2305.14314
    FTfloat32uint8fp4N      @load_in_8bitboolload_in_4bitbnb_4bit_compute_dtypestrbnb_4bit_quant_storagebnb_4bit_quant_typebnb_4bit_use_double_quant llm_int8_enable_fp32_cpu_offloadllm_int8_has_fp16_weightllm_int8_skip_moduleslist[str] | Nonellm_int8_thresholdfloatreturnNonec                   sh   t    || _|| _|| _|| _|| _|| _|| _|| _	|	p g | _
|
| _| jdvr2td| j d S )N)r   z$Unsupported bnb_4bit_quant_storage: )super__init__r   r   r   r   r   r   r   r   r    r"   
ValueError)selfr   r   r   r   r   r   r   r   r    r"   	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/bitsandbytes.pyr'   !   s    



zBitsAndBytesConfig.__init__c                 C  s6   d| j  d| j d| j d| j d| j d| j dS )Nz BitsAndBytesConfig(load_in_8bit=z, load_in_4bit=z, bnb_4bit_compute_dtype=z, bnb_4bit_quant_storage=z, bnb_4bit_quant_type=z, llm_int8_skip_modules=))r   r   r   r   r   r    r)   r,   r,   r-   __repr__?   s   
zBitsAndBytesConfig.__repr__c                 C     dS )Nbitsandbytesr,   r/   r,   r,   r-   get_nameI      zBitsAndBytesConfig.get_name	list[str]c                 C     g S Nr,   r/   r,   r,   r-   get_scaled_act_namesL   r4   z'BitsAndBytesConfig.get_scaled_act_nameslist[torch.dtype]c                 C  s   t jt jt jgS r7   )torchr   float16bfloat16r/   r,   r,   r-   get_supported_act_dtypesO   s   z+BitsAndBytesConfig.get_supported_act_dtypesintc                 C  r1   )NF   r,   )clsr,   r,   r-   get_min_capabilityR      z%BitsAndBytesConfig.get_min_capabilityc                   C  r6   r7   r,   r,   r,   r,   r-   get_config_filenamesV   rB   z'BitsAndBytesConfig.get_config_filenamesconfigdict[str, Any]'BitsAndBytesConfig'c                 C  s   ddd}||dgdd}||dgdd}||dgd	d}||d
gdd}||dgdd}||dgdd}||dgdd}	||dgdd}
||dgg d}||dgdd}| |||||||	|
||d
S )Nc                 S  s8   zt | |}|d ur|W S |W S  ty   | Y S w r7   )r
   get_from_keysr(   )rD   keysdefault_valuevaluer,   r,   r-   get_safe_value\   s   z6BitsAndBytesConfig.from_config.<locals>.get_safe_valuer   F)rI   r   Tr   r   r   r   r   r   r   r   r   r    r"   r   )
r   r   r   r   r   r   r   r   r    r"   r7   r,   )r@   rD   rK   r   r   r   r   r   r   r   r   r    r"   r,   r,   r-   from_configZ   sN   
zBitsAndBytesConfig.from_configlayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  sF   ddl m} t|trt|| jrt S t| S t||r!t| S d S )Nr   )FusedMoE)	,sglang.srt.layers.moe.fused_moe_triton.layerrQ   
isinstancer   is_layer_skipped_bnbr    r   BitsAndBytesLinearMethodBitsAndBytesMoEMethod)r)   rM   rO   rQ   r,   r,   r-   get_quant_method   s   

z#BitsAndBytesConfig.get_quant_method)
FTr   r   r   FFFNr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   )r$   r   )r$   r5   )r$   r9   )r$   r>   )rD   rE   r$   rF   )rM   rN   rO   r   r$   rP   )__name__
__module____qualname____doc__r'   r0   r3   r8   r=   classmethodrA   staticmethodrC   rL   rW   __classcell__r,   r,   r*   r-   r      s0    




0r   rO   r   r    r5   c                   s^   |  d t fdd|D }t fddtt D }t|}t||@ dk}|p.|S )N.c                 3  s    | ]}| v V  qd S r7   r,   ).0module_name
componentsr,   r-   	<genexpr>   s    
z'is_layer_skipped_bnb.<locals>.<genexpr>c                 3  s&    | ]}d   d|d  V  qdS )r_   N   )join)r`   irb   r,   r-   rd      s   $ r   )splitanysetrangelen)rO   r    substr_checkset_componentsset_llm_int8_skip_modulesprefix_checkr,   rb   r-   rT      s   
rT   c                 C  s:   | j rt| jttjj S t| jttjj S r7   )is_floating_pointr:   finfobitsiinfor   dtyper,   r,   r-   calculate_quant_ratio   s   rw   c                   @  sN   e Zd ZdZdddZdddZ	d d!ddZ	d d!ddZ	d d!ddZdS )"rU   zjLinear method for BitsAndBytes.

    Args:
       quant_config: The BitsAndBytes quantization config.
    quant_configr   c              
   C  sX   zdd l }t|jtdk rtdW n ty& } ztd|d }~ww || _d S Nr   z0.46.1zCbitsandbytes version is wrong. Please install bitsandbytes>=0.46.1.ziPlease install bitsandbytes>=0.46.1 via `pip install bitsandbytes>=0.46.1` to use bitsandbytes quantizer.)r2   r   parse__version__ImportErrorrx   r)   rx   r2   errr,   r,   r-   r'      s    
z!BitsAndBytesLinearMethod.__init__rM   rN   input_size_per_partitionr>   output_partition_sizes	list[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec                   s^   ddl m   fdd}fdd}	jjr| }
n|	 }
|d|
 t|
| d S )Nr   )
Int8Paramsc                    s>    t jtt jdjjdd} t| dddddd | S )Nru   F)datahas_fp16_weightsrequires_gradr   re   T)	input_dim
output_dimpack_factoruse_bitsandbytes_8bit
generation)r:   emptysumint8rx   r   r   )qweight)r   r   r   r)   r,   r-   create_qweight_for_8bit   s&   	
zHBitsAndBytesLinearMethod.create_weights.<locals>.create_qweight_for_8bitc                    sb   t }  t }||  dkrtdtjjtj||  dtjddd}t|dd| dd |S )	Nr   z>The input size is not aligned with the quantized weight shape.re   ru   Fr   T)r   r   r   use_bitsandbytes_4bit)	rw   r   r(   r:   nn	Parameterr   r   r   )quant_ratio
total_sizer   )r   r   r   r,   r-   create_qweight_for_4bit   s&   	zHBitsAndBytesLinearMethod.create_weights.<locals>.create_qweight_for_4bitweight)bitsandbytes.nnr   rx   r   register_parameterr   )r)   rM   r   r   r   r   r   extra_weight_attrsr   r   r   r,   )r   r   r   r   r)   r-   create_weights   s   
z'BitsAndBytesLinearMethod.create_weightsNxtorch.Tensorbiastorch.Tensor | Noner$   c                 C  s$   | j jr| |||S | |||S r7   )rx   r   _apply_8bit_weight_apply_4bit_weight)r)   rM   r   r   r,   r,   r-   apply  s   zBitsAndBytesLinearMethod.applyc                 C  sN  ddl m}m} |j}|j}d}|jdkr |d|d}d}|t	j
}	|j}
|
j}|
j}|
j}|
j}|jd }tdd | D }t	j||t	j|jd	}d}tt|D ]}|| jd }|dksi|d
kr| ||< |
|| ||d
   || _|| |j|| _| jj|| _| jj|| _d|| _|| jdkr|| jsd|| _|	 d}|||
|| ||d
   || d|d d ||| f< ||7 }|dkr| jjs|| jd ur|| j!d ur|| `|| j!|
|| ||d
  < qX||}|r|j"g |d d |dR  }|d ur||7 }|
 jd
7  _|S )Nr   )MatmulLtStatematmulF   Tc                 S     g | ]	}|d  j d qS re   r   shaper`   quant_stater,   r,   r-   
<listcomp>1      z?BitsAndBytesLinearMethod._apply_8bit_weight.<locals>.<listcomp>rv   devicere   g        )state)#r2   r   r   rv   r   ndimreshapesizetor:   r<   r   bnb_shard_offsetsbnb_quant_statematmul_stater   r   itemsr   r;   r   rk   rl   CBSCBrx   r"   	thresholdr   r   is_traininguse_pool	unsqueezeCxBview)r)   rM   r   r   r   r   original_typeoriginal_shapereshape_after_matmulbf_xr   offsetsquant_statesmatmul_statesr   	out_dim_0	out_dim_1outcurrent_indexrg   r   new_xr,   r,   r-   r     sj   






"
z+BitsAndBytesLinearMethod._apply_8bit_weightc                 C  s   |j }|j}d}|jdkr|d|d}d}|tj}|j}|j	}	|j
}
|jd }tdd |	 D }tj||tj|jd}t|||
| ||}|ra|jg |d d |dR  }|d uri||7 }|S )	NFr   r   Tr   c                 S  r   r   r   r   r,   r,   r-   r   |  r   z?BitsAndBytesLinearMethod._apply_4bit_weight.<locals>.<listcomp>r   )rv   r   r   r   r   r   r:   r<   r   r   r   r   r   r   r   apply_bnb_4bitr   )r)   rM   r   r   r   r   r   r   r   r   r   r   r   r   r,   r,   r-   r   h  s,   


"z+BitsAndBytesLinearMethod._apply_4bit_weightrx   r   )rM   rN   r   r>   r   r   r   r>   r   r>   r   r   r7   )rM   rN   r   r   r   r   r$   r   )	rX   rY   rZ   r[   r'   r   r   r   r   r,   r,   r,   r-   rU      s    

GTrU   r   r   r   r   r   r$   r%   c           	      C  s~   ddl m} |j}d}tt|D ]+}|| jd }|| ||| ||d    || |d d ||| f< ||7 }qd S )Nr   )matmul_4bitre   )r2   r   r   rk   rl   r   t)	r   r   r   r   r   r   r   rg   r   r,   r,   r-   _apply_bnb_4bit  s   "
r   c                 C  s   d S r7   r,   )r   r   r   r   r,   r,   r-   _apply_bnb_4bit_fake  s   r   r   )op_nameop_funcmutates_args	fake_implc                      sh   e Zd ZdZd! fddZd"ddZd#ddZd$ddZd"ddZd"ddZ	d%ddZ
d%dd Z  ZS )&rV   zgMoE method for BitsAndBytes.

    Args:
       quant_config: The BitsAndBytes quantization config.
    rx   r   c              
     sb   t    zdd l}t|jtdk rtdW n ty+ } ztd|d }~ww || _d S ry   )r&   r'   r2   r   rz   r{   r|   rx   r}   r*   r,   r-   r'     s"   

zBitsAndBytesMoEMethod.__init__rM   rN   num_expertsr>   hidden_sizeintermediate_size_per_partitionr   r   c                 K  s2   | j jr| j}n| j}||||||fi | d S r7   )rx   r   _create_weights_8bit_create_weights_4bit)r)   rM   r   r   r   r   r   call_funr,   r,   r-   r     s   	
z$BitsAndBytesMoEMethod.create_weightsc                 C  s
   || _ d S r7   )moe_runner_config)r)   rM   r   r,   r,   r-   create_moe_runner  s   
z'BitsAndBytesMoEMethod.create_moe_runnerdispatch_outputr   r$   r   c                 C  sl   ddl m} ddlm} |j}|j}| jjr| |\}}n| 	|\}}| j
}	||||||	d}
||
dS )Nr   )	fused_moe)StandardCombineInput)hidden_statesw1w2topk_outputr   )r   )0sglang.srt.layers.moe.fused_moe_triton.fused_moer   &sglang.srt.layers.moe.token_dispatcherr   r   r   rx   r   _apply_8bit_dequant_apply_4bit_dequantr   )r)   rM   r   r   r   r   r   w13r   r   outputr,   r,   r-   r     s    
zBitsAndBytesMoEMethod.applyc              	   K  s   t |}|d | | }tjjtj||dtjddd}	|d|	 t|	| t|	||d| ||d |f|dd || | }
tjjtj||
dtjddd}t|||||||f|dd |d	| t|| d S )
Nr   re   ru   Fr   
w13_weightT)r   r   r   experts_shaper   r   	w2_weight)rw   r:   r   r   r   r   r   r   )r)   rM   r   r   r   r   r   r   w13_total_sizew13_qweightw2_total_size
w2_qweightr,   r,   r-   r     sj   	
	
	z*BitsAndBytesMoEMethod._create_weights_4bitc                 K     t r7   NotImplementedError)r)   rM   r   r   r   r   r   r,   r,   r-   r   M  s   	z*BitsAndBytesMoEMethod._create_weights_8bit!tuple[torch.Tensor, torch.Tensor]c                 C  s`   ddl m} ||jdd|jj}||jdd|jj}||jj}||jj}||fS )Nr   )dequantize_4bitr   re   )bitsandbytes.functionalr   r   r   r   r   r   )r)   rM   r   r   r   r,   r,   r-   r   X  s   z)BitsAndBytesMoEMethod._apply_4bit_dequantc                 C  r   r7   r   )r)   rM   r,   r,   r-   r   i  s   z)BitsAndBytesMoEMethod._apply_8bit_dequantr   )
rM   rN   r   r>   r   r>   r   r>   r   r   )rM   rN   )rM   rN   r   r   r$   r   )rM   rN   r$   r   )rX   rY   rZ   r[   r'   r   r   r   r   r   r   r   r^   r,   r,   r*   r-   rV     s    




E
rV   )rO   r   r    r5   )
r   r   r   r   r   r   r   r   r$   r%   )$
__future__r   typingr   r   r   r:   	packagingr   sglang.srt.layers.linearr   *sglang.srt.layers.quantization.base_configr   r	   r
   r   &sglang.srt.layers.quantization.unquantr   sglang.srt.utilsr   r   r   r   r   r   rT   rw   rU   r   r   opssglangr   AttributeErrorerrorrV   r,   r,   r,   r-   <module>   s<   
~ 
[
	