o
    
۾iR                  
   @   sd  d dl mZmZ d dlZd dlmZ d dlmZmZ d dl	m
Z
mZ d dlmZmZmZmZ d dlmZmZ d dlmZ d d	lmZ G d
d deZdedee fddZdd ZG dd deZdejdejdejdejddf
ddZdejdejdejdejddf
ddZ zededge ej!d ej"j#j$Z$W n e%y Z& ze&dZ&[&ww G dd  d eZ'dS )!    )AnyUnionN)version)FusedMoEConfigFusedMoEQuantConfig)FusedMoEFusedMoEMethodBase)
LinearBaseLinearMethodBaseUnquantizedLinearMethodset_weight_attrs)QuantizationConfigQuantizationMethods)current_platform)direct_register_custom_opc                       s  e Zd ZdZ										d(d	ed
edededededededee dB deddf fddZdefddZ	e
defddZe
deej fddZe
defddZedee fddZe
d eeef dd fd!d"Zd#ejjd$eded% dB fd&d'Z  ZS ))BitsAndBytesConfigzaConfig class for BitsAndBytes Quantization.

    Reference: https://arxiv.org/abs/2305.14314
    FTfloat32uint8fp4N      @load_in_8bitload_in_4bitbnb_4bit_compute_dtypebnb_4bit_quant_storagebnb_4bit_quant_typebnb_4bit_use_double_quant llm_int8_enable_fp32_cpu_offloadllm_int8_has_fp16_weightllm_int8_skip_modulesllm_int8_thresholdreturnc                    sh   t    || _|| _|| _|| _|| _|| _|| _|| _	|	p g | _
|
| _| jdvr2td| j d S )N)r   z$Unsupported bnb_4bit_quant_storage: )super__init__r   r   r   r   r   r   r   r   r   r   
ValueError)selfr   r   r   r   r   r   r   r   r   r   	__class__ h/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/bitsandbytes.pyr"   %   s    



zBitsAndBytesConfig.__init__c                 C   s6   d| j  d| j d| j d| j d| j d| j dS )Nz BitsAndBytesConfig(load_in_8bit=z, load_in_4bit=z, bnb_4bit_compute_dtype=z, bnb_4bit_quant_storage=z, bnb_4bit_quant_type=z, llm_int8_skip_modules=))r   r   r   r   r   r   r$   r'   r'   r(   __repr__C   s   
zBitsAndBytesConfig.__repr__c                 C      dS )Nbitsandbytesr'   r*   r'   r'   r(   get_nameM      zBitsAndBytesConfig.get_namec                 C   s   t jt jt jgS N)torchr   float16bfloat16r*   r'   r'   r(   get_supported_act_dtypesQ   s   z+BitsAndBytesConfig.get_supported_act_dtypesc                 C   r,   )NF   r'   clsr'   r'   r(   get_min_capabilityU   r/   z%BitsAndBytesConfig.get_min_capabilityc                   C   s   g S r0   r'   r'   r'   r'   r(   get_config_filenamesY   r/   z'BitsAndBytesConfig.get_config_filenamesconfigc                    s   d fdd	}||dgdd}||dgdd}||dgd	d}||d
gdd}||dgdd}||dgdd}||dgdd}	||dgdd}
||dgg d}||dgdd} |||||||	|
||d
S )Nc                    s8   z  | |}|d ur|W S |W S  ty   | Y S w r0   )get_from_keysr#   )r:   keysdefault_valuevaluer6   r'   r(   get_safe_value_   s   z6BitsAndBytesConfig.from_config.<locals>.get_safe_valuer   F)r=   r   Tr   r   r   r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   r   r0   r'   )r7   r:   r?   r   r   r   r   r   r   r   r   r   r   r'   r6   r(   from_config]   sN   zBitsAndBytesConfig.from_configlayerprefix)r
   BitsAndBytesMoEMethodc                 C   s>   t |trt|| jrt S t| S t |trt| |jS d S r0   )	
isinstancer	   is_layer_skipped_bnbr   r   BitsAndBytesLinearMethodr   rC   
moe_config)r$   rA   rB   r'   r'   r(   get_quant_method   s   

z#BitsAndBytesConfig.get_quant_method)
FTr   r   r   FFFNr   )__name__
__module____qualname____doc__boolstrlistfloatr"   r+   classmethodr   r.   r1   dtyper4   intr8   staticmethodr9   dictr   r@   nnModuler   rH   __classcell__r'   r'   r%   r(   r      sj    	


0
r   rB   r   c                    s^   |  d t fdd|D }t fddtt D }t|}t||@ dk}|p.|S )N.c                 3   s    | ]}| v V  qd S r0   r'   ).0module_name
componentsr'   r(   	<genexpr>   s    
z'is_layer_skipped_bnb.<locals>.<genexpr>c                 3   s&    | ]}d   d|d  V  qdS )rY   N   )join)rZ   ir\   r'   r(   r^      s   $ r   )splitanysetrangelen)rB   r   substr_checkset_componentsset_llm_int8_skip_modulesprefix_checkr'   r\   r(   rE      s   
rE   c                 C   s:   | j rt| jttjj S t| jttjj S r0   )is_floating_pointr1   finfobitsiinfor   rR   r'   r'   r(   calculate_quant_ratio   s   rp   c                   @   s   e Zd ZdZdefddZdejjde	de
e	 de	d	e	d
ejfddZ	ddejjdejdejdB dejfddZ	ddejjdejdejdB dejfddZ	ddejjdejdejdB dejfddZdS )rF   zjLinear method for BitsAndBytes.

    Args:
       quant_config: The BitsAndBytes quantization config.
    quant_configc              
   C   sX   zdd l }t|jtdk rtdW n ty& } ztd|d }~ww || _d S Nr   z0.46.1zCbitsandbytes version is wrong. Please install bitsandbytes>=0.46.1.ziPlease install bitsandbytes>=0.46.1 via `pip install bitsandbytes>=0.46.1` to use bitsandbytes quantizer.)r-   r   parse__version__ImportErrorrq   )r$   rq   r-   errr'   r'   r(   r"      s    
z!BitsAndBytesLinearMethod.__init__rA   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                    s^   ddl m   fdd}fdd}	jjr| }
n|	 }
|d|
 t|
| d S )Nr   )
Int8Paramsc                     s>    t jtt jdjjdd} t| dddddd | S )Nro   F)datahas_fp16_weightsrequires_gradr   r_   T)	input_dim
output_dimpack_factoruse_bitsandbytes_8bit
generation)r1   emptysumint8rq   r   r   )qweight)r|   rw   rx   r$   r'   r(   create_qweight_for_8bit   s&   	
zHBitsAndBytesLinearMethod.create_weights.<locals>.create_qweight_for_8bitc                     sb   t }  t }||  dkrtdtjjtj||  dtjddd}t|dd| dd |S )	Nr   z>The input size is not aligned with the quantized weight shape.r_   ro   Fr   T)r   r   r   use_bitsandbytes_4bit)	rp   r   r#   r1   rV   	Parameterr   r   r   )quant_ratio
total_sizer   )rw   rx   r{   r'   r(   create_qweight_for_4bit   s&   	zHBitsAndBytesLinearMethod.create_weights.<locals>.create_qweight_for_4bitweight)bitsandbytes.nnr|   rq   r   register_parameterr   )r$   rA   rw   rx   ry   rz   r{   extra_weight_attrsr   r   r   r'   )r|   rw   rx   r{   r$   r(   create_weights   s   
z'BitsAndBytesLinearMethod.create_weightsNxbiasr    c                 C   s$   | j jr| |||S | |||S r0   )rq   r   _apply_8bit_weight_apply_4bit_weight)r$   rA   r   r   r'   r'   r(   apply  s   zBitsAndBytesLinearMethod.applyc                 C   sN  ddl m}m} |j}|j}d}|jdkr |d|d}d}|t	j
}	|j}
|
j}|
j}|
j}|
j}|jd }tdd | D }t	j||t	j|jd	}d}tt|D ]}|| jd }|dksi|d
kr| ||< |
|| ||d
   || _|| |j|| _| jj|| _| jj|| _d|| _|| jdkr|| jsd|| _|	 d}|||
|| ||d
   || d|d d ||| f< ||7 }|dkr| jjs|| jd ur|| j!d ur|| `|| j!|
|| ||d
  < qX||}|r|j"g |d d |dR  }|d ur||7 }|
 jd
7  _|S )Nr   )MatmulLtStatematmulF   Tc                 S      g | ]	}|d  j d qS r_   r   shaperZ   quant_stater'   r'   r(   
<listcomp>2      z?BitsAndBytesLinearMethod._apply_8bit_weight.<locals>.<listcomp>rR   devicer_   g        )state)#r-   r   r   rR   r   ndimreshapesizetor1   r3   r   bnb_shard_offsetsbnb_quant_statematmul_stater   r   itemsr   r2   r   re   rf   CBSCBrq   r   	thresholdr   r~   is_traininguse_pool	unsqueezeCxBview)r$   rA   r   r   r   r   original_typeoriginal_shapereshape_after_matmulbf_xr   offsetsquant_statesmatmul_statesr   	out_dim_0	out_dim_1outcurrent_indexra   rz   new_xr'   r'   r(   r     sp   






"
z+BitsAndBytesLinearMethod._apply_8bit_weightc                 C   s   |j }|j}d}|jdkr|d|d}d}|tj}|j}|j	}	|j
}
|jd }tdd |	 D }tj||tj|jd}t|||
| ||}|ra|jg |d d |dR  }|d uri||7 }|S )	NFr   r   Tr   c                 S   r   r   r   r   r'   r'   r(   r   }  r   z?BitsAndBytesLinearMethod._apply_4bit_weight.<locals>.<listcomp>r   )rR   r   r   r   r   r   r1   r3   r   r   r   r   r   r   r   apply_bnb_4bitr   )r$   rA   r   r   r   r   r   r   r   r   r   r   r   r   r'   r'   r(   r   i  s,   


"z+BitsAndBytesLinearMethod._apply_4bit_weightr0   )rI   rJ   rK   rL   r   r"   r1   rV   rW   rS   rO   rR   r   Tensorr   r   r   r'   r'   r'   r(   rF      s\    
G

TrF   r   r   r   r   r    c           	      C   s~   ddl m} |j}d}tt|D ]+}|| jd }|| ||| ||d    || |d d ||| f< ||7 }qd S )Nr   )matmul_4bitr_   )r-   r   r   re   rf   r   t)	r   r   r   r   r   r   r   ra   rz   r'   r'   r(   _apply_bnb_4bit  s   "
r   c                 C      d S r0   r'   )r   r   r   r   r'   r'   r(   _apply_bnb_4bit_fake  s   r   r   )op_nameop_funcmutates_args	fake_impldispatch_keyc                       s*  e Zd ZdZdedef fddZdejj	de
de
d	e
d
ejf
ddZdejj	dedB fddZdedejdejdejdejeejejf B f
ddZdejj	de
de
d	e
d
ejf
ddZdejj	de
de
d	e
d
ejf
ddZdejj	deejejf fddZdejj	deejejf fddZ  ZS )rC   zgMoE method for BitsAndBytes.

    Args:
       quant_config: The BitsAndBytes quantization config.
    rq   moec              
      sd   t  | zdd l}t|jtdk rtdW n ty, } ztd|d }~ww || _d S rr   )r!   r"   r-   r   rs   rt   ru   rq   )r$   rq   r   r-   rv   r%   r'   r(   r"     s"   
zBitsAndBytesMoEMethod.__init__rA   num_expertshidden_sizeintermediate_size_per_partitionr{   c                 K   s2   | j jr| j}n| j}||||||fi | d S r0   )rq   r   _create_weights_8bit_create_weights_4bit)r$   rA   r   r   r   r{   r   call_funr'   r'   r(   r     s   	
z$BitsAndBytesMoEMethod.create_weightsr    Nc                 C   r   r0   r'   r$   rA   r'   r'   r(   get_fused_moe_quant_config     z0BitsAndBytesMoEMethod.get_fused_moe_quant_configr   topk_weightstopk_idsc                 C   s`   ddl m} | jjr| |\}}n| |\}}||||||| jj |j|j	|j
|j| jdS )Nr   )fused_experts)hidden_statesw1w2r   r   inplace
activationapply_router_weight_on_inputglobal_num_experts
expert_maprq   )$vllm.model_executor.layers.fused_moer   rq   r   _apply_8bit_dequant_apply_4bit_dequntr   disable_inplacer   r   r   r   moe_quant_config)r$   rA   r   r   r   r   w13r   r'   r'   r(   r     s"   zBitsAndBytesMoEMethod.applyc              	   K   s   t |}|d | | }tjjtj||dtjddd}	|d|	 t|	| t|	||d| ||d |f|dd || | }
tjjtj||
dtjddd}t|||||||f|dd |d	| t|| d S )
Nr   r_   ro   Fr   
w13_weightT)r   r   r   experts_shaper   r   	w2_weight)rp   r1   rV   r   r   r   r   r   )r$   rA   r   r   r   r{   r   r   w13_total_sizew13_qweightw2_total_size
w2_qweightr'   r'   r(   r     sj   	
	
	z*BitsAndBytesMoEMethod._create_weights_4bitc                 K      t r0   NotImplementedError)r$   rA   r   r   r   r{   r   r'   r'   r(   r   S  s   	z*BitsAndBytesMoEMethod._create_weights_8bitc                 C   s`   ddl m} ||jdd|jj}||jdd|jj}||jj}||jj}||fS )Nr   )dequantize_4bitr   r_   )bitsandbytes.functionalr   r   r   r   r   r   )r$   rA   r   r   r   r'   r'   r(   r   ^  s   z(BitsAndBytesMoEMethod._apply_4bit_dequntc                 C   r   r0   r   r   r'   r'   r(   r   o  r   z)BitsAndBytesMoEMethod._apply_8bit_dequant)rI   rJ   rK   rL   r   r   r"   r1   rV   rW   rS   rR   r   r   r   r   r   tupler   r   r   r   r   rX   r'   r'   r%   r(   rC     s    



E

rC   )(typingr   r   r1   	packagingr   +vllm.model_executor.layers.fused_moe.configr   r   *vllm.model_executor.layers.fused_moe.layerr   r   !vllm.model_executor.layers.linearr	   r
   r   r   'vllm.model_executor.layers.quantizationr   r   vllm.platformsr   vllm.utils.torch_utilsr   r   rN   rO   rE   rp   rF   r   r   r   r   opsvllmr   AttributeErrorerrorrC   r'   r'   r'   r(   <module>   sd   { [

	