o
    پi6                     @  s"  d dl mZ d dlZd dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZ d d	lmZmZmZmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# ernd dl$m%Z%m&Z& ddgZ'e(e)Z*G dd deZ+G dd deZ,G dd deZ-dS )    )annotationsN)TYPE_CHECKINGAnyDictListOptional)Module)$get_tensor_model_parallel_world_size)	MoeRunnerMoeRunnerBackendMoeRunnerConfig)TritonMoeQuantInfo)BlockQuantScaleParameterModelWeightParameter)FusedMoEMethodBaseLinearMethodBaseQuantizationConfigQuantizeMethodBase)apply_w8a8_block_int8_linear)UnquantizedLinearMethod)is_layer_skipped)set_weight_attrs)CombineInputStandardDispatchOutputstaticdynamicc                   @  s~   e Zd ZdZ				d(d)ddZed*ddZed+ddZed,ddZed-ddZ	ed.ddZ
d/d$d%Zd-d&d'ZdS )0BlockInt8ConfigzConfig class for INT8.Fr   Nis_checkpoint_int8_serializedboolactivation_schemestrignored_layersOptional[List[str]]weight_block_size	List[int]returnNonec                 C  s   || _ |r
td |tvrtd| || _|pg | _|d urC|s'tdt|dkr7tdt| d|dkrCtd| d	|| _d S )
Nz\Detected int8 checkpoint. Please note that the format is experimental and subject to change.zUnsupported activation scheme zMThe block-wise quantization only supports int8-serialized checkpoint for now.   zFThe quantization block size of weight must have 2 dimensions, but got z dimensions.r   zUThe block-wise quantization only supports dynamic activation scheme for now, but got z activation scheme.)	r   loggerwarningACTIVATION_SCHEMES
ValueErrorr   r!   lenr#   )selfr   r   r!   r#    r.   a/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/blockwise_int8.py__init__(   s.   


zBlockInt8Config.__init__c                 C     dS )Nblockwise_int8r.   clsr.   r.   r/   get_nameH      zBlockInt8Config.get_nameList[torch.dtype]c                 C  s   t jt jgS N)torchbfloat16halfr3   r.   r.   r/   get_supported_act_dtypesL   s   z(BlockInt8Config.get_supported_act_dtypesintc                 C  r1   )NP   r.   r3   r.   r.   r/   get_min_capabilityP   r6   z"BlockInt8Config.get_min_capability	List[str]c                 C     g S r8   r.   r3   r.   r.   r/   get_config_filenamesT   r6   z$BlockInt8Config.get_config_filenamesconfigDict[str, Any]c                 C  sT   |  |dg}d|v }|  |dg}| |dgd }| |dgd }| ||||dS )Nquant_methodint8r   r!   r#   )r   r   r!   r#   )get_from_keysget_from_keys_or)r4   rC   rE   r   r   r!   r#   r.   r.   r/   from_configX   s   zBlockInt8Config.from_configlayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  sR   ddl m} ddlm} t||rt|| jrt S t| S t||r't	| S d S )Nr   )
LinearBase)FusedMoE)
sglang.srt.layers.linearrN   &sglang.srt.layers.moe.fused_moe_tritonrO   
isinstancer   r!   r   BlockInt8LinearMethodBlockInt8MoEMethod)r-   rJ   rL   rN   rO   r.   r.   r/   get_quant_methodf   s   

z BlockInt8Config.get_quant_methodc                 C  rA   r8   r.   )r-   r.   r.   r/   get_scaled_act_namest   s   z$BlockInt8Config.get_scaled_act_names)Fr   NN)
r   r   r   r    r!   r"   r#   r$   r%   r&   )r%   r    )r%   r7   )r%   r=   )r%   r@   )rC   rD   r%   r   )rJ   rK   rL   r    r%   rM   )__name__
__module____qualname____doc__r0   classmethodr5   r<   r?   rB   rI   rU   rV   r.   r.   r.   r/   r   %   s&     
r   c                   @  s<   e Zd ZdZdddZdddZd ddZ	d!d"ddZdS )#rS   a  Linear method for INT8.
    Supports loading INT8 checkpoints with static weight scale and
    dynamic activation scale.

    Limitations:
    Only support block-wise int8 quantization and int8 checkpoint

    Args:
        quant_config: The quantization config.
    quant_configr   c                 C  &   || _ | j jd usJ | j jsJ d S r8   r\   r#   r   r-   r\   r.   r.   r/   r0         zBlockInt8LinearMethod.__init__rJ   rK   input_size_per_partitionr=   output_partition_sizesr$   
input_sizeoutput_sizeparams_dtypetorch.dtypec                 K  s~  t |}|d}	t }
| jjd | jjd }}|
dkr4|| |
kr4|| dkr4td| d| d|
dkr>|| |
ksDt|dkrZ|D ]}|| dkrYtd| d| dqF||_||_||_	||_
| jjrmtjn|}ttj|||d	dd|	d
}|d| ttj|| d | || d | tjd	dd|	d
}ttjj|d d < |d| | jjdksJ |dd  d S )Nweight_loaderr      z"Weight input_size_per_partition = 3 is not divisible by weight quantization block_k = .zWeight output_partition_size = 3 is not divisible by weight quantization block_n = dtype)data	input_dim
output_dimrg   weightweight_scale_invr   input_scale)sumgetr	   r\   r#   r+   r,   logical_widthsra   output_size_per_partition
orig_dtyper   r9   rF   r   emptyregister_parameterr   float32finfominr   )r-   rJ   ra   rb   rc   rd   re   extra_weight_attrsrw   rg   tp_sizeblock_nblock_koutput_partition_sizeweight_dtyperq   scaler.   r.   r/   create_weights   sz   




z$BlockInt8LinearMethod.create_weightsr   r%   r&   c                 C  s0   t jj|jjdd|_t jj|jjdd|_d S )NFrequires_grad)r9   nn	Parameterrq   rn   rr   r-   rJ   r.   r.   r/   process_weights_after_loading   s   z3BlockInt8LinearMethod.process_weights_after_loadingNxtorch.TensorbiasOptional[torch.Tensor]c                 C  s   t ||j| jj|jd |dS )N)inputrq   
block_sizeweight_scalers   r   )r   rq   r\   r#   rr   )r-   rJ   r   r   r.   r.   r/   apply   s   zBlockInt8LinearMethod.applyr\   r   )rJ   rK   ra   r=   rb   r$   rc   r=   rd   r=   re   rf   rJ   r   r%   r&   r8   )rJ   rK   r   r   r   r   r%   r   )rW   rX   rY   rZ   r0   r   r   r   r.   r.   r.   r/   rS   x   s    


QrS   c                   @  sB   e Zd ZdZdddZd ddZd!ddZd"ddZd#ddZdS )$rT   a  MoE method for INT8.
    Supports loading INT8 checkpoints with static weight scale and
    dynamic activation scale.

    Limitations:
    Only support block-wise int8 quantization and int8 checkpoint

    Args:
        quant_config: The quantization config.
    r\   r   c                 C  r]   r8   r^   r_   r.   r.   r/   r0      r`   zBlockInt8MoEMethod.__init__rJ   r   num_expertsr=   hidden_sizeintermediate_size_per_partitionre   rf   c                 K  s  ddl m} | jjrtj}t }| jjd | jjd }	}
||	 dkr.td| d|	 d|dkrC||
 dkrCtd| d|
 dtj	j
tj|d	| ||d
dd}|d| t|| tj	j
tj||||d
dd}|d| t|| tj	j
tj|d	||	 d |	  ||
 d |
 tjd
dd}tj	j
tj|||	 d |	 ||
 d |
 tjd
dd}|d| |d| |d|jji t|| t|| | jjdksJ d |_d |_d S )Nr   )FusedMoeWeightScaleSupportedrh   z,The output_size of gate's and up's weight = rk   rj   z"The input_size of down's weight = ri   r'   rl   Fr   
w13_weight	w2_weightw13_weight_scale_invw2_weight_scale_invrE   r   )rQ   r   r\   r   r9   rF   r	   r#   r+   r   r   ry   rz   r   onesr{   updateBLOCKvaluer   w13_input_scalew2_input_scale)r-   rJ   r   r   r   re   r~   r   r   r   r   r   r   w13_weight_scalew2_weight_scaler.   r.   r/   r     s   	

	
	
		



z!BlockInt8MoEMethod.create_weightsr%   r&   c                 C  s   d S r8   r.   r   r.   r.   r/   r   a  r6   z0BlockInt8MoEMethod.process_weights_after_loadingrK   moe_runner_configr   c                 C  s   || _ ttj|| _d S r8   )r   r
   r   TRITONrunner)r-   rJ   r   r.   r.   r/   create_moe_runnere  s   z$BlockInt8MoEMethod.create_moe_runnerdispatch_outputr   r   c              
   C  s6   t |j|jd|j|j|j|j| jjd}| j	
||S )NT)r   r   use_int8_w8a8	w13_scalew2_scale	a13_scalea2_scaleblock_shape)r   r   r   r   r   r   r   r\   r#   r   run)r-   rJ   r   
quant_infor.   r.   r/   r   k  s   zBlockInt8MoEMethod.applyNr   )
rJ   r   r   r=   r   r=   r   r=   re   rf   r   )rJ   rK   r   r   )rJ   rK   r   r   r%   r   )	rW   rX   rY   rZ   r0   r   r   r   r   r.   r.   r.   r/   rT      s    


^
rT   ).
__future__r   loggingtypingr   r   r   r   r   r9   torch.nnr   sglang.srt.distributedr	   sglang.srt.layers.moer
   r   r   'sglang.srt.layers.moe.moe_runner.tritonr   sglang.srt.layers.parameterr   r   *sglang.srt.layers.quantization.base_configr   r   r   r   )sglang.srt.layers.quantization.int8_utilsr   &sglang.srt.layers.quantization.unquantr   $sglang.srt.layers.quantization.utilsr   sglang.srt.utilsr   &sglang.srt.layers.moe.token_dispatcherr   r   r*   	getLoggerrW   r(   r   rS   rT   r.   r.   r.   r/   <module>   s*   
Sz