o
    پi7                     @  s  d dl mZ d dlZd dlmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlmZmZmZ d d	lmZ d d
lmZmZmZmZ d dlmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z% e% Z&e Z'e(e)Z*G dd deZ+G dd deZ,dS )    )annotationsN)AnyListOptional)Module)	Parameter)
LinearBase)ChannelQuantScaleParameterModelWeightParameter)LinearMethodBaseQuantizationConfigQuantizeMethodBase)is_fp8_fnuz)apply_fp8_linearcan_auto_enable_marlin_fp8cutlass_fp8_supportednormalize_e4m3fn_to_e4m3fnuz)apply_fp8_marlin_linearprepare_fp8_layer_for_marlin)UnquantizedLinearMethod)is_layer_skipped)get_bool_env_varis_cudac                      s|   e Zd ZdZd! fddZed"d
dZed#ddZed$ddZed%ddZ	ed&ddZ
d'ddZd(dd Z  ZS ))FBGEMMFp8ConfigzConfig class for FBGEMM Fp8.ignore_list	list[str]input_scale_ubfloatc                   sH   t    |r	|ng | _|| _d| _tr"td}t }|p|| _d S d S )NFSGLANG_FORCE_FP8_MARLIN)super__init__r   r   
use_marlin_is_cudar   r   )selfr   r   force_marlinauto_enable	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/fpgemm_fp8.pyr    *   s   
zFBGEMMFp8Config.__init__returnstrc                 C     dS )N
fbgemm_fp8r(   clsr(   r(   r)   get_name8      zFBGEMMFp8Config.get_namelist[torch.dtype]c                 C  s   t jt jgS N)torchbfloat16float16r.   r(   r(   r)   get_supported_act_dtypes<   s   z(FBGEMMFp8Config.get_supported_act_dtypesintc                 C  r,   )NP   r(   r.   r(   r(   r)   get_min_capability@   r1   z"FBGEMMFp8Config.get_min_capabilityc                 C     g S r3   r(   r.   r(   r(   r)   get_config_filenamesD   r1   z$FBGEMMFp8Config.get_config_filenamesconfigdict[str, Any]c                 C  s(   |  |dg}|  |dg}| ||dS )Nmodules_to_not_convertactivation_scale_ub)r   r   )get_from_keys)r/   r=   r   r   r(   r(   r)   from_configH   s   zFBGEMMFp8Config.from_configlayertorch.nn.ModuleprefixOptional[QuantizeMethodBase]c                 C  s.   t |trt|| j| jdrt S t| S d S )N)rE   ignored_layersfused_mapping)
isinstancer   r   r   packed_modules_mappingr   FBGEMMFp8LinearMethod)r#   rC   rE   r(   r(   r)   get_quant_methodN   s   
z FBGEMMFp8Config.get_quant_method	List[str]c                 C  r;   r3   r(   )r#   r(   r(   r)   get_scaled_act_names[   s   z$FBGEMMFp8Config.get_scaled_act_names)r   r   r   r   )r*   r+   )r*   r2   )r*   r8   )r*   r   )r=   r>   r*   r   )rC   rD   rE   r+   r*   rF   )r*   rM   )__name__
__module____qualname____doc__r    classmethodr0   r7   r:   r<   rB   rL   rN   __classcell__r(   r(   r&   r)   r   '   s    
r   c                   @  s8   e Zd ZdddZdddZdddZ	d d!ddZdS )"rK   quant_configr   c                 C  s   || _ t | _t | _d S r3   )rU   r4   get_default_dtype	out_dtyper   )r#   rU   r(   r(   r)   r    a   s   
zFBGEMMFp8LinearMethod.__init__rC   rD   input_size_per_partitionr8   output_partition_sizes	list[int]
input_sizeoutput_sizeparams_dtypetorch.dtypec                 K  s   | d}~~t|}	||_||_|	|_||_ttj|	|tj	ddd|d}
|
d|
 ttjt|dftjdd|d}ttjj|d d < |
d| tjjtj| jjtjdd	d
}||_d S )Nweight_loader)dtype   r   )data	input_dim
output_dimr_   weight)rb   rd   r_   weight_scaleFrequires_grad)getsumlogical_widthsrX   output_size_per_partition
orig_dtyper
   r4   emptyfloat8_e4m3fnregister_parameterr	   float32finfominnnr   tensorrU   r   )r#   rC   rX   rY   r[   r\   r]   extra_weight_attrsr_   rl   re   rf   r   r(   r(   r)   create_weightsh   s<   


z$FBGEMMFp8LinearMethod.create_weightsr   r*   Nonec                 C  s   t |jjdd|_t |jjdd|_|j}tr4t||jd d\}}}|d ur-t |dd|_t |dd|_t | dd|_| jj	rIt
| |`d S d S )NFrg   )re   rf   input_scale)r   rf   rb   re   _is_fp8_fnuzr   ry   trU   r!   r   r   )r#   rC   re   rf   ry   r(   r(   r)   process_weights_after_loading   s   z3FBGEMMFp8LinearMethod.process_weights_after_loadingNxtorch.TensorbiasOptional[torch.Tensor]c              
   C  sH   | j jrt||j|j|j|j|j|dS t||j|jd |j	|| j
ddS )N)inputre   rf   	workspacesize_nsize_kr   F)r   re   rf   ry   r   r   r   use_per_token_if_dynamic)rU   r!   r   re   rf   r   rl   rX   r   r   r   )r#   rC   r}   r   r(   r(   r)   apply   s(   
zFBGEMMFp8LinearMethod.apply)rU   r   )rC   rD   rX   r8   rY   rZ   r[   r8   r\   r8   r]   r^   )rC   r   r*   rx   r3   )rC   rD   r}   r~   r   r   r*   r~   )rO   rP   rQ   r    rw   r|   r   r(   r(   r(   r)   rK   _   s    


2rK   )-
__future__r   loggingtypingr   r   r   r4   torch.nnr   torch.nn.parameterr   sglang.srt.layers.linearr   sglang.srt.layers.parameterr	   r
   *sglang.srt.layers.quantization.base_configr   r   r   )sglang.srt.layers.quantization.fp8_kernelr   (sglang.srt.layers.quantization.fp8_utilsr   r   r   r   /sglang.srt.layers.quantization.marlin_utils_fp8r   r   &sglang.srt.layers.quantization.unquantr   $sglang.srt.layers.quantization.utilsr   sglang.srt.utilsr   r   r"   rz   	getLoggerrO   loggerr   rK   r(   r(   r(   r)   <module>   s(   
8