o
    پi"#                     @   s   d dl Z d dlmZmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZ d dlmZmZmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ e Ze e Z!G dd deZ"G dd deZ#dS )    N)AnyDictListOptional)	Parameter)
LinearBase)ModelWeightParameterPerTensorScaleParameter)LinearMethodBaseQuantizationConfigQuantizeMethodBase)apply_petit_nvfp4_linearprepare_nvfp4_layer_for_petitverify_petit_nvfp4_supported)UnquantizedLinearMethod)is_layer_skipped)is_hipc                   @   s"  e Zd ZdZ				d$dedededee ddf
d	d
Ze	defddZ
e	deej fddZe	defddZe	dee fddZe	deeef dd fddZe	dee fddZe	deeef defddZdedefddZdejjdeded fd d!Zdee fd"d#ZdS )%PetitNvFp4ConfigzConfig class for Petit FP4.FNis_checkpoint_nvfp4_serializedkv_cache_quant_algo
group_sizeexclude_modulesreturnc                 C   s*   || _ |r
td || _|| _|| _d S )Nz]Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.)r   loggerwarningr   r   r   )selfr   r   r   r    r   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/petit.py__init__%   s   
zPetitNvFp4Config.__init__c                 C      dS )Npetit_nvfp4r   clsr   r   r   get_name6   s   zPetitNvFp4Config.get_namec                 C   s   t jt jgS N)torchbfloat16halfr!   r   r   r   get_supported_act_dtypes:   s   z)PetitNvFp4Config.get_supported_act_dtypesc                 C   r   )NZ   r   r!   r   r   r   get_min_capability>   s   z#PetitNvFp4Config.get_min_capabilityc                 C   s   dgS )Nzhf_quant_config.jsonr   r!   r   r   r   get_config_filenamesC   s   z%PetitNvFp4Config.get_config_filenamesconfigc                 C   s   |  |dg}|d }|dd }t|| d|v }|d }|s"d}|dd }|r0|r0|d usBtd| d	| d
|  td| ||||S )Nquantization
quant_algor   NVFP4r   autor   zgroup_size: z,kv_cache_quant_algo: z,exclude_modules: z`NVFP4 quantization requires group size and kv_cache_quant_algo specified in hf_quant_config.json)get_from_keysgetr   r   r   
ValueError)r"   r,   quant_configquant_methodr   r   r   r   r   r   r   from_configG   s4   
zPetitNvFp4Config.from_configc                 C   s   |  |}|r|  S d S r$   )is_petit_nvfp4_compatibler#   )r"   hf_quant_cfg
user_quantcan_convertr   r   r   override_quantization_methode   s   
z-PetitNvFp4Config.override_quantization_methodr4   c                 C   s   | dd }to|dkS )Nr5    modelopt)r2   lower_is_hip)r"   r4   r5   r   r   r   r7   l   s   z*PetitNvFp4Config.is_petit_nvfp4_compatibleprefixc                 C   s4   |D ]}| dd dd}t||r dS qdS )N.z\.*z.*TF)replacere	fullmatch)r   r@   r   pattern	regex_strr   r   r   is_layer_excludedq   s   z"PetitNvFp4Config.is_layer_excludedlayerr   c                 C   s6   t |trt|| js| || jrt S t| S d S r$   )
isinstancer   r   r   rH   r   PetitNvFp4LinearMethod)r   rI   r@   r   r   r   get_quant_methodx   s   
z!PetitNvFp4Config.get_quant_methodc                 C   s   g S r$   r   )r   r   r   r   get_scaled_act_names   s   z%PetitNvFp4Config.get_scaled_act_names)FNNN)__name__
__module____qualname____doc__boolstrintr   r   classmethodr#   r%   dtyper(   r*   r+   r   r   r6   r   r;   r7   listrH   nnModulerL   rM   r   r   r   r   r   "   sP    

r   c                   @   s   e Zd ZdZdefddZdejjde	de
e	 de	d	e	d
ejfddZdejjddfddZ	ddejjdejdeej dejfddZdS )rK   a8  Linear method for NVFP4.
    Supports loading NVFP4 checkpoints with the following structure:

    |Tensor Name           | datatype      |  shape      |
    |----------------------------------------------------|
    |input_scale           | torch.float32 | scalar      |
    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
    |weight_scale          | FP8-E4M3      | [X, Y]      |
    |weight_scale_2        | torch.float32 | scalar      |

    The weights are quantized per block of 16 elements.
    Args: quant_config: The ModelOpt quantization config.
    r4   c                 C   s
   || _ d S r$   )r4   )r   r4   r   r   r   r      s   
zPetitNvFp4LinearMethod.__init__rI   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                 K   s  ~~| j js
tdt|}|d}	||_||_||_|d dkr&td| j jr-tj	n|}
t
tj||d tjddd|	d	}|d
| ttjt|tjd|	d}|d| ttjt|tjd|	d}|d| t
tj||| j j |
ddd|	d	}|d| d S )NzHNVFP4 quantization was selected,  dynamic quantization is not supported.weight_loader   r   z=Unsupported model when in features size is not multiple of 16   )rV      )data	input_dim
output_dimr_   weight)rc   r_   input_scaleweight_scale_2weight_scale)r4   r   r3   sumr2   logical_widthsrZ   output_size_per_partitionr%   float8_e4m3fnr   emptyuint8register_parameterr	   lenfloat32r   )r   rI   rZ   r[   r\   r]   r^   extra_weight_attrsrl   r_   weight_dtyperf   rg   rh   ri   r   r   r   create_weights   sd   


z%PetitNvFp4LinearMethod.create_weightsr   Nc                 C   sf   |j  tj}|j tj}t|dd|_ t|dd|_t|j |j dd|_t| |` d S )NF)requires_grad)	rg   maxtor%   rr   rh   r   alphar   )r   rI   input_scale_2rh   r   r   r   process_weights_after_loading   s   z4PetitNvFp4LinearMethod.process_weights_after_loadingxbiasc              	   C   s    t ||j|j|j|j|j|dS )N)inputrf   ri   rh   size_nsize_kr}   )r   rf   ri   rh   rl   rZ   )r   rI   r|   r}   r   r   r   apply   s   zPetitNvFp4LinearMethod.applyr$   )rN   rO   rP   rQ   r   r   r%   rX   rY   rT   r   rV   ru   r{   Tensorr   r   r   r   r   r   rK      s6    
JrK   )$loggingtypingr   r   r   r   regexrD   r%   torch.nn.parameterr   sglang.srt.layers.linearr   sglang.srt.layers.parameterr   r	   *sglang.srt.layers.quantization.base_configr
   r   r   *sglang.srt.layers.quantization.petit_utilsr   r   r   &sglang.srt.layers.quantization.unquantr   $sglang.srt.layers.quantization.utilsr   sglang.srt.utilsr   r?   	getLoggerrN   r   r   rK   r   r   r   r   <module>   s    
e