o
    
۾i+-                     @   s   d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZmZ d dlmZ d dlmZmZ d dlm Z  ee!Z"G dd deZ#G dd deZ$G dd deZ%dS )    )AnyN)	Parameter)init_logger)	Attention)
LinearBaseLinearMethodBaseUnquantizedLinearMethod)QuantizationMethods)QuantizationConfigQuantizeMethodBase)BaseKVCacheMethod)apply_petit_nvfp4_linearprepare_nvfp4_layer_for_petitverify_petit_nvfp4_supported)is_layer_skipped)ModelWeightParameterPerTensorScaleParameter)current_platformc                   @   sj  e Zd ZdZ				d,dededB dedB dee dB ddf
d	d
Zd-ddZ	e
defddZe
deej fddZe
defddZe
dee fddZe
deeef dd fddZe
dedB fddZe
deeef defddZdedee defddZd ejjdedd!fd"d#Zdee fd$d%Zdefd&d'Zdefd(d)Zdee fd*d+ZdS ).PetitNvFp4ConfigzConfig class for Petit FP4.FNis_checkpoint_nvfp4_serializedkv_cache_quant_algo
group_sizeexclude_modulesreturnc                 C   s2   |    || _|rtd || _|| _|| _d S )Nz]Detected nvfp4 checkpoint. Please note that the format is experimental and subject to change.)_check_hardware_supportr   loggerwarningr   r   r   )selfr   r   r   r    r   a/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/petit.py__init__*   s   
zPetitNvFp4Config.__init__c                 C   s   t  rtddS )z
        Verifies that the current hardware is supported by the Petit backend.
        This backend is specifically designed for AMD GPUs and is not
        supported on the CUDA platform.
        zThe 'petit' quantization backend is designed for AMD GPUs and is not supported on the CUDA platform. For NVIDIA GPUs, please use a different quantization method such as FP8, AWQ, or GPTQ.N)r   is_cuda
ValueErrorr   r   r   r   r   <   s
   z(PetitNvFp4Config._check_hardware_supportc                 C      dS )Npetit_nvfp4r   clsr   r   r   get_nameK   s   zPetitNvFp4Config.get_namec                 C   s   t jt jgS N)torchbfloat16halfr&   r   r   r   get_supported_act_dtypesO   s   z)PetitNvFp4Config.get_supported_act_dtypesc                 C   r$   )NZ   r   r&   r   r   r   get_min_capabilityS   s   z#PetitNvFp4Config.get_min_capabilityc                 C   s   dgS )Nzhf_quant_config.jsonr   r&   r   r   r   get_config_filenamesX   s   z%PetitNvFp4Config.get_config_filenamesconfigc                 C   s   |  |dg}|d}t|tr|std| }|d}t|ts)td|}t|| |dp6d}t|ts@td|}|d	g }	|	d u rOg }
nt|	tr`t	d
d |	D r`|	}
ntdd|v }| ||||
dS )Nquantization
quant_algoz7Missing or invalid 'quant_algo' in quantization config.r   z>Missing or invalid 'group_size' (int) in hf_quant_config.json.r   autoz3'kv_cache_quant_algo' must be a string if provided.r   c                 s   s    | ]}t |tV  qd S r)   )
isinstancestr).0xr   r   r   	<genexpr>v   s    

z/PetitNvFp4Config.from_config.<locals>.<genexpr>z3'exclude_modules' must be a list[str] (or omitted).NVFP4)r   r   r   r   )
get_from_keysgetr5   r6   r"   upperintr   listall)r'   r1   qcquant_method_rawquant_methodgroup_size_rawr   kv_cache_quant_algo_rawr   exclude_rawr   r   r   r   r   from_config\   s>   




zPetitNvFp4Config.from_configc                 C   sH   t  sd S |d|}|dp|dpd }|dv r"|  S d S )Nr2   r3   rC    )r:   MODELOPT_FP4MODELOPT)r   is_rocmr<   r=   r(   )r'   hf_quant_cfg
user_quantrA   algor   r   r   override_quantization_method   s   z-PetitNvFp4Config.override_quantization_methodquant_configc                 C   s0   | d|}| dp| dpd }|dkS )Nr2   r3   rC   rH   r:   )r<   r=   )r'   rP   rA   rN   r   r   r   is_petit_nvfp4_compatible   s   z*PetitNvFp4Config.is_petit_nvfp4_compatibleprefixc                 C   s4   |D ]}| dd dd}t||r dS qdS )N.z\.*z.*TF)replacere	fullmatch)r   rR   r   pattern	regex_strr   r   r   is_layer_excluded   s   z"PetitNvFp4Config.is_layer_excludedlayerzQuantizeMethodBase | Nonec                 C   sL   |   }t|trt||s| ||rt S t| S t|tr$t| S d S r)   )	require_exclude_modulesr5   r   r   rZ   r   PetitNvFp4LinearMethodr   PetitFp8KVCacheMethod)r   r[   rR   excluder   r   r   get_quant_method   s   

z!PetitNvFp4Config.get_quant_methodc                 C   s   g S r)   r   r#   r   r   r   get_scaled_act_names   s   z%PetitNvFp4Config.get_scaled_act_namesc                 C   s   | j d u rtd dS | j S )Nz/group_size not set; defaulting to 16 for NVFP4.   )r   r   r   r#   r   r   r   require_group_size   s   

z#PetitNvFp4Config.require_group_sizec                 C   s
   | j pdS )Nr4   )r   r#   r   r   r   require_kv_cache_quant_algo      
z,PetitNvFp4Config.require_kv_cache_quant_algoc                 C   s   t | jpg S r)   )r?   r   r#   r   r   r   r\      s   z(PetitNvFp4Config.require_exclude_modules)FNNN)r   N)__name__
__module____qualname____doc__boolr6   r>   r?   r    r   classmethodr	   r(   r*   dtyper-   r/   r0   dictr   rG   rO   rQ   rZ   nnModuler`   ra   rc   rd   r\   r   r   r   r   r   '   s\    


)
r   c                       s&   e Zd ZdZdef fddZ  ZS )r^   zI
    Supports loading kv-cache scaling factors from FP8 checkpoints.
    rP   c                    s   t  | d S r)   )superr    r   rP   	__class__r   r   r       s   zPetitFp8KVCacheMethod.__init__)rf   rg   rh   ri   r   r    __classcell__r   r   rr   r   r^      s    r^   c                   @   s   e Zd ZdZdefddZdejjde	de
e	 de	d	e	d
ejfddZdejjddfddZ	ddejjdejdejdB dejfddZdS )r]   a8  Linear method for NVFP4.
    Supports loading NVFP4 checkpoints with the following structure:

    |Tensor Name           | datatype      |  shape      |
    |----------------------------------------------------|
    |input_scale           | torch.float32 | scalar      |
    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
    |weight_scale          | FP8-E4M3      | [X, Y]      |
    |weight_scale_2        | torch.float32 | scalar      |

    The weights are quantized per block of 16 elements.
    Args: quant_config: The ModelOpt quantization config.
    rP   c                 C   s
   || _ d S r)   )rP   rq   r   r   r   r       re   zPetitNvFp4LinearMethod.__init__r[   input_size_per_partitionoutput_partition_sizes
input_sizeoutput_sizeparams_dtypec                 K   s  ~~| j js
tdt|}|d}	||_||_||_|d dkr&td| j jr-tj	n|}
t
tj||d tjddd|	d	}|d
| ttjt|tjd|	d}|d| ttjt|tjd|	d}|d| | j  }t
tj||| |
ddd|	d	}|d| d S )NzHNVFP4 quantization was selected,  dynamic quantization is not supported.weight_loaderrb   r   z=Unsupported model when in features size is not multiple of 16   )rl      )data	input_dim
output_dimrz   weight)r}   rz   input_scaleweight_scale_2weight_scale)rP   r   r"   sumr<   logical_widthsru   output_size_per_partitionr*   float8_e4m3fnr   emptyuint8register_parameterr   lenfloat32rc   )r   r[   ru   rv   rw   rx   ry   extra_weight_attrsr   rz   weight_dtyper   r   r   r   r   r   r   r   create_weights   sf   


z%PetitNvFp4LinearMethod.create_weightsr   Nc                 C   sf   |j  tj}|j tj}t|dd|_ t|dd|_t|j |j dd|_t| |` d S )NF)requires_grad)	r   maxtor*   r   r   r   alphar   )r   r[   input_scale_2r   r   r   r   process_weights_after_loading%  s   z4PetitNvFp4LinearMethod.process_weights_after_loadingr8   biasc              	   C   s    t ||j|j|j|j|j|dS )N)inputr   r   r   size_nsize_kr   )r   r   r   r   r   ru   )r   r[   r8   r   r   r   r   apply1  s   zPetitNvFp4LinearMethod.applyr)   )rf   rg   rh   ri   r   r    r*   rn   ro   r>   r?   rl   r   r   Tensorr   r   r   r   r   r]      s6    
Kr]   )&typingr   regexrV   r*   torch.nn.parameterr   vllm.loggerr   $vllm.model_executor.layers.attentionr   !vllm.model_executor.layers.linearr   r   r   'vllm.model_executor.layers.quantizationr	   3vllm.model_executor.layers.quantization.base_configr
   r   0vllm.model_executor.layers.quantization.kv_cacher   9vllm.model_executor.layers.quantization.utils.petit_utilsr   r   r   9vllm.model_executor.layers.quantization.utils.quant_utilsr   vllm.model_executor.parameterr   r   vllm.platformsr   rf   r   r   r^   r]   r   r   r   r   <module>   s&    	