o
    پip                     @   sJ   d dl Z d dlZd dlmZmZ d dlmZ e eZ	G dd deZ
dS )    N)QuantizationConfigQuantizeMethodBase)is_fp8_fnuzc                   @   sR   e Zd ZdZdefddZdejjfddZ	dejjdej
fd	d
ZdddZdS )BaseKVCacheMethoda  
    Quant method that adds `k_scale` and `v_scale` attributes to the
    Attention layer to support loading those scaling factors from checkpoints.
    The k/v_scale will be used to:
        - quantize k/v_cache entries before saving them to the cache
        - dequantize k/v_cache entries before fetching them from the cache

    :param quant_config: the appropriate QuantizationConfig
    quant_configc                 C   s
   || _ d S )N)r   )selfr    r   [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/quantization/kv_cache.py__init__   s   
zBaseKVCacheMethod.__init__layerc                 C   s@   t jjt jdt jddd|_t jjt jdt jddd|_dS )zS
        Create "weight" (aka k_scale and v_scale) for an attention layer.
        g      )dtypeF)requires_gradN)torchnn	Parametertensorfloat32k_scalev_scaler   r   r   r   r	   create_weights   s   z BaseKVCacheMethod.create_weightsreturnc                 C   s   t | jj d)Nz.apply should not be called.)RuntimeError	__class____name__r   r   r   r	   apply,   s   zBaseKVCacheMethod.applyNc                 C   s   |j dkr&|jdkr&|j d }|jd }t r%|d9 }|d9 }n6|j dk r5|jdk r5d}d}n'|j dks<J t|j |j}|d }|d }t r\|d9 }|d9 }t|trft|tsjtd|j 	| |j	| ||_
||_d S )Ng        cpu   g      ?z7Only support per-tensor scaling factor for fp8 KV cache)r   r   totolistr   max
isinstancefloat
ValueErrorcopy_k_scale_floatv_scale_float)r   r   r   r   scale_to_duplicater   r   r	   process_weights_after_loading/   s2   
z/BaseKVCacheMethod.process_weights_after_loading)r   N)r   
__module____qualname____doc__r   r
   r   r   Moduler   Tensorr   r(   r   r   r   r	   r      s    
r   )loggingr   *sglang.srt.layers.quantization.base_configr   r   )sglang.srt.layers.quantization.fp8_kernelr   	getLoggerr   loggerr   r   r   r   r	   <module>   s   
