o
    ۷i                     @   s0   d Z ddlmZ ddlmZ G dd deZdS )z3FP8 quantization config for diffusion transformers.    )	Fp8Config   )DiffusionQuantizationConfigc                   @   sB   e Zd ZdZeZ			d	dedee dB dee dB fddZ	dS )
DiffusionFp8Configa`  FP8 quantization config optimized for diffusion transformers.

    Uses dynamic activation scaling (no calibration dataset needed) and
    online weight quantization from BF16/FP16 checkpoints.

    Device Compatibility:
        - Turing (SM 75+): Weight-only FP8 via Marlin kernel
        - Ada/Hopper (SM 89+): Full W8A8 FP8 with native hardware support

    The kernel selection is automatic based on GPU capability.

    Args:
        activation_scheme: Activation quantization scheme.
            - "dynamic": Per-token dynamic scaling (default, no calibration)
            - "static": Single per-tensor scale (requires calibration)
        weight_block_size: Block size for block-wise weight quantization.
            Format: [block_n, block_k]. If None, uses per-tensor scaling.
        ignored_layers: List of layer name patterns to skip quantization.
    dynamicNactivation_schemeweight_block_sizeignored_layersc                 C   s,   || _ || _|p	g | _td|||d| _d S )NF)is_checkpoint_fp8_serializedr   r   r	   )r   r   r	   r   _vllm_config)selfr   r   r	    r   Z/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm_omni/diffusion/quantization/fp8.py__init__"   s   
zDiffusionFp8Config.__init__)r   NN)
__name__
__module____qualname____doc__r   quant_config_clsstrlistintr   r   r   r   r   r   
   s    

r   N)r   +vllm.model_executor.layers.quantization.fp8r   baser   r   r   r   r   r   <module>   s   