o
    i7                    @   s@  d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlmZ ddlZddlmZ ddlm  m  mZ ddlZddlmZ ddlmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( dd	l)m*Z* dd
l+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z< ddl=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZUmVZV ddlWmXZXmYZYmZZZm[Z[ ddl\m]Z]m^Z^m_Z_ ddl`maZambZbmcZcmdZdmeZe ddlfmgZgmhZh ddlimjZjmkZk ddllmmZm ddlnmoZompZpmqZqmrZr ddlsmtZtmuZu ddlmvZv ewexZyg dZzdd iZ{d!d" Z|e%eqj}ge!eqj}ge"eqj}eqj~giZe%d#e!d#e"d#iZ	$		%dd&eeed'f  d(dfd)d*Zd+d, Zdd-d.Zdd/d0Zdd1d2ejd3ee@ d4ee@ d5eeejjegef  fd6d7Zd8d9 Zd:d; Zd#d#d<d=d>Zedfd2ejjd?ed5eeejjegef  d@eejj fdAdBZdCejd(ejfdDdEZdCejd(ejfdFdGZdCejd(ejfdHdIZeG dJdK dKeZdddLdMeej dNeej fdOdPZeVedddLdQejjd?edMeej dNeej d(ejjf
dRdSZeG dTdU dUeZdVdW ZeVedQejjd?ed(ejjfdXdYZeG dZd[ d[eZeVedQejjd?ed(ejjfd\d]ZeG d^d_ d_eZd`da ZeVedbdcdQejjd?eddefdedfZdCejd(ejfdgdhZdCejd(ejfdidjZdCejdkejd(ejfdldmZdCejdkejd(ejejffdndoZeG dpdq dqeZdrds ZeVedbdcdQejjd?ed(ejjfdtduZeG dvdw dweZeVedbdcdQejjd?efdxdyZdzd{ ZeG d|d} d}eZd~d ZeVedbdcdQejjd?edded(ejjfddZ		ddCejde4dejdeej deej f
ddZeG dd deZdd ZeVedbdcdQejjd?eddefddZeG dd deZeVedQejjd?efddZdePdejdejd(dfddZeG dd deZdddLdMeej dNeej fddZeVedddLdQejjd?edMeej dNeej d(ejjf
ddZeG dd deZeZeeehZdQejjded?efddZded?efddZdQejded?efddZdQe1d(ejfddZejeeeeee*g dS )ap  
Quantization APIs

Generally these APIs can be applied directly to any model
with Linear modules to obtain quantized linear ops. The intended
usage involves applying torch.compile to the model afterwards
both because primitives were designed based on the fusions that
come along with it and because that is how we access the intended quantized
and mixed GEMM kernels
    N)OrderedDict)	dataclassfield)partial)AnyCallableListOptionalTupleUnion)AOBaseConfig)AffineQuantizedTensorCutlassSemiSparseLayoutFloat8LayoutInt4CPULayoutInt4XPULayoutPlainLayoutSemiSparseLayoutTensorCoreTiledLayoutto_affine_quantized_floatx!to_affine_quantized_floatx_staticto_affine_quantized_intx)Target)Layout)
e4m3_dtype
e5m2_dtype)Float8Linear)Float8MMConfigFP8Granularity_check_hardware_support!_granularity_is_a_1_128_w_128_128_normalize_granularity)(Float8StaticActivationFloat8WeightConfigGemliteUIntXWeightOnlyConfig%Int8DynamicActivationInt4WeightConfigUIntXWeightOnlyConfig)$LinearActivationWeightObservedTensor)AffineQuantizedObserverBase)KernelPreference)Float8PackingFormatFloat8TensorInt4ChooseQParamsAlgorithmInt4PackingFormatInt4PlainInt32TensorInt4PreshuffledTensor
Int4TensorInt4TilePackedTo4dTensor
Int8TensorIntxChooseQParamsAlgorithmIntxOpaqueTensorIntxPackingFormatIntxUnpackedToInt8TensorQuantizeTensorToFloat8KwargsQuantizeTensorToInt8KwargsSparse2x4CUTLASSFloat8Tensor)_QUANTIZE_CONFIG_HANDLER register_quantize_module_handler)_fp8_mm_compat_linear_extra_repr_quantization_typeget_block_size)is_MI300is_sm_at_least_89is_sm_at_least_90   )GranularityPerAxisPerGroupPerRow	PerTensor)LinearActivationQuantizedTensorto_linear_activation_quantized)Int4WeightOnlyQuantizerInt8DynActInt4WeightQuantizer) intx_quantization_aware_training)_DTYPE_TO_QVALUE_BOUNDSMappingTypeZeroPointDomainquantize_affine)	QuantizerTwoStepQuantizer)_get_per_token_block_size)swap_conv2d_1x1_to_linearrQ   rR   rJ   	autoquant_get_subclass_inserter	quantize_rL   rK   3Float8DynamicActivationFloat8SemiSparseWeightConfigModuleFqnToConfigrU   z
.autoquantc                 C   sB   | t v rdd l}t |  }||t}t|| S tdtd| )Nr   zmodule z has no attribute )_lazy_imports	importlibimport_module__package__getattrAttributeError__name__)namer[   module_pathmodule rd   R/home/ubuntu/.local/lib/python3.10/site-packages/torchao/quantization/quant_api.py__getattr__   s   
rf   F rd   
extra_args.returnc           
      C   s   || |dd r|dur| j |d || g|R  } | S t|  }|D ] \}}t|||| | d||}	|	|urE|	durEt| ||	 q%|durP| j |d | S )a'  
    Recursively replaces each child module in `model` with the result of `replacement_fn(child)`
    if `filter_fn(child)` returns `True`.

    Args:
        model (torch.nn.Module): The model containing modules to be replaced.
        replacement_fn (Callable[[torch.nn.Module], torch.nn.Module]): The function to replace matching modules.
        filter_fn (Callable[[torch.nn.Module], bool]): The filter function to determine which modules to replace.
        cur_fqn (str, optional): The current fully qualified name of the module being processed. Defaults to "".
        device (device, optional): Device to move the model to before applying `filter_fn`. Defaults to None.
        extra_args (Tuple[Any, ...], optional): optional extra args to pass to `replacement_fn`.

    Returns:
        None
    Ndevice.)tolistnamed_children)_replace_with_custom_fn_if_matches_filtersetattr)
modelreplacement_fn	filter_fncur_fqnrl   rh   named_children_listra   child	new_childrd   rd   re   rq      s*   rq   c                 G   sz   ddl m} ddlm} t| tjjo<t| do<t| j	| o<t| j	t
 o<t| j	t o<t| j	| o<t| tjjj S )Nr   )_AffineFakeQuantizedTensorrB   )AutoQuantizableLinearWeightweight)5torchao.quantization.qat.affine_fake_quantized_tensorrz   rU   r{   
isinstancetorchnnLinearhasattrr|   r   rH   moduleslinearNonDynamicallyQuantizableLinear)modargsrz   r{   rd   rd   re   
_is_linear   s   r   c                    s0    dd dd fdd}|S )a  
    Returns a function which inserts the given subclass into all linear modules
    in the model. The inserted module will have its weight set to the result of
    `cls(mod.weight, **kwargs)`. If parametrization is enabled then this will be done using
    torch.nn.utils.parametrize instead of directly setting the attribute on the module.

    Args:
        cls (torch.Tensor): The class to insert as a child module.
        kwargs (Any): Any additional arguments for the constructor.
    constructorsubclass_constructormethod
from_floatc                    sz   r(t jj j| jfi dd| _| j \}}t| dt |  | S t jjt | jfi dd| _| S )NFrequires_gradr|   )	r   r   	Parameterr   r|   __tensor_flatten__parametrizeregister_parametrizationr^   )lin_r   clsr   enable_parametrizationr   kwargsrd   re   insert_subclass  s   	z/_get_subclass_inserter.<locals>.insert_subclass)pop)r   r   r   r   rd   r   re   rV      s   rV   c                    sB   G dd dt jj  fdd}|du rdd }t| ||d dS )	zi
    Changes all conv2d 1x1 modules to equivalent linear modules so that they can then be quantized.
    c                       s$   e Zd Z fddZdd Z  ZS )z2swap_conv2d_1x1_to_linear.<locals>.PermuteSandwichc                    s   t    || _d S N)super__init__r   )selfr   	__class__rd   re   r   "  s   

z;swap_conv2d_1x1_to_linear.<locals>.PermuteSandwich.__init__c                 W   s&   |  |d ddddddddS )Nr         rB   )r   permute)r   r   rd   rd   re   forward&  s   &z:swap_conv2d_1x1_to_linear.<locals>.PermuteSandwich.forward)r`   
__module____qualname__r   r   __classcell__rd   rd   r   re   PermuteSandwich!  s    r   c                    sR   | j dksJ tjj| j| j| jd u d}tj| j	dd|_| j|_ |S )NrB   rB   )biasrj   )
kernel_sizer   r   r   in_channelsout_channelsr   r   r|   squeeze)convr   r   rd   re   replace_conv2d_1x1)  s   z5swap_conv2d_1x1_to_linear.<locals>.replace_conv2d_1x1Nc                 W   s   t | tjjo| jdkS )Nr   )r~   r   r   Conv2dr   )r   r   rd   rd   re   <lambda>3  s    
z+swap_conv2d_1x1_to_linear.<locals>.<lambda>ru   )r   r   Modulerq   )rs   ru   r   rd   r   re   rT     s   	
rT   r   rs   input_observerweight_observerru   c                   s8   dt jf fdd}t| ||du rt dS | dS )a  
    Converts the weight of a linear module to a LinearActivationWeightObservedTensor.

    This function wraps the weight of the given linear module with a LinearActivationWeightObservedTensor,
    which enables observation of both input and weight tensors during forward passes.
    The wrapped weight is then re-wrapped as a nn.Parameter to maintain compatibility
    with PyTorch's module system.

    Example::

    ```
        import torch
        import torch.nn as nn
        from torchao.quantization import PerTensor
        from torchao.quantization.linear_observer_tensor import insert_observers_
        from torchao.quantization.observer import (
            AffineQuantizedMinMaxObserver,
            MappingType
        )

        # Create observers
        input_observer = AffineQuantizedMinMaxObserver(
            MappingType.SYMMETRIC,
            torch.float8_e4m3fn,
            granularity_type=PerTensor(),
            eps=torch.finfo(torch.float32).eps,
            scale_dtype=torch.float,
            zero_point_dtype=torch.int,
            zero_point_domain=ZeroPointDomain.NONE,
        )

        # Create a linear module
        linear_module = nn.Linear(10, 20)

        # Convert the linear module's weight to an observed tensor
        insert_observers_(linear_module, input_observer, weight_observer=None)

        # The linear_module can now be used as usual, with observers calculating statistics
        output = linear_module(torch.randn(10, 10))

        # Get the scale and zero point of the input observer
        scale, zero_point = linear_module.weight.input_observer.calculate_qparams()
    ```

    Args:
        model (nn.Module): The nn.Module to convert.
        input_observer (Optional[AffineQuantizedObserverBase]): Observer for input tensor.
        weight_observer (Optional[AffineQuantizedObserverBase]): Observer for weight tensor.
        filter_fn (Optional[Callable[[torch.nn.Module, str], bool]]): Filter function to select which modules to convert.
            If not provided, all linear modules will be converted. This function should take a module and its fully qualified name.

    Returns:
        nn.Linear: The modified linear module with its weight wrapped in a LinearActivationWeightObservedTensor.
    linear_modulec                    s&   t jtj| j d| jjd| _| S )Nr   r   r   )r   r   r&   r   r|   r   )r   r   rd   re   convert_to_linear_observerz  s   z5insert_observers_.<locals>.convert_to_linear_observerN)r   r   rq   r   )rs   r   r   ru   r   rd   r   re   insert_observers_<  s   >
r   c                 C   s,   d| j jd  d| j jd  dt| j  S )Nznum_embeddings=r   z, embedding_dim=rB   z	, weight=)r|   shaper=   r   rd   rd   re   _embedding_extra_repr  s   ,r   c                 C   sH   g }| }t |dkr|| || dtt| |  d|S )Nr   =z, )lenappendr=   r^   join)r   original_extra_reprparameter_namemodule_torchao_extra_reproriginal_extra_repr_strrd   rd   re   _module_extra_repr  s   

r   )allow_requires_gradpropagate_biasc                   s    fdd}|S )zHelper function to apply the constructor that quantizes the weight Tensor (with additional kwargs)
    to the weight of linear module
    c                    sP    o| j j}dkr| jd< tjj| j fi |d| _ tt| | _	| S )NTr   r   )
r|   r   r   r   r   r   types
MethodTyper<   
extra_repr)r   r   r   r   r   r   rd   re   r     s   
z6_get_linear_subclass_inserter.<locals>.insert_subclassrd   )r   r   r   r   r   rd   r   re   _get_linear_subclass_inserter  s   
r   configrl   c                 C   s  t jd t|tri|durtdt|  }| D ]I\}}t	||s5t
|||s5d|jv rft|rft|||}|durE|j|d}||urf|dkrf|dd }||d}	||	 }
t|
|| qdS t|tr|du rttn|}tt| }t| ||||fd	 dS td
)a  Convert the weight of linear modules in the model with `config`, model is modified inplace

    Args:
        model (torch.nn.Module): input model
        config (AOBaseConfig): a workflow configuration object.
        filter_fn (Optional[Callable[[torch.nn.Module, str], bool]]): function that takes a nn.Module instance and fully qualified name of the module, returns True if we want to run `config` on
        the weight of the module
        device (device, optional): Device to move module to before applying `filter_fn`. This can be set to `"cuda"` to speed up quantization. The final model will be on the specified `device`.
            Defaults to None (do not change device).

    Example::

        import torch
        import torch.nn as nn
        from torchao import quantize_

        # quantize with some predefined `config` method that corresponds to
        # optimized execution paths or kernels (e.g. int4 tinygemm kernel)
        # also customizable with arguments
        # currently options are
        # Int8DynamicActivationInt8WeightConfig (optimized with int8 mm op and torch.compile)
        # Int4WeightOnlyConfig (optimized with int4 tinygemm kernel and torch.compile)
        # Int8WeightOnlyConfig (optimized with int8 mm op and torch.compile
        from torchao.quantization.quant_api import Int4WeightOnlyConfig

        m = nn.Sequential(nn.Linear(32, 1024), nn.Linear(1024, 32))
        quantize_(m, Int4WeightOnlyConfig(group_size=32))

    ztorchao.quantization.quantize_NzuCustom filter_fn and FqnToConfig were both specified. Only filter_fn=None is supported when FqnToConfig is specified._defaultrk   rg   rm   rj   )rl   rh   zPassing a generic Callable to `quantize_` is no longer recommended and will be deprecated at a later release. Please see https://github.com/pytorch/ao/issues/1690 for instructions on how to pass in workflow configuration instead.)r   _C_log_api_usage_oncer~   FqnToConfig
ValueErrordictnamed_modulesitemsfqn_matches_fqn_config _module_param_matches_fqn_configfqn_to_configr   _fqn_to_config_handlerrn   splitremovesuffixrr   r   r9   typerq   AssertionError)rs   r   ru   rl   r   
module_fqnrc   replacement
child_name
parent_fqnparent_modulehandlerrd   rd   re   rW     sH   #



rW   xc              	   C   s@   t j}tj}tj}ttjj}tj}t| |t| ||||dS )zGThis is defined here instead of local function to support serialization)epsscale_dtypezero_point_dtype)	rN   
ASYMMETRICr   int8float32finfor   r   rS   )r   mapping_typetarget_dtyper   r   r   rd   rd   re   _int8_asymm_per_token_quant  s   r   c           	      C   sP   t j}tj}tj}ttjj}tj}d}d}t| |t	| ||||||d	}|S )Nr      )	quant_min	quant_maxr   r   r   )
rN   r   r   uint8r   r   r   int32r   rS   )	r   r   r   r   r   r   r   r   outrd   rd   re   _uint8_asymm_per_token_quant  s&   r   c              
   C   s6   t j}tj}d}d}d}t| |t| ||||tjdS Nh㈵>   r   r   r   r   )rN   	SYMMETRICr   r   r   rS   r   r   r   r   r   r   r   rd   rd   re   _int8_symm_per_token_quant%  s   r   c                   @   s   e Zd ZU dZejZejed< e	dZ
eed< ejZeed< dZeej ed< ejZeed< ejZeed	< ejZeed
< dZeed< dd ZdS )%Int8DynamicActivationIntxWeightConfiga|  
    Configuration for dynamically quantizing activations to torch.int8 and weights to torch.intx, with 1 <= x <= 8.
    More specifically, activations are dynamically quantized to 8-bits at a per-token granularity with scales/zeros.
    Weights are quantized with scales/zeros in a groupwise or channelwise manner using the number of bits specified by weight_dtype.

    This layout is identical to Int8DynamicActivationInt4WeightConfig when weight_dtype is torch.int4 and other args
    are the same.  However, this layout is more general and supports other weight dtypes.

    args:
        `weight_dtype`: The dtype to use for weight quantization.  Must be torch.intx, where 1 <= x <= 8.
       ` weight_granularity`: The granularity to use for weight quantization.  Must be PerGroup or PerAxis(axis=0).
        `weight_mapping_type`: The type of mapping to use for the weight quantization.
            Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.  MappingType.SYMMETRIC requires ZeroPointDomain.NONE
        `weight_scale_dtype`: The dtype to use for the weight scale.
        `act_mapping_type`: The type of mapping to use for the activation quantization.
            Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.
        `intx_packing_format`: The format to use for the packed weight tensor (version 2 only).
            - unpacked_to_int8: this format is the default and is intended for export applications like ExecuTorch.
            - opaque_torchao_auto: this format is optimized for CPU performance.
        `intx_choose_qparams_algorithm`: The algorithm to use for choosing the quantization parameters.
        `version`: version of the config to use, only subset of above args are valid based on version, see note for more details.

    Example:

    .. literalinclude:: ../../examples/inference/int8_dynamic_activation_intx_weight.py
       :language: python
    weight_dtype    weight_granularityweight_mapping_typeNweight_scale_dtypeact_mapping_typeintx_packing_formatintx_choose_qparams_algorithmr   versionc                 C   s   t jd | jdd tddD v sJ d| j t| jttfs+J d| j t| jtr@| jj	dks@J d	| jj	 | j
tjtjtjfv sSJ d
| j
 | jtjtjfv sdJ d| j d S )Nz:torchao.quantization.Int8DynamicActivationIntxWeightConfigc                 S      g | ]
}t td | qS intr^   r   .0brd   rd   re   
<listcomp>g      zGInt8DynamicActivationIntxWeightConfig.__post_init__.<locals>.<listcomp>rB   	   <weight_dtype must be torch.intx, where 1 <= x <= 8, but got z8weight_granularity must be PerAxis or PerGroup, but got r   zaxis must be 0, but got z~weight_mapping_type must be MappingType.ASYMMETRIC or MappingType.SYMMETRIC or MappingType.SYMMETRIC_NO_CLIPPING_ERR, but got zRact_mapping_type must be MappingType.ASYMMETRIC or MappingType.SYMMETRIC, but got )r   r   r   r  ranger~   r  rD   rE   axisr  rN   r   r   SYMMETRIC_NO_CLIPPING_ERRr  r   rd   rd   re   __post_init__c  s4   



z3Int8DynamicActivationIntxWeightConfig.__post_init__)r`   r   r   __doc__r   r   r  dtype__annotations__rE   r  rC   rN   r   r  r  r	   r   r  r4   UNPACKED_TO_INT8r  r2   AFFINEr	  r
  r  r  rd   rd   rd   re   r  8  s   
 
r  custom_scalecustom_zero_pointr   r!  c             
   C   sp  |j }|j}|j}|j}|j}	|j}
|j}|  dks$J d|   t|t	r-|j
}nt|trE|jdks?J d|j | jd }ntd| d|f}|jdksWJ |	tjks^J tjtjtjtjg}|
tjksx|
|v sxJ d|
 |d ur|jtjkr|tj}tj| |||d	|||d
}|d ur|| jkrt|| | |}|
|v rt j!|||
d}d }||fS )Nr   zFInt8DynamicActivationIntxWeightConfig only works for 2-d Tensor, got: r   %axis must be 0 with PerAxis, but got rj   z4weight_granularity must be PerGroup or PerAxis, got rB   Unsupported packing format: int8_asym_per_token)r   activation_quantizationr	  r   r!  )r   r  )"r  r  r  r  r  r  r	  dimr~   rE   
group_sizerD   r  r   r   r
  rN   r   r4   OPAQUE_ATEN_KLEIDIAIOPAQUE_TORCHAO_AUTOOPAQUE_TORCHAO_KLEIDIAIOPAQUE_TORCHAO_LOWBITr  r  r   r   rn   r   r5   from_hp+_adjust_scale_dtype_in_intx_unpacked_tensorr3   !from_intx_unpacked_to_int8_tensor)r|   r   r   r   r!  r  r  r  r  r  r  r	  r'  
block_sizeopaque_formats
new_weightnew_biasrd   rd   re   4_int8_dynamic_activation_intx_weight_quantize_tensor  sn   





r3  rc   c                C   sX   t | j| j|||d\}}tjj|dd| _|d u rd | _t| tjr*t	t
| | _| S )Nr  Fr   )r3  r|   r   r   r   r   r~   r   r   r   r<   r   )rc   r   r   r!  r1  r2  rd   rd   re   ._int8_dynamic_activation_intx_weight_transform  s   
r4  c                   @   sZ   e Zd ZU dZdZeed< dZeed< e	j
Ze	ed< ejZeed< dZeed	< d
d ZdS )Int4WeightOnlyConfiga]  
    Configuration for int4 weight only quantization, only groupwise quantization is supported
    right now, and we support version 1 and version 2, that are implemented differently although with
    same support. In version 2, different target are mainly distinguished by `packing_format` arg, and in version 1, mainly by `layout`.

    Args:
        `group_size`: parameter for quantization, controls the granularity of quantization, smaller
         size is more fine grained, choices are [256, 128, 64, 32], used in both version 1 and 2
        `int4_packing_format`: the packing format for int4 tensor, used in version 2 only
         `int4_choose_qparams_algorithm`: variants of choose qparams algorithm to use for int4,
         currently support TINYGEMM ("tinygemm") and HQQ ("hqq"), used in version 2 only
        `set_inductor_config`: if True, adjusts `torchinductor` settings to recommended values. used in both version 1 and 2
        `version`: version of the config to use, default is 2

    Example:

    .. literalinclude:: ../../examples/inference/int4_weight_only.py
       :language: python
       r'  Tset_inductor_configint4_packing_formatint4_choose_qparams_algorithmr   r
  c                 C      t jd d S )Nz)torchao.quantization.Int4WeightOnlyConfigr   r   r   r   rd   rd   re   r        z"Int4WeightOnlyConfig.__post_init__N)r`   r   r   r  r'  r  r  r7  boolr,   PLAINr8  r+   TINYGEMMr9  r
  r  rd   rd   rd   re   r5    s   
 
r5  c                 C   s  |j }|j}|j}| jd | dkr td| j d|  | S tdd t| jd D |g }|j	dks8J t
|}|tjkrN|tjksNJ d	| d
|tjkr^tj| |tjd}|S |tjkrkt| |}|S |tjkrxt| |}|S |tjkrtj| ||d}|S td| )Nrj   r   zZSkipping quantizing weight with int4 weight only quantization because the shape of weight z# is not compatible with group_size c                 S      g | ]}d qS rB   rd   r  r   rd   rd   re   r        z5_int4_weight_only_quantize_tensor.<locals>.<listcomp>rB   r   zBInt4ChooseQParamsAlgorithm.HQQ is not supported by packing format zF, it's only supported by Int4PackingFormat.TILE_PACKED_TO_4D currentlyactivation_dtype)r9  z!Unsupported int4 packing format: )r'  r9  r8  r   loggerinfotupler  ndimr
  ro   r+   HQQr,   TILE_PACKED_TO_4DPRESHUFFLEDr.   r,  r   bfloat16r>  r/   PLAIN_INT32r-   r0   r   )r|   r   r'  r9  r8  r/  r1  rd   rd   re   !_int4_weight_only_quantize_tensor  sT   "





rO  c                 C   sT   |j r	tjj  t| dsJ dt| j|}tj	j
|dd| _tt| | _| S )Nr|   gapplying int8 weight only quant requires module to have weight attribute but {module} does not have oneFr   )r7  torchaoquantizationutils"recommended_inductor_config_setterr   rO  r|   r   r   r   r   r   r<   r   )rc   r   r1  rd   rd   re   _int4_weight_only_transform=  s   rU  c                   @   s   e Zd ZU dZdZeed< dS )'Float8DynamicActivationInt4WeightConfiga  Configuration for apply float8 dynamic per row quantization and int4
    per group weight quantization to linear
    (only group_size 128 is supported right now since underlying kernel used only supports 128
    and above and no benefits of making it bigger)

    Args:
        `int4_packing_format`: how the weight is packed, only preshuffled is supported

    Example:

    .. literalinclude:: ../../examples/inference/float8_dynamic_activation_int4_weight.py
       :language: python
    preshuffledr8  N)r`   r   r   r  r8  r,   r  rd   rd   rd   re   rV  N  s   
 rV  c                 C   s   t | ds	J d|j}|dksJ d| | j}d}tdd t|jd D |g }tj| j|tj	d	}tj
j|d
d| _tt| | _| S )Nr|   rP  rW  z?only preshuffled int4_packing_format supported right now, got: r6  c                 S   r@  rA  rd   rB  rd   rd   re   r  p  rC  zD_float8_dynamic_activation_int4_weight_transform.<locals>.<listcomp>rB   rD  Fr   )r   r8  r|   rH  r  rI  r.   r,  r   float8_e4m3fnr   r   r   r   r<   r   )rc   r   r8  r|   r'  r/  r1  rd   rd   re   0_float8_dynamic_activation_int4_weight_transforma  s$   
"rY  c                   @   sT   e Zd ZU dZdZee ed< e Z	ee
 ed< dZeed< dZeed< d	d
 ZdS )Int8WeightOnlyConfiga  
    Configuration for applying int8 weight-only symmetric per-channel quantization to linear layers.

    Args:
        group_size (version 1) - Controls the granularity of quantization.
        If None, applies per-channel quantization. Otherwise, applies per-group quantization with the specified group size.
        granularity (version 2) - Quantization granularity.
            PerRow() for per-channel quantization, PerTensor() for per-tensor quantization.
        set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
            for better performance with this quantization scheme.

    Example:

    .. literalinclude:: ../../examples/inference/int8_weight_only.py
       :language: python
    Nr'  granularityTr7  rB   r
  c                 C   s8   t jd | jdkr| jd u sJ d| j d S d S )Nz)torchao.quantization.Int8WeightOnlyConfigr   z1Only support version 2 with group_size=None, got )r   r   r   r
  r'  r   rd   rd   re   r    s   

z"Int8WeightOnlyConfig.__post_init__)r`   r   r   r  r'  r	   r  r  rF   r[  rC   r7  r=  r
  r  rd   rd   rd   re   rZ  {  s   
 rZ  c           	      C   s   |j dkrDtd tj}tj}ttjj	}tj
}|j}|d u r&| jd }tdd t|  d D |g }t| |||||d}|S |j dksQJ d|j  tj| |jd	}|S )
NrB   zConfig Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more detailsrj   c                 S   r@  rA  rd   )r  r   rd   rd   re   r    rC  z5_int8_weight_only_quantize_tensor.<locals>.<listcomp>)r   r   r   Unexpected version: )r[  )r
  warningswarnrN   r   r   r   r   r   r   int64r'  r   rH  r  r&  r   r1   r,  r[  )	r|   r   r   r   r   r   r'  r/  r1  rd   rd   re   !_int8_weight_only_quantize_tensor  s0   

$r`  r|   r   r   c                C   sj   |j r	tjj  t| |sJ dtt| ||}t| |t	j
j|dd ttt| j|d| | _| S )Nzqapplying int8 weight only quant requires module to have {parameter_name} attribute but {module} does not have oneFr   r   r   )r7  rQ  rR  rS  rT  r   r`  r^   rr   r   r   r   r   r   r   r   r   rc   r   r   quantized_tensorrd   rd   re   _int8_weight_only_transform  s,   
re  c              
   C   sJ   t j}tj}d}d}d}t| |t| ||||| jtjkr!tjdS d dS r   )	rN   r   r   r   r   rS   r  float16r   r   rd   rd   re   (_int8_symm_per_token_reduced_range_quant  s"   rg  c              
   C   s\   t j}tj}d}d}d}| jd dkr| S t| |t| ||||| jtjkr*tj	dS d dS )Nr   r   r   rB   r   )
rN   r   r   r   r   r   rS   r  rf  r   r   rd   rd   re   4_int8_symm_per_token_reduced_range_quant_noop_decode  s&   rh  r   c                 C   s   t | t| tj|td ddS )N	mm_configr/  r   r   _layout)r   rS   r   r   r   r   r   rd   rd   re   _float8_cutlass_quant   s   rn  c                 C   s   t | t| tj|t dS )Nrk  )r   rS   r   r   r   rm  rd   rd   re   _float8_cutlass_quant_sparse  s   ro  c                   @   sp   e Zd ZU dZe Zee ed< e	j
Zee	 ed< dZeed< e Zeed< dZeed< d	Zeed
< dd ZdS )%Int8DynamicActivationInt8WeightConfigae  
    Configuration for applying int8 dynamic symmetric per-token activation and int8 per-channel weight
    quantization to linear layers.

    Args:
        layout: Optional[Layout] = PlainLayout() - Tensor layout for the quantized weights. Controls how the
            quantized data is stored and accessed.
        act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC - Mapping type for activation quantization.
            SYMMETRIC uses symmetric quantization around zero.
        weight_only_decode: bool = False - If True, only quantizes weights during forward pass and keeps activations
            in original precision during decode operations.
        set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
            for better performance with this quantization scheme.
        version (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Int8Tensor

    Example:

    .. literalinclude:: ../../examples/inference/int8_dynamic_activation_int8_weight.py
       :language: python
    layoutr  Fweight_only_decoder[  Tr7  rB   r
  c                 C   r:  )Nz:torchao.quantization.Int8DynamicActivationInt8WeightConfigr;  r   rd   rd   re   r  8     z3Int8DynamicActivationInt8WeightConfig.__post_init__N)r`   r   r   r  r   rq  r	   r   r  rN   r   r  rr  r=  rF   r[  rC   r7  r
  r  r  rd   rd   rd   re   rp    s   
 rp  c              
   C   s6  |j dkrb|j}|j}|j}| jd }|dkr%td| j d|  | S tj}t	j
}dd }tj}	ttjj}
tj}|rAt}n
|tjkrIt}nt}|| }t| |||	|
|||d}t||}|S |jt t hv soJ d	|j}|j}|jtjksJ d
|j dksJ d|j  tj| |t||jdd}|S )NrB   rj      zKSkipping applying Int8DynamicActivationInt8WeightConfig to weight of shape z  because `in_feature` is <= 16: c                 S   s*   t dd t|  d D | jd g S )Nc                 S   r@  rA  rd   rB  rd   rd   re   r  R  rC  zg_int8_dynamic_activation_int8_weight_quantize_tensor.<locals>.get_weight_block_size.<locals>.<listcomp>rB   rj   )rH  r  r&  r   )r   rd   rd   re   get_weight_block_sizeQ  s   *zS_int8_dynamic_activation_int8_weight_quantize_tensor.<locals>.get_weight_block_size)r   r   rl  zero_point_domainz'Only PerRow and PerTensor are supportedz0asymmetric dynamic quant not supported currentlyr   r\  r[  r   )r[  act_quant_kwargs)r
  rq  r  rr  r   rF  rG  rN   r   rO   NONEr   r   r   r   r   r_  rh  rg  r   r   rI   r[  rF   rG   r1   r,  r7   )r|   r   rq  r  rr  in_featuresr   weight_zero_point_domainru  r   r   r   input_quant_funcr/  r1  quantized_weightr  act_granularityrd   rd   re   4_int8_dynamic_activation_int8_weight_quantize_tensor>  sl   





	r  c                C   s`   |j r	tjj  t| dsJ dt| j|}tj	j
|dd| _ttt| j|d| | _| S )Nr|   zyapplying int8 dynamic activation int8 weight quant requires module to have weight attributebut {module} does not have oneFr   rb  )r7  rQ  rR  rS  rT  r   r  r|   r   r   r   r   r   r   r   r   )rc   r   r   r1  rd   rd   re   ._int8_dynamic_activation_int8_weight_transform  s$   r  c                   @   sr   e Zd ZU dZdZeej ed< e	 Z
eed< ejZee ed< dZeed< dZeed	< d
d ZdefddZdS )$Int8StaticActivationInt8WeightConfigaM  
    Configuration for applying int8 static symmetric quantization to both activation and weight

    Args:
        act_quant_scale (torch.Tensor): The scale tensor for activation quantization.
        granularity (Granularity): The granularity of quantization. PerRow() and PerTensor() are supported currently
        act_mapping_type (MappingType): The mapping type for activation quantization. only SYMMETRIC is supported currently
        set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
        version (int): the version of the config
    Nact_quant_scaler[  r  Tr7  rB   r
  c                 C   s@   t jd t| jtr| jjdkrtd| jj dd S d S )Nz9torchao.quantization.Int8StaticActivationInt8WeightConfigrj   znInt8StaticActivationInt8WeightConfig only supports PerRow(dim=-1) for activation quantization, got PerRow(dim=zS). Per-feature activation quantization is not supported due to slicing limitations.)r   r   r   r~   r[  rF   r&  r   r   rd   rd   re   r    s   z2Int8StaticActivationInt8WeightConfig.__post_init__ri   c                 C   s   t | j| jdS )zGet the activation quantization kwargs for static quantization.

        Returns:
            QuantizeTensorToInt8Kwargs with the configured granularity and mapping type.
        rw  )r7   r[  r  r   rd   rd   re   get_act_quant_kwargs  s   z9Int8StaticActivationInt8WeightConfig.get_act_quant_kwargs)r`   r   r   r  r  r	   r   Tensorr  rF   r[  rC   rN   r   r  r7  r=  r
  r  r  r7   r  rd   rd   rd   re   r    s   
 r  c                C   s   |j t t hv sJ d|jtjksJ dt| |s$J d| d|jr-tj	j
  |j }|j }tjt| ||t||jd|j d}t| |tjj|dd ttt| j|d	| | _| S )
Nz0Only PerRow and PerTensor is supported currentlyz/asymmetric static quant not supported currentlyz#Expected module to have attribute `z` but not foundrw  )r[  rx  r  Fr   rb  )r[  rF   rG   r  rN   r   r   r7  rQ  rR  rS  rT  r1   r,  r^   r7   r  detachrr   r   r   r   r   r   r   r   r   )rc   r   r   activation_granularityr  rd  rd   rd   re   -_int8_static_activation_int8_weight_transform  sH   

r  c                   C   s   t d tt dS )z
    Applies int8 dnynamic symmetric per-token activation and int8 per-channel weight
    quantization + 2:4 sparsity to linear layers.
    a  int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout kwarg in Int8DynamicActivationInt8WeightConfig instead.

    from torchao.dtypes import SemiSparseLayout
    Int8DynamicActivationInt8WeightConfig(layout=SemiSparseLayout())rq  )r]  r^  rp  r   rd   rd   rd   re   /int8_dynamic_activation_int8_semi_sparse_weight  s   r  c                   @   s@   e Zd ZU dZeZejed< dZ	e
ed< dZeed< dd Zd	S )
Float8WeightOnlyConfiga  
    Configuration for applying float8 weight-only symmetric per-channel quantization to linear layers.

    Args:
        weight_dtype (torch.dtype): The target data type for weight quantization. Default is torch.float8_e4m3fn.
        set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
        version (int): the version of the config, version 1 is deprecated, version 2 is using Float8Tensor (default)

    Note:
        The actual matmul will be computed in original precision of the weight tensor.

    Example:

    .. literalinclude:: ../../examples/inference/float8_weight_only.py
       :language: python
    r  Tr7  r   r
  c                 C   r:  )Nz+torchao.quantization.Float8WeightOnlyConfigr;  r   rd   rd   re   r  '  r<  z$Float8WeightOnlyConfig.__post_init__N)r`   r   r   r  r   r  r   r  r  r7  r=  r
  r  r  rd   rd   rd   re   r    s   
 r  c                 C   s6   |j dksJ d|j  |j}tj| |t d}|S )Nr   r\  )float8_dtyper[  )r
  r  r*   r,  rF   )r|   r   r  r1  rd   rd   re    _float8_weight_only_quant_tensor+  s   r  c                C   s|   |j r	tjj  t| |sJ dt| trt| } t	t
| ||}t| |tjj|dd ttt| j|d| | _| S )Nzsapplying float8 weight only quant requires module to have {parameter_name} attribute but {module} does not have oneFr   rb  )r7  rQ  rR  rS  rT  r   r~   r   _unwrap_float8_linearr  r^   rr   r   r   r   r   r   r   r   r   rc  rd   rd   re   _float8_weight_only_transform4  s0   

r  r  rE  scale
zero_pointc                 C   s   |du sJ dt |tr| jtjksJ dt| j|}|du r0t| ||tjt	ddd}|S t |t
s9J dt| |||t	ddd}|S )zThis function is used to quantize the input activation tensor for an aqt_float variant. If scale
    is not provided it will be dynamically calculate the scales otherwise it will use the provided scale.
    Nz8Zero point is not supported for dynamic FP8 quantizationzFPerRow quantization only works for bfloat16 precision input activationri  )input_floatr/  r   r   rl  z7Static quantization only supports PerTensor granularity)r  r/  r  r   rl  )r~   rF   r  r   rM  r>   r   r   r   r   rG   r   )r   r  rE  r  r  r/  
activationrd   rd   re    _input_activation_quant_func_fp8Z  s8   


r  c                   @   s   e Zd ZU dZeZejed< eZ	ejed< dZ
eeeee f  ed< ejZee ed< dZee ed< dZee ed< dZee ed	< ejZeed
< dZeed< dZeed< dd ZdS ))Float8DynamicActivationFloat8WeightConfiga  
    Configuration for applying float8 dynamic symmetric quantization to both activations and weights of linear layers.

    Args:
        activation_dtype (torch.dtype): The target data type for activation quantization. Default is torch.float8_e4m3fn.
        weight_dtype (torch.dtype): The target data type for weight quantization. Default is torch.float8_e4m3fn.
        granularity (Optional[Union[FP8Granularity, List[FP8Granularity]]]):
            The granularity for quantization. Can be either a single granularity (applied to both
            activations and weights) or a tuple of two granularities (one for activations, one for weights).
            If None, defaults to PerTensor for both. Currently both quantizations need to be the same type. And
            only PerTensor and PerRow are supported.
        mm_config (Float8MMConfig): Configuration for the matrix multiplication. Default uses fast accumulation.
        activation_value_lb (Optional[float]): the lower bound for activation value for calculating scale
        activation_value_ub (Optional[float]): the upper bound for activation value for calculating scale
        kernel_preference (KernelPreference): kernel preference for ops like matmul, grouped matmul etc. by defalut (KernelPreference.AUTO) it will be chosen for user based on hardware or other information, this only needs to be set in weight
        set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
        version (int): the version of the config, version 1 is deprecated, version 2 is using Float8Tensor (default)

    Example:

    .. literalinclude:: ../../examples/inference/float8_dynamic_activation_float8_weight.py
       :language: python
    rE  r  Nr[  packing_formatrj  activation_value_lbactivation_value_ubkernel_preferenceTr7  r   r
  c                 C   s   t jd t| j\}}||g| _d}t| jr1| jtjtj	fv s&J d| j
dks/J dd}t j r8d}| jd u rEt|d| _d S d S )Nz>torchao.quantization.Float8DynamicActivationFloat8WeightConfigTunimplementedr   F)use_fast_accum)r   r   r   r!   r[  r    r  r(   AUTOTORCHr
  xpuis_availablerj  r   )r   r  r  default_use_fast_accumrd   rd   re   r    s,   



z7Float8DynamicActivationFloat8WeightConfig.__post_init__)r`   r   r   r  r   rE  r   r  r  r  r[  r	   r   r   r   r)   r>  r  rj  r   r  floatr  r(   r  r  r7  r=  r
  r  r  rd   rd   rd   re   r    s   
 r  c                 C   sL  |j }|j}|j}|j}|j}|j}|j}|j}	t| |\}
}| 	 dv rIt
|
tr0t
|ts4J d| jd d dksF| jd d dkrH| S nt| sO| S |jdks\J d|j |	tjkrpt
|trp| jtjkspJ dt||
|||d	}|	tjkrtj| |||||d
}|S |	tjkrt
|tsJ dtj| |||d}|S d S )N)      zH4D/5D tensor only supports per tensor activation and weight quantizationr   rt  rB   r   r\  zBPerRow quantization only works for bfloat16 precision input weight)hp_value_lbhp_value_ubr  )r  r[  rj  r  rx  z8Sparse packing format only supports per-row quantization)r  r[  rx  )rE  r  r[  rj  r  r  r  r  r   r&  r~   rG   r   r;   r
  r)   r>  rF   r  r   rM  r6   r*   r,  SPARSE_CUTLASSr8   )r|   r   rE  r  r[  rj  r  r  r  r  r  r  rx  r}  rd   rd   re   8_float8_dynamic_activation_float8_weight_quantize_tensor  sr   $

r  c                C   s   t j rt st sJ d|jrtjj	  t
| |s+J d| dd|  d t| tr4t| } tt| ||}t| |t jj|dd ttt| j|d| | _| S )	NzPFloat8 dynamic activation quantization is only supported on CUDA>=8.9 and MI300+zKapplying float8 dynamic activation quant requires module to have parameter z
 attributez but z does not have oneFr   rb  )r   cudar  r@   r?   r7  rQ  rR  rS  rT  r   r~   r   r  r  r^   rr   r   r   r   r   r   r   r   rc  rd   rd   re   2_float8_dynamic_activation_float8_weight_transform	  s<   




r  c                   @   sD   e Zd ZU dZe Zeed< eZ	e
jed< eZe
jed< dd ZdS )rX   a  
    Applies float8 dynamic quantization to activations and float8 quantization followed by compression to sparse semi-structured tensor to weights of linear layers.

    Args:
        `layout`: layout type for quantized weight tensor, only supports `CutlassSemiSparseLayout` at the moment.
        `activation_dtype`: data type for quantized activation tensor.
        `weight_dtype`: data type for quantized weight tensor.
    rq  rE  r  c                 C   r:  )NzHtorchao.quantization.Float8DynamicActivationFloat8SemiSparseWeightConfigr;  r   rd   rd   re   r  ?  rs  zAFloat8DynamicActivationFloat8SemiSparseWeightConfig.__post_init__N)r`   r   r   r  r   rq  r   r  r   rE  r   r  r   r  r  rd   rd   rd   re   rX   0  s   
 	rX   c                 C   s   t d t sJ dt| trt| } | j}|j}|j}|j	}t|t
s.td| dt||}t|td|id}tjj|dd| _tt| | _| S )	NaW  Config Deprecation: Float8DynamicActivationFloat8SemiSparseWeightConfig is deprecated and will no longer be supported in a future release. Please use Float8DynamicActivationFloat8WeightConfig with packing_format=Float8PackingFormat.SPARSE_CUTLASS and granularity=PerRow() instead. See https://github.com/pytorch/ao/issues/3594 for more detailsz2Float8 quantization is only supported on CUDA>=9.0z;Only CutlassSemiSparseLayout layout is supported. Received rm   r   )quant_kwargsFr   )r]  r^  rA   r~   r   r  r|   r  rE  rq  r   NotImplementedErrorro  rI   rn  r   r   r   r   r   r<   r   )rc   r   r|   r  rE  rq  rd   rd   re   >_float8_dynamic_activation_float8_semi_sparse_weight_transformE  s.   



r  intx_unpacked_tensor	hp_tensorr   c              	   C   sN   t | tsJ | j|| _t| j \}}t|| j| j| jt	j
||d| _dS )ad  
    Adjusts the scale_dtype on IntxUnpackedToInt8Tensor.
    Updating the scale dtype requires updating the qdata because qdata is calculated after the scale.
    This is used in IntxWeightOnlyConfig and Int8DynamicActivationIntxWeightConfig to make
    version=2 and version=1 numerically equivalent when the scale_dtype differs from the input dtype
    )output_dtyper   r   N)r~   r5   r  rn   rM   r   rP   r/  r  r   r   qdata)r  r  r   qminqmaxrd   rd   re   r-  i  s   r-  c                   @   s   e Zd ZU dZejZejed< e	dZ
eed< ejZeed< dZeej ed< ejZeed< ejZeed	< d
Zeed< dd ZdS )IntxWeightOnlyConfigaE  
    Configuration for quantizing weights to torch.intx, with 1 <= x <= 8.
    Weights are quantized with scales/zeros in a groupwise or channelwise
    manner using the number of bits specified by weight_dtype.
    args:
        `weight_dtype`: The dtype to use for weight quantization.  Must be torch.intx, where 1 <= x <= 8.
        `granularity`: The granularity to use for weight quantization.  Must be PerGroup or PerAxis(0).
        `mapping_type`: The type of mapping to use for the weight quantization.
            Must be one of MappingType.ASYMMETRIC or MappingType.SYMMETRIC.
        `scale_dtype`: The dtype to use for the weight scale.
        `intx_packing_format`: The format to use for the packed weight tensor (version 2 only).
        `intx_choose_qparams_algorithm`: The algorithm to use for choosing the quantization parameters.
        `version`: version of the config to use, only subset of above args are valid based on version, see note for more details.

    Example:

    .. literalinclude:: ../../examples/inference/intx_weight_only.py
       :language: python
    r  r   r[  r   Nr   r  r	  r   r
  c                 C   s   t jd | jdd tddD v sJ d| j t| jttfs+J d| j t| jtr@| jj	dks@J d	| jj	 | j
tjtjtjfv sSJ d
| j
 d S )Nz)torchao.quantization.IntxWeightOnlyConfigc                 S   r  r  r  r  rd   rd   re   r    r  z6IntxWeightOnlyConfig.__post_init__.<locals>.<listcomp>rB   r  r  z1granularity must be PerAxis or PerGroup, but got r   r"  zvmapping_type must be MappingType.ASYMMETRIC, MappingType.SYMMETRIC, or MappingType.SYMMETRIC_NO_CLIPPING_ERR, but got )r   r   r   r  r  r~   r[  rD   rE   r  r   rN   r   r   r  r   rd   rd   re   r    s$   


z"IntxWeightOnlyConfig.__post_init__)r`   r   r   r  r   r   r  r  r  rD   r[  rC   rN   r   r   r   r	   r4   r  r  r2   r  r	  r
  r  r  rd   rd   rd   re   r    s   
 
r  c             	   C   s^  |j }|j}|j}|j}|j}|j}	|  dkrd}
n|  dkr$d}
n	td|   t|t	r6|j
}nt|trN|jdksHJ d|j | j|
 }ntd| |  dkr`d|f}n|  dkshJ d|ddf}|jdksuJ |jtjkr|d ur|jtjkr|tj}tj| ||||||	d	}|d ur|| jkrt|| | |S td
| )Nr   rj   r  rB   z>IntxWeightOnlyConfig only works for 2-d and 4-d Tensors, got: r   r"  z-granularity must be PerGroup or PerAxis, got )r   r   r!  r	  r#  )r  r[  r   r   r  r	  r&  r   r~   rE   r'  rD   r  r   r
  r4   r  r  r   r   rn   r   r5   r,  r-  )r|   r   r   r!  r  r[  r   r   r  r	  	input_dimr'  r/  r1  rd   rd   re   !_intx_weight_only_quantize_tensor  sT   



	r  c                C   sr   t | ds	J dt| j|||d}tjj|dd| _t| tjr*t	t
| | _| S t| tjr7t	t| | _| S )Nr|   zgapplying intx weight only quant requires module to have weight attribute but {module} does not have oner  Fr   )r   r  r|   r   r   r   r~   r   r   r   r<   r   	Embeddingr   )rc   r   r   r!  r1  rd   rd   re   _intx_weight_only_transform  s    r  c                   @   sj   e Zd ZU dZeedZeee	e
 f ed< eedZeee	e
 f ed< dZeed< dd Zd	d
 ZdS )r   a  Configuration class for applying different quantization configs to modules or parameters based on their fully qualified names (FQNs).

    Args:
        `fqn_to_config`: typing.OrderedDict[str, Optional[AOBaseConfig]]: an
         ordered dictionary from
             (1). fully qualified name (fqn) of module or parameter
             (2). regex of fully qualified name (in python `re` module regex format), should
                  start with prefix "re:" or
             (3). "_default"
         to the config that we want to apply to the module/param or None

         Config key ordered by precedence:
           * fully qualified parameter name, e.g. `language.layers.0.q_proj.weight`
           * fully qualified module name, e.g. `language.layers.0.q_proj`
           * regex for parameter names, must start with `re:`, e.g. `re:language\.layers\..+\.q_proj.weight`.
             The first regex that matches will be applied.
           * regex for module names, must start with `re:`, e.g. `re:language\.layers\..+\.q_proj`,
             whichever regex fully matches the module fqn first will be applied
             (order of keys for dictionary are kept consistent since we are using OrderedDict)
           * "_default", fallback if no match for all previous keys
             (Note, when using `_default`, the config is applied to all modules, to apply
              it to only a subset of modules, e.g. with some types, it's better to filter
              the modules that we don't want to quantize before hand and configure them to
              None, e.g. `{"re:.+norm.+": None, "_default": linear_config}`) "_default" is not supported when filter_fn is not specified.
        `module_fqn_to_config`: typing.OrderedDict[str, Optional[AOBaseConfig]]: To maintain BC with ModuleFqnToConfig, to be deprecated later
        `version`: int: Version of config to use.

    Note:
        - The order of patterns in the OrderedDict may matter as only the first matching pattern is applied
        - "_default" is ignored for parameter replacement.
    )default_factoryr   module_fqn_to_configrB   r
  c                 C   s   t jd t| jdkrt| jdkr| j| jkrtdt| jdkr0t| jdkr0| j| _t| jdkrBt| jdkrB| j| _d| jv rNtd d S d S )Nz torchao.quantization.FqnToConfigr   zP`fqn_to_config` and `module_fqn_to_config` are both specified and are not equal!r   zConfig Deprecation: _default is deprecated and will no longer be supported in a future release. Please see https://github.com/pytorch/ao/issues/3229 for more details.)	r   r   r   r   r   r  r   r]  r^  r   rd   rd   re   r  <  s    
zFqnToConfig.__post_init__c                 C   s$   d dgdd | j D dS )N
zFqnToConfig({c                 s   s&    | ]\}}d | d| dV  qdS )z  'z':
    ,Nrd   )r  keyvaluerd   rd   re   	<genexpr>Y  s
    
z&FqnToConfig.__str__.<locals>.<genexpr>z}))r   r   r   r   rd   rd   re   __str__U  s   zFqnToConfig.__str__N)r`   r   r   r  r   r   r   OrderedDictTypestrr	   r   r  r  r
  r  r  r  rd   rd   rd   re   r     s   
  r   fqnc                 C   s6  d}g }t t|  D ]$\}\}}|t| v r0t|dkr%| d| n|}|||||f qt|D ]9\}}}}||jv rnd}|j| }	|	du rQ|| q5tt	|	 }
t	|	t
v re|
| |	|d} q5tt	|	 dq5|s||jv r|j| }	|	durtt	|	 }
|
| |	S | S |D ]C\}}}}|jD ]9}|drt|d	d |rd}|j| }	|	durtt	|	 }
t	|	t
v r|
| |	|d} qtt	|	 dqq|s|jD ]'}|drt|d	d |r|j| }	|	durtt	|	 }
|
| |	  S q|s|jd
d}	|	durtt	|	 }
|
| |	S | S )aw  This function expects a module that either is specified in FqnToConfig or has a parameter that is specified in FqnToConfig.

    Args:
        module (torch.nn.Module): The module to be processed.
        fqn (str): The fully qualified name of the module containing the parameters.
        config (FqnToConfig): Configuration object containing regex patterns / fqn mapped
            to quantization configurations.

    Returns:
        torch.nn.Module: The modified module with quantized parameters.

    Raises:
        NotImplementedError: If the quantization configuration is not yet supported for parameter quantization.
    Fr   rm   TNra  zs does not yet support parameter quantization! Please see https://github.com/pytorch/ao/issues/3252 for more detailsre:r   r   )	enumeratero   named_parametersdirr   r   r   r   r9   r   +CUSTOM_PARAM_QUANTIZATION_SUPPORTED_CONFIGSr  
startswithre	fullmatchget)rc   r  r   parameter_config_foundtop_level_paramsir   paramparameter_fqncr   patternrd   rd   re   r   o  sn   









r   c                 C   s\   | |j v r| drJ d|  ddS |j D ]}|dr+t|dd | r+ dS qdS )ae  Check if a given fqn matches the exact fqn or regex pattern specified in FqnToConfig.

    Args:
        fqn (str): The fully qualified name of the module.
        config (FqnToConfig): Configuration object containing regex patterns or raw FQNs for quantization.

    Returns:
        bool: True if the fqn is specified in FqnToConfig. False otherwise.
    r  zError: Exact match but regex z specified.Tr   NF)r   r  r  r  )r  r   !maybe_module_or_param_fqn_patternrd   rd   re   r     s   


r   c                 C   sP   |   D ]!\}}|t| v r%t|dkr| d| n|}t||r% dS qdS )a  Check if a given module contains top-level parameters that match the exact fqn or regex pattern specified in FqnToConfig.

    Args:
        module (nn.Module): The module to be checked.
        fqn (str): The fully qualified name of the module.
        config (FqnToConfig): Configuration object containing regex patterns or raw FQNs for quantization.

    Returns:
        bool: True if the module contains top-level parameters that match the fqn or regex pattern specified in FqnTo
    r   rm   TF)r  r  r   r   )rc   r  r   ra   r  r  rd   rd   re   r     s   
r   c                 C   sN   t d t| j| j}W d   n1 sw   Y  | j|_| j|_|S )aY  
    Unwrap a torchao Float8Linear by returning a nn.Linear with the same weights and bias.

    Torchao inference quantization techniques are generally only applicable to nn.Linear
    layers, so this helper is useful for unwrapping models trained with torchao float8 training,
    which replaces nn.Linear layers with Float8Linear layers.
    metaN)r   rl   r   r   rz  out_featuresr|   r   )rc   
new_modulerd   rd   re   r    s   r  )rg   Nrd   )Fr   )NN)r  loggingr  r   r]  collectionsr   dataclassesr   r   	functoolsr   typingr   r   r   r	   r
   r   r  r   torch.nnr   torch.nn.utils.parametrizerS  r   rQ  torchao.core.configr   torchao.dtypesr   r   r   r   r   r   r   r   r   r   r   Mtorchao.dtypes.uintx.packed_linear_int8_dynamic_activation_intx_weight_layoutr   torchao.dtypes.utilsr   torchao.float8.configr   r   torchao.float8.float8_linearr   torchao.float8.inferencer   r   r   r    r!   (torchao.prototype.quantization.quant_apir"   r#   r$   r%   =torchao.quantization.linear_activation_weight_observed_tensorr&   torchao.quantization.observerr'   %torchao.quantization.quantize_.commonr(   (torchao.quantization.quantize_.workflowsr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   %torchao.quantization.transform_moduler9   r:   torchao.quantization.utilsr;   r<   r=   r>   torchao.utilsr?   r@   rA   r[  rC   rD   rE   rF   rG   "linear_activation_quantized_tensorrH   rI   linear_quant_modulesrJ   rK   qatrL   quant_primitivesrM   rN   rO   rP   unifiedrQ   rR   rS   	getLoggerr`   rF  __all__rZ   rf   FLOATINTLAYOUT_TO_ZERO_POINT_DOMAINLAYOUT_TO_PRESERVE_ZEROSrq   r   rV   rT   r   r  r=  r   r   r   r   DevicerW   r  r   r   r   r  r3  r4  r5  rO  rU  rV  rY  rZ  r`  re  rg  rh  r  rn  ro  rp  r  r  r  r  r  r  r  r  r  r  r  r  rX   r  r-  r  r  r  r   rY   r  r   r   r   r   r  serializationadd_safe_globalsrd   rd   rd   re   <module>   sZ   4
H

.

"%
Q

KL
J"9 



#I+0	)
)=H&#
8
>Q
\

