o
    i)_                     @   s  U d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZ d dlmZmZ ddlmZmZmZmZmZm Z m!Z! dd	l"m#Z# g d
Z$e %e&Z'dd Z(da)ee* e+d< dd Z,dd Z-i Z.eee* eee* eee* e/f f f e+d< G dd deZ0G dd dZ1dRddZ2dej3dee/ fddZ4dd Z5dd  Z6ej7fd!d"Z8d#d$ Z9ej7fd%d&Z:ej7fd'd(Z;d)d*ej<ej=d+dfd,d-Z>ej<fd.d/Z?d0d1 Z@d)d*ej=fd2d3ZAd)d*ej=fd4d5ZBd)d*ej<ej=d+fd6d7ZC	)	*dSd8d9ZDd)d*ej7ejEdfd:d;ZFd)d*ej7ejEfd<d=ZGej7ej7dfd>ej3d?ejHd@ejHdAeeI dej3f
dBdCZJdDdE ZKdFee/dGf dHedee/dGf fdIdJZLdKej3fdLdMZMdNdO ZNdKej3deOfdPdQZPdS )T    N)DictListOptionalTuple)TorchDispatchMode)int_scaled_matmul)MappingTypeZeroPointDomain)_choose_qparams_affine_dont_preserve_zero_choose_qparams_affine_tinygemm _dequantize_affine_no_zero_point_dequantize_affine_tinygemm_quantize_affine_no_zero_point_quantize_affine_tinygemmchoose_qparams_affinedequantize_affinequantize_affine)check_cpu_versioncheck_xpu_version   )GranularityPerAxisPerBlockPerGroupPerRow	PerTensorPerToken)LinearActivationQuantizedTensor)compute_error%_quantize_activation_per_token_absmax$_quant_int8_dynamic_per_token_linear dynamically_quantize_per_channeldequantize_per_tensordequantize_per_channelget_groupwise_affine_qparamspack_tinygemm_scales_and_zeros unpack_tinygemm_scales_and_zeros-groupwise_affine_quantize_tensor_from_qparams/groupwise_affine_dequantize_tensor_from_qparams groupwise_affine_quantize_tensor"groupwise_affine_dequantize_tensorper_token_dynamic_quantget_group_qparams_symmetric"recommended_inductor_config_setterc                 C   s.   t j| }t j| | }dt ||  S )N   )torchlinalgnormlog10)xyPsPn r7   N/home/ubuntu/.local/lib/python3.10/site-packages/torchao/quantization/utils.pyr   E   s   r   _cur_fqnc                    s    fdd}|S )Nc                    s    a d S N)r9   )moduleinputfqnr7   r8   forward_hookQ   s   z'_get_logging_hook.<locals>.forward_hookr7   )r>   r?   r7   r=   r8   _get_logging_hookP   s   r@   c                 C   s$   |   D ]\}}|t| qd S r:   )named_modulesregister_forward_pre_hookr@   )modelnamemodr7   r7   r8   _apply_logging_hookX   s   rF   _fqn_to_op_to_shape_to_countc                   @   s   e Zd ZdddZdS )LoggingTensorModer7   Nc           	      C   s   |d u ri }||i |}|j  d|j }d}|D ]}t|tjr-|tt|jd 7 }q|dkr8|d d }tt	vr@i t	t< |t	t vrLi t	t |< |t	t | vr\dt	t | |< t	t | |  d7  < |S )N. z, r   r   )

__module____name__
isinstancer/   Tensorstrlistshaper9   rG   )	selffunctypesargskwargsrsop_name	shape_strargr7   r7   r8   __torch_dispatch__d   s&   z$LoggingTensorMode.__torch_dispatch__)r7   N)rM   rL   __qualname__r\   r7   r7   r7   r8   rH   c   s    rH   c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )_MultiInputc                 C   s   t || _d S r:   )rQ   values)rS   inputsr7   r7   r8   __init__}      z_MultiInput.__init__c                 C   s   | j | | S r:   )r_   append)rS   r<   r7   r7   r8   	add_input   s   z_MultiInput.add_inputc                 C   s   t | j| S r:   )r^   r_   )rS   slicer7   r7   r8   __getitem__   rb   z_MultiInput.__getitem__c                 C      dd | j D | _ d S )Nc                 S   $   g | ]}t |tjr| n|qS r7   )rN   r/   rO   cuda.0valr7   r7   r8   
<listcomp>       z$_MultiInput.cuda.<locals>.<listcomp>r_   rS   r7   r7   r8   ri         z_MultiInput.cudac                 C   rg   )Nc                 S   rh   r7   )rN   r/   rO   xpurj   r7   r7   r8   rm      rn   z#_MultiInput.xpu.<locals>.<listcomp>ro   rp   r7   r7   r8   rr      rq   z_MultiInput.xpuN)rM   rL   r]   ra   rd   rf   ri   rr   r7   r7   r7   r8   r^   |   s    r^   c                 C   sl   |d ur| j |krtd| d| d| j  d|d ur2|  |kr4td| d| d|   dd S d S )NzExpected Tensor argument z to have dtype z
, but got z	 instead.z to have size )dtype
ValueErrorsize)
tensor_argarg_namers   ru   r7   r7   r8   _guard_dtype_size   s   rx   r3   returnc                 C   s:   g }t t| jd D ]}|d q|| jd  |S )Nr   )rangelenrR   rc   )r3   
block_size_r7   r7   r8   _get_per_token_block_size   s
   r   c              
   C   s   t j}t| j}tt|d D ]}d||< qtj}d}d}d}| jtj	kr)tj
nd }t| |||||||d\}	}
t| ||	|
|||}||	fS )Nr   gh㈵>i   )scale_dtype)r   	SYMMETRICrQ   rR   r{   r|   r/   int8rs   float16float32r   r   )tmapping_typer}   irs   eps	quant_min	quant_maxr   scale
zero_point	quantizedr7   r7   r8   r      s.   


r   c                 C   s0   t | \}}t|||||}|dur|| }|S )ze
    like F.linear, but with int8 dynamic quantization of activation,
    and a quantized weight
    N)r   _quant_int8_per_token_matmul)r3   w_vals_int8_tw_scalesbias	out_dtypex_vals_int8x_scalesmm_outr7   r7   r8   r       s   
r    c                 C   s   | j tjksJ d| j  d|j tjksJ d|j  d|j tjtjfv s/J d|j  | d| jd }t|||dd}|| jg | jdd |jd R  }||}|S )a  
    Quantized matmul of int8 operands that accumulates to int32 and returns
    output_dtype. For now, this is written for approximate numerical
    Assumes that activation and weight quantization are symmetric,
    i.e. act_zp and w_zp is 0.
    Assumes that weight quantization is per-channel.

    see
    https://github.com/google/gemmlowp/blob/master/doc/quantization.md
    for an overview of quantized matmul compute

    in scalar form, assuming output_dtype is fp32 and zw == 0:

      Y_i_j_fp32 = sx * sw dot(X_i, W_j)
    zx dtype z not yet supportedzw dtype z?x_scales needs to be a torch.float32 or torch.bfloat16 but got rz   r   N)	rs   r/   r   floatbfloat16reshaperR   r   to)r   r   r   r   output_dtypetmpy_dot_scaledr4   r7   r7   r8   r      s*   


r   c              
   C   sv   |   dks
J dttjj}d| jd f}tj}tj}t	| |||||||d\}}	t
| |||	|||}
|
||	fS )z
    assumes symmetric quantization
    assumes axis == 0
    assumes dense memory format
    TODO(future): relax ^ as needed
       only support 2d Tensorsr   )target_dtyper   r   r   zero_point_dtype)dimr/   finfor   r   rR   int64r   r   r   r   )r3   r   r   r   r   r}   r   r   r   r   quantr7   r7   r8   r!     s&   


r!   c                 C   sB   | j }| j}| dksJ d|  t| |||||d}|S )Nr   zscale size: r   )rR   rs   numelr   )int_reprr   r   r   r}   input_dtypedequantizedr7   r7   r8   r"   9  s   r"   c                 C   sP   |   dks
J d|  } | jd df}| j}t| |||||d}| }|S )Nr   r   r   r   r   )r   r   rR   rs   r   )r   scaleszero_pointsr   r}   r   r   r7   r7   r8   r#   D  s   r#         Fc                 C   sV  || j d kr| j d }|dksJ | j d | dksJ |  dks%J |dks0J d| tj}tj}d|f}	d}
d| d }|d u rHd}|}|tjkrQ|ntj}|tjkrk|skt	| ||	||
||||d	\}}n&|tjkr|st
| ||	||
||||d	\}}nt| ||	||
||||d	\}}|j|d	| j d d|j|d	| j d dfS )
Nrz   r   r   r      z-only n_bit smaller than 8 is supported, got: gư>)r   r   rs   )rR   r   r   
ASYMMETRICr/   int32r	   INTFLOATr   r
   r   r   r   )wn_bit	groupsizers   zero_point_domainpreserve_zeror   r   r   r}   r   r   r   r   r   r   r7   r7   r8   r$   U  sp   	

r$   c                 C   sR   t | d|| d t |d|d |  }t| d|dg|dd S )Nr   )rs   ru   zerosr   rz   rK   )rx   ru   r   r/   cat	unsqueeze	transpose
contiguous)r   r   rs   r   r7   r7   r8   r%     s   r%   c                 C   s(   | j d dks	J t| ddddS )Nrz   r   r   rK   r   )rR   r/   splitr   )scales_and_zerosr7   r7   r8   r&     s   r&   c                 C   sh  |dksJ || j d kr|j d dkr| j d }| j d | dks$J |  dks,J d|f}tj}d}d| d }	|tjkrCt}
n|tjkrKt}
nttj	krSt
}
ntd| |
| ||||||	}| j d dkrt|jst|js|d d d d df d> |d d dd df B tj}t|jr|d d dd df d> |d d d d df B tj}|S )Nr   rz   r   r   z Unrecognized zero point domain: r   )rR   r   r/   r   r	   r   r   r   r   NONEr   rt   r   devicer   r   uint8)r   r   r   r   r   r   r}   r   r   r   _quantize_affineint_datar7   r7   r8   r'     sB   



	
8
8r'   c              
   C   s  |dksJ |   dksJ | jtjks| jd dkrut| jsu| tj}|d? }|d@ }tj	| jd | jd d ftj| jd}	t
| js^||	d d d d df< ||	d d dd df< n||	d d d d df< ||	d d dd df< n| }	||	jd kr|jd dkr|	jd }|	jd | dksJ d|f}
tj}d}d| d }|tjkrt}n
|tjkrt}nt}||	|
||||||jdS )	Nr   r   rz   r      r   )rs   r   r   )r   rs   r/   r   rR   r   r   r   r   r   r   r	   r   r   r   r   r   )w_int4x8r   r   r   r   r   data	high_bitslow_bitsw_int32r}   r   r   r   _dequantize_affiner7   r7   r8   r(     sR   



r(   c           
      C   s@   t | |||||d\}}t| |||||d}t|||}	||	fS )N)r   r   )r   )r$   r'   r%   )
r   r   r   rs   r   r   r   r   r   r   r7   r7   r8   r)     s   
r)   c                 C   s   t |\}}t| ||||S r:   )r&   r(   )r   r   r   r   r   r   r7   r7   r8   r*   -  s   
r*   c                 C   s  || j d kr| j d }|dksJ | j d | dksJ |  dks%J |dks0J d| d|f}|d u r?t| jj}i }d|d< tddD ]}d|d   d|d  d f||< qJ|| \}	}
t| ||tj|	|
|||d		\}}|	| j d d|	| j d dfS )
Nrz   r   r   r   r   zunsupported n_bit: )rz   r   	   )r   r   r   r   r   r   )
rR   r   r/   r   rs   r   r{   r   r   r   )r   r   r   	precisionr   r   r}   rangesr   r   r   r   r   r7   r7   r8   r,   :  s4   	
$
$r,   c                 C   sb   t | ||||\}}d}d|d  d }d|d   }ddlm}	 |	| ||||tj|}
|
||fS )Nr   r   r   r   )8_quantized_decomposed_quantize_per_channel_group_wrapper)r,   torchao._executorch_opsr   r/   r   )r   r   
group_sizer   r   r   r   max_intmin_intr   w_int8r7   r7   r8   group_quantize_tensor_symmetrica  s   

r   r<   r   r   r   c                 C   sp   t j}t| }d}d}tj}| j}	t| ||||||||d	\}
}t| ||
||||}t|||
|||||	d}|S )Nir   )r   r   r   r   )	r   r   r   r/   r   rs   r   r   r   )r<   r   r   r   r   r}   r   r   quant_dtyper   r   r   qdqr7   r7   r8   r+   {  sJ   
	
r+   c                   C   sB   dt jj_dt jj_dt jj_dt jj_dt jjj_t 	d dS )a  
    Set inductor config to use the following optimizations which have been showed to improve performance for quantized models:
        coordinate_descent_tuning = True
        coordinate_descent_check_all_directions = True
        force_fuse_int_mm_with_mul = True
        fx_graph_cache = True
        triton.unique_kernel_names = True
        torch.set_float32_matmul_precision("high")
    ThighN)
r/   	_inductorconfigcoordinate_descent_tuning'coordinate_descent_check_all_directionsforce_fuse_int_mm_with_mulfx_graph_cachetritonunique_kernel_namesset_float32_matmul_precisionr7   r7   r7   r8   r-     s   




r-   input_shape.granularityc                 C   s  t |tr| S t |trt| }d||j< t|S t |trx|j}t|t| k rGt|}t|t| k rC|	dd t|t| k s5t|}t|t| ksYJ d| d|  t
t|D ]}| | ||  dksuJ d|  d| q_|S t |trdt| d  | d f S t |trdgt|  }| |j ||j< t|S t |tr| d |j dksJ d	| d  d
|j dt| d  |jf S td| )zGet the block size based on the input shape and granularity type.
    Args:
        input_shape: The input tensor shape possibly more than 2 dimensions
        granularity: The granularity type of the quantization
    r   r   zBlock size z8 must have the same number of dimensions as input shape zNot all shapes in input shape z are divisible by block size )r   rz   zLast dimension of input z  is not divisible by group size zUnsupported Granularity: )rN   r   r   rQ   axistupler   r}   r|   insertr{   r   r   r   r   r   rt   )r   r   r}   block_size_listr   r7   r7   r8   get_block_size  sD   






r   weightc                 C   s   ddl m} t| |r| jj d|   dS t| tr-| jj d| j dt| j dS t	| dr>| jj d|   dS t
| tju sLt| tjjrSdt
|  S d	t
|  S )
Nr   )AffineQuantizedTensor()z(activation=	, weight=_quantization_typezTensor: znot recognized: )torchao.dtypesr   rN   	__class__rM   r   r   input_quant_funcoriginal_weight_tensorhasattrtyper/   rO   nn	Parameter)r   r   r7   r7   r8   r     s   

"
r   c                 C   s,   d| j jd  d| j jd  dt| j  S )Nzin_features=r   z, out_features=r   r   )r   rR   r   rp   r7   r7   r8   _linear_extra_repr  s   ,r   c              	   C   sr   |   dv sJ d|    d| jdd \}}|d dko$|d dk}|s7td| j d	| d
| d |S )z
    Check if a weight tensor meets float8 quantization requirements.

    Args:
        weight (torch.Tensor): The weight tensor to check

    Returns:
        bool: True if the tensor can be quantized to float8, False otherwise
    )r      z6float8 quantization only works for 2/3-D tensors, got zD tensorrK   N   r   z+Skipping float8 quantization: weight shape z: is not compatible with _scaled_mm. Both input dimension (z) and output dimension (z) must be multiples of 16. )r   rR   loggerinfo)r   out_dimin_dimis_compatibler7   r7   r8   _fp8_mm_compat  s   

r  )NN)r   r   )Qloggingtypingr   r   r   r   r/   torch.utils._python_dispatchr   torchao.kernelr   %torchao.quantization.quant_primitivesr   r	   r
   r   r   r   r   r   r   r   r   torchao.utilsr   r   r   r   r   r   r   r   r   r   "linear_activation_quantized_tensorr   __all__	getLoggerrM   r   r   r9   rP   __annotations__r@   rF   rG   intrH   r^   rx   rO   r   r   r    r   r   r!   r"   r#   r   r   r$   r%   r&   r'   r(   r)   r*   r   r,   r   rs   r   r+   r-   r   r   r   boolr  r7   r7   r7   r8   <module>   s   
4$	
"
!
< 
F
0
;


)

.


/