o
    i-                  	   @   s`  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ e eZee j e ejZe d	Zee ee G d
d de eZ!	d de	j"de#de#de$fddZ%G dd dej&Z'i fdej&fddZ(G dd dZ)eG dd deZ*ee*de	jj&de*de	jj&fddZ+dS )!    N)	dataclass)Enum)OptionalUnion)quantize_per_channel_group)AOBaseConfig)'_choose_qparams_and_quantize_affine_hqq) register_quantize_module_handlerz4%(asctime)s - %(name)s - %(levelname)s - %(message)sc                   @   s   e Zd ZdZ	 dZ	 dZdS )UIntxChooseQParamsAlgorithmz[Variant of quantization algorithm to calculate scale and zero_point for UIntx quantization.min_maxhqqN)__name__
__module____qualname____doc__MIN_MAXHQQ r   r   R/home/ubuntu/.local/lib/python3.10/site-packages/torchao/experimental/quant_api.pyr
   '   s    r
   Tvals
group_sizenbithas_weight_zerosc              	   C   s  |dkr|dks
J |rd|d >  }d|d > d }nd}d|> d }| j \}}| d|} tj| dd\}	}
tj| dd\}}
||	 ||  }|sQt|}n	|t|	|  }| ||} ||d}||d}t| |||||rwtjntj	|d}|sd }|||fS )N      r   )axis)inputscaleszero_points	quant_min	quant_maxdtyper   )
shapereshapetorchminmax
zeros_likeroundr   int8uint8)r   r   r   r   signedqminqmaxnkvmins_vmaxsgroup_scalesgroup_zerosgroup_qvalsr   r   r   	_quantize9   s:   


r7   c                       sH   e Zd Z	d
deej f fddZejfdefddZ	dd	 Z
  ZS )UIntxWeightOnlyQuantizedLinearNbiasc                    sB   t    || _|| _|d urtj|dd| _d S | dd  d S )NFrequires_gradr9   )super__init___pack_weights_op
_linear_opnn	Parameterr9   register_parameter)selfpack_weight_op	linear_opr9   	__class__r   r   r=   d   s   
z'UIntxWeightOnlyQuantizedLinear.__init__uintx_choose_qparams_algorithmc           
   
   C   s   || _ || _|tjkr)t|||d|j|jddd\}}}}|tj	}| | }n |tj
krBt|| j| j ddd\}}}| | }ntd| tj|dd| _tj|dd| _| | j|jd}	tj|	dd| _d S )	Nr   FT)nbitsr   r   compute_dtypedeviceverbose
raw_output)r   r,   z,Unsupported uintx_choose_qparams_algorithm: r:   )rK   )r   r   r
   r   r   r"   rK   tor%   r+   r   r7   
ValueErrorr@   rA   weight_scalesweight_zerosr>   cpupacked_weights)
rC   weightsr   r   rH   weight_qvalsrP   rQ   r2   rS   r   r   r   quantize_and_pack_weightsr   s<   


z8UIntxWeightOnlyQuantizedLinear.quantize_and_pack_weightsc                 C   s   |  dksJ |  dkr'| || j| j| j| j}| jd ur%|| j }|S |jdd }|jd }| jjd }| |d|| j| j| j| jjg ||R  }| jd ur\|| j }|S )N   r   r   )	dimr?   rS   r   rP   rQ   r9   r#   r$   )rC   xoutput
lead_shaper0   r/   r   r   r   forward   s>   





z&UIntxWeightOnlyQuantizedLinear.forward)N)r   r   r   r   r%   Tensorr=   r
   r   rV   r\   __classcell__r   r   rF   r   r8   c   s    
,r8   modulec                 C   s   |d }|d }t | tjrJ |dkr|dksJ |  D ]E\}}t |tjs.t|| q|j s;td| dtt	t
jjd| dt	t
jjd	| d
|jd}t| || ||j|| qd S )Nr   r   r      zFUIntxWeightOnlyQuantizedLinear requires contiguous weights for layer 'zF'. Please call .contiguous() on the weight tensor before quantization._pack_weight_bit_linear_fp_act_
bit_weightrD   rE   r9   )
isinstancer@   Linearnamed_children)_replace_linear_with_quantized_linear_mpsweightis_contiguousrO   r8   getattrr%   opstorchaor9   setattrrV   )r_   kwargsr   r   namechildqlinearr   r   r   ri      s*   

ri   c                
   @   sZ   e Zd Zddddddee deej dee dee fddZd	e	j
d
e	j
fddZdS )UIntxWeightOnlyLinearQuantizerN)rK   	precisionbitwidth	groupsizerK   ru   rv   rw   c                C   s   |r
|dkr
t d|| _|r|tjtjtjfvrtd|| _|d u r/d}t	d| d |t
ddvr:td	|| _|d u rLd
}t	d| d |dvrTtd|| _d S )NmpszHOnly device=mps is currently supported in UIntxWeightOnlyLinearQuantizerz[Only precisions float32, float16 & bfloat16 are supported in UIntxWeightOnlyLinearQuantizer   z&bitwidth not specified, defaulting to .r   r   zDOnly bitwidts 1 to 7 are supported in UIntxWeightOnlyLinearQuantizer   z'groupsize not specified, defaulting to     @   r{      zQOnly groupsizes 32, 64, 128 & 256 are supported in UIntxWeightOnlyLinearQuantizer)NotImplementedErrorrK   r%   float32float16bfloat16rO   ru   loggerwarningrangerv   rw   )rC   rK   ru   rv   rw   r   r   r   r=      s<   
z'UIntxWeightOnlyLinearQuantizer.__init__modelreturnc                 C   s>   | j r	|| j }| jr|| j}t|| j| jdd |S )N)r   r   )rp   )rK   rN   ru   ri   rw   rv   )rC   r   r   r   r   quantize  s   z'UIntxWeightOnlyLinearQuantizer.quantize)r   r   r   r   strr%   r"   intr=   r@   Moduler   r   r   r   r   rt      s    
.rt   c                   @   sH   e Zd ZU dZdZeed< dZeed< ej	Z
eeef ed< dd Zd	S )
UIntxWeightOnlyConfiga  
    Configuration for applying uintx weight-only asymmetric per-group quantization
    to linear layers for MPS devices.

    Args:
        bitwidth (int): Number of bits for quantization, must be between 1 and 7 inclusive.
            Default is 4.
        group_size (int): Group size for quantization. Must be one of [32, 64, 128, 256].
            Default is 128.
        uintx_choose_qparams_algorithm (Union[UIntxChooseQParamsAlgorithm, str]): Algorithm for
            choosing quantization parameters. Options:
            - "min_max" (default): Simple min-max scaling
            - "hqq": Half-Quadratic Quantization for better accuracy
    ry   rv   r{   r   rH   c                 C   sZ   | j tddvrtd| j  | jdvrtd| j t| jtr+t| j| _d S d S )Nr   r   z0bitwidth must be between 1 and 7 inclusive, got r|   z2group_size must be one of [32, 64, 128, 256], got )rv   r   rO   r   rf   rH   r   r
   )rC   r   r   r   __post_init__,  s   



z#UIntxWeightOnlyConfig.__post_init__N)r   r   r   r   rv   r   __annotations__r   r
   r   rH   r   r   r   r   r   r   r   r     s   
 r   configr   c                 C   sn   |j }|j}|j}| j stdtttj	j
d| dttj	j
d| d| jd}|| j||| |S )NzUIntxWeightOnlyQuantizedLinear requires contiguous weights. Please call .contiguous() on the weight tensor before quantization.ra   rb   rc   rd   re   )rv   r   rH   rj   rk   rO   r8   rl   r%   rm   rn   r9   rV   )r_   r   r   r   rH   rs   r   r   r    _uintx_weight_only_mps_transform<  s    

r   )T),loggingsysdataclassesr   enumr   typingr   r   r%   torch.nnr@   $torch.ao.quantization.fx._decomposedr   torchao.core.configr   %torchao.quantization.quant_primitivesr   %torchao.quantization.transform_moduler	   	getLoggerr   r   setLevelWARNINGStreamHandlerstdouthandler	Formatter	formattersetFormatter
addHandlerr   r
   r]   r   boolr7   r   r8   ri   rt   r   r   r   r   r   r   <module>   sR   




*Y>&