o
    i                     @   sL  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZmZmZ d dlmZ d dlm Z  d dl!m"Z" zd dl#m$Z$ W n   e j%Z$Y e
j&'ddZ(e"e(Z)dgZ*dd Z+dd Z,dd Z-dd Z.dd Z/e)de
j0de1de
j0d e
j0d!e
j0d"e	e
j0 d#ee2 d$e
j0fd%d&Z3d'd( Z4d$e5fd)d*Z6d+efd,d-Z7d.d/ Z8d0d1 Z9d2d3 Z:d4d5 Z;d6d7 Z<d8d9 Z=d:d; Z>d<d= Z?d>d? Z@d@dA ZAdBdC ZBdDdE ZCdFdG ZDdHdI ZEdJdK ZFdLdM ZGdNe
jHjfdOdPZIeG dQdR dRZJdSed$efdTdZKdS )U    N)	dataclass)partial)AnyCallableListOptional)	out_dtype)quantized_decomposed_lib)GraphModule)InternalMatch)ReplacedPatternsreplace_pattern_with_filters)WrapperModule)"_get_aten_graph_module_for_pattern,_replace_literals_with_existing_placeholders'_replace_literals_with_new_placeholders"remove_tensor_overload_for_qdq_ops)MappingType)_get_per_token_block_size)_register_custom_op)$_disable_aten_to_metadata_assertionstorchaoFRAGMENT reference_representation_rewritec                 C   sf   t jj| ||||t j}t jj|||||	t j}t jjj|||
}t jj|||||t j}|S N)	torchopsquantized_decomposeddequantize_per_tensorint8atenlineardefaultquantize_per_tensor)x_i8x_scalex_zero_pointx_quant_minx_quant_max	weight_i8weight_scaleweight_zero_pointweight_quant_minweight_quant_max	bias_fp32	out_scaleout_zero_pointout_quant_minout_quant_maxx_fp32weight_fp32out_fp32out_i8 r7   n/home/ubuntu/.local/lib/python3.10/site-packages/torchao/quantization/pt2e/reference_representation_rewrite.py_qdq_quantized_linear,   s    r9   c                 C   s   t jj| ||} t jj|||	}| t j}|t j}tt jjjjt j	|| || d }|| }tt jjj
jt j	|
|}|| }tt jjjjt j	||| | | }t jj|||t j}|S r   )r   r   r    clamptoint16r   r!   r"   int32divTensormulr   )r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   x_i16
weight_i16acc_i32
bias_scalebias_i32r6   r7   r7   r8   _reference_quantized_linearO   s2   
	

	rF   c
                 C   s   t jj| |||t j\}
}t jj| |
|||t j}t jj||
|||t j} t jj|||||t j}t jjj	| ||	}|S r   )
r   r   r   choose_qparamsr   r#   r   r    r!   r"   )r3   r'   r(   x_epsr)   r*   r+   r,   r-   r.   r%   r&   r$   r4   r5   r7   r7   r8   _qdq_dynamic_quantized_linear   s&   rI   c
                 C   s   t jj| |||t j\}
}| |
 } t | } | jt jd}|| }t |||}|jt jd}t jj	|||}|t j
}|t j
}tt jj	jjt j|| || d }|
| }tt jj	jjt j|	|}|| }||
|  }|S Ndtype)r   r   r   rG   r   roundr;   r=   r:   r    r<   r   r!   r"   r>   r?   )r3   r'   r(   rH   r)   r*   r+   r,   r-   r.   r%   r&   x_i32r$   rA   rB   rC   rD   rE   r5   r7   r7   r8   #_reference_dynamic_quantized_linear   s0   

rO   c                 C   s   t j}t| }d}	d}
tjj| |j|tj|	|
|tj	tj
	\}}tjj| |||tj|	|
}tjj||||tj|	|
tj	} |dksFJ d|jd | dksSJ d| dks]J dd|f}tjj||||tjd	d
}tjjj| ||}|S )N   r   zGroup size must be positive   z&Weight must be divisible by group_size   zWeight must be 2D tensor   )r   
ASYMMETRICr   r   r   r   choose_qparams_affinenamer   float32r=   quantize_affinedequantize_affineshapedimr    r!   r"   )r3   rH   	weight_i4r*   r+   r.   
group_sizex_mapping_typeper_token_block_sizer'   r(   r%   r&   r$   
block_sizer4   r5   r7   r7   r8   ,_qdq_dynamic_quantized_linear_4bit_groupwise   sf   
	
rc   r3   rH   r^   r*   r+   r.   r_   returnc           #      C   s  |d }t j}t| }d}	d}
tjj| |j|tj|	|
|tj	tj
	\}}tjj| |||tj|	|
}|j\}}|| }|tjtj	}|j}|jd }|d|}|jd }||||}||||}|tj
}|tj
}tj||tj	| jd}t|}||d< |jdkr|d}t|D ][}|dd|ddf }|dd|ddf }|jdd}|dd|f }ttjjjjtj
||d}||tj	|dd  }|tj	dd|dd |dd } ||  }q|dd}!||! }"|dur|"| }"|"|S )	a  
    Reference implementation for dynamically quantized linear 4-bit groupwise operation.
    This implementation emulates actual numerics of on-device integer compute.

    Args:
        x_fp32: Input activation tensor in fp32
        x_eps: Epsilon for quantization parameter computation
        weight_i4: 4-bit quantized weight (stored as int8 with values in [-8, 7])
        weight_scale: Groupwise scales for weight dequantization
        weight_zero_point: Groupwise zero points for weight (unused for symmetric)
        bias_fp32: Optional bias tensor in fp32
        group_size: Size of each group for groupwise quantization

    Returns:
        Output tensor in fp32
    rR   rP   rQ   r   )rL   deviceN)r]   )r   rV   r   r   r   r   rW   rX   r   rY   r=   rZ   r\   r;   bfloat16viewzerosrf   listndim	unsqueezerangesumr   r    r!   r"   )#r3   rH   r^   r*   r+   r.   r_   r`   ra   r'   r(   r%   r&   r$   out_featuresin_features
num_groupsx_orig_shapek_dim
batch_sizex_i8_groupedweight_i4_groupedx_i32_groupedweight_i32_groupedacc_fp32	out_shape	group_idxx_groupweight_groupweight_group_col_sumweight_scale_group	group_accweights_col_sum_adjustedx_scale_multiplierr5   r7   r7   r8   _reference_dqlinear_int4  s   










r   c              
   C   s   t jj| |||||d|fS )z
    Reference implementation for dynamically quantized linear 4-bit groupwise operation.
    This function now delegates to the custom op implementation.
    rR   )r   r   r   reference_dqlinear_int4)r3   rH   r^   r*   r+   r.   r_   r7   r7   r8   2_reference_dynamic_quantized_linear_4bit_groupwise  s   r   c                 C   s   d}d}| j  D ]Q}t|tjjr5|jdkr5|jtjj	j
jkr5|j}t|dkr5|d dko4|d dk}t|tjjrZ|jdkrZ|jtjj	jjkrZ|j}t|dkrZ|d tjk}q	|o^|S )NFcall_functionrU      rT         )	nodes_mapvalues
isinstancer   fxNodeoptargetr   r   r[   r"   argslenrZ   r   )matchoriginal_graphpattern_graphweight_is_int4act_quant_is_int8noder   r7   r7   r8   6_filter_fn_for_dynamic_quantized_linear_4bit_groupwise  s&   

r   replacement_patternc                 C   s   ddl m}m} d}d}| j D ]\}}|jtjjj	j
kr"|} nqt| jdkr/| jd }|durJ|durL|j |_||d|jg|jd< dS dS dS )z
    Port metadata for dynamically quantized linear 4-bit groupwise operation.
    It custom_op node's metadata with corresponding linear node's metadata.
    r   )
NodeSourceNodeSourceActionNre   #ReplaceInt4DynamicQuantWithCustomOp	from_node)torch.fx.tracebackr   r   r   itemsr   r   r   r    r!   r"   r   replacementsmetacopyREPLACE)r   r   r   linear_nodeint4_custom_op_node_g_nr7   r7   r8   :_port_metadata_for_dynamic_quantized_linear_4bit_groupwise  s&   
r   c                 C   s   ddg}ddg}ddg}d}ddg}d}t jj| ||||t j}t jj|||||	t j}t jjj|||
||||||	}t jj|||||t j}|S NrR   r   F)	r   r   r   r   r   r    convolutionr"   r#   )r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   stridepaddingdilation
transposedoutput_paddinggroupsr3   r4   r5   r6   r7   r7   r8   _qdq_quantized_conv2d  s@   
r   c                 C   s  ddg}ddg}ddg}d}ddg}d}t jj| ||} t jj|||	}| t j}|t j}tt jjjjt j	|| || d ||||||}|| }tt jjj
jt j	|
|}|d}|d}|| }tt jjjjt j	||| | | }t jj|||t j}|S )NrR   r   Fre   )r   r   r    r:   r;   r<   r   r   r"   r=   r>   r?   rl   r@   r   )r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r   r   r   r   r   r   rA   rB   rC   rD   rE   r6   r7   r7   r8   _reference_quantized_conv2d  sN   




	r   c
                 C   sh   t jj| ||||	t j}
t jj|||||	t j}|
| }t jj|}t jj|||||	t j}|S r   )r   r   r   r   r   r    relur#   r$   r%   r&   y_i8y_scaley_zero_pointr/   r0   	quant_min	quant_maxr3   y_fp32r5   r6   r7   r7   r8   _qdq_quantized_add_reluk  s   r   c
                 C   s   |  tj}
| tj}ttjjjjtj|
| || }
ttjjjjtj|| || }|
| | }tjj|||	 tj	}|S )z
    See comments for `_reference_quantized_add` for more information on
    how to derive the formula for out_i8 based on x_i8 and y_i8
    )
r;   r   r=   r   r   r    r@   r?   r:   r   r$   r%   r&   r   r   r   r/   r0   r   r   rN   y_i32out_i32r6   r7   r7   r8   _reference_quantized_add_relu  s"   

r   c
                 C   sZ   t jj| ||||	t j}
t jj|||||	t j}|
| }t jj|||||	t j}|S r   )r   r   r   r   r   r#   r   r7   r7   r8   _qdq_quantized_add  s   r   c
                 C   s   |  tj}
| tj}t|| |
|   tj}
t|| ||   tj}|
| | }d}d}	tjj|||	 tj}|S )a  
        # How to Derive the formula for out_i8 based on x_i8 and y_i8
        # (since quantized add takes x_i8, y_i8 and their quantization parameters, and produce an out_i8)

        # out_i8 is quantized output, we can write down the formula for it first:
    out_i8 = out_f32 / out_scale + out_zero_point           (1)

        # then out_fp32 is computed from x_f32 + y_f32, and the x_fp32 and y_fp32 are the dequantized x_i8 and y_i8
        out_f32 = x_f32 + y_f32           (2)
        x_fp32 = (x_i8 - x_zero_point) * x_scale         (3)
        y_fp32 = (y_i8 - y_zero_point) * y_scale         (4)

        # applying the above fomula to the out_i8 equation we can get the following:
        out_i8 = out_fp32 / out_scale + out_zero_point             # (1)
           = (x_f32 + y_f32) / out_scale + out_zero_point      # applying (2) to substitute out_fp32 with x_fp32 + y_fp32
           = ((x_i8 - x_zero_point) * x_scale + (y_i8 - y_zero_point) * y_scale) / out_scale + out_zero_point  # apply (3) and (4)
    rP   rQ   )r;   r   r=   rM   r   r    r:   r   r   r7   r7   r8   _reference_quantized_add  s   r   c	                 C   sj   d}	d}
d}d}d}t jj| ||||t j}t jjj||	|
|||\}}t jj|||||t j}|S r   )	r   r   r   r   r   r    max_pool2d_with_indicesr"   r#   )r$   r%   r&   r'   r(   r/   r0   r1   r2   kernel_sizer   r   r   	ceil_moder3   r5   r   r6   r7   r7   r8   _qdq_quantized_max_pool2d  s   
r   c	                 C   s~   d}	d}
d}d}d}t | ||} | t j}t jjj|| |	|
|||\}}|||  | }t |||}|t j}|S r   )	r   r:   r;   r=   r   r    r   r"   r   )r$   r%   r&   r'   r(   r/   r0   r1   r2   r   r   r   r   r   rN   r   r   r5   r6   r7   r7   r8   _reference_quantized_max_pool2d  s   
r   c                 C      t jj| ||||t j}|S r   )r   r   r   r#   r   r3   scale
zero_pointr   r   xr7   r7   r8   _quantize_per_tensor_int8$     r   c                 C   sH   | | }t |}|jt jd}|| }t |||}|jt jd}|S rJ   )r   rM   r;   r=   r:   r   r   r7   r7   r8   #_reference_quantize_per_tensor_int8+  s   
r   c                 C   r   r   )r   r   r   r   r   )r$   r   r   r   r   r3   r7   r7   r8   _dequantize_per_tensor_int8;  r   r   c                 C   s0   t jj| ||} | t j| | jt jdS rJ   )r   r   r    r:   r;   rY   )r$   r   r   r   r   r7   r7   r8   %_reference_dequantize_per_tensor_int8B  s   r   c              	   C       t jj| |||||t j}|S r   )r   r   r   quantize_per_channelr   )r3   scaleszero_pointsch_axisr   r   r6   r7   r7   r8   _quantize_per_channel_int8O  s   r   c                 C   sP   t | |d} t jjt | | t j| ||}t ||d}|t jS Nre   )	r   	transposer   r    r:   rM   r;   r=   r   )r3   r   r   r   r   r   r   r7   r7   r8   $_reference_quantize_per_channel_int8X  s   r   c              	   C   r   r   )r   r   r   dequantize_per_channelr   )r$   r   r   r   r   r   r5   r7   r7   r8   _dequantize_per_channel_int8c  s   r   c                 C   sR   t jj| ||} t | |d} | t j}|| t j| }t ||d}|S r   )r   r   r    r:   r   r;   r=   float)r$   r   r   r   r   r   rN   r5   r7   r7   r8   &_reference_dequantize_per_channel_int8m  s   r   gmc                 C   s   t | dgdddddS )Nre      r   r   )rR   rP   rQ   )exclude_literalsliteral_to_ph_idx)r   )r   r7   r7   r8   '_replace_ph_qdq_per_channel_replacement{  s   r   c                   @   s   e Zd ZU dZeedf ed< eed< eed< dZe	ee
ge
f  ed< dZe	ee
ge
f  ed< dZe	eed	ejjejjgef   ed
< dZeed< dZe	edgdf  ed< dS )_RewriteInfozData needed for rewrite, this includes example inputs, pattern and replacement functions
    and post transformation functions for the exported pattern and replacement GraphModule
    .example_inputspatternreplacementNpattern_post_transreplacement_post_transr   	filter_fnFignore_literalsr   port_metadata_fn)__name__
__module____qualname____doc__tupler   __annotations__r   r   r   r
   r   r   rj   r   r   Graphboolr   r   r7   r7   r7   r8   r     s   
 r   modelc                 C   s   t jdddt jdt jdt jdt jdt jdt jdgt jdt jdgt jdt jdddt jdt jdt jdt jdt jdt jdgt jdt jdgt jdt jdt jdt jdt jdt jdt jdt jdgt jdt jdgt jdf}t jdt jdddt t j	j
t jdddt jdt jdt jdt jdt jdt jdgt jdt jdgt jdt jdt jdf
}t jdddt jdt jdt jdt jdt jdt jdgt jdt jdgt jdt jdddt jdt jdt jdt jdt jdt jdgt jdt jdgt jdt jdt jdt jdt jdt jdt jdt jdgt jdt jdgt jdf}t jdddt jdt jdt jdt jdt jdt jdddt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdgt jdt jdgt jdf
}t jdddt jdt jdt jdt jdt jdt jdgt jdt jdgt jdt jdt jdt jdt jdt jdgt jdt jdgt jdf	}t jdd	d	d	t jdt jdt jdt jdt jdt jdgt jdt jdgt jdf}t jdddt jdt jdt jdt jdt jdt jdgt jdt jdgt jdf}t jdd	d	d	t jdt jd	t jdt jd	t jddddf}t jdddt jdt jd	t jdt jd	t jddddf}	t jd
t jdt t j	j
t jdddt jdt jddt jdt jddt jdt jdt jddf}
t jdt jdt t j	j
t jdddt jdt jddt jdt jddt jdt jdt jddf}t|ttttttddddt t j	j
d	idttddddt t j	j
d	idt|
ttttttt t j	j
dddidttt t j	j
dddidtgdtdt|ttttttt t j	j
dddidttt t j	j
dddidtgdtdt|ttttttt|ttttttdgdttdgdt|ttttt|ttttt|ttttttt|tt tt!t|tt"tt#t|tt$tt%t&t&t|	tt'tt(t&t&g}t)|  t* ` |D ]T}|j+}|j,}|j-}|j.}|j/}t0||}t)| t0||}t)| |r||}|r||}|1  |1  t2| |||j3|j4d}|j5r|D ]}|5| qqW d    | S 1 sw   Y  | S )NrP   rQ   )rS   r   rK   rR   )r   r   i)rR   r   r   r   r   )rR       rT   rU   )   r   r   r   )rR   rR   r   rS   )r   )rR   r   r   T)r   r   r   re   )r   )match_filtersr   )6r   randintr   randnr   ri   inttensorfinforY   epsr   r   rI   rO   r   r   rc   r   r   r   r9   rF   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	recompiler   r   r   r   )r    _QUANTIZED_LINEAR_EXAMPLE_INPUTS(_DYNAMIC_QUANTIZED_LINEAR_EXAMPLE_INPUTS _QUANTIZED_CONV2d_EXAMPLE_INPUTS)_QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS$_QUANTIZED_MAX_POOL2D_EXAMPLE_INPUTS(_QUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS*_DEQUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS)_QUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS+_DEQUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS9_DYNAMIC_QUANTIZED_LINEAR_4BIT_GROUPWISE_EXAMPLE_INPUTS_19_DYNAMIC_QUANTIZED_LINEAR_4BIT_GROUPWISE_EXAMPLE_INPUTS_2_REWRITE_INFO_LISTrewrite_infor   r   r   r   r   matchesmr7   r7   r8   r     s  		

s

)L
contextlibdataclassesr   	functoolsr   typingr   r   r   r   r   !torch._higher_order_ops.out_dtyper   $torch.ao.quantization.fx._decomposedr	   torch.fxr
   6torch.fx.passes.utils.matcher_with_name_node_map_utilsr   torch.fx.subgraph_rewriterr   r   &torchao.quantization.pt2e.export_utilsr   torchao.quantization.pt2e.utilsr   r   r   r   %torchao.quantization.quant_primitivesr   torchao.quantization.utilsr   torchao.utilsr   torch._export.utilsr   nullcontextlibraryLibrary	quant_libregister_custom_op__all__r9   rF   rI   rO   rc   r?   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r7   r7   r7   r8   <module>   s   #5!/A 

3R%)	
