o
    	۷i|u                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ ddlmZ e	e
ZG dd dejZG dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd dejZd$ddZd$ddZd$ddZG dd deZG dd deZG dd deZd%d d!ZG d"d# d#eZdS )&    N)nn)Function   )loggingc                       s>   e Zd ZdZ									d fdd	Zdd	d
Z  ZS )QuantEmbeddinga  
    Quantized version of `torch.nn.Embedding`. Adds quantization-specific arguments on top of `torch.nn.Embedding`.

    Args:
        weight_bit (`int`, *optional*, defaults to `8`):
            Bitwidth for the quantized weight.
        momentum (`float`, *optional*, defaults to `0.95`):
            Momentum for updating the activation quantization range.
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
    N       @F   ffffff?c                    s   t    || _|| _|| _|| _|| _|| _|| _t	
t||g| _| dtd | dt| j |	| _|
| _|| _d| _tj| _d S )Nweight_scaling_factor   weight_integerF)super__init__num_dimpadding_idxmax_norm	norm_typescale_grad_by_freqsparser   	Parametertorchzerosweightregister_buffer
zeros_like
weight_bitmomentum
quant_modepercentile_modeSymmetricQuantFunctionapplyweight_function)selfnum_embeddingsembedding_dimr   r   r   r   r   _weightr   r   r   	__class__ ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/ibert/quant_modules.pyr   ,   s    
zQuantEmbedding.__init__c           	   	   C   s   | j stj|| j| j| j| j| j| j	d fS | j}|j
 }| d}| d}t| j||d| _| | j| j| j| j| _tj|| j| j| j| j| j| j	}|| j | jfS )Nr   F)r   r   
functional	embeddingr   r   r   r   r   r   datadetachminexpandmax$symmetric_linear_quantization_paramsr   r
   r"   r   r   )	r#   x	positionsincremental_stateww_transformw_minw_maxemb_intr)   r)   r*   forwardM   s<   	
	zQuantEmbedding.forward)	NNr   FFNr   r	   FNN)__name__
__module____qualname____doc__r   r;   __classcell__r)   r)   r'   r*   r      s    !r   c                       s>   e Zd ZdZd fdd	Zdd Z					dd	d
Z  ZS )QuantActap  
    Quantizes the given activation.

    Args:
        activation_bit (`int`):
            Bitwidth for the quantized activation.
        act_range_momentum (`float`, *optional*, defaults to `0.95`):
            Momentum for updating the activation quantization range.
        per_channel (`bool`, *optional*, defaults to `False`):
            Whether to or not use channel-wise quantization.
        channel_len (`int`, *optional*):
            Specify the channel length when set the *per_channel* True.
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
    r	   FNc                    s   t    || _|| _|| _|| _d| _tj| _	| jsF| 
dtd | 
dtd | 
dtd |  jd8  _|  jd7  _d S td)NFx_minr   x_maxact_scaling_factorgh㈵>;per-channel mode is not currently supported for activation.)r   r   activation_bitact_range_momentumr   per_channel
percentiler    r!   act_functionr   r   r   rC   rD   NotImplementedError)r#   rG   rH   rI   channel_lenr   r'   r)   r*   r      s   
zQuantAct.__init__c              
   C   s:   | j j d| j d| j d| j dd| j dd
S )Nz(activation_bit=z, quant_mode: z, Act_min: z.2fz, Act_max: ))r(   r=   rG   r   rC   itemrD   )r#   r)   r)   r*   __repr__   s   

zQuantAct.__repr__c                 C   s  |d u r|n|| }| j r| jrJ d| jrJ d|j }|j }	|	  dkr5|  dks9J d| j dkrT| j	 dk rT| j| | _| j	|	 | _	n2| j
dkrjt| j|| _t| j	|	| _	n| j| j
 |d| j
   | _| j	| j
 |	d| j
   | _	| js|d fS |d u r| jn|}|d u r| j	n|}	t| j||	| jd	| _|d u r| || j| j| j}
nt||| j| j||}
| jd}|
| | jfS )
Nz:percentile mode is not currently supported for activation.rF   r   z5NaN detected when computing min/max of the activationg&|g&|>r   )rI   )trainingrJ   rI   r-   r/   r1   isnansumrC   rD   rH   r   r   r2   rG   rE   rK   FixedPointMulr!   view)r#   r3   pre_act_scaling_factoridentityidentity_scaling_factorspecified_minspecified_maxx_actrC   rD   quant_act_intcorrect_output_scaler)   r)   r*   r;      sH   	

"
	zQuantAct.forward)r	   FNF)NNNNNr=   r>   r?   r@   r   rP   r;   rA   r)   r)   r'   r*   rB   r   s    
rB   c                       s:   e Zd ZdZ	d fdd	Z fdd	ZdddZ  ZS )QuantLineara8  
    Quantized version of `torch.nn.Linear`. Adds quantization-specific arguments on top of `torch.nn.Linear`.

    Args:
        weight_bit (`int`, *optional*, defaults to `8`):
            Bitwidth for the quantized weight.
        bias_bit (`int`, *optional*, defaults to `32`):
            Bitwidth for the quantized bias.
        per_channel (`bool`, *optional*, defaults to `False`):
            Whether or not to use channel-wise quantization.
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
    Tr       Fc                    s   t    || _|| _tt||g| _| 	dt
| j | 	dt| j |r?tt|| _| 	dt
| j || _|| _|| _|| _|| _d| _tj| _d S )Nr   fc_scaling_factorbias_integerF)r   r   in_featuresout_featuresr   r   r   r   r   r   r   biasr   r   rI   bias_bitr   r    r!   r"   )r#   rd   re   rf   r   rg   rI   r   r'   r)   r*   r      s    
zQuantLinear.__init__c                    s*   t   }d| d| j d| j d}|S )N(z weight_bit=z, quant_mode=rN   )r   rP   r   r   )r#   sr'   r)   r*   rP     s   
zQuantLinear.__repr__Nc           
      C   s   | j stjj|| j| jdd fS |d ur|jdksJ d| j}|j }| j	r=t
j|dd d\}}t
j|dd d\}}n| d}| d}t| j||| j	| _| | j| j| j| j| _| j| }| jd urw| | j| jd|| _|dd}|| }	tjj|	| j| jd| |fS )N)r   rf   )r   zInput activation to the QuantLinear layer should be globally (non-channel-wise) quantized. Please add a QuantAct layer with `per_channel = True` before this QuantAct layerr   )r   outFrQ   )r   r   r+   linearr   rf   shaper-   r.   rI   r   r/   r1   r0   r2   r   rb   r"   r   r   rg   rc   rV   )
r#   r3   prev_act_scaling_factorr6   r7   r8   _r9   bias_scaling_factorx_intr)   r)   r*   r;     s0   


zQuantLinear.forward)Tr   ra   FFNr_   r)   r)   r'   r*   r`      s    r`   c                       s4   e Zd ZdZd fdd	Zdd Zdd	d
Z  ZS )IntGELUa}  
    Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`.

    Args:
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
        force_dequant (`str`, *optional*, defaults to `"none"`):
            Force dequantize the layer if either "gelu" or "nonlinear" is given.
    Tnonec                    sj   t    || _|dv rtd d| _| jst | _d| _d| _	g d| _
| j
d  | j
d   < d S )	N)	nonlineargeluzForce dequantize geluFg-?   )g]m{ҿgMr      r   )r   r   r   loggerinfor   GELUactivation_fnkconstcoeff)r#   r   force_dequantr'   r)   r*   r   7  s   



zIntGELU.__init__c                 C   s   t | jd | }t | jd |d  }t |}t t || }||| d |  }|d | jd  }t|d| j  }|d| j  }||fS Nr   rw   r   )	r   floorr~   signr/   abs	floor_ster!   r}   )r#   rp   scaling_factorb_intc_intr   abs_inty_intr)   r)   r*   int_erfG  s   
zIntGELU.int_erfNc                 C   s^   | j s
| |d fS || }| ||| j \}}d| }|||  }|| d }|| |fS )N      ?rw   )r   r{   r   r|   )r#   r3   r   rp   sigmoid_intsigmoid_scaling_factor	shift_intr)   r)   r*   r;   V  s   zIntGELU.forward)Trs   rq   )r=   r>   r?   r@   r   r   r;   rA   r)   r)   r'   r*   rr   ,  s
    
rr   c                       s:   e Zd ZdZd fdd	Zdd Zdd	 Zd
d Z  ZS )
IntSoftmaxa  
    Quantized version of `torch.nn.Softmax`. Adds quantization-specific arguments on top of `torch.nn.Softmax`.

    Args:
        output_bit (`int`):
            Bitwidth for the layer output activation.
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
        force_dequant (`str`, *optional*, defaults to `"none"`):
            Force dequantize the layer if either "softmax" or "nonlinear" is given.
    Frs   c                    s   t    || _d| _|| _|dv rtd d| _td| jd| _d| _	d| _
g d	| _| jd
  | jd   < | jd  | jd   < d S )Nra   )rt   softmaxzForce dequantize softmaxF   r   gvq-   )gN$?g'|:?r   r   r   rw   )r   r   
output_bitmax_bitr   rx   ry   rB   actx0r}   coef)r#   r   r   r   r'   r)   r*   r   r  s   


zIntSoftmax.__init__c                 C   s~   t   t | jd | }t | jd |d  }W d    n1 s%w   Y  || | | }| jd |d  }||fS r   )r   no_gradr   r   )r#   rp   r   r   r   zr)   r)   r*   int_polynomial  s   
zIntSoftmax.int_polynomialc                 C   s   t   t | j| }W d    n1 sw   Y  t || j| }t|| }|||  }| ||\}}t j	t|d| j|   dd}|d| j  }||fS )Nrw   r   r/   )
r   r   r   r   r1   r}   r   r!   r   clamp)r#   rp   r   x0_intqrexp_intexp_scaling_factorr)   r)   r*   int_exp  s   
"zIntSoftmax.int_expc                 C   s   | j stjj|ddd fS || }|jddd\}}|| }| ||\}}| ||\}}|| }|jddd}	t	d| j
 |	 }
t	||
 d| j
| j   }dd| j  }|| |fS )NrQ   r   T)r   keepdimrw   r   )r   r   r+   r   r1   r   r   rT   r   r!   r   r   )r#   r3   r   rp   	x_int_maxrn   r   r   expexp_int_sumfactorr)   r)   r*   r;     s   zIntSoftmax.forward)Frs   )	r=   r>   r?   r@   r   r   r   r;   rA   r)   r)   r'   r*   r   e  s    r   c                       s<   e Zd ZdZd fdd	Zdd Zd	d
 ZdddZ  ZS )IntLayerNorma  
    Quantized version of `torch.nn.LayerNorm`. Adds quantization-specific arguments on top of `torch.nn.LayerNorm`.

    Args:
        output_bit (`int`, *optional*, defaults to `8`):
            Bitwidth for the layer output activation.
        quant_mode (`bool`, *optional*, defaults to `False`):
            Whether or not the layer is quantized.
        force_dequant (`str`, *optional*, defaults to `"none"`):
            Force dequantize the layer if either "layernorm" or "nonlinear" is given.
    r   Frs   c                    s   t    || _|| _tt|| _tt|| _	|| _
|dv r,td d| _
| dtd || _d| _d | _t| j| j
d| _d S )N)rt   	layernormzForce dequantize layernormFshiftr   ra   r   )r   r   normalized_shapeepsr   r   r   r   r   rf   r   rx   ry   r   r   r   dim_sqrtrB   
activation)r#   r   r   r   r   r   r'   r)   r*   r     s   

zIntLayerNorm.__init__c                 C   s   t  A |d }t j|ddd}t t |d| j    }| j}t | j|| _t	
dt| dt| j  W d    d S 1 sHw   Y  d S )Nrw   Taxisr   zDynamic shift adjustment: z -> )r   r   rT   log2sqrtr   ceilr1   r   rx   ry   int)r#   r   y_sq_intvar_intr   	shift_oldr)   r)   r*   	set_shift  s   
"""zIntLayerNorm.set_shiftc                 C   s:   |  | t|d| j  }|d }tj|ddd}|S )z
        This fallback function is called when overflow is detected during training time, and adjusts the `self.shift`
        to avoid overflow in the subsequent runs.
        rw   Tr   )r   r   r!   r   r   rT   )r#   r   y_int_shiftedr   r   r)   r)   r*   overflow_fallback  s
   
zIntLayerNorm.overflow_fallbackNc                 C   s  | j s.|jddd}|| }tj|d ddd}|t| j|  }|| j | j }|d fS | jd u rHtj|j	d tj
d}t||j| _|| }t|jddd}|| }	t|	d| j  }
|
d }tj|ddd}| jr| d| j kr| |	}| d| j d k sJ dtt|d| j  }td| }t|	| d }	| jd }| jj | jj  }t|| }|	| }	|| j }|	| }||fS )	Nrw   Tr   )dtypeg?zfError detected in overflow handling: `var_int` exceeds `self.max_bit` (the maximum possible bit width)l        i   @)r   meanr   r   r   r   rf   r   tensorrl   floattodevice	round_ster!   r   r   rT   rR   r1   r   r   r-   r.   )r#   r3   r   r   yvarnrp   mean_intr   r   r   r   std_intr   rf   bias_intr)   r)   r*   r;     s@   



zIntLayerNorm.forward)r   Frs   rq   )	r=   r>   r?   r@   r   r   r   r;   rA   r)   r)   r'   r*   r     s    	r   Fc           	      C   s   | j d }t|d|d   }t|| d }tj| |dj}|dkr(|d }n
tj|  |dj }|s<| }| }||fS )a  
    Calculate the percentile max and min values in a given tensor

    Args:
        input (`torch.Tensor`):
            The target tensor to calculate percentile max and min.
        lower_percentile (`float`):
            If 0.1, means we return the value of the smallest 0.1% value in the tensor as percentile min.
        upper_percentile (`float`):
            If 99.9, means we return the value of the largest 0.1% value in the tensor as percentile max.
        output_tensor (`bool`, *optional*, defaults to `False`):
            If True, this function returns tensors, otherwise it returns values.

    Returns:
        `Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of *input*
    r   r   g{Gz?)r|   )rl   roundr   kthvaluevaluesrO   )	inputlower_percentileupper_percentileoutput_tensorinput_lengthlower_indexupper_indexupper_boundlower_boundr)   r)   r*   get_percentile_min_max  s   

r   c                 C   s   t | jdkr|dddd}|dddd}nt | jdkr,|dd}|dd}n
|d}|d}|rF| d| |  | S td| |  | S )a?  
    Quantize single-precision input tensor to integers with the given scaling factor and zeropoint.

    Args:
        input (`torch.Tensor`):
            Single-precision input tensor to be quantized.
        scale (`torch.Tensor`):
            Scaling factor for quantization.
        zero_pint (`torch.Tensor`):
            Shift for quantization.
        inplace (`bool`, *optional*, defaults to `False`):
            Whether to compute inplace or not.

    Returns:
        `torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*.
       rQ   r   rw   r   )lenrl   rV   mul_add_round_r   r   )r   scale
zero_pointinplacer)   r)   r*   linear_quantize5  s   

r   c                 C   s   t  K d| d  d }|r-t jt j| | gdddd\}}t j|dd| }nt| | }t j|dd| }W d   |S W d   |S 1 sRw   Y  |S )a/  
    Compute the scaling factor with the given quantization range for symmetric quantization.

    Args:
        saturation_min (`torch.Tensor`):
            Lower bound for quantization range.
        saturation_max (`torch.Tensor`):
            Upper bound for quantization range.
        per_channel (`bool`, *optional*, defaults to `False`):
            Whether to or not use channel-wise quantization.

    Returns:
        `torch.Tensor`: Scaling factor that linearly quantizes the given range between *saturation_min* and
        *saturation_max*.
    rw   r   r   g:0yE>r   N)r   r   r1   stackr   r   )num_bitssaturation_minsaturation_maxrI   r   r   rn   r)   r)   r*   r2   X  s   
(

r2   c                   @   (   e Zd ZdZedd Zedd ZdS )r    zw
    Class to quantize the given floating-point values using symmetric quantization with given range and bitwidth.
    c                 C   sN   t jd|jd}d|d  d }t|||dd}t || |d }|| _|S )a6  
        Args:
            x (`torch.Tensor`):
                Floating point tensor to be quantized.
            k (`int`):
                Quantization bitwidth.
            percentile_mode (`bool`):
                Whether or not to use percentile calibration.
            scale (`torch.Tensor`):
                Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction
                requires pre-calculated scaling factor.

        Returns:
            `torch.Tensor`: Symmetric-quantized value of *input*.
        g        )r   rw   r   F)r   )r   r   r   r   r   r   )ctxr3   r|   r   r   r   r   new_quant_xr)   r)   r*   r;   }  s   zSymmetricQuantFunction.forwardc                 C   sb   | j }t|jdkr|dddd}nt|jdkr!|dd}n|d}| | d d d d fS )Nr   rQ   r   rw   )r   r   rl   rV   clone)r   grad_outputr   r)   r)   r*   backward  s   
zSymmetricQuantFunction.backwardNr=   r>   r?   r@   staticmethodr;   r   r)   r)   r)   r*   r    x  s    
r    c                   @   r   )r   z;
    Straight-through Estimator(STE) for torch.floor()
    c                 C   
   t |S rq   )r   r   r   r3   r)   r)   r*   r;        
zfloor_ste.forwardc                 C      |  S rq   r   r   r   r)   r)   r*   r        zfloor_ste.backwardNr   r)   r)   r)   r*   r         
r   c                   @   r   )r   z;
    Straight-through Estimator(STE) for torch.round()
    c                 C   r   rq   )r   r   r   r)   r)   r*   r;     r   zround_ste.forwardc                 C   r   rq   r   r   r)   r)   r*   r     r   zround_ste.backwardNr   r)   r)   r)   r*   r     r   r      c                 C   s   |   }| d} t|   \}}g }|D ]}tt|d|  j	tdtj
d}|| qt|}t|| }t|| j|t|| j|fS )z
    Decompose the scaling factor into mantissa and twos exponent.

    Args:
        scaling_factor (`torch.Tensor`):
            Target scaling factor to decompose.

    Returns:
        ``Tuple(torch.Tensor, torch.Tensor)`: mantisa and exponent
    rQ   rw   1)rounding)sizerV   npfrexpcpunumpyr   decimalDecimalquantizeROUND_HALF_UPappendarrayr   r   
from_numpyr   r   )inputsr   shape_of_inputoutput_moutput_etmp_mmint_m_shiftedr)   r)   r*   batch_frexp  s   
"
r
  c                   @   s.   e Zd ZdZe		dddZedd ZdS )rU   aQ  
    Function to perform fixed-point arithmetic that can match integer arithmetic on hardware.

    Args:
        pre_act (`torch.Tensor`):
            Input tensor.
        pre_act_scaling_factor (`torch.Tensor`):
            Scaling factor of the input tensor *pre_act*.
        bit_num (`int`):
            Quantization bitwidth.
        z_scaling_factor (`torch.Tensor`):
            Scaling factor of the output tensor.
        identity (`torch.Tensor`, *optional*):
            Identity tensor, if exists.
        identity_scaling_factor (`torch.Tensor`, *optional*):
            Scaling factor of the identity tensor *identity*, if exists.

    Returns:
        `torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act* and
        *identity*), whose scale is rescaled to *z_scaling_factor*.
    Nc                 C   s  t |jdkrdd }ndd }|| _d|d  d }t  ||}|d ur,||}|| _t|| }	|tj}
|tj	tj}|
| }||}t
|\}}|	tj|tj }t|d|  }|d urt|| }|tj}
|tj	tj}|
| }||}t
|\}}|tj|tj }t|d|  }|| }t|tj	| d |W  d    S 1 sw   Y  d S )Nr   c                 S   s   | S rq   r)   r3   r)   r)   r*   <lambda>  s    z'FixedPointMul.forward.<locals>.<lambda>c                 S   s   |  dddS )Nr   rQ   )rV   r  r)   r)   r*   r    s    rw   r   r   )r   rl   rX   r   r   z_scaling_factorr   typedoubler   r
  r   )r   pre_actrW   bit_numr  rX   rY   reshaper   z_int_A_B	new_scaler  eoutputwx_intm1e1output1r)   r)   r*   r;     s<   


$zFixedPointMul.forwardc                 C   s8   d }| j d ur| | j }| | j d d d d |d fS rq   )rX   r   r  )r   r   identity_gradr)   r)   r*   r   /  s   
zFixedPointMul.backwardr<   r   r)   r)   r)   r*   rU     s    4rU   )F)r   )r   r   r   r   r   torch.autogradr   utilsr   
get_loggerr=   rx   Moduler   rB   r`   rr   r   r   r   r   r2   r    r   r   r
  rU   r)   r)   r)   r*   <module>   s*   
SjP9G
e
$
# -
"