o
    3wi!7                     @   s  d dl mZ d dlmZ d dlmZmZ d dlZd dlmZ d dl	Z	d dl
mZ d dlmZ 	 G dd dZed	ed
dee	jge	jf deeef fddZee	jdr[e	jjZne	jjZed	ed
de	jde	jde	jfddZeG dd dZG dd de	jjZG dd de	jjZ 				d&de	jde	jdee	j dee d ee	j f
d!d"Z!		d'de	jde	jd#ej"dee	j d ee	j f
d$d%Z#dS )(    )	dataclass)prod)CallableOptionalN)warn)
deprecatedc                   @   s<   e Zd ZdZdd Zdd Zedd Zdd	 Zd
d Z	dS )GlobalOutlierPoolerNc                 C      t d)NzCall get_instance() instead)RuntimeErrorself r   ]/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py__init__   s   zGlobalOutlierPooler.__init__c                 C   s   t  | _d | _d S N)setoutliers	model_dimr   r   r   r   
initialize   s   
zGlobalOutlierPooler.initializec                 C   s&   | j d u r| | | _ | j   | j S r   )	_instance__new__r   )clsr   r   r   get_instance!   s   

z GlobalOutlierPooler.get_instancec                 C   s2   | j d u r|| _ || j krd S | j|  d S r   )r   r   updatetolist)r   outlier_idxfeature_dimr   r   r   add_outliers(   s
   

z GlobalOutlierPooler.add_outliersc                 C   s   t t| jt jS r   )torchTensorlistr   toint64r   r   r   r   get_current_outlier_idx0   s   z+GlobalOutlierPooler.get_current_outlier_idx)
__name__
__module____qualname__r   r   r   classmethodr   r   r#   r   r   r   r   r      s    
r   zDThis function is deprecated and will be removed in a future release.)categorytransform_tile	tile_sizec                 C   s   |\}}d||   k rdk sJ  J t j|| t jd||}t |}tdD ]F}t j|d| ddd }|d t j	 }t 
| d |ksRJ d	| |}	|	|jd }
||
d|  7 }|| d| k rq |S q+|S )
a  
    Compute a permutation of indices that invert the specified (tiled) matrix transformation

    :param transform_tile: a function that applies forward transform to a tensor of shape [dim1, dim2]
    :param tile_size: higher-level tile dimensions, i.e. (8, 32) for Turing and (32, 32) for Ampere
    :note: we assume that tile_transform applies to a cpu-based int8 tensor of shape tile_size
    :example: transform_tile function for the turing layout (bitsandbytes.functional as F)
    :returns: indices
    r   l            dtype      trunc)rounding_mode   zint overflow)r   aranger"   view
zeros_likerangedivr!   int8
contiguousallintr,   )r)   r*   d1d2tile_indicespermuted_tile_indicesiith_dim_indicessample_tile_ipermuted_tile_iith_permuted_indicesr   r   r   get_inverse_transform_indices4   s     
rD   is_compilingpermuted_tensorr=   returnc                 C   s   | j |j \}}\}}|| ||   krdks!J d J d| d|  }t|}||| < ||||| || }|dddd}||| S )a  
    Undo a tiled permutation such as turing or ampere layout

    :param permuted_tensor: torch tensor in a permuted layout
    :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
    :return: contiguous row-major tensor
    r   z+tensor must contain a whole number of tiles         )	shapereshapenumeltr   
empty_likeflattenpermuter8   )rF   r=   rowscols	tile_rows	tile_colstensoroutputsr   r   r   undo_layout^   s   ,
rY   c                   @   s
  e Zd ZU dZeej ed< dZe	ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed	< dZeej ed
< dZeej ed< dZeej ed< dZee ed< dZdZdZeej ed< dZdZdZdZdd Zedd ZdS )MatmulLtStateN_tile_indicesFforce_no_igemmltCBCxBSBSCBCxBtSBtCBtsubBoutlier_pool        idxTrowc                 C   s.   d | _ d | _d | _d | _d | _d | _d | _d S r   )r]   r^   r_   r`   ra   rb   rc   r   r   r   r   reset_grads   s   
zMatmulLtState.reset_gradsc                 C   r	   )Nz$tile_indices is no longer supported.)
ValueErrorr   r   r   r   r=      s   zMatmulLtState.tile_indices)r$   r%   r&   r[   r   r   r   __annotations__r\   boolr]   r^   r_   r`   ra   rb   rc   rd   re   r   has_accumulated_gradients	thresholdrg   is_traininghas_fp16_weightsuse_poolformatBri   propertyr=   r   r   r   r   rZ   t   s,   
 
rZ   c                   @   sr   e Zd Ze			ddejjjdejdejde	ej de	ej de	e
 fdd	Zedejjjd
ejfddZdS )MatMul8bitLtNctxABoutbiasstatec              	   C   s  |pt  }d| _t|jdkrQd| _|| _|| _|| _|jd |jd kr;tj|jd d |jdd   |j	|j
dS tj|jd d |jd d  |j	|j
dS |j}|j	tjkrgt sgtd|j	 d t|jd	krw|d|jd }| jd rtj|tj|jd
\}}}	}
}ntj|tj|jd
\}}	}d  }}
d}|js|jd u rt|dd d u}|  o|jd |dk}|r| }|jr|r|jd u s|jd u r|  t|tj\|_|_}|jdkr||_ tj!j"#|||j|	|j||\}}ntj!j"j$j%||j|	|j||j	d}d }|| _&|| _'|j	| _(|d u r(d n|j	| _)t*| jd d rC|||f| _+|
|j f| _,ng d| _+d| _,| -d d  g |d d |jjd R }t|d	krl||S |S )NFr   TrH   rK   r,   devicez'MatMul8bitLt: inputs will be cast from z to float16 during quantizationrI   )rn   gradrf   )ry   r,   rJ   NNNNN).rZ   is_emptyr   rL   rv   rw   ry   r   emptyr,   r|   float16_is_compilingwarningsr   lenrM   needs_input_gradFint8_double_quantr!   rn   int8_vectorwise_quantrp   r]   getattris_contiguousstrider8   ro   r`   ri   rg   opsbitsandbytesint8_mixed_scaled_mmint8_scaled_mmdefaultrz   
grad_shapedtype_A
dtype_biasanytensorstensor_statessave_for_backward)ru   rv   rw   rx   ry   rz   input_shapeCACAtSCASCAtoutlier_colshas_gradis_transposed_outputsubAoutput_shaper   r   r   forward   sr   
	,,
$



zMatMul8bitLt.forwardgrad_outputc                 C   s  | j r| jd u r
d nt| j}t| jt| jd |d fS | j\}}}}}| j\}}}	| j\}
}| j	}d  } }}|rE|j
d| jd}t|jdkrW|d|jd  }|rt|tj\}}}}}tjjjj|  | ||
tjd}|jdkr|d ur| dkr|d d |f  t| |7  < |r|jd ur|jj| jdd|j d!d	}t|| j|"| j#}nt$d
||d |d fS )Nr   r+   rI   rH   rf   T)copyrK   g@ ?z)State must contain CB matrix for backward)%r   ry   r   r4   rv   rw   r   r   r   rz   sumr   r   rL   rM   r8   r   r   r!   r   r   r   r   r   rO   rn   rN   matmulr]   r   mul_r`   	unsqueezemulr3   r   	Exception)ru   r   	bias_grad	req_gradA	req_gradBr   req_gradBiasr   r   rv   r   rg   rz   grad_Agrad_B	grad_biasCgradSCgradtr]   r   r   r   backward  s:   


$
&zMatMul8bitLt.backwardr~   )r$   r%   r&   staticmethodr   autogradfunctionFunctionCtxr   r   rZ   r   r   r   r   r   r   rt      s(    b rt   c                   @   s2   e Zd Zeddeej fddZedd ZdS )
MatMul4BitNquant_statec                 C   s  d| _ t|jdkrLd| _ || _|| _|| _|j}|jd |d kr7tj|jd d |dd   |j|j	dS tj|jd d |d d  |j|j	dS tj
j|t|||j |}|| _|j|j|d u rld n|j| _| _| _t| jd d rd |f| _|S d| _|S )	NFr   TrH   rK   r{   rJ   r   )r   r   rL   rv   rw   ry   r   r   r,   r|   nn
functionallinearr   dequantize_4bitr!   rO   rz   r   dtype_Br   r   r   r   )ru   rv   rw   rx   ry   r   B_shaper   r   r   r   r   1  s$   **&(
zMatMul4Bit.forwardc           
      C   s   | j r| jd u r
d nt| j}t| jt| jd |d fS | j\}}}}}| j\}}d\}}}	|r;|jd| j	d}	|rNt
|t|| j|j }||d |	d fS )Nr~   r   r+   )r   ry   r   r4   rv   rw   r   r   r   r   r   r   r   rz   r!   r,   rO   )
ru   r   r   r   r   r   rw   r   r   r   r   r   r   r   O  s   

"zMatMul4Bit.backwardr~   )	r$   r%   r&   r   r   r   
QuantStater   r   r   r   r   r   r   -  s
    r   rf   rv   rw   rx   rz   ry   c                 C   s*   |pt  }|dkr||_t| ||||S )Nrf   )rZ   rn   rt   apply)rv   rw   rx   rz   rn   ry   r   r   r   r   f  s   
r   r   c                 C   s   |d usJ |   | jd krH| jdkrH| jd |j dkr3td|j d| j  t| ||||S tj| |	 ||d}|d urF||7 }|S t| ||||S )NrH   Fr   z4Some matrices hidden dimension is not a multiple of z^ and efficient inference kernels are not supported for these (slow). Matrix input size found: )rz   )
rN   rL   requires_grad	blocksizer   r   r   r   	gemv_4bitrO   )rv   rw   r   rx   ry   r   r   r   matmul_4bitt  s   r   )NNrf   Nr   )$dataclassesr   mathr   typingr   r   r   r   r   typing_extensionsr   bitsandbytes.functionalr   r   r   FutureWarningr   tupler:   rD   hasattrcompilerrE   r   _dynamo
LongTensorrY   rZ   r   Functionrt   r   r   r   r   r   r   r   r   <module>   sx    
 
( <
