o
    ‚o™i$O  ã                   @   sÞ  d dl mZ d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
Z
d dlmZ d dlmZ 	 G dd„ dƒZd	ee
jge
jf d
eeef fdd„Zde
jde
jde
jfdd„ZededG dd„ de
jjƒƒZejZejZejZededde
jdefdd„ƒZ ededdd„ ƒZ!ededdd„ ƒZ"eG dd„ dƒƒZ#G d d!„ d!e
jjƒZ$G d"d#„ d#e
jjƒZ%			$	d/d%e
jd&e
jd'ee
j d(ee# d)ee
j f
d*d+„Z&		d0d%e
jd&e
jd,ej'd'ee
j d)ee
j f
d-d.„Z(dS )1é    )Ú	dataclass)Úprod)ÚCallableÚOptionalÚTupleN)Úwarn)Ú
deprecatedc                   @   s<   e Zd ZdZdd„ Zdd„ Zedd„ ƒZdd	„ Zd
d„ Z	dS )ÚGlobalOutlierPoolerNc                 C   s   t dƒ‚)NzCall get_instance() instead)ÚRuntimeError©Úself© r   úT/home/ubuntu/.local/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.pyÚ__init__   s   zGlobalOutlierPooler.__init__c                 C   s   t ƒ | _d | _d S ©N)ÚsetÚoutliersÚ	model_dimr   r   r   r   Ú
initialize   s   
zGlobalOutlierPooler.initializec                 C   s&   | j d u r|  | ¡| _ | j  ¡  | j S r   )Ú	_instanceÚ__new__r   )Úclsr   r   r   Úget_instance!   s   

z GlobalOutlierPooler.get_instancec                 C   s2   | j d u r|| _ || j krd S | j | ¡ ¡ d S r   )r   r   ÚupdateÚtolist)r   Úoutlier_idxÚfeature_dimr   r   r   Úadd_outliers(   s
   

z GlobalOutlierPooler.add_outliersc                 C   s   t  t| jƒ¡ t j¡S r   )ÚtorchÚTensorÚlistr   ÚtoÚint64r   r   r   r   Úget_current_outlier_idx0   s   z+GlobalOutlierPooler.get_current_outlier_idx)
Ú__name__Ú
__module__Ú__qualname__r   r   r   Úclassmethodr   r   r#   r   r   r   r   r	      s    
r	   Útransform_tileÚ	tile_sizec                 C   sè   |\}}d||   k rdk sJ ‚ J ‚t j|| t jd ||¡}t  |¡}tdƒD ]F}t j|d| ddd }|d  t j¡ 	¡ }t  
| ¡ d |k¡sRJ d	ƒ‚| |ƒ}	|	 |j¡d }
||
d|  7 }|| d| k rq |S q+|S )
aþ  
    Compute a permutation of indices that invert the specified (tiled) matrix transformation

    :param transform_tile: a function that applies forward transform to a tensor of shape [dim1, dim2]
    :param tile_size: higher-level tile dimensions, i.e. (8, 32) for Turing and (32, 32) for Ampere
    :note: we assume that tile_transform applies to a cpu-based int8 tensor of shape tile_size
    :example: transform_tile function for the turing layout (bitsandbytes.functional as F)
    :returns: indices
    r   l            ©Údtypeé   é   Útrunc)Úrounding_modeé€   zint overflow)r   Úaranger"   ÚviewÚ
zeros_likeÚrangeÚdivr!   Úint8Ú
contiguousÚallÚintr+   )r(   r)   Úd1Úd2Útile_indicesÚpermuted_tile_indicesÚiÚith_dim_indicesÚsample_tile_iÚpermuted_tile_iÚith_permuted_indicesr   r   r   Úget_inverse_transform_indices4   s     
þrC   Úpermuted_tensorr<   Úreturnc                 C   s¤   | j |j \}}\}}|| ||   krdks!J dƒ‚ J dƒ‚|  d| ¡ ¡ ¡ }t |¡}||| ¡ < | |||| || ¡}| dddd¡}| ||¡ ¡ S )a  
    Undo a tiled permutation such as turing or ampere layout

    :param permuted_tensor: torch tensor in a permuted layout
    :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
    :return: contiguous row-major tensor
    r   z+tensor must contain a whole number of tileséÿÿÿÿé   é   é   )	ÚshapeÚreshapeÚnumelÚtr   Ú
empty_likeÚflattenÚpermuter7   )rD   r<   ÚrowsÚcolsÚ	tile_rowsÚ	tile_colsÚtensorÚoutputsr   r   r   Úundo_layoutS   s   ,
rW   zbMatMul8bit is deprecated and will be removed in a future release. Please use MatMul8bitLt instead.)Úcategoryc                   @   s&   e Zd Zeddd„ƒZedd„ ƒZdS )Ú
MatMul8bitNÚvectorc                 C   sÚ   |d u rg d¢}|d dkr)t  ¡  t  ||¡}W d   ƒ n1 s#w   Y  n0t|jƒdkr3d}nd}tj|d|d\}}	tj|||d\}
}t ||
¡}t ||	||j	|¡}|j
s_|j
re|  ||¡ || _|| _|S )N)r,   r,   r,   r   r,   rH   rI   rF   ©ÚdimÚ
quant_type)r   Úno_gradÚmatmulÚlenrJ   ÚFÚvectorwise_quantÚigemmÚvectorwise_mm_dequantr+   Úrequires_gradÚsave_for_backwardr]   Ú	precision)ÚctxÚAÚBÚoutr]   rg   Úoutputr\   ÚqAÚSAÚqBÚSBÚioutr   r   r   Úforwardj   s&   
ÿ€zMatMul8bit.forwardc                 C   sœ  | j \}}| j}| j}d  }}|jrÐt|jƒdkr"ddg}g d¢}	ndg}ddg}	|d dkrMt ¡  t | 	|	¡|¡}W d   ƒ n1 sGw   Y  nƒt|jƒdkr¦t|jƒdkr¦| 
¡ }| ¡ sg| 
¡  tj| d|jd ¡d|d\}
}| ¡ s€| 
¡ }tj| d|jd ¡d|d\}}t | ¡ |
¡}t || ¡ ||j|¡}n*tj|||d\}
}tj|||d\}}t | 	|	¡|
¡}t || 	|	¡||j|¡}|jrGt|jƒdkrßdg}ndg}t|jƒdkrðg d¢}	|}nddg}	dg}|d dkrt ¡  t || 	|	¡¡}W d   ƒ n	1 sw   Y  n*tj|||d\}
}tj|||d\}}t |
| 	|	¡¡}t ||| 	|	¡|j|¡}||d d d fS )	NrG   r   rI   )r   rH   rI   r,   rH   rF   r[   )Úsaved_tensorsr]   rg   re   r`   rJ   r   r^   r_   rP   r7   Úis_contiguousra   rb   r2   rc   rM   rd   r+   )rh   Úgrad_outputri   rj   r]   rg   Úgrad_AÚgrad_BÚdimsÚpermute_dimÚqgrad_outputÚS1rm   ÚS2Úigrad_BÚdim_Bro   ÚS3Úigrad_Ar   r   r   Úbackwardƒ   s€   


ÿ€
ý"û
ÿ€ûzMatMul8bit.backward)NrZ   N)r$   r%   r&   Ústaticmethodrr   r   r   r   r   r   rY   e   s
    rY   zDThis function is deprecated and will be removed in a future release.Údevicec                    sF   t jj| ddk rdS t jj| d‰ d}t‡ fdd„|D ƒƒr!dS dS )z7check if this device supports the optimized int8 kernel)rƒ   )é   é   F)zGTX 1630zGTX 1650zGTX 1660c                 3   s    | ]}|ˆ v V  qd S r   r   )Ú.0Ú
model_name©Údevice_namer   r   Ú	<genexpr>ß   s   € z#supports_igemmlt.<locals>.<genexpr>T)r   ÚcudaÚget_device_capabilityÚget_device_nameÚany)rƒ   Únvidia16_modelsr   rˆ   r   Úsupports_igemmltØ   s   r   c                 C   s&   | dv sJ d| › ƒ‚| dkrdS dS )N)Ú
col_turingÚ
col_amperez9please find this assert and manually enter tile size for r‘   )r,   é    )r“   r“   r   )Úformatr   r   r   Ú_get_tile_sizeä   s   
ýr•   c                    sN   ‡ ‡fdd„}t  ¡  t|tˆƒƒ ˆ ¡W  d   ƒ S 1 s w   Y  d S )Nc                    s"   t j|  ˆ ¡dˆdd  | j¡S )NÚrow)Ú
from_orderÚto_orderr   )ra   Ú	transformr!   rƒ   )Úx©rƒ   r”   r   r   Ú<lambda>ï   s   " zget_tile_inds.<locals>.<lambda>)r   r^   rC   r•   r!   )r”   rƒ   r™   r   r›   r   Úget_tile_indsí   s   
$ÿr   c                   @   s
  e Zd ZU dZeej ed< dZe	ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< dZeej ed	< dZeej ed
< dZeej ed< dZeej ed< dZee ed< dZdZdZeej ed< dZdZdZdZdd„ Zedd„ ƒZdS )ÚMatmulLtStateNÚ_tile_indicesFÚforce_no_igemmltÚCBÚCxBrp   ÚSCBÚCxBtÚSBtÚCBtÚsubBÚoutlier_poolç        ÚidxTr–   c                 C   s.   d | _ d | _d | _d | _d | _d | _d | _d S r   )r¡   r¢   rp   r£   r¤   r¥   r¦   r   r   r   r   Úreset_grads  s   
zMatmulLtState.reset_gradsc                 C   s"   | j d u rt| j| jjƒ| _ | j S r   )rŸ   r   ÚformatBr¢   rƒ   r   r   r   r   r<     s   
zMatmulLtState.tile_indices)r$   r%   r&   rŸ   r   r   r   Ú__annotations__r    Úboolr¡   r¢   rp   r£   r¤   r¥   r¦   r§   r¨   r	   Úhas_accumulated_gradientsÚ	thresholdrª   Úis_trainingÚhas_fp16_weightsÚuse_poolr¬   r«   Úpropertyr<   r   r   r   r   rž   ô   s,   
 
rž   c                   @   sr   e Zd Ze			ddejjjdejdejde	ej de	ej de	e
 fdd	„ƒZedejjjd
ejfdd„ƒZdS )ÚMatMul8bitLtNrh   ri   rj   rk   ÚbiasÚstatec                 C   sÀ  |pt ƒ }d| _t|jƒdkrQd| _|| _|| _|| _|jd |jd kr;tj|jd d… |jdd …  |j	|j
dS tj|jd d… |jd d…  |j	|j
dS |j}|j	tjkrdt d|j	› d¡ t|jƒd	krt| d|jd ¡}| jd r‹tj| tj¡|jd
\}}}	}
}ntj| tj¡|jd
\}}	}d  }}
d}|js¨|jd u ræt|dd ƒd u}| ¡  o¾|jd | d¡k}|rÅ| ¡ }|jrÊ|rÔ|jd u sÔ|jd u ræ| ¡  t | tj¡¡\|_|_}|jdkr<|d ur<| ¡ r<||_ |d urd|d d …|j f< |d d …|j f  ¡ }|jr#|d d …|j f  !¡ |_"n|jd d …|j f }| !¡ |j d  |j	¡|_"nd }t #||j¡}|d u sQ|j	tjkr`tj$||	|j|d |j	¡}ntj$||	|jd d |j	¡ %|¡}|d urƒ|j"d urƒ| &||j"¡}|| _'|| _(|j	|j	|d u r”d n|j	| _)| _*| _+t,| jd d… ƒrµ|||f| _-|
|j f| _.ng d¢| _-d| _.|  /d d ¡ g |d d… ¢|jjd ‘R }t|ƒd	krÞ| |¡S |S )NFr   TrF   rI   ©r+   rƒ   z'MatMul8bitLt: inputs will be cast from z to float16 during quantizationrG   )r°   Úgradr©   g   €@ €?)r¶   rH   ©NNN©NN)0rž   Úis_emptyr   rJ   ri   rj   r¶   r   Úemptyr+   rƒ   Úfloat16Úwarningsr   r`   rK   Úneeds_input_gradra   Úint8_double_quantr!   r°   Úint8_vectorwise_quantr²   r¡   Úgetattrrt   Ústrider7   r±   r£   r«   rL   rª   rM   r§   Úint8_linear_matmulÚint8_mm_dequantÚadd_Úaddmmr·   Ú
grad_shapeÚdtype_AÚdtype_BÚ
dtype_biasrŽ   ÚtensorsÚtensor_statesrf   )rh   ri   rj   rk   r¶   r·   Úinput_shapeÚCAÚCAtÚSCAÚSCAtÚoutlier_colsÚhas_gradÚis_transposedÚ_ÚsubAr   Úout32rl   Úoutput_shaper   r   r   rr      sr   
	,,
$ 
"*

zMatMul8bitLt.forwardru   c                 C   s’  | j r| jd u r
d nt | j¡}t | j¡t | j¡d |d fS | j\}}}}}| j\}}}	| j\}
}| j	}d  } }}|rE|j
d| jd}t|jƒdkrW| d|jd ¡ ¡ }|r•t | tj¡¡\}}}}}t | ¡  ¡ | ¡ ¡}t |||
¡}|jdkr•|d ur•|d d …|f  t | ¡ |¡7  < |rÂ|jd ur¾|jj| jdd |j d¡ d	¡¡}t | | j¡|¡ | j ¡}nt!d
ƒ‚||d |d fS )Nr   r*   rG   rF   r©   T)ÚcopyrI   g@ €?z)State must contain CB matrix for backward)"r¼   r¶   r   r3   ri   rj   rÀ   rÍ   rÎ   r·   ÚsumrÌ   r`   rJ   rK   r7   ra   rÁ   r!   r¾   rÅ   rM   rÆ   r°   r_   r¡   rÊ   Úmul_r£   Ú	unsqueezeÚmulr2   rÉ   Ú	Exception)rh   ru   Ú	bias_gradÚ	req_gradAÚ	req_gradBr×   Úreq_gradBiasrÑ   rØ   ri   rÓ   rª   r·   rv   rw   Ú	grad_biasÚCgradÚSCgradtÚgradB32r¡   r   r   r   r   “  s0   
$
&zMatMul8bitLt.backwardrº   )r$   r%   r&   r‚   r   ÚautogradÚfunctionÚFunctionCtxr   r   rž   rr   r   r   r   r   r   rµ     s(    úÿþýüûúr rµ   c                   @   s2   e Zd Zeddeej fdd„ƒZedd„ ƒZdS )Ú
MatMul4BitNÚquant_statec                 C   s  d| _ t|jƒdkrLd| _ || _|| _|| _|j}|jd |d kr7tj|jd d… |dd …  |j|j	dS tj|jd d… |d d…  |j|j	dS tj
j |t ||¡ |j¡ ¡ |¡}|| _|j|j|d u rld n|j| _| _| _t| jd d… ƒr†d |f| _|S d| _|S )	NFr   TrF   rI   r¸   rH   r»   )r¼   r   rJ   ri   rj   r¶   r   r½   r+   rƒ   ÚnnÚ
functionalÚlinearra   Údequantize_4bitr!   rM   r·   rÊ   rË   rÌ   rŽ   rÀ   rÍ   )rh   ri   rj   rk   r¶   rí   ÚB_shaperl   r   r   r   rr   ½  s$   **&(
þzMatMul4Bit.forwardc           
      C   sª   | j r| jd u r
d nt | j¡}t | j¡t | j¡d |d fS | j\}}}}}| j\}}d\}}}	|r;|jd| j	d}	|rNt 
|t || j¡ |j¡ ¡ ¡}||d |	d fS )Nrº   r   r*   )r¼   r¶   r   r3   ri   rj   rÀ   rÍ   rÜ   rÌ   r_   ra   rñ   r·   r!   r+   rM   )
rh   ru   rá   râ   r×   rä   rj   rv   rw   rå   r   r   r   r   Û  s   

"zMatMul4Bit.backwardrº   )	r$   r%   r&   r‚   r   ra   Ú
QuantStaterr   r   r   r   r   r   rì   ¹  s
    rì   r©   ri   rj   rk   r·   r¶   c                 C   s*   |pt ƒ }|dkr||_t | ||||¡S )Nr©   )rž   r°   rµ   Úapply)ri   rj   rk   r·   r°   r¶   r   r   r   r_   ò  s   
r_   rí   c                 C   s¢   |d usJ ‚|   ¡ | jd krH| jdkrH| jd |j dkr3td|j› d| j› ƒ t | ||||¡S tj| | 	¡ ||d}|d urF||7 }|S t | ||||¡S )NrF   Fr   z4Some matrices hidden dimension is not a multiple of z^ and efficient inference kernels are not supported for these (slow). Matrix input size found: )r·   )
rL   rJ   re   Ú	blocksizer   rì   rô   ra   Ú	gemv_4bitrM   )ri   rj   rí   rk   r¶   r   r   r   Úmatmul_4bit   s   ÿr÷   )NNr©   Nr»   ))Údataclassesr   Úmathr   Útypingr   r   r   r¿   r   r   Útyping_extensionsr   Úbitsandbytes.functionalrï   ra   r	   r   r9   rC   Ú
LongTensorrW   ÚFutureWarningré   ÚFunctionrY   rô   Ú	mm_cublasÚ
bmm_cublasÚmatmul_cublasrƒ   r®   r   r•   r   rž   rµ   rì   r_   ró   r÷   r   r   r   r   Ú<module>   s~    ÿ

þþj




* <úÿþýü
úûÿþýüû