o
    oiŤ                 ,   @   s  d dl Zd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZ d dlmZ d dlmZmZ ddlmZ i Zerejr	 ejejejfejejfejejfej ej!ej"fej#ej$fejejejfej%ej&ej'fd	Z(ej)ej*fej+ej,fej-ej.fej/ej0fej)ej*fej+ej,fd
Z1ej2ej3ej4fej5ej6ej7fej8ej9ej:fej;ej<ej=fej>ej?ej@fejAejBejCfdZDG dd dZEG dd dZFG dd dZGi ZHdeHejI< deHejJ< deHejK< deHejL< deHejM< ejNdd dZOejPQ dkrdejfddZRnd dlSZSdejfddZRejIeOdddZTdddZUdd!d"ZVdd#d$ZWed%eXd&dd'd(ZYed%eXd&dd)d*ZZdd,d-Z[dd/d0Z\dd2d3Z]d d5d6Z^dd7d8Z_ed9eXd&d:d; Z`d<ee	ej  fd=d>Zaed?eXd&d@edAejPjbfdBdCZcd@edAejdfdDdEZedFe	e dAe	ejd fdGdHZfed?eXd&dIdJ Zged?eXd&dKdL ZhedMeXd&ddNdOZiedMeXd&ddQdRZjedMeXd&	P				ddSdTZk		U	VddFedWe	ej dXeldAefdYdZZmG d[d\ d\Zn				]	ddFejd^e	ej d_e	ej dWe	ej dAe
ejenf f
d`daZo					]	ddFejdbe	en d_e	ej d^e	ej dWe	ej dcepdAejfdddeZqddgdhZrdddfdejLfdFejd_e	ej dWe	ej fdidjZsdddfdejLfdFejd_e	ej dWe	ej fdkdlZtdddfddmejLfdFejd_e	ej dWe	ej dAe
ejenf fdndoZu				fddFejdbe	en d_e	ej dWe	ej dcepdAejfdpdqZv				fddFejdbe	en d_e	ej dWe	ej dcepdAejfdrdsZw				f	md	dFejdbe	en d_e	ej dWe	ej dcepdAejfdtduZxed?eXd&		d
dFed^e	ej dWe	ej dAe
ee
eef f fdvdwZyed?eXd&				ddFedxe	e
eef  d_e	ej d^e	ej dWe	ej dAefdydzZzed?eXd&ddFed^edWe	ej dAefd{d|Z{ed?eXd&ddFed^edWe	ej dAefd}d~Z|									dde}dedededeldeldepdelde	ej deldeldeldeldelde	ej deldAdf"ddZ~edeXd&				dde}dededede	ej deldeldeldepdeldede	ej dede	ej dede	ej deldelde	ej deldAdf*ddZ			dde}dededede	ej deldeldeldeldeldepdeldede	ej dede	ej deldeldAdf&ddZed?eXd&ddededepdepfddZdedededefddZejMfddZ				ddFededWe	ej fddZ			ddFededWe	ej fddZ			ddFededWe	ej fddZedeXd&ddejfdFejdejde
eje}f de
eje}f dWe	ej de	e
eje}f  fddZdejfdFejdejdWe	ej fddZ		d
dFejdejdejdWe	ej de	ej f
ddZedeXd&				ddFejdbe	e
eje}f  dejdejdWe	ej de	ej fddÄZ				ddFejde	ej de	ej de	ej dAe
ejeje	ej f f
ddƄZddFejfddȄZG ddʄ dʃZG dd̄ d̃ZG dd΄ d΃ZddЄ Zdd҄ ZejfddԄZedeXd&					ddFejde	ej de	ej de	ej de	ej dAe
ejejejeje	e f fddلZ					ddFejde	ej de	ej de	ej de	ej f
ddۄZdFejdejfddބZddFejfddZedMeXd&dddZ	ddeeejf dejdWe	ej fddZd
ddZdZedeXd&dddZedeXd&dddZedeXd&ejdfddZed?eXd&ejfddZed?eXd&dd Zed?eXd&dd ZdS (      N)prod)AnyDictIterableOptionalTupleUnion)Tensor)
deprecated)pack_dict_to_tensorunpack_tensor_to_dict   )lib)adammomentumrmsproplionadagradlambademamix)r   r   r   r   r   lars)r   r   r   r   r   r   c                   @   s6   e Zd ZdZdd Zdd Zedd Zdd	d
ZdS )GlobalPageManagerNc                 C      t dNzCall get_instance() insteadRuntimeErrorself r   K/home/ubuntu/.local/lib/python3.10/site-packages/bitsandbytes/functional.py__init__z      zGlobalPageManager.__init__c                 C   s
   g | _ d S N)paged_tensorsr   r   r   r   
initialize}      
zGlobalPageManager.initializec                 C   &   | j d u r| | | _ | j   | j S r"   	_instance__new__r$   clsr   r   r   get_instance      

zGlobalPageManager.get_instanceFc                 C   s$   | j d d d D ]}t|| qd S )N)r#   prefetch_tensor)r   to_cputr   r   r   prefetch_all   s   zGlobalPageManager.prefetch_allF)	__name__
__module____qualname__r(   r    r$   classmethodr,   r2   r   r   r   r   r   w   s    
r   c                   @   s4   e Zd ZdZdd Zdd Zedd Zdd	 ZdS )
CUBLAS_ContextNc                 C   r   r   r   r   r   r   r   r       r!   zCUBLAS_Context.__init__c                 C   s
   i | _ d S r"   )contextr   r   r   r   r$      r%   zCUBLAS_Context.initializec                 C   r&   r"   r'   r*   r   r   r   r,      r-   zCUBLAS_Context.get_instancec                 C   sP   |j | jvr"tj }tj| tt	 | j|j < tj| | j|j  S r"   )
indexr9   torchcudacurrent_device
set_devicectc_void_pr   get_context)r   deviceprev_devicer   r   r   rA      s   
zCUBLAS_Context.get_context)	r4   r5   r6   r(   r    r$   r7   r,   rA   r   r   r   r   r8      s    
r8   c                   @   s,   e Zd ZdZdd Zdd Zedd ZdS )Cusparse_ContextNc                 C   r   r   r   r   r   r   r   r       r!   zCusparse_Context.__init__c                 C   s   t t | _d S r"   )r?   r@   r   get_cusparser9   r   r   r   r   r$      s   zCusparse_Context.initializec                 C   r&   r"   r'   r*   r   r   r   r,      r-   zCusparse_Context.get_instance)r4   r5   r6   r(   r    r$   r7   r,   r   r   r   r   rD      s    rD         r<   )r:   ac                 C   s   t j| S r"   )r;   r<   	device_ofrH   r   r   r   _cuda_device_of   s   rK   c                 C   s   t  S r"   )
contextlibnullcontextrJ   r   r   r   rK      r!   dtyperB   c                 G   sp   t |  t| }tt|}t|ttj}t	j
j||d}tj|| t|d|}d|_|j|_|S )Nshape)rO   countT)dtype2bytesr   r   cget_managed_ptrr?   c_size_tcastPOINTERc_intnp	ctypeslibas_arrayr;   
frombufferviewis_pagedr:   page_deviceid)rO   rB   rQ   	num_bytescuda_ptrc_ptr	new_arrayoutr   r   r   	get_paged   s   re   Fc                 C   sR   | j sJ d|rd}n| j}t| j |   }tt| t	|t
| d S )Nz%Only paged tensors can be prefetched!r.   )r^   r_   rS   rO   numelr   	cprefetchget_ptrr?   rU   c_int32)Ar0   deviceidr`   r   r   r   r/      s   "r/   Tc                 C   s   d }|j tjkrttd|  dd }t|}n|j tjkr-ttd|  dd }t|}|d u r8t	d|  t|dd}|rN|rNt
| |d urNt
| |t|t||t|  |jsd|jrktj  d S d S )Nc_fp32_uint8zFunction not implemented: 
is_managedF)rO   r;   float32getattrr   r?   c_floatuint8c_uint8NotImplementedErrorr/   rh   c_int64rf   r^   r<   synchronize)	func_namerj   Bvalueprefetchfunccvaluero   r   r   r   elementwise_func   s$   
 r~   c                 C   s   t d| d | d S )Nfillr~   )rj   rz   rB   r{   r   r   r   r     s   r   z-Function will be removed in a future release.)categoryc                 C   s   t d| d d d S )Naranger   r   )rj   rB   r   r   r   r        r   c                 C   s   t d| |d d S )N_mulr   r   )rj   ry   rB   r   r   r   r     r   r      c                 C   s   | rdnd}d| }|s|dk r| sd| nd| d }t |d|}d|  }|dkr/|S | d }t |d |  dg|  ||d    S )	N              rG   r   r         ?   r   )r;   linspacerf   r	   tolist)signed
total_bitsadd_zerosigntotal_valuesvaluesgaplr   r   r   create_linear_map  s   0r   +ew?c           	   
   C   s  zddl m} W n ty } ztd|d }~ww |rC|t| ddd d  }dgd }|t| ddd d   }n&|t| ddd d  }dgd	 }|t| ddd d   }|| | }t|}| j	}||
  }| d
ksJ |S )Nr   )normzZScipy is required for `create_normal_map`. Install `bitsandbytes` with the `[test]` extra.g      ?	   r.      r      r   )scipy.statsr   ImportErrorppfr;   r   r   r	   sortr   maxrf   )	offsetuse_extra_valuer   iev1v2v3vr   r   r   r   create_normal_map#  s.    
$ 
"

r      c                 C   s  |}|}| rdnd}|| || ksJ g }g }t td||   d||  dD ]\}	}
|d|
  q)g }ttjddg|d}d|d  }td| D ]I}|D ]D}|dkrZdnd}t t|D ]\}	}||d|	d    7 }qb|dkr~|d|   }n|d|| d    }|| | r||  qRqNt|d| ksJ |  |dk rdt| }t|D ]}	|d q|  t	|}||
  }|S )Nr   r   rG   )repeatr   r   )	enumeraterangeappendlist	itertoolsproductlenr   r;   r	   r   )r   exponent_bitsprecision_bitsr   ephas_signevaluespvaluesivalr   lstbiasevaluebit_patternrz   pvalr   coder   r   r   create_fp8_map@  sD   *

r      c                 C   s  g }|| rdnd }d||  d }t |D ]U}t| r%d|| |  d nd|| | d  d }tjdd|tjd}|dd |dd  d }	|d|d  |  |	  7 }| rk|d|d  |   |	  7 }q|d	krtjdd|d tjd}|dd |dd  d }	|d|d  |  |	  7 }| r|d|d  |   |	  7 }|d	 |d
 t|d| ksJ dt| }
t |
D ]}|d	 q|  tj	|tjdS )a+  
    Creates the dynamic quantiztion map.

    The dynamic data type is made up of a dynamic exponent and
    fraction. As the exponent increase from 0 to -7 the number
    of bits available for the fraction shrinks.

    This is a generalization of the dynamic type where a certain
    number of the bits and be reserved for the linear quantization
    region (the fraction). n determines the maximum number of
    exponent bits.

    For more details see
    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
    r   rG   g?rO   Nr.          @
   r   r   r   )
r   intr;   r   rp   r   r   r   r   tensor)r   max_exponent_bitsr   datanon_sign_bitsadditional_itemsr   fraction_items
boundariesmeansr   r   r   r   create_dynamic_mapk  s:     

r   c                 C   sn   t | d| d d}| }|d dt| }t|D ]}|d q|  t|}||   }|S )NrG   r   )num_quantilesr   r   )	estimate_quantilesr   r   r   r   r   r	   absr   )rj   r   qr   r   r   r   r   create_quantile_map  s   
r   zDThis function is deprecated and will be removed in a future version.c                   C   s   dS )Nrowr   r   r   r   r   get_special_format_str  s   r   tensorsc                 C   s   d}t  }| D ]}|durt|dds||jM }||jj q|s.tddd | D  t|dkr@td	d
d | D  |S )ap  Verifies that the input tensors are all on the same device.

    An input tensor may also be marked as `paged`, in which case the device placement is ignored.

    Args:
        tensors (`Iterable[Optional[torch.Tensor]]`): A list of tensors to verify.

    Raises:
        `RuntimeError`: Raised when the verification fails.

    Returns:
        `Literal[True]`
    TNr^   FzZAll input tensors need to be on the same GPU, but found some tensors to not be on a GPU:
 c                 S      g | ]}|j |jfqS r   rQ   rB   .0r1   r   r   r   
<listcomp>      zis_on_gpu.<locals>.<listcomp>r   zcInput tensors need to be on the same GPU, but found the following tensor and device combinations:
 c                 S   r   r   r   r   r   r   r   r     r   )setrq   is_cudaaddrB   r:   r   r   )r   on_gpugpu_idsr1   r   r   r   	is_on_gpu  s    
r   zDThis function is deprecated and will be removed in a future release.r   returnc                 C   s   t j| jS r"   )r;   r<   current_streamrB   r   r   r   r   get_tensor_stream  s   r   c                 C   s   t tj| jjS r"   )r?   r@   r;   _C_cuda_getCurrentRawStreamrB   r:   r   r   r   r   _get_tensor_stream  s   r   rj   c                 C   s   | du rdS t |  S )zGets the memory address of the first element of a tenso

    Args:
        A (`Optional[Tensor]`): A PyTorch tensor.

    Returns:
        `Optional[ct.c_void_p]`: A pointer to the underlying tensor data.
    N)r?   r@   data_ptr)rj   r   r   r   rh     s   	rh   c                 C   s   t j }t j|  |S r"   )r;   r<   r=   r>   )rB   rC   r   r   r   pre_call  s   
r   c                 C   s   t j|  d S r"   )r;   r<   r>   )rC   r   r   r   	post_call  s   r   zkThe layout transformation operations will be removed in a future release. Please use row-major layout only.c              	   C   sn   d| t jkrdnd d| d| d|rdnd }tt|s2t| td| d	| d
|  d| tt|S )Nctransform_r       __to_r1   nz"Transform function not supported:  to z for data type z and transpose=)r;   int8hasattrr   print
ValueErrorrq   )rO   orderAorderOut	transposenamer   r   r   get_transform_func   s   2

r   r   c                 C   sD  t j}t| }|dkr| d }n|dkr| d | d  }| d }	| |f}
|r5|}|	}|}	| d d d |f}
|dks=|dkrF|| ||d|
fS |d	kr]d
|	d d
  }	|||	f||d|
fS |dkr|d
|	d d
  }	d|d d  }|||	f||d|
fS |dkrd
|	d d
  }	d
|d d
  }|||	f||d|
fS td| )NrG   r      r   r.   r   colrN   col32r      
col_turingr   r   
col_amperezTo_order not supported: )r;   zerosr   ru   )rQ   rO   rB   to_order
from_orderr   	init_funcdimsrowscolsstatetmpr   r   r   get_transform_buffer  s6   
r
  c                    s(  |d u r
| j |f}n|d }|d u r#t|d | j| j||d \}}n|d |f}t| j|||}|d  t dkrJt d }	t d }
n2|d urjt }t fdd|D }	t||	 }
t|	}	nt d  d  }	t d }
t	
 | j}||t| t||	|
 ||fS )Nr   r   rG   c                    s   g | ]} | qS r   r   )r   r   rP   r   r   r   W  s    z$nvidia_transform.<locals>.<listcomp>)rQ   r
  rO   rB   r   r   r?   ri   r   r8   r,   rA   rh   )rj   r  r  rd   r   r  ld	new_stater|   dim1dim2r   ptrr   rP   r   nvidia_transform:  s*   "r        `?r   rd   r   c              	   C   sB  |   dk rtd|    d|dkrtd| |dk r)|dkr)dd|  }|du r7tjd	tj| jd
}t| |g t| j}| jtjkr]t	
t| t|t|t|    n#| jtjkrxt	t| t|t|t|    ntd| j t| |dk rtd| }tdd| | j}|| }|S )a  
    Estimates 256 equidistant quantiles on the input tensor eCDF.

    Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
    via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
    and the extreme quantiles close to 0 and 1 have high variance / large estimation
    errors. These large errors can be avoided by using the offset variable which trims
    the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
    trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
    usually has a much lower error but is not a minimum entropy encoding. Given an offset
    of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor. Any shape.
    out : torch.Tensor
        Tensor with the 256 estimated quantiles.
    offset : float
        The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
    num_quantiles : int
        The number of equally spaced quantiles.

    Returns
    -------
    torch.Tensor:
        The 256 quantiles in float32 datatype.
    r   zQQuantile estimation needs at least 256 values in the Tensor, but Tensor had only z values.zgCurrently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles=r  r   rG   N)r   rN   zNot supported data type r      )rf   ru   r;   r  rp   rB   r   r   rO   r   cestimate_quantiles_fp32rh   r?   rr   rX   float16cestimate_quantiles_fp16r   roundr   longto)rj   rd   r   r   rB   stepidxr   r   r   r   d  s2   "
**r   c                   @   s   e Zd ZdZdZdd eD Zg dZ							dddZd	d
 Ze	de
eef dejdd fddZdddZdd Zdd ZdS )
QuantStatezWcontainer for quantization state components to work with Params4bit and similar classesfp4nf4c                 C   s   g | ]}d | qS )bitsandbytes__r   )r   xr   r   r   r     s    zQuantState.<listcomp>)absmax	quant_mapnested_absmaxnested_quant_mapquant_state
quant_type	blocksizerO   rQ   nested_blocksizenested_dtypenested_offsetNc	           	      C   s>   || _ || _|| _|| _|| _|| _|| _|| _|d u| _d S r"   )	r!  rQ   r   rO   r'  r&  r   state2nested)	r   r!  rQ   r   r'  r&  rO   r   r+  r   r   r   r      s   zQuantState.__init__c                 C   sR   | j r| j| j| j| j| j| jg| jg}|| S | j| j| j| jd| jg}|| S )a$  
        ensures compatibility with older quant state scheme with nested lists.
        assumes the following layout:
        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
        N)r,  r!  rQ   rO   r'  r   r+  r&  )r   r  	list_reprr   r   r   __get_item__  s   

zQuantState.__get_item__qs_dictrB   r   c              
   C   sp  dd |  D }t|sd|vrtdt|dks'|d dd | jvr3td	| j d
| dt|dkrG|d }|t|| dd |  D }t|	 
| js\J d|v rtt|d |}| |d ||d |d |tt|d d}nd\}}| |d |d ||d |d |tt|d |d durt|d nd||d}|S )aO  
        unpacks components of state_dict into QuantState
        where necessary, convert into strings, torch.dtype, ints, etc.

        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.

        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.
        c                 S   s(   g | ]\}}d |v rt |tjr|qS )r%  
isinstancer;   r	   r   kr   r   r   r   r     s   ( z(QuantState.from_dict.<locals>.<listcomp>r&  z<Expected packed or unpacked quant_state items, found neitherr   r   .r.   z@There should be exactly one `quant_state` item with ending from z.
Detected c                 S   s    i | ]\}}| d d |qS )r4  r.   )splitr2  r   r   r   
<dictcomp>  s     z(QuantState.from_dict.<locals>.<dictcomp>r#  r*  r(  r$  r)  )r!  r'  r   rO   NNr!  r'  r"  rO   rQ   N)r&  r!  r'  r   rO   rQ   r   r+  )itemsr   r   r5  valid_qs_type_keysupdater   popr   keysissubsetvalid_qs_keysr;   r   floatr  rq   Size)r+   r/  rB   qs_keyfirst_qs_keyr   r+  r%  r   r   r   	from_dict  s@   $
zQuantState.from_dictFc                 C   s   | j | j| j| jt| jdt| jd}| j	r6|
| jj| jj| jj t| jjd| j d |s:|S dd | D }dd | D }t||d| j  < |S )z
        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
        ztorch.)r&  r!  r'  r"  rO   rQ   )r#  r(  r$  r)  r*  c                 S   s"   i | ]\}}t |tjr||qS r   r0  r2  r   r   r   r6  1     " z&QuantState.as_dict.<locals>.<dictcomp>c                 S   s"   i | ]\}}t |tjs||qS r   r0  r2  r   r   r   r6  2  rD  zquant_state.bitsandbytes__)r&  r!  r'  r   strrO   striptuplerQ   r,  r:  r+  cloner   itemr8  r   )r   packedr/  qs_packed_dictnon_tensor_dictr   r   r   as_dict  s,   
	zQuantState.as_dictc                 C   s\   | j || _ | j|| _| jr,| j|| _| jj|| j_| jj || j_ d S d S r"   )r   r  r!  r,  r   r+  )r   rB   r   r   r   r  6  s   zQuantState.toc                 C   s   t |tsdS tj| j|jddo^| j|jko^tj| j|jddo^| j|jko^| j|jko^| j	|j	ko^| j
d urC|j
d urC| j
|j
kn| j
|j
u o^| jd urY|jd urY| j|jkS | j|ju S )NFgư>)atol)r1  r  r;   allcloser!  rQ   r   rO   r'  r&  r   r+  )r   otherr   r   r   __eq__?  s,   







zQuantState.__eq__)NNNNNNNr3   )r4   r5   r6   __doc__valid_quant_typesr9  r>  r    r.  r7   r   rE  r   r;   rB   rC  rM  r  rQ  r   r   r   r   r    s&    
"
2 	r     r   r!  c                 C   s  |du rdt vrt | jt d< t d }|du r.|  }||   }tj|f| jtjd}|du r:tj| tj	d}| jj
dkr|dv sFJ || j}t| ||g t| L t|t| t|t|t|t|  f}| jtjkr|tj|  n | jtjkrtj|  n| jtjkrtj|  ntd| j W d   n1 sw   Y  n| }tt|t| t|t|t|t|   |r| }	||	8 }t||dd	\}
}t|
||| j|	|d
}||fS t|||| jd}||fS )aW  Quantize a tensor in blocks of values.

    The input tensor is quantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is calculated for scaling
    the non-linear quantization.

    Args:
        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
        code (`torch.Tensor`, *optional*):
            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 4096.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
        - `torch.Tensor`: The quantized tensor.
        - [`QuantState`]: The state object used to undo the quantization.
    NdynamicrB   rO   r   cpurT           r      @   ?Blockwise quantization only supports 16/32-bit floats, but got F)r'  r,  )r!  r   r'  rO   r   r+  r!  r   r'  rO   )	name2qmapr   r  rB   rf   r;   r  rp   
zeros_likers   typer   rK   rh   r?   ri   rX   rO   r  r   cquantize_blockwise_fp16bfloat16cquantize_blockwise_bf16cquantize_blockwise_fp32r   rW  cquantize_blockwise_cpu_fp32
c_longlongmeanquantize_blockwiser  )rj   r   r!  rd   r'  r,  r   blocksargsr   qabsmaxr+  r%  r   r   r   rj  W  sp   #
		rj  r%  r'  c                 C   s  |dus
|dus
J |du r#|du r#dt vrt | jt d< t d }|du r0t|||tjd}|j}|jrLt	|j|j
}||j7 }|jtjkrL| }|du r[tj| j|j| jd}| jjdkr|j| j}|jdvrvtd|j dt| ||g t| b t|jt| t|t|t|jt|  t| f}|jtjkrtj|  n |jtjkrtj|  n|jtjkrtj |  ntd	|j W d   |S W d   |S W d   |S 1 sw   Y  |S |j! }t"t|t| t|t|t#|jt#|   |S )
a  Dequantize a tensor in blocks of values.

    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is used for scaling
    the non-linear dequantization.

    Args:
        A (`torch.Tensor`): The quantized input tensor.
        quant_state ([`QuantState`], *optional*):
            The quantization state as returned by [`quantize_blockwise`].
            Required if `absmax` is not provided.
        absmax (`torch.Tensor`, *optional*):
            A tensor containing the scaling values.
            Required if `quant_state` is not provided and ignored otherwise.
        code (`torch.Tensor`, *optional*):
            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
            Ignored when `quant_state` is provided.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 4096.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
            Ignored when `quant_state` is provided.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        `torch.Tensor`:
            The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
    NrU  r_  rN   rW  rX  zThe blocksize of zJ is not supported. Supported values: [4096, 2048, 1024, 512, 256, 128, 64]r^  )$r`  r   r  rB   r  r;   rp   r!  r,  dequantize_blockwiser+  r   rO   r?  emptyrQ   rb  r   r'  r   r   rK   rh   r?   rX   rf   r   r  r   cdequantize_blockwise_fp16rd  cdequantize_blockwise_bf16cdequantize_blockwise_fp32rW  cdequantize_blockwise_cpu_fp32rh  )rj   r%  r!  r   rd   r'  r,  rl  r   r   r   rn    sv   )









	rn  r]  c                 C   s   |d u rd}d }| dkr	 g d}n(| dkrg d}n| dkr$g d}n| dkr:|d	kr6g d
d d d }nt d|d u rFt d|  dtj||d}||   | dks^J |S )Nr<   r  )r   g    6Gg    fg    TFٿg   I4ҿg   ০ǿg    Or   g   __?g   `\?g   ?g   @g?g    4?g   ` ?g   `v"?r   r  )r   g      ?g       @g      (@g      @g      @r   g      @r   g      g       g      (g      g      g       g      int4)r      r   rF   r   rG   r   r   r   r.   iiaf4r]  )r   g|8geg:Kڞ׿gH2퓊cпg}Yu-ÿgQ	#(Dr   gF?g`_?g
0E?gL_߹E?gƶ=?ga@?gкv-?r   r.   z94-bit AbnormalFloats currently only support blocksize 64.z	Typename z not supportedrB      )ru   r;   r   div_r   r   rf   )typenamerB   r'  r   r   r   r   get_4bit_type%  s,   




r  c                 C      t | ||||d|S Nr  quantize_4bitrj   r!  rd   r'  compress_statisticsquant_storager   r   r   quantize_fp4w     r  c                 C   r  Nr  r  r  r   r   r   quantize_nf4  r  r  r  c              
   C   s  | j jdkrtd| j j |dvrtd| d|  }| j}|du r7||   }	tj|	f| j tjd}|du rPt| d }
tj|d	 |
 d	f|| j d
}|dv sVJ t	| ||g t
| f dt| t|t|t|t|f}| jtjkr|dkrtj|  n:tj|  n4| jtjkr|dkrtj|  n$tj|  n| jtjkr|dkrtj|  ntj|  ntd| j W d   n1 sw   Y  t|| j d}|r| }||8 }t|dd\}}~t||| j|||||d}||fS t||| j|||d}||fS )a  Quantize tensor A in blocks of 4-bit values.

    Quantizes tensor A by dividing it into blocks which are independently quantized.

    Args:
        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 64.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        compress_statistics (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
        quant_storage (`torch.dtype`, *optional*): The dtype of the tensor used to store the result. Defaults to `torch.uint8`.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        Tuple[`torch.Tensor`, `QuantState`]: A tuple containing the quantization results.
        - `torch.Tensor`: The quantized tensor with packed 4-bit values.
        - [`QuantState`]: The state object used to undo the quantization.
    r<   z0Device type not supported for FP4 quantization: r  4-bit quantization data type  is not implemented.NrV  rG   r   rN   rX  r  r^  r{  r   )r'  )r!  rQ   rO   r'  r   r&  r   r+  )r!  rQ   rO   r'  r   r&  )rB   rb  ru   rf   rQ   r;   r  rp   rS   r   rK   rh   r?   ri   rX   rO   rd  r   cquantize_blockwise_bf16_fp4cquantize_blockwise_bf16_nf4r  cquantize_blockwise_fp16_fp4cquantize_blockwise_fp16_nf4cquantize_blockwise_fp32_fp4cquantize_blockwise_fp32_nf4r   r  ri  rj  r  )rj   r!  rd   r'  r  r&  r  r   input_shaperk  modrl  r   r   rm  r+  r  r   r   r   r    s~   !
		r  c                 C      t | ||||dS r  dequantize_4bitrj   r%  r!  rd   r'  r   r   r   dequantize_fp4     r  c                 C   r  r  r  r  r   r   r   dequantize_nf4  r  r  c           	   	   C   s  |dvrt d| d|dvrtd| d|du r2|dur$|dus&J t||j|j||d}n|j}|jrNt|j|j}||j	7 }|jt
jkrN| }|du r]t
j|j|j| jd	}| }t| ||g t| }t| k dt| t|t|t|jt||f}|jt
jkr|jd
krtj|  n<tj|  n6|jt
jkr|jd
krtj|  n%tj|  n|jt
jkr|jd
krtj|  ntj |  nt d|j W d   n1 sw   Y  | jd dkr|! S |S )a  Dequantizes a packed 4-bit quantized tensor.

    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is used for scaling
    the non-linear dequantization.

    Args:
        A (`torch.Tensor`): The quantized input tensor.
        quant_state ([`QuantState`], *optional*):
            The quantization state as returned by [`quantize_4bit`].
            Required if `absmax` is not provided.
        absmax (`torch.Tensor`, *optional*):
            A tensor containing the scaling values.
            Required if `quant_state` is not provided and ignored otherwise.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 64.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.

    Raises:
        ValueError: Raised when the input data type or blocksize is not supported.

    Returns:
        `torch.Tensor`: The dequantized tensor.
    )rY  rT  rZ  r[  r   r\  r]  zThe blockwise of zJ is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]r  r  r  N)r!  rQ   rO   r'  r&  rN   r  r^  r   r   )"r   ru   r  rQ   rO   r!  r,  rn  r+  r   r;   rp   r?  ro  rB   rf   r   r   rK   rh   r?   rX   r'  rd  r&  r   cdequantize_blockwise_bf16_fp4cdequantize_blockwise_bf16_nf4r  cdequantize_blockwise_fp16_fp4cdequantize_blockwise_fp16_nf4cdequantize_blockwise_fp32_fp4cdequantize_blockwise_fp32_nf4r1   )	rj   r%  r!  rd   r'  r&  r   streamrl  r   r   r   r    sj   #
	






r  c                 C   sx   |d u rdt vrt | jt d< t d }|| j}t|  }|jtjkr,|	 }| | }t
|||}|||ffS )NrU  )r`  r   r  rB   r;   r   r   rO   rp   r?  quantize_no_absmax)rj   r   rd   r!  inpr   r   r   quantizex  s   r  r  c                 C   s~   |d us
|d us
J |d u r)|d u r)dt vrt | jt d< t d }|| j}|d u r1||f}t| |d |}||d  S )NrU  r   r   )r`  r   r  rB   dequantize_no_absmax)rj   r  r!  r   rd   r   r   r   
dequantize  s   r  c              	   C   s`   t | j}|du rtj| tjd}t| |g tt|t| t|t	
|   t| |S )a  
    Quantizes input tensor to 8-bit.

    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
    `out` using the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor, optional
        The output tensor. Needs to be of type byte.

    Returns
    -------
    torch.Tensor:
        Quantized 8-bit tensor.
    Nr   )r   rB   r;   ra  rs   r   r   	cquantizerh   r?   rX   rf   r   )rj   r   rd   rC   r   r   r   r    s   
&r  c              	   C   sl   t | j}|du rtj| tjd}t|| |g t| }tt	|t	| t	|t
|  | t| |S )a  
    Dequantizes the 8-bit tensor to 32-bit.

    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
    the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The 8-bit input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor
        The 32-bit output tensor.

    Returns
    -------
    torch.Tensor:
        32-bit output tensor.
    Nr   )r   rB   r;   ra  rp   r   r   r   cdequantizerh   r?   rX   rf   r   )rj   r   rd   rC   r  r   r   r   r    s   
(r  r   r   optimizer_namegr   state1beta1epsr  lrr+  beta2beta3alphaweight_decaygnorm_scale	unorm_vec	max_unormc                 C   sf  d}|dkrt |j }d}|jt jkrt|  d }n.|jt jkr*t|  d }n!|jt jkr?t	t|  dkr?t|  d }nt
d|j d|j t|||||g t|Q |t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|t|  W d   dS 1 sw   Y  dS )	az  
    Performs an inplace optimizer update with one or two optimizer states.

    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer: {adam}.
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Optimizer state 1.
    beta1 : float
        Optimizer beta1.
    eps : float
        Optimizer epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    state2 : torch.Tensor
        Optimizer state 2.
    beta2 : float
        Optimizer beta2.
    beta3 : float
        Optimizer beta3.
    alpha : float
        Optimizer alpha.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    skip_zeros : bool
        Whether to skip zero-valued gradients or not (default: False).
    r   Nr   r   r   rG   AGradient+optimizer bit data type combination not supported: grad , optimizer )r;   r   r   r?  rO   rp   str2optimizer32bitr  rd  r   r   r   rK   rh   r?   rr   ri   c_boolrf   )r  r  r   r  r  r  r  r  r+  r  r  r  r  r  r  r  
skip_zeros
param_norm
optim_funcr   r   r   optimizer_update_32bit  sH   >
"r  zyThis function is deprecated and will be removed in a future release. Please use optimizer_update_8bit_blockwise instead. qmap1qmap2max1max2new_max1new_max2c                 C   s  d}|dkrt |j }t|j}t||||||
|||||g |jt jkr|jt j	krt
|  d t|t|t|t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|t|  nk|jt jkr|jt j	krt
|  d t|t|t|t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|t|  ntd|j d|j t| dS )a  
    Performs an inplace Adam update.

    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
    Uses AdamW formulation if weight decay > 0.0.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer. Choices {adam, momentum}
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Adam state 1.
    state2 : torch.Tensor
        Adam state 2.
    beta1 : float
        Adam beta1.
    beta2 : float
        Adam beta2.
    eps : float
        Adam epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    qmap1 : torch.Tensor
        Quantization map for first Adam state.
    qmap2 : torch.Tensor
        Quantization map for second Adam state.
    max1 : torch.Tensor
        Max value for first Adam state update.
    max2 : torch.Tensor
        Max value for second Adam state update.
    new_max1 : torch.Tensor
        Max value for the next Adam update of the first state.
    new_max2 : torch.Tensor
        Max value for the next Adam update of the second state.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    r   r   r   r  r  N)r;   r   r   r?  r   rB   r   rO   rp   rs   str2optimizer8bitrh   r?   rr   ri   rf   r  r   r   )r  r  r   r  r+  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rC   r   r   r   optimizer_update_8bitG  sr   M


r  absmax1absmax2c                 C   sv  d }|j tjkr|j tjkrt|  d }n:|j tjkr(|j tjkr(t|  d }n'|j tjkrC|j tjkrCtt|  dkrCt|  d }ntd|j  d|j  t	||||||||g t
|R |t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|t|t|  W d    d S 1 sw   Y  d S )Nr   r   r   rG   r  r  )rO   r;   rp   rs   str2optimizer8bit_blockwiser  rd  r   r   r   rK   rh   r?   rr   ri   r  rf   )r  r  r   r  r+  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   optimizer_update_8bit_blockwise  sH   
"r  grad	gnorm_vec
percentilec           
   	   C   s   t | j}t| |g | jtjkr&tt| t|t	
|t	
|   n$| jtjkrAtt| t|t	
|t	
|   n	td| j dt| t||d  }t|\}}t|| }d}	||kro|| }	|||	fS )a  Applies percentile clipping

    grad: torch.Tensor
        The gradient tensor.
    gnorm_vec: torch.Tensor
        Vector of gradient norms. 100 elements expected.
    step: int
        The current optimization steps (number of past gradient norms).

    zGradient type z not supported!d   r   )r   rB   r   rO   r;   rp   r   cpercentile_clipping_g32rh   r?   ri   rf   r  cpercentile_clipping_g16r   r   sqrtr   )
r  r  r  r  rC   current_gnormvalsr  
clip_valuer  r   r   r   percentile_clipping  s2   

r  	histogramindex1index2sourcec                 C   s   t | jdks	J | jtjksJ |jtjksJ |jtjks!J |jtjks)J | jjdks1J |jjdks9J |jjdksAJ |jjdksIJ t	| jd }t	|
 }t| |||g tt| t|t|t||| d S )NrG   r<   r   )r   rQ   rO   r;   rp   int32rB   rb  r?   ri   rf   r   r   chistogram_scatter_add_2drh   )r  r  r  r  maxdim1r   r   r   r   histogram_scatter_add_2d<  s   (r  c              
   C   s  t j s
t j  | j|ks|j|kr td| j d|j | j}|j}|}|}	d}
t|dkr|t|dkr||sI|	sI| jd |jd krId}
n|rZ|	sZ| jd |jd krZd}
n|rk|	rk| jd |jd krkd}
n|s{|	r{| jd |jd kr{d}
nt|dkrt|dkr|s|	s| jd |jd krd}
n|r|	s| jd |jd krd}
ny|r|	r| jd |jd krd}
nh|s|	r| jd |jd krd}
nWt|dkr#t|dkr#|s|	s| jd |jd krd}
n8|r|	s| jd |jd krd}
n'|r|	r| jd |jd krd}
n|s#|	r#| jd |jd kr#d}
|d urc|j}|
sbt|dkrbt|dkrb|d |d krb|d |d krb|d |d krb|d |d krbd}
nt|dkrt|dkr|s|	s|d |d f}n|r|	r|d |d f}n|r|	s|d |d f}n|s|	r|d |d f}nt|dkrt|dkr|s|	s|d |d |d f}n|r|	r|d |d |d f}ny|r|	s|d |d |d f}ng|s|	r|d |d |d f}nUt|dkrXt|dkrX|s#|	s#|d |d |d f}n5|r5|	r5|d |d |d f}n#|rG|	sG|d |d |d f}n|sX|	rX|d |d |d f}|
sltd	| d
| d| d
|	 d	|S )Nz3Expected torch.int8 input tensors A and B, but got  and TrG   r   r   Fr   z?Tensor dimensions incorrect for matrix mulitiplication: A x B:  x z with transpose for A x B: r4  )	r;   r<   is_initializedinitrO   	TypeErrorrQ   r   r   )rj   ry   rd   transposed_Atransposed_Bexpected_typesAsBtAtBcorrectsoutr   r   r   check_matmulN  s   

""
"Hr  ry   c                 C   s  |d u rt d|  | jd krt d|j}|d }|j}|jr.t|j|j}||j7 }|d u r\t| jdkrMt	j
| jd | jd |f| j| jd}nt	j
| jd |f| j| jd}d}	|d }
|d }|d }|d }| jd d d }t|| |||jg t|
}
t|	}	t|}t|}t|}t|}t| }t|  |jt	jt	jt	jt	jfv r5| jt	jkrt|
|	|t| t|t|t|jt||||t|j| n`| jt	jkrt|
|	|t| t|t|t|jt||||t|j| n@| jt	jkr-t|
|	|t| t|t|t|jt||||t|j| n td	| j td	| j W d    |S W d    |S W d    |S 1 sYw   Y  |S )
NzIstate cannot be None. gemv_4bit() requires the state from quantize_4bit()r.   zcDimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]r   r   r   sizerO   rB   rG   z%Matmul not implemented for data type )r   rf   rQ   r!  r,  rn  r+  r   r   r;   ro  rO   rB   r   r   r?   ri   r   rK   rs   rd  r  rp   r   cgemm_4bit_inference_naive_fp16rh   r'  cgemm_4bit_inference_naive_bf16cgemm_4bit_inference_naive_fp32ru   )rj   ry   rd   r  r  r  Bshapeboutr!  r   mr3  ldaldcldbr  r   r   r   	gemv_4bit  s   	
(










8
8
88r  c                 C   sv  t | ||||}|d u rtj|tj| jd}t| jdkr>t|jdkr>| jd |jd kr>| jd |jd kr>t| ||S | j}|j}|rUt|dkrU|d |d f}n|rht|dkrh|d |d |d f}|ryt|dkry|d |d f}n|rt|dkr|d |d |d f}t|dkr4| d |jd krd}n| d |jd krd}t| jdkr|  d | jd krd}n)|  d | jd krd}n|  d | jd krd}n|  d | jd krd}t|dkr|d }|  |rdnd }	nt|dkrt|dkr|d |d  }|d }	|d }
|d }| |r,dnd }|d }nHt|dkr|t|dksDJ |d |d krV|d |d ks`t	d| d	| d}d}|d }
|d }|d |d  }|
}|d }	|
}t
 | j}t|| |g t|t|t|t|
t|t|t|t| t|t|t|	t| |S )
Nr  r   r   rG   r   FTzMOnly bsi,bso->io supported for tensor contractions, but dims for A x B were: r  )r  r;   r  r  rB   r   rQ   batched_igemmstrider   r8   r,   rA   r   r   cigemmr?   r  ri   rh   )rj   ry   rd   r  r  r  r  r  r   r  r  r3  r  r  r  r   r   r   igemm
  s   (

$r  c                 C   s  t | jdkrt |jdkstd| j d|j t| ||||}|d u r0tj|tj| jd}| r=|	 d }d}nV|	 }|d |jd krU|
 }|	 d }n>|d |jd krgd	}|	 d }n,|d dkrx|
 }|	 d }n|d dkr|
 }|	 d }n
|
 }|	 d }|  r| 	 d }d}n8| 	 }|d | jd kr| 
 } | 	 d }d}n|d | jd kr| 	 d }d	}n| 
 } | 	 d }d}| jd }	| jd }
|jd }|jd }|}|jd |jd  }| jd | jd  }| jd |jd  }t | j}t|| |g t|t|t|t|t|
t|t|t| t|t|t|t|t|t|t|t|	 |S )
Nr   z@Expected 3-dimensional tensors for bmm, but got shapes A and B: r  r  r   Fr   rG   T)r   rQ   r   r  r;   r  r  rB   is_contiguousr  
contiguousr8   r,   rA   r   r   cbatched_igemmr?   r  ri   rh   c_longc_uint32)rj   ry   rd   r  r  r  r  sr  	num_batchr   r  r3  r  strideAstrideBstrideCr  r   r   r   r  o  s   



r  zeigemmlt is deprecated and will be removed in a future release. Please use int8_linear_matmul instead.SASBSoutc                 C   sn   |d ur|d dkrt d|d  d|d ur(|d dkr(t d|d  dt| |||d}||jdffS )Nr   r   z<Only row-major format inputs are supported, but got format ``zAOnly row-major format is supported for matrix B, but got format `)rd   rO   )ru   int8_linear_matmulrQ   )rj   ry   r  r  rd   r	  rO   resultr   r   r   igemmlt  s   r  c                 C   s  || } }| j }|j }| jtjksJ |jtjksJ | jdks$J d|jdv s-J dt|dks:J d| |du sE|j|ksEJ g |dd |d R }|\}}t|dd }	|d }
|d }|d }|
|ksxJ d	| d
| |
d dkrt| |   	tj
}|dur||}|S |du rtj|| j|d}t| ||g t| g t | j}t| }t|}t|}d}t|}t|	}	t|}t|
}
t|}t|}t| }|tj
krt|||	||||||
|||}nt|||	||||||
|||}W d   n	1 sw   Y  |dkr*td|rFtd|d|d|d|
||fd||	|f
|S )aL  Performs an 8-bit integer matrix multiplication.

    A linear transformation is applied such that `out = A @ B.T`. When possible, integer tensor core hardware is
    utilized to accelerate the operation.

    Args:
        A (`torch.Tensor`): The first matrix operand with the data type `torch.int8`.
        B (`torch.Tensor`): The second matrix operand with the data type `torch.int8`.
        out (`torch.Tensor`, *optional*): A pre-allocated tensor used to store the result.
        dtype (`torch.dtype`, *optional*): The expected data type of the output. Defaults to `torch.int32`.

    Raises:
        `NotImplementedError`: The operation is not supported in the current environment.
        `RuntimeError`: Raised when the cannot be completed for any other reason.

    Returns:
        `torch.Tensor`: The result of the operation.
    rG   z:Only two dimensional matrices are supported for argument B)rG   r   zCOnly two or three dimensional matrices are supported for argument Ar   z(Input tensor dimensions need to be > 0: Nr.   zQint8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = z @ rF   rV  r  z#int8_linear_matmul not implemented!z$cublasLt ran into an error!
	shapeA=z	, shapeB=z	, shapeC=z
	(lda, ldb, ldc)=z
	(m, n, k)=)rQ   rO   r;   r   ndimr   matmulr?  r1   r  r  copy_ro  rB   r   rK   r8   r,   rA   rh   r?   ri   r   r   cigemmlt_32
cigemmlt_8ru   r   )rj   ry   rd   rO   shapeAshapeBshapeCr3  r  r   r  r  r  r  ctxptrAptrBptrCptrRowScaler  	has_errorr   r   r   r    sz   
 
 







" 
r  	row_stats	col_statsr   c                 C   s   | j tjksJ |dur|j tjksJ |du r tj| tjd}t| }t|}t|}t|}t|}	tt| j	dd }
t| j	d }t
| ||||g t|  t|||||	|
|t|  W d   |S 1 sow   Y  |S )a  Performs dequantization on the result of a quantized int8 matrix multiplication.

    Args:
        A (`torch.Tensor` with dtype `torch.int32`): The result of a quantized int8 matrix multiplication.
        row_stats (`torch.Tensor`): The row-wise quantization statistics for the lhs operand of the matrix multiplication.
        col_stats (`torch.Tensor`): The column-wise quantization statistics for the rhs operand of the matrix multiplication.
        out (`torch.Tensor`, *optional*): A pre-allocated tensor to store the output of the operation.
        bias (`torch.Tensor`, *optional*): An optional bias vector to add to the result.

    Returns:
        `torch.Tensor`: The dequantized result with an optional bias, with dtype `torch.float16`.
    Nr   r.   )rO   r;   r  r  
empty_likerh   r?   ri   r   rQ   r   rK   r   cdequant_mm_int32_fp16r   )rj   r  r  rd   r   r  ptrOutptrRowStatsptrColStatsptrBiasnumRowsnumColsr   r   r   int8_mm_dequantK	  s*   

r&  z?mm_dequant is deprecated. Please use int8_mm_dequant() instead.c                 C   s   t | ||||S r"   )r&  )rj   r%  r  r  rd   new_row_statsnew_col_statsr   r   r   r   
mm_dequanty	  s   r)  nnz_block_ptrc                 C   s   |   sJ d}|du s|du r?|  d| jd }|dkr)||k}||d |du r2t| |}|du r?|jddd }|||fS )a   "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The row-wise and column-wise absmax values are determined.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip>
    This function is useful for training, but for inference it is advised to use [`get_row_absmax`] instead.
    The column-wise quantization scales are not typically needed in inference scenarios.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): Input tensor.
        row_stats (`torch.Tensor`, *optional*): If provided, calculation of row statistics is skipped.
        col_stats (`torch.Tensor`, *optional*): If provided, calculation of column statistics is skipped.
        nnz_block_ptr (`torch.Tensor`, *optional*): Not used.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.
            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing quantization statistics.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization statistics.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization statistics.
        - `torch.Tensor` with dtype `torch.bool`, *optional*: A mask indicating the locations of outliers in the input tensor.
    Nr.   r   r   Fdimkeepdim)is_floating_pointr   r]   rQ   masked_fill_get_row_absmaxamaxr?  )rj   r  r  r*  	thresholdoutlier_maskabsAr   r   r   get_colrow_absmax	  s   !

r5  c              
   C   s   | j tjksJ t| jdd }| jd }tj|ftj| jd}t| g t	| " t
t| t|t|t|t|t|  W d   |S 1 sOw   Y  |S )aT  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.
            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `torch.Tensor` with dtype `torch.float32`: The absolute maximum value for each row, with outliers ignored.
    Nr.   rN   )rO   r;   r  r   rQ   ro  rp   rB   r   rK   r   cget_row_statsrh   r?   rr   ri   r   )rj   r2  r  r  r  r   r   r   r0  	  s$   





r0  c                   @   s4   e Zd ZdedededejdejdejfddZd	S )
COOSparseTensorr  r  nnzrowidxcolidxr   c                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |ks0J || _|| _|| _|| _|| _	|| _
d S r"   )rO   r;   r  r  rf   r  r  r8  r9  r:  r   )r   r  r  r8  r9  r:  r   r   r   r   r    	  s   
zCOOSparseTensor.__init__N)r4   r5   r6   r   r;   r	   r    r   r   r   r   r7  	  s    r7  c                   @      e Zd Zdd ZdS )CSRSparseTensorc                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |d ks2J || _|| _|| _|| _|| _	|| _
d S Nr   )rO   r;   r  r  rf   r  r  r8  rowptrr:  r   )r   r  r  r8  r>  r:  r   r   r   r   r    	     
zCSRSparseTensor.__init__Nr4   r5   r6   r    r   r   r   r   r<  	      r<  c                   @   r;  )CSCSparseTensorc                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |d ks2J || _|| _|| _|| _|| _	|| _
d S r=  )rO   r;   r  r  rf   r  r  r8  colptrr9  r   )r   r  r  r8  rC  r9  r   r   r   r   r    

  r?  zCSCSparseTensor.__init__Nr@  r   r   r   r   rB  	
  rA  rB  c                 C   sz   t j| jdd\}}|d t j| jd ft j| jjd}|j|	 |
 dd |d t| j| j| j|| j| jS NTreturn_countsr   rN   r   )r:   srcr,  )r;   uniquer9  add_r  r  r  rB   scatter_r  r   cumsum_r<  r  r8  r:  r   )cooAr   countsr>  r   r   r   coo2csr
  s   

rN  c                 C   s   t | j\}}| j| }| j| }t j|dd\}}|d t j| jd ft j	| jj
d}|j| | dd |d t| j| j| j|||S rD  )r;   r   r:  r9  r   rH  rI  r  r  r  rB   rJ  r  r   rK  rB  r  r8  )rL  r   
col2rowidxr9  r   	colvaluesrM  rC  r   r   r   coo2csc#
  s   



rQ  c                 C   sL   t j|ft j|d}t j|ft j|d}t j|f||d}t| |||||S )NrN   )r;   r  r  r7  )r  r  r8  rB   rO   r9  r:  r   r   r   r   	coo_zeros/
  s   rR  zDThis function is deprecated. Please use `int8_double_quant` instead.out_colout_rowc              
   C   s   d}t | |||||d\}}}}}	|dkrN|	durNtjd| jd | jtjd}
| dd|	f }t| jd | jd | |
|	d|	
|	d |}||||  |fS )aT  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The statistics are determined both row-wise and column-wise (transposed).

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip warning={true}>
    This function exists for backwards compatibility only. It is advised to use [`int8_double_quant`] instead.
    The difference is that this function will return a [`COOSparseTensor`] for outliers instead of a column index.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        col_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantization scales.
        row_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantization scales.
        out_col (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantized data.
        out_row (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantized data.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The row-wise quantized data.
        - `torch.Tensor` with dtype `torch.int8`: The column-wise quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization scales.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization scales.
        - `COOSparseTensor`, *optional*: A structure representing the outlier values from the input tensor.
    Nr2  r   r   rV  r   )int8_double_quantr;   r   rQ   rB   r  r7  rf   repeat_interleaver  r   r   flattenr?  )rj   r  r  rS  rT  r2  
coo_tensor	quant_row	quant_coloutlier_colsoutlier_rowsoutliersr   r   r   double_quant6
  s*   (		r_  c                 C   s   t | |d\}}}t| |d\}}}	|dkr |	dur | |	d} t| t|d tj	}
|dur:|
|}|durC|
|
}
||
||  |fS )aL  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The statistics are determined both row-wise and column-wise (transposed).

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip>
    This function is useful for training, but for inference it is advised to use [`int8_vectorwise_quant`] instead.
    This implementation performs additional column-wise transposed calculations which are not optimized.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        col_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantization scales.
        row_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantization scales.
        out_col (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantized data.
        out_row (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantized data.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The row-wise quantized data.
        - `torch.Tensor` with dtype `torch.int8`: The column-wise quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization scales.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization scales.
        - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
    rU  r   Nr   )int8_vectorwise_quantr5  masked_fillr;   r  mulC	unsqueezer  r   r  rX  r?  )rj   r  r  rS  rT  r2  rZ  r\  r   r3  r[  r   r   r   rV  x
  s   *"

rV  statsc                 C   s   | | dd d S )aY  Dequantizes a tensor with dtype `torch.int8` to `torch.float32`.

    Args:
        A (`torch.Tensor` with dtype `torch.int8`): The quantized int8 tensor.
        stats (`torch.Tensor` with dtype `torch.float32`): The row-wise quantization statistics.

    Returns:
        `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
    r.   r   g   @ ?)r]   )rj   re  r   r   r   int8_vectorwise_dequant
  s   rf  c                 C   s"  | j tjksJ t| g t| jdd }| jd }tj|| jtjd}tj| j| jtj	d}d}|dkrL| 
 |k}| rLt|jddd}t| $ tt| t|t|t|t|t|t|  W d   n1 sww   Y  |dkr|durd|dd|f< |||fS )aw  Quantizes a tensor with dtype `torch.float16` to `torch.int8` in accordance to the `LLM.int8()` algorithm.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input tensor.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The quantization scales.
        - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
    Nr.   rV  r   r   )r,  r   )rO   r;   halfr   r   rQ   ro  rB   rp   r   r   anyargwherer]   rK   r   cint8_vector_quantrh   r?   rr   ri   r   )rj   r2  r  r  r  rT  r\  r^  r   r   r   r`  
  s2   



r`  c                 C   s  t | j}|d u r| j|f}n|d }|d u r)t|d | j| j||d |\}}n|d |f}|d }	t|	dkrHt|	d }
t|	d }nt|	d |	d  }
t|	d }t| |g |dkr|rst	
t| t||
| n}t	t| t||
| np|dkr|rt	t| t||
| n]t	t| t||
| nP|dkr|rt	t| t||
| n=t	t| t||
| n0|dkr|dkrt	t| t||
| n|dkrt	t| t||
| n
td| d	| t| ||fS )
Nr   r   rG   r   r   r   r   z)Transform function not implemented: From r   )r   rB   rQ   r
  rO   r   r?   ri   r   r   ctransform_row2col32Trh   ctransform_row2col32ctransform_row2turingTctransform_row2turingctransform_row2ampereTctransform_row2amperectransform_turing2rowctransform_ampere2rowru   r   )rj   r  r  rd   r   r  r  rC   r  rQ   r  r  r   r   r   	transform
  sF   
$rs  rL  c                 C   s  t | ts2| jr| jtjksJ dt| jd | jd |  |  d 	 |  d 	 | 
 d} |d u rFtj| j|jd f|j|jd}| j}| j |ksRJ | j |ks[J | j
 |ksdJ | j|jd ksnJ | rtdnd}| |r}dnd }|jd }t j}t| j}t| j}	t| j
}
t|}t|}t| j}t| j}t| j}t|jd }t|}t|}t| j| j| j
||g t|||	|
||||||||t| |S )Nz8Tensor must be `COOSparseTensor or a PyTorch COO tensor.r   r   )r  r  r8  r9  r:  r   rV  FT) r1  r7  	is_sparselayoutr;   
sparse_coorQ   _nnzindicesr   r   ro  r  rB   rO   r8  r9  rf   r:  r  r  r  rD   r,   r9   rh   r?   ri   r   r   	cspmm_coor  )rL  ry   rd   r8  r  r  r  r  	ptrRowidx	ptrColidx	ptrValuesr  r  cnnzcrowsAccolsAccolsBcldbcldcr   r   r   spmm_coo,  sj   
	 






r  c                 C   s~  |d u rt j| j|jd f|j| jjd}| j}t|j}| j	
 |ks&J | j
 |ks/J | j
 |ks8J | j|jd ksKJ | j d|j | rQdnd}| |rZdnd }|jd }t j| j	dd\}	}
|
d }t j|
dd\}}| }| }|d d	ksJ d
|d  d|jt jt jfv sJ t|}t|}t|}t| j	}t| j}t| j}t|}t|}t|}t|

 }t| j}t| j}t| j}t|jd }t|jd }t|}t|}t| j	| j| j|||g |jt jkr t|||||||||||||| n|jt jkr9t|||||||||||||| t| |S )Nr   rV  r   z vs FTrE  )
descendingr   z)Current max count per row is 8 but found r4  )r;   r  r  rQ   rB   r   rO   r8  r   r9  rf   r:  r  r  r  rH  cumsumr   r   r  r   rh   r?   ri   r   r    cspmm_coo_very_sparse_naive_fp16 cspmm_coo_very_sparse_naive_int8r   )rL  ry   dequant_statsrd   r8  rC   r  r  r  r   rM  r   	max_countmax_idx	ptrOffsetptrMaxCount	ptrMaxIdxrz  r{  r|  r  r  ptrDequantStats	cnnz_rowsr}  r~  r  crowsBr  r  r  r   r   r   spmm_coo_very_sparseo  s   "
&
 




r  g     _@ztThis function is deprecated and will be removed in a future release. Consider using `int8_vectorwise_quant` instead.vectorc                 C   s$  |dkrt |   }t | | d t j}||fS |dv r>t jt | |dd}t | t|  t j}||fS |dkru| j	}|  } |  | 
  }|dkrWd}d	| }| 
 }t || }	t ||  |	 |	 } | |fS |d
v r| j	}|  } t j| |ddt j| |dd }d||dk< d	| }t j| |dd}t || }	t ||  |	 |	 } | |fS |dkrt  B t | }
t j|
|dd}|d }|
||
k}t | | }||
| | | |< t | | t t j}W d    ||fS 1 sw   Y  ||fS d S )Nlinear   )r  r   Tr+  	zeropointr   r   g     o@)vector-zeropointrow-zeropointtruncated-vectorgffffff?)r;   r   r   r?  r  r  r   r1  rc  rO   minaminno_grad	expand_asr   )r   r,  r&  r  xqrO   dynaqxminxzpxabsxr  r   r   r   r   vectorwise_quant  sV    



r  zvThis function is deprecated and will be removed in a future release. Consider using `int8_vectorwise_dequant` instead.c                 C   s$   |dkr| t  | tj}|S d S )Nr  )rc  r  r;   rp   )r  r  r&  r   r   r   r   vectorwise_dequant  s   r  znThis function is deprecated and will be removed in a future release. Consider using `int8_mm_dequant` instead.c                 C   s  |dkr|| t t   }|  | |S |dkr(d||  }|  | |S |dkrqd||  }|  }t|jdkrIt|jdkrI|d}t|jdkr\t|jdkr\|d}t|jdkrh||9 }n||9 }||S |dkr|  }t|jdkrt|jdkr|d}t|jdkrt|jdkr|d}t|jdkr|d| 9 }n|d| 9 }|d|  9 }||S |d	kr|  }t|jdkrt|jdkr|d}t|jdkrt|jdkr|d}t|jdkr||| t t   9 }n
||| t t   9 }||S |d
v rd|  }t|jdkr/t|jdkr/|d}t|jdkrDt|jdkrD|d}t|jdkrS||t  9 }n||t  9 }||t  9 }||S d S )Nr  r  r   r  r   rG   r   r  r   )r  r  )rc  r?  r  r   rQ   squeezer1   )r  S1S2rO   r&  r   r   r   r   r   vectorwise_mm_dequant  sd   











 
 

r  c                 C   s   |   d|d |d   }|   }t| jdkr(t|jdkr(|d}t|jdkr8|| d 9 }n||d 9 }||d d 9 }||7 }||S )Nr   r   rG   r   r  )r?  r1   sumr   rQ   r  r  )r  rj   ry   r  r  rO   r   r   r   r   r   dequant_min_max@  s   "

r  c                 C   s   |d }|d }|dv sJ | j jdksJ tj|d | ftj| j d}t| }t|d }t|d }t| }	t|}
t|}t	| j }|dkr[t
|	|
|||| n|dkrit
|	|
|||| t| |S )Nr   r   )r   r   r<   rN   r   r   )rB   rb  r;   r  rf   r   r?   ri   rh   r   r   cextractOutliers_turingcextractOutliers_amperer   )rj   r  r  r  formatArd   idx_sizer  r  r  ptrIdxr   rC   r   r   r   extract_outliersO  s$    
r  c                 C   s6   t | }tt| t|t|  t| |S r"   )r;   ra  r   cpipeline_testrh   r?   rU   rf   )rj   
batch_sizerd   r   r   r   pipeline_testi  s   
(r  r3   )T)NTr"   )Tr   T)r   T)Tr   rG   r   )Tr   r   )r   )r   F)r   NFNN)Nr  r   )NNNrT  F)NNNNrT  F)Nr]  )NNNr]  )NNNr]  r  r7  )NNNN)	Nr   r   r   r   r   Nr   F)r   r   Nr   )r   r   F)r   )NFFN)NFF)NNNr   )r   )NNNNr   )r   r  )r  )ctypesr?   r   mathr   typingr   r   r   r   r   r   numpyrY   r;   r	   typing_extensionsr
   bitsandbytes.utilsr   r   
cextensionr   r`  compiled_with_cudacadam32bit_grad_fp32cadam32bit_grad_fp16cadam32bit_grad_bf16cmomentum32bit_grad_32cmomentum32bit_grad_16crmsprop32bit_grad_32crmsprop32bit_grad_16clion32bit_grad_fp32clion32bit_grad_fp16clion32bit_grad_bf16cadagrad32bit_grad_32cadagrad32bit_grad_16cademamix32bit_grad_fp32cademamix32bit_grad_fp16cademamix32bit_grad_bf16r  cadam_static_8bit_grad_32cadam_static_8bit_grad_16cmomentum_static_8bit_grad_32cmomentum_static_8bit_grad_16crmsprop_static_8bit_grad_32crmsprop_static_8bit_grad_16clion_static_8bit_grad_32clion_static_8bit_grad_16r  cadam_8bit_blockwise_grad_fp32cadam_8bit_blockwise_grad_fp16cadam_8bit_blockwise_grad_bf16"cmomentum_8bit_blockwise_grad_fp32"cmomentum_8bit_blockwise_grad_fp16"cmomentum_8bit_blockwise_grad_bf16!crmsprop_8bit_blockwise_grad_fp32!crmsprop_8bit_blockwise_grad_fp16!crmsprop_8bit_blockwise_grad_bf16clion_8bit_blockwise_grad_fp32clion_8bit_blockwise_grad_fp16clion_8bit_blockwise_grad_bf16!cadagrad_8bit_blockwise_grad_fp32!cadagrad_8bit_blockwise_grad_fp16!cadagrad_8bit_blockwise_grad_bf16"cademamix_8bit_blockwise_grad_fp32"cademamix_8bit_blockwise_grad_fp16"cademamix_8bit_blockwise_grad_bf16r  r   r8   rD   rS   rp   r  rd  rs   r   rB   FIRST_CUDA_DEVICEr<   device_countrK   rL   re   r/   r~   r   FutureWarningr   r   r   r   r   r   r   r   r   Streamr   r@   r   rh   r   r   r   r
  r  r?  r   r  rj  r   rn  r  r  r  r  r  r  r  r  r  r  r  rE  r  r  r  r  r  r  r  r  r  r  r@  r  r  r&  r)  r5  r0  r7  r<  rB  rN  rQ  rg  rR  r_  rV  rf  r`  rs  r  r  rc  r  r  r  r  r  r  r   r   r   r   <module>   s   
%"












+7


$



'(
B 4
j
fT


q



h

$
$(	

g	
 	


@ *W
k
h
`	(k

.
9%	
C
:83
CR,
;


