o
    2wi#I                 ,   @   s0
  d dl mZ d dlZd dlZd dlmZ d dlmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZmZ dd	lmZ i Z	 ejejejfejejfejejfejejej fej!ej"fejejejfej#ej$ej%fd
Z&ej'ej(fej)ej*fej+ej,fej-ej.fej'ej(fej)ej*fdZ/ej0ej1ej2fej3ej4ej5fej6ej7ej8fej9ej:ej;fej<ej=ej>fej?ej@ejAfdZBG dd dZCG dd dZDG dd dZEejFdd dZGejHI dkrdejfddZJnd dlKZKdejfddZJejLeGdddZMddejfddZNdd!d"ZOdd#d$ZPdd%d&ZQdd(d)ZRdd+d,ZSdd/d0ZTdd2d3ZUed4eVd5dd6d7ZWd8ee	ej  fd9d:ZXd;ed<ejYfd=d>ZZde	e d<e	ejY fd?d@Z[ed4eVd5		A	BddedCe	ej dDe\d<efdEdFZ]G dGdH dHZ^				I	ddejdJe	ej dKe	ej dCe	ej d<e_eje^f f
dLdMZ`					I	ddejdNe	e^ dKe	ej dJe	ej dCe	ej dOead<ejfdPdQZbddSdTZcdddRdejdfdejdKe	ej dCe	ej fdUdVZedddRdejdfdejdKe	ej dCe	ej fdWdXZfdddRddYejdfdejdKe	ej dCe	ej d<e_eje^f fdZd[Zg				RddejdNe	e^ dKe	ej dCe	ej dOead<ejfd\d]Zh				RddejdNe	e^ dKe	ej dCe	ej dOead<ejfd^d_Zi				R	YddejdNe	e^ dKe	ej dCe	ej dOead<ejfd`daZjed4eVd5		ddedJe	ej dCe	ej d<e_ee_eef f fdbdcZked4eVd5				ddedde	e_eef  dKe	ej dJe	ej dCe	ej d<efdedfZled4eVd5ddedJedCe	ej d<efdgdhZmed4eVd5ddedJedCe	ej d<efdidjZn		k	k	k	k	l		k	ddmeodnedoedpedqe\dre\dseadte\due	ej dve\dwe\dxe\dye\dze\d{e	ej d|e\d<df"d}d~ZpedeVd5	k	l		kddmeodnedoedpedue	ej dqe\dve\dre\dseadte\dede	ej dede	ej dede	ej dye\dze\d{e	ej d|e\d<df*ddZq	k	l	ddmeodnedoedpedue	ej dqe\dve\dwe\dxe\dre\dseadte\dede	ej dede	ej dye\dze\d<df&ddZred4eVd5ddededseadeafddZsed4eVd5dedededefddZtejufddZv				ddededCe	ej fddZw			ddededCe	ej fddZx			ddededCe	ej fddZydejzfdejdejdCe	ej fddZ{		ddejdejdejdCe	ej de	ej f
ddZ|ed4eVd5				kddejde	ej de	ej de	ej d<e_ejeje	ej f f
ddZ}ed4eVd5ddejfddZ~G dd dZG dd dZG dd dZdd Zdd ZejfddZ					kddejde	ej de	ej de	ej de	ej f
ddZdejdejfddZddejfddZ	dde
eejf dejdCe	ej fddÄZdddńZdZedeVd5dddʄZed4eVd5ejdfdd̄ZdS )    )IterableN)prod)AnyOptionalUnion)Tensor)
deprecated)pack_dict_to_tensorunpack_tensor_to_dict   )lib)adammomentumrmsproplionadagradlambademamix)r   r   r   r   r   lars)r   r   r   r   r   r   c                   @   s6   e Zd ZdZdd Zdd Zedd Zdd	d
ZdS )GlobalPageManagerNc                 C      t dNzCall get_instance() insteadRuntimeErrorself r   T/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/bitsandbytes/functional.py__init__z      zGlobalPageManager.__init__c                 C   s
   g | _ d S N)paged_tensorsr   r   r   r   
initialize}      
zGlobalPageManager.initializec                 C   &   | j d u r| | | _ | j   | j S r    	_instance__new__r"   clsr   r   r   get_instance      

zGlobalPageManager.get_instanceFc                 C   s$   | j d d d D ]}t|| qd S )N)r!   prefetch_tensor)r   to_cputr   r   r   prefetch_all   s   zGlobalPageManager.prefetch_allF)	__name__
__module____qualname__r&   r   r"   classmethodr*   r0   r   r   r   r   r   w   s    
r   c                   @   s4   e Zd ZdZdd Zdd Zedd Zdd	 ZdS )
CUBLAS_ContextNc                 C   r   r   r   r   r   r   r   r      r   zCUBLAS_Context.__init__c                 C   s
   i | _ d S r    )contextr   r   r   r   r"      r#   zCUBLAS_Context.initializec                 C   r$   r    r%   r(   r   r   r   r*      r+   zCUBLAS_Context.get_instancec                 C   sP   |j | jvr"tj }tj| tt	 | j|j < tj| | j|j  S r    )
indexr7   torchcudacurrent_device
set_devicectc_void_pr   get_context)r   deviceprev_devicer   r   r   r?      s   
zCUBLAS_Context.get_context)	r2   r3   r4   r&   r   r"   r5   r*   r?   r   r   r   r   r6      s    
r6   c                   @   s,   e Zd ZdZdd Zdd Zedd ZdS )Cusparse_ContextNc                 C   r   r   r   r   r   r   r   r      r   zCusparse_Context.__init__c                 C   s   t t | _d S r    )r=   r>   r   get_cusparser7   r   r   r   r   r"      s   zCusparse_Context.initializec                 C   r$   r    r%   r(   r   r   r   r*      r+   zCusparse_Context.get_instance)r2   r3   r4   r&   r   r"   r5   r*   r   r   r   r   rB      s    rB   r:   )r8   ac                 C   s   t j| S r    )r9   r:   	device_ofrD   r   r   r   _cuda_device_of   s   rG   c                 C   s   t  S r    )
contextlibnullcontextrF   r   r   r   rG      r   dtyper@   c                 G   sn   | j t| }tt|}t|ttj}t	j
j||d}tj|| t|d|}d|_|j|_|S )N)shape)rK   countT)itemsizer   r   cget_managed_ptrr=   c_size_tcastPOINTERc_intnp	ctypeslibas_arrayr9   
frombufferviewis_pagedr8   page_deviceid)rK   r@   rL   	num_bytescuda_ptrc_ptr	new_arrayoutr   r   r   	get_paged   s   r`   FAc                 C   sB   | j sJ d|rd}n| j}tt| t| jt| d S )Nz%Only paged tensors can be prefetched!r,   )	rY   rZ   r   	cprefetchget_ptrr=   rP   nbytesc_int32)ra   r.   deviceidr   r   r   r-      s
   $r-   Tc                 C   s   d }|j tjkrttd|  dd }t|}n|j tjkr-ttd|  dd }t|}|d u r8t	d|  t|dd}|rN|rNt
| |d urNt
| |t|t||t|  |jsd|jrktj  d S d S )Nc_fp32_uint8zFunction not implemented: 
is_managedF)rK   r9   float32getattrr   r=   c_floatuint8c_uint8NotImplementedErrorr-   rc   c_int64numelrY   r:   synchronize)	func_namera   Bvalueprefetchfunccvaluerj   r   r   r   elementwise_func   s$   
 rz   c                 C   s   t d| d | d S )Nfillrz   )ra   rv   r@   rw   r   r   r   r{         r{   c                 C   s   t d| |d d S )N_mulr   r|   )ra   ru   r@   r   r   r   r~      r}   r~      c                 C   s   | rdnd}d| }|s|dk r| sd| nd| d }t |d|}d|  }|dkr/|S | d }t |d |  dg|  ||d    S )	N                 r   r         ?   r   )r9   linspacerr   r   tolist)signed
total_bitsadd_zerosigntotal_valuesvaluesgaplr   r   r   create_linear_map  s   0r   +ew?c           	   
   C   s  zddl m} W n ty } ztd|d }~ww |rC|t| ddd d  }dgd }|t| ddd d   }n&|t| ddd d  }dgd	 }|t| ddd d   }|| | }t|}| j	}||
  }| d
ksJ |S )Nr   )normzZScipy is required for `create_normal_map`. Install `bitsandbytes` with the `[test]` extra.g      ?	   r,      r      r   )scipy.statsr   ImportErrorppfr9   r   r   r   sortr   maxrr   )	offsetuse_extra_valuer   iev1v2v3vr   r   r   r   create_normal_map  s.    
$ 
"

r      r   c                 C   s  |}|}| rdnd}|| || ksJ g }g }t td||   d||  dD ]\}	}
|d|
  q)g }ttjddg|d}d|d  }td| D ]I}|D ]D}|dkrZdnd}t t|D ]\}	}||d|	d    7 }qb|dkr~|d|   }n|d|| d    }|| | r||  qRqNt|d| ksJ |  |dk rdt| }t|D ]}	|d q|  t	|}||
  }|S )Nr   r   r   )repeatr   r   )	enumeraterangeappendlist	itertoolsproductlenr   r9   tensorr   )r   exponent_bitsprecision_bitsr   ephas_signevaluespvaluesivalr   lstbiasevaluebit_patternrv   pvalr   coder   r   r   create_fp8_map2  sD   *

r      c                 C   s  g }|d }d||  d }t |D ]U}t| r!d|| |  d nd|| | d  d }tjdd|tjd}|dd |dd  d }	|d|d  |  |	  7 }| rg|d|d  |   |	  7 }q|d	krtjdd|d tjd}|dd |dd  d }	|d|d  |  |	  7 }| r|d|d  |   |	  7 }|d	 |d
 t|d| ksJ dt| }
t |
D ]}|d	 q|  tj	|tjdS )a+  
    Creates the dynamic quantiztion map.

    The dynamic data type is made up of a dynamic exponent and
    fraction. As the exponent increase from 0 to -7 the number
    of bits available for the fraction shrinks.

    This is a generalization of the dynamic type where a certain
    number of the bits and be reserved for the linear quantization
    region (the fraction). n determines the maximum number of
    exponent bits.

    For more details see
    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
    r   r   g?rK   Nr,          @
   r   r   r   )
r   intr9   r   rk   r   r   r   r   r   )r   max_exponent_bitsr   datanon_sign_bitsadditional_itemsr   fraction_items
boundariesmeansr   r   r   r   create_dynamic_map]  s:     

r   zDThis function is deprecated and will be removed in a future release.)categoryc                 C   sn   t | d| d d}| }|d dt| }t|D ]}|d q|  t|}||   }|S )Nr   r   )num_quantilesr   r   )	estimate_quantilesr   r   r   r   r   r   absr   )ra   r   qr   r   r   r   r   create_quantile_map  s   
r   tensorsc                 C   s   d}t  }| D ]}|durt|dds||jM }||jj q|s.tddd | D  t|dkr@td	d
d | D  |S )ap  Verifies that the input tensors are all on the same device.

    An input tensor may also be marked as `paged`, in which case the device placement is ignored.

    Args:
        tensors (`Iterable[Optional[torch.Tensor]]`): A list of tensors to verify.

    Raises:
        `RuntimeError`: Raised when the verification fails.

    Returns:
        `Literal[True]`
    TNrY   FzZAll input tensors need to be on the same GPU, but found some tensors to not be on a GPU:
 c                 S      g | ]}|j |jfqS r   rL   r@   .0r/   r   r   r   
<listcomp>      zis_on_gpu.<locals>.<listcomp>r   zcInput tensors need to be on the same GPU, but found the following tensor and device combinations:
 c                 S   r   r   r   r   r   r   r   r     r   )setrl   is_cudaaddr@   r8   r   r   )r   on_gpugpu_idsr/   r   r   r   	is_on_gpu  s    
r   r   returnc                 C   s   t tj| jjS r    )r=   r>   r9   _C_cuda_getCurrentRawStreamr@   r8   )r   r   r   r   _get_tensor_stream  s   r   c                 C   s   | du rdS t |  S )zGets the memory address of the first element of a tenso

    Args:
        A (`Optional[Tensor]`): A PyTorch tensor.

    Returns:
        `Optional[ct.c_void_p]`: A pointer to the underlying tensor data.
    N)r=   r>   data_ptr)ra   r   r   r   rc     s   	rc         `?r   r_   r   c              
   C   sX  |   dk rtd|    d|dkrtd| |dk r)|dkr)dd|  }|du r7tjd	tj| jd
}t| L t| |g | jtjkr]t	
t| t|t|t|    n#| jtjkrxt	t| t|t|t|    ntd| j W d   n1 sw   Y  |dk rtd| }tdd| | j}|| }|S )a  
    Estimates 256 equidistant quantiles on the input tensor eCDF.

    Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
    via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
    and the extreme quantiles close to 0 and 1 have high variance / large estimation
    errors. These large errors can be avoided by using the offset variable which trims
    the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
    trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
    usually has a much lower error but is not a minimum entropy encoding. Given an offset
    of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor. Any shape.
    out : torch.Tensor
        Tensor with the 256 estimated quantiles.
    offset : float
        The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
    num_quantiles : int
        The number of equally spaced quantiles.

    Returns
    -------
    torch.Tensor:
        The 256 quantiles in float32 datatype.
    r   zQQuantile estimation needs at least 256 values in the Tensor, but Tensor had only z values.zgCurrently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles=r   r   r   N)r   rJ   zNot supported data type r      )rr   rp   r9   zerosrk   r@   rG   r   rK   r   cestimate_quantiles_fp32rc   r=   rm   rS   float16cestimate_quantiles_fp16roundr   longto)ra   r_   r   r   stepidxr   r   r   r     s4   #
**
r   c                   @   s   e Zd ZdZdZdd eD Zg dZ							dddZd	d
 Ze	de
eef dejdd fddZdddZdd Zdd ZdS )
QuantStatezWcontainer for quantization state components to work with Params4bit and similar classes)fp4nf4c                 C   s   g | ]}d | qS )bitsandbytes__r   )r   xr   r   r   r   %  s    zQuantState.<listcomp>)absmax	quant_mapnested_absmaxnested_quant_mapquant_state
quant_type	blocksizerK   rL   nested_blocksizenested_dtypenested_offsetNc	           	      C   s>   || _ || _|| _|| _|| _|| _|| _|| _|d u| _d S r    )	r   rL   r   rK   r   r   r   state2nested)	r   r   rL   r   r   r   rK   r   r   r   r   r   r   5  s   zQuantState.__init__c                 C   sR   | j r| j| j| j| j| j| jg| jg}|| S | j| j| j| jd| jg}|| S )a$  
        ensures compatibility with older quant state scheme with nested lists.
        assumes the following layout:
        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
        state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
        N)r   r   rL   rK   r   r   r   r   )r   r   	list_reprr   r   r   __getitem__J  s   

zQuantState.__getitem__qs_dictr@   r   c              
   C   sp  dd |  D }t|sd|vrtdt|dks'|d dd | jvr3td	| j d
| dt|dkrG|d }|t|| dd |  D }t|	 
| js\J d|v rtt|d |}| |d ||d |d |tt|d d}nd\}}| |d |d ||d |d |tt|d |d durt|d nd||d}|S )aO  
        unpacks components of state_dict into QuantState
        where necessary, convert into strings, torch.dtype, ints, etc.

        qs_dict: based on state_dict, with only relevant keys, striped of prefixes.

        item with key `quant_state.bitsandbytes__[nf4/fp4]` may contain minor and non-tensor quant state items.
        c                 S   s(   g | ]\}}d |v rt |tjr|qS )r   
isinstancer9   r   r   kr   r   r   r   r   j  s   ( z(QuantState.from_dict.<locals>.<listcomp>r   z<Expected packed or unpacked quant_state items, found neitherr   r   .r,   z@There should be exactly one `quant_state` item with ending from z.
Detected c                 S   s    i | ]\}}| d d |qS )r  r,   )splitr  r   r   r   
<dictcomp>w  s     z(QuantState.from_dict.<locals>.<dictcomp>r   r   r   r   r   )r   r   r   rK   NNr   r   r   rK   rL   N)r   r   r   r   rK   rL   r   r   )itemsr   
ValueErrorr  valid_qs_type_keysupdater
   popr   keysissubsetvalid_qs_keysr9   r   floatr   rl   Size)r)   r   r@   qs_keyfirst_qs_keyr   r   r   r   r   r   	from_dict^  s@   $
zQuantState.from_dictFc                 C   s   | j | j| j| jt| jdt| jd}| j	r6|
| jj| jj| jj t| jjd| j d |s:|S dd | D }dd | D }t||d| j  < |S )z
        returns dict of tensors and strings to use in serialization via _save_to_state_dict()
        param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving
        ztorch.)r   r   r   r   rK   rL   )r   r   r   r   r   c                 S   s"   i | ]\}}t |tjr||qS r   r   r  r   r   r   r       " z&QuantState.as_dict.<locals>.<dictcomp>c                 S   s"   i | ]\}}t |tjs||qS r   r   r  r   r   r   r    r  zquant_state.bitsandbytes__)r   r   r   r   strrK   striptuplerL   r   r  r   cloner   itemr  r	   )r   packedr   qs_packed_dictnon_tensor_dictr   r   r   as_dict  s,   
	zQuantState.as_dictc                 C   s\   | j || _ | j|| _| jr,| j|| _| jj|| j_| jj || j_ d S d S r    )r   r   r   r   r   r   )r   r@   r   r   r   r     s   zQuantState.toc                 C   s   t |tsdS tj| j|jddo^| j|jko^tj| j|jddo^| j|jko^| j|jko^| j	|j	ko^| j
d urC|j
d urC| j
|j
kn| j
|j
u o^| jd urY|jd urY| j|jkS | j|ju S )NFgư>)atol)r  r   r9   allcloser   rL   r   rK   r   r   r   r   )r   otherr   r   r   __eq__  s,   







zQuantState.__eq__)NNNNNNNr1   )r2   r3   r4   __doc__valid_quant_typesr
  r  r   r   r5   dictr  r   r9   r@   r  r  r   r"  r   r   r   r   r   !  s&    
"
2 	r      r   r   c                 C   s   |du rdt vrt | jt d< t d }tjjj| || j|\}}|rJ|	 }||8 }t||dd\}	}
t
|	|j| jdd|| j||
d}nt
||j| jdd|| jd}|durb||n|}|duro||j|_||fS )	aW  Quantize a tensor in blocks of values.

    The input tensor is quantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is calculated for scaling
    the non-linear quantization.

    Args:
        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
        code (`torch.Tensor`, *optional*):
            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 4096.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
        - `torch.Tensor`: The quantized tensor.
        - [`QuantState`]: The state object used to undo the quantization.
    NdynamicF)r   r   T)copy)r   r   r   rK   r   r   r   r   r   rK   )	name2qmapr   r   r@   r9   opsbitsandbytesquantize_blockwisedefaultmeanr   rK   copy_r   )ra   r   r   r_   r   r   _out_absmaxr   qabsmaxr   r   r   r   r   r-    s4   #

	r-  r   r   c                 C   s   |dus
|dus
J |du r#|du r#dt vrt | jt d< t d }|du r0t|||tjd}|j}|jrLt	|j|j
}||j7 }|jtjkrL| }|durgtjjj	j| ||j| j|j|j|d |S tjjj	| ||j| j|j|jS )a  Dequantize a tensor in blocks of values.

    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is used for scaling
    the non-linear dequantization.

    Args:
        A (`torch.Tensor`): The quantized input tensor.
        quant_state ([`QuantState`], *optional*):
            The quantization state as returned by [`quantize_blockwise`].
            Required if `absmax` is not provided.
        absmax (`torch.Tensor`, *optional*):
            A tensor containing the scaling values.
            Required if `quant_state` is not provided and ignored otherwise.
        code (`torch.Tensor`, *optional*):
            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
            Ignored when `quant_state` is provided.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 4096.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
            Ignored when `quant_state` is provided.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        `torch.Tensor`:
            The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
    Nr'  r)  r_   )r*  r   r   r@   r   r9   rk   r   r   dequantize_blockwiser   r   rK   r  r+  r,  r_   r   r   r.  )ra   r   r   r   r_   r   r   r   r   r   r5    s<   )


r5  @   c                 C   s   |d u rd}d }| dkr	 g d}n(| dkrg d}n| dkr$g d}n| dkr:|d	kr6g d
d d d }nt d|d u rFt d|  dtj||d}||   | dks^J |S )Nr:   r   )r   g    6Gg    fg    TFٿg   I4ҿg   ০ǿg    Or   g   __?g   `\?g   ?g   @g?g    4?g   ` ?g   `v"?r   r   )r   g      ?g       @g      (@g      @g      @r   g      @r   g      g       g      (g      g      g       g      int4)r      r         r   r   r   r   r,   iiaf4r6  )r   g|8geg:Kڞ׿gH2퓊cпg}Yu-ÿgQ	#(Dr   gF?g`_?g
0E?gL_߹E?gƶ=?ga@?gкv-?r   r,   z94-bit AbnormalFloats currently only support blocksize 64.z	Typename z not supportedr@      )rp   r9   r   div_r   r   rr   )typenamer@   r   r   r   r   r   get_4bit_typef  s,   




rD  c                 C      t | ||||d|S Nr   quantize_4bitra   r   r_   r   compress_statisticsquant_storager   r   r   quantize_fp4     rL  c                 C   rE  Nr   rG  rI  r   r   r   quantize_nf4  rM  rO  r   c              
   C   s   | j }tjjj| |||\}}	t|| jd}
|r6|	 }t	|	| dd\}}~	t
||| j||
|||d}nt
|	|| j||
|d}|durJ||n|}|durW||j|_||fS )a  Quantize tensor A in blocks of 4-bit values.

    Quantizes tensor A by dividing it into blocks which are independently quantized.

    Args:
        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 64.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        compress_statistics (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
        quant_storage (`torch.dtype`, *optional*): The dtype of the tensor used to store the result. Defaults to `torch.uint8`.

    Raises:
        ValueError: Raised when the input data type is not supported.

    Returns:
        Tuple[`torch.Tensor`, `QuantState`]: A tuple containing the quantization results.
        - `torch.Tensor`: The quantized tensor with packed 4-bit values.
        - [`QuantState`]: The state object used to undo the quantization.
    r@  r   )r   )r   rL   rK   r   r   r   r   r   )r   rL   rK   r   r   r   N)rL   r9   r+  r,  rH  r.  rD  r@   r/  r-  r   rK   r0  r   )ra   r   r_   r   rJ  r   rK  input_shaper1  r2  r   r   r3  r   stater   r   r   rH    sD    

rH  c                 C      t | ||||dS rF  dequantize_4bitra   r   r   r_   r   r   r   r   dequantize_fp4     rV  c                 C   rR  rN  rS  rU  r   r   r   dequantize_nf4%  rW  rX  c              	   C   s   |du r|dur|dusJ t ||j|j||d}n|j}|jr6t|j|j}||j7 }|jtj	kr6|
 }|durNtjjjj| ||j|j|j|j|d ntjjj| ||j|j|j|j}| jd dkrj| S |S )a  Dequantizes a packed 4-bit quantized tensor.

    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
    The the absolute maximum value within these blocks is used for scaling
    the non-linear dequantization.

    Args:
        A (`torch.Tensor`): The quantized input tensor.
        quant_state ([`QuantState`], *optional*):
            The quantization state as returned by [`quantize_4bit`].
            Required if `absmax` is not provided.
        absmax (`torch.Tensor`, *optional*):
            A tensor containing the scaling values.
            Required if `quant_state` is not provided and ignored otherwise.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
            The size of the blocks. Defaults to 64.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.

    Raises:
        ValueError: Raised when the input data type or blocksize is not supported.

    Returns:
        `torch.Tensor`: The dequantized tensor.
    N)r   rL   rK   r   r   r4  r   r   )r   rL   rK   r   r   r5  r   r   r9   rk   r  r+  r,  rT  r_   r   r   r.  r/   )ra   r   r   r_   r   r   r   r   r   rT  /  s<   "	


	rT  c                 C   sx   |d u rdt vrt | jt d< t d }|| j}t|  }|jtjkr,|	 }| | }t
|||}|||ffS )Nr'  )r*  r   r   r@   r9   r   r   rK   rk   r  quantize_no_absmax)ra   r   r_   r   inpr   r   r   quantizex  s   r[  rQ  c                 C   s~   |d us
|d us
J |d u r)|d u r)dt vrt | jt d< t d }|| j}|d u r1||f}t| |d |}||d  S )Nr'  r   r   )r*  r   r   r@   dequantize_no_absmax)ra   rQ  r   r   r_   r   r   r   
dequantize  s   r]  c              
   C   sx   t | . |du rtj| tjd}t| |g tt|t| t|t	| 
  W d   |S 1 s5w   Y  |S )a  
    Quantizes input tensor to 8-bit.

    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
    `out` using the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor, optional
        The output tensor. Needs to be of type byte.

    Returns
    -------
    torch.Tensor:
        Quantized 8-bit tensor.
    Nr   )rG   r9   
zeros_likern   r   r   	cquantizerc   r=   rS   rr   )ra   r   r_   r   r   r   rY    s   
(
rY  c              
   C   s   t | 4 |du rtj| tjd}t|| |g t| }tt|t| t|t	
|  | W d   |S 1 s;w   Y  |S )a  
    Dequantizes the 8-bit tensor to 32-bit.

    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
    the quantization map `code`.

    Parameters
    ----------
    A : torch.Tensor
        The 8-bit input tensor.
    code : torch.Tensor
        The quantization map.
    out : torch.Tensor
        The 32-bit output tensor.

    Returns
    -------
    torch.Tensor:
        32-bit output tensor.
    Nr   )rG   r9   r^  rk   r   r   r   cdequantizerc   r=   rS   rr   )ra   r   r_   streamr   r   r   r\    s   
*
r\  r   r   optimizer_namegr   state1beta1epsr   lrr   beta2beta3alphaweight_decaygnorm_scale	unorm_vec	max_unormc                 C   sf  d}|dkrt |j }d}|jt jkrt|  d }n.|jt jkr*t|  d }n!|jt jkr?t	t|  dkr?t|  d }nt
d|j d|j t|||||g t|Q |t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|t|  W d   dS 1 sw   Y  dS )	az  
    Performs an inplace optimizer update with one or two optimizer states.

    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer: {adam}.
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Optimizer state 1.
    beta1 : float
        Optimizer beta1.
    eps : float
        Optimizer epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    state2 : torch.Tensor
        Optimizer state 2.
    beta2 : float
        Optimizer beta2.
    beta3 : float
        Optimizer beta3.
    alpha : float
        Optimizer alpha.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    skip_zeros : bool
        Whether to skip zero-valued gradients or not (default: False).
    r   Nr   r   r:  r   AGradient+optimizer bit data type combination not supported: grad , optimizer )r9   r   r   r  rK   rk   str2optimizer32bitr   bfloat16r   r	  r   rG   rc   r=   rm   re   c_boolrr   )rb  rc  r   rd  re  rf  r   rg  r   rh  ri  rj  rk  rl  rm  rn  
skip_zeros
param_norm
optim_funcr   r   r   optimizer_update_32bit  sH   >
"rw  zyThis function is deprecated and will be removed in a future release. Please use optimizer_update_8bit_blockwise instead. qmap1qmap2max1max2new_max1new_max2c                 C   s  d}|dkrt |j }t| t||||||
|||||g |jt jkr|jt jkrt	|  d t
|t
|t
|t
|t
|t|t|t|t|t|t|t|	t
|
t
|t
|t
|t
|t
|t|t|t|  nk|jt jkr|jt jkrt	|  d t
|t
|t
|t
|t
|t|t|t|t|t|t|t|	t
|
t
|t
|t
|t
|t
|t|t|t|  ntd|j d|j W d   dS W d   dS 1 sw   Y  dS )a  
    Performs an inplace Adam update.

    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
    Uses AdamW formulation if weight decay > 0.0.

    Parameters
    ----------
    optimizer_name : str
        The name of the optimizer. Choices {adam, momentum}
    g : torch.Tensor
        Gradient tensor.
    p : torch.Tensor
        Parameter tensor.
    state1 : torch.Tensor
        Adam state 1.
    state2 : torch.Tensor
        Adam state 2.
    beta1 : float
        Adam beta1.
    beta2 : float
        Adam beta2.
    eps : float
        Adam epsilon.
    weight_decay : float
        Weight decay.
    step : int
        Current optimizer step.
    lr : float
        The learning rate.
    qmap1 : torch.Tensor
        Quantization map for first Adam state.
    qmap2 : torch.Tensor
        Quantization map for second Adam state.
    max1 : torch.Tensor
        Max value for first Adam state update.
    max2 : torch.Tensor
        Max value for second Adam state update.
    new_max1 : torch.Tensor
        Max value for the next Adam update of the first state.
    new_max2 : torch.Tensor
        Max value for the next Adam update of the second state.
    gnorm_scale : float
        The factor to rescale the gradient to the max clip value.
    unorm_vec : torch.Tensor
        The tensor for the update norm.
    max_unorm : float
        The maximum update norm relative to the weight norm.
    r   r   r   ro  rp  N)r9   r   r   r  rG   r   rK   rk   rn   str2optimizer8bitrc   r=   rm   re   rr   r   r	  )rb  rc  r   rd  r   re  rh  rf  r   rg  rx  ry  rz  r{  r|  r}  rk  rl  rm  rn  ru  r   r   r   optimizer_update_8bitG  sx   M


"r  absmax1absmax2c                 C   sv  d }|j tjkr|j tjkrt|  d }n:|j tjkr(|j tjkr(t|  d }n'|j tjkrC|j tjkrCtt|  dkrCt|  d }ntd|j  d|j  t	||||||||g t
|R |t|t|t|t|t|t|t|t|t|	t|
t|t|t|t|t|t|t|t|t|  W d    d S 1 sw   Y  d S )Nr   r   r:  r   ro  rp  )rK   r9   rk   rn   str2optimizer8bit_blockwiser   rr  r   r	  r   rG   rc   r=   rm   re   rs  rr   )rb  rc  r   rd  r   re  rh  ri  rj  rf  r   rg  rx  ry  r  r  rk  rl  rt  rv  r   r   r   optimizer_update_8bit_blockwise  sH   
"r  grad	gnorm_vec
percentilec           	   
   C   s   t | M t| |g | jtjkr&tt| t|t	|t	| 
  n$| jtjkrAtt| t|t	|t	| 
  n	td| j dW d   n1 sTw   Y  t||d  }t|\}}t|| }d}||krz|| }|||fS )a  Applies percentile clipping

    grad: torch.Tensor
        The gradient tensor.
    gnorm_vec: torch.Tensor
        Vector of gradient norms. 100 elements expected.
    step: int
        The current optimization steps (number of past gradient norms).

    zGradient type z not supported!Nd   r   )rG   r   rK   r9   rk   r   cpercentile_clipping_g32rc   r=   re   rr   r   cpercentile_clipping_g16r	  sqrtr   )	r  r  r   r  current_gnormvalsr   
clip_valuerl  r   r   r   percentile_clipping  s4   

r  	histogramindex1index2sourcec                 C   s   t | jdks	J | jtjksJ |jtjksJ |jtjks!J |jtjks)J | jjdks1J |jjdks9J |jjdksAJ |jjdksIJ t	| jd }t	|
 }t| |||g tt| t|t|t||| d S )Nr   r:   r   )r   rL   rK   r9   rk   int32r@   typer=   re   rr   r   r   chistogram_scatter_add_2drc   )r  r  r  r  maxdim1nr   r   r   histogram_scatter_add_2d:  s   (r  c              
   C   s  t j s
t j  | j|ks|j|kr td| j d|j | j}|j}|}|}	d}
t|dkr|t|dkr||sI|	sI| jd |jd krId}
n|rZ|	sZ| jd |jd krZd}
n|rk|	rk| jd |jd krkd}
n|s{|	r{| jd |jd kr{d}
nt|dkrt|dkr|s|	s| jd |jd krd}
n|r|	s| jd |jd krd}
ny|r|	r| jd |jd krd}
nh|s|	r| jd |jd krd}
nWt|dkr#t|dkr#|s|	s| jd |jd krd}
n8|r|	s| jd |jd krd}
n'|r|	r| jd |jd krd}
n|s#|	r#| jd |jd kr#d}
|d urc|j}|
sbt|dkrbt|dkrb|d |d krb|d |d krb|d |d krb|d |d krbd}
nt|dkrt|dkr|s|	s|d |d f}n|r|	r|d |d f}n|r|	s|d |d f}n|s|	r|d |d f}nt|dkrt|dkr|s|	s|d |d |d f}n|r|	r|d |d |d f}ny|r|	s|d |d |d f}ng|s|	r|d |d |d f}nUt|dkrXt|dkrX|s#|	s#|d |d |d f}n5|r5|	r5|d |d |d f}n#|rG|	sG|d |d |d f}n|sX|	rX|d |d |d f}|
sltd	| d
| d| d
|	 d	|S )Nz3Expected torch.int8 input tensors A and B, but got  and Tr   r   r   Fr:  z?Tensor dimensions incorrect for matrix mulitiplication: A x B:  x z with transpose for A x B: r  )	r9   r:   is_initializedinitrK   	TypeErrorrL   r   r	  )ra   ru   r_   transposed_Atransposed_Bexpected_typesAsBtAtBcorrectsoutr   r   r   check_matmulM  s   

""
"Hr  ru   c              	   C   s~   |d u rt d|j}|jrt||j|j }|d ur/tjjj	j
| ||j||j|j|d |S tjjj	| ||j||j|jS )NzIstate cannot be None. gemv_4bit() requires the state from quantize_4bit()r4  )r	  r   r   r5  r   r   r9   r+  r,  	gemv_4bitr_   rL   r   r   r.  )ra   ru   r_   r  r  rQ  r   r   r   r   r    s0   
	
r  c                 C   sv  t | ||||}|d u rtj|tj| jd}t| jdkr>t|jdkr>| jd |jd kr>| jd |jd kr>t| ||S | j}|j}|rUt|dkrU|d |d f}n|rht|dkrh|d |d |d f}|ryt|dkry|d |d f}n|rt|dkr|d |d |d f}t|dkr4| d |jd krd}n| d |jd krd}t| jdkr|  d | jd krd}n)|  d | jd krd}n|  d | jd krd}n|  d | jd krd}t|dkr|d }|  |rdnd }	nt|dkrt|dkr|d |d  }|d }	|d }
|d }| |r,dnd }|d }nHt|dkr|t|dksDJ |d |d krV|d |d ks`t	d| d	| d}d}|d }
|d }|d |d  }|
}|d }	|
}t
 | j}t|| |g t|t|t|t|
t|t|t|t| t|t|t|	t| |S )
NsizerK   r@   r:  r   r   r   FTzMOnly bsi,bso->io supported for tensor contractions, but dims for A x B were: r  )r  r9   r   r  r@   r   rL   batched_igemmstrider	  r6   r*   r?   r   r   cigemmr=   rs  re   rc   )ra   ru   r_   r  r  r  r  r  r  ldbmr  ldaldcptrr   r   r   igemm  s   (

$r  c                 C   s  t | jdkrt |jdkstd| j d|j t| ||||}|d u r0tj|tj| jd}| r=|	 d }d}nV|	 }|d |jd krU|
 }|	 d }n>|d |jd krgd	}|	 d }n,|d dkrx|
 }|	 d }n|d dkr|
 }|	 d }n
|
 }|	 d }|  r| 	 d }d}n8| 	 }|d | jd kr| 
 } | 	 d }d}n|d | jd kr| 	 d }d	}n| 
 } | 	 d }d}| jd }	| jd }
|jd }|jd }|}|jd |jd  }| jd | jd  }| jd |jd  }t | j}t|| |g t|t|t|t|t|
t|t|t| t|t|t|t|t|t|t|t|	 |S )
Nr:  z@Expected 3-dimensional tensors for bmm, but got shapes A and B: r  r  r   Fr   r   T)r   rL   r	  r  r9   r   r  r@   is_contiguousr  
contiguousr6   r*   r?   r   r   cbatched_igemmr=   rs  re   rc   c_longc_uint32)ra   ru   r_   r  r  r  r  sr  	num_batchr  r  r  r  strideAstrideBstrideCr  r   r   r   r  +  s   



r  c                 C   s2   |durt jjj| || |S t jjj| |S )aL  Performs an 8-bit integer matrix multiplication.

    A linear transformation is applied such that `out = A @ B.T`. When possible, integer tensor core hardware is
    utilized to accelerate the operation.

    Args:
        A (`torch.Tensor`): The first matrix operand with the data type `torch.int8`.
        B (`torch.Tensor`): The second matrix operand with the data type `torch.int8`.
        out (`torch.Tensor`, *optional*): A pre-allocated tensor used to store the result.
        dtype (`torch.dtype`, *optional*): The expected data type of the output. Defaults to `torch.int32`.

    Raises:
        `NotImplementedError`: The operation is not supported in the current environment.
        `RuntimeError`: Raised when the cannot be completed for any other reason.

    Returns:
        `torch.Tensor`: The result of the operation.
    N)r9   r+  r,  int8_linear_matmulr_   r.  )ra   ru   r_   rK   r   r   r   r    s   r  	row_stats	col_statsr   c                 C   s2   t jjjj| ||t j|d}|dur||S |S )a  Performs dequantization on the result of a quantized int8 matrix multiplication.

    Args:
        A (`torch.Tensor` with dtype `torch.int32`): The result of a quantized int8 matrix multiplication.
        row_stats (`torch.Tensor`): The row-wise quantization statistics for the lhs operand of the matrix multiplication.
        col_stats (`torch.Tensor`): The column-wise quantization statistics for the rhs operand of the matrix multiplication.
        out (`torch.Tensor`, *optional*): A pre-allocated tensor to store the output of the operation.
        bias (`torch.Tensor`, *optional*): An optional bias vector to add to the result.

    Returns:
        `torch.Tensor`: The dequantized result with an optional bias, with dtype `torch.float16`.
    )rK   r   N)r9   r+  r,  int8_mm_dequantr.  r   r0  )ra   r  r  r_   r   resultr   r   r   r    s   
r  nnz_block_ptrc                 C   s   |   sJ d}|du s|du r?|  d| jd }|dkr)||k}||d |du r2t| |}|du r?|jddd }|||fS )a   "Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The row-wise and column-wise absmax values are determined.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip>
    This function is useful for training, but for inference it is advised to use [`get_row_absmax`] instead.
    The column-wise quantization scales are not typically needed in inference scenarios.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): Input tensor.
        row_stats (`torch.Tensor`, *optional*): If provided, calculation of row statistics is skipped.
        col_stats (`torch.Tensor`, *optional*): If provided, calculation of column statistics is skipped.
        nnz_block_ptr (`torch.Tensor`, *optional*): Not used.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.
            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing quantization statistics.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization statistics.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization statistics.
        - `torch.Tensor` with dtype `torch.bool`, *optional*: A mask indicating the locations of outliers in the input tensor.
    Nr,   r   r   Fdimkeepdim)is_floating_pointr   rX   rL   masked_fill_get_row_absmaxamaxr  )ra   r  r  r  	thresholdoutlier_maskabsAr   r   r   get_colrow_absmax  s   "

r  c              
   C   s   | j tjksJ t| jdd }| jd }tj|ftj| jd}t| g t	| " t
t| t|t|t|t|t|  W d   |S 1 sOw   Y  |S )aT  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.
            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `torch.Tensor` with dtype `torch.float32`: The absolute maximum value for each row, with outliers ignored.
    Nr,   rJ   )rK   r9   r   r   rL   emptyrk   r@   r   rG   r   cget_row_statsrc   r=   rm   re   r   )ra   r  rowscolsr  r   r   r   r    s$   





r  c                   @   s4   e Zd ZdedededejdejdejfddZd	S )
COOSparseTensorr  r  nnzrowidxcolidxr   c                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |ks0J || _|| _|| _|| _|| _	|| _
d S r    )rK   r9   r  r   rr   r  r  r  r  r  r   )r   r  r  r  r  r  r   r   r   r   r   "  s   
zCOOSparseTensor.__init__N)r2   r3   r4   r   r9   r   r   r   r   r   r   r  !  s    r  c                   @      e Zd Zdd ZdS )CSRSparseTensorc                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |d ks2J || _|| _|| _|| _|| _	|| _
d S Nr   )rK   r9   r  r   rr   r  r  r  rowptrr  r   )r   r  r  r  r  r  r   r   r   r   r   5     
zCSRSparseTensor.__init__Nr2   r3   r4   r   r   r   r   r   r  4      r  c                   @   r  )CSCSparseTensorc                 C   s   |j tjksJ |j tjksJ |j tjksJ | |ks J | |ks(J | |d ks2J || _|| _|| _|| _|| _	|| _
d S r  )rK   r9   r  r   rr   r  r  r  colptrr  r   )r   r  r  r  r  r  r   r   r   r   r   F  r  zCSCSparseTensor.__init__Nr  r   r   r   r   r  E  r  r  c                 C   sz   t j| jdd\}}|d t j| jd ft j| jjd}|j|	 |
 dd |d t| j| j| j|| j| jS NTreturn_countsr   rJ   r   )r8   srcr  )r9   uniquer  add_r   r  r  r@   scatter_r   r   cumsum_r  r  r  r  r   )cooAr   countsr  r   r   r   coo2csrV  s   

r  c                 C   s   t | j\}}| j| }| j| }t j|dd\}}|d t j| jd ft j	| jj
d}|j| | dd |d t| j| j| j|||S r  )r9   r   r  r  r   r  r  r   r  r  r@   r  r   r   r  r  r  r  )r  r   
col2rowidxr  r   	colvaluesr  r  r   r   r   coo2csc_  s   



r  c                 C   sL   t j|ft j|d}t j|ft j|d}t j|f||d}t| |||||S )NrJ   )r9   r   r  r  )r  r  r  r@   rK   r  r  r   r   r   r   	coo_zerosk  s   r  out_colout_rowc                 C   sT   |durt d|durt d|durt d|dur t dtjjjj| |dS )aL  Determine the quantization statistics for input matrix `A` in accordance to the `LLM.int8()` algorithm.

    The statistics are determined both row-wise and column-wise (transposed).

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    <Tip>
    This function is useful for training, but for inference it is advised to use [`int8_vectorwise_quant`] instead.
    This implementation performs additional column-wise transposed calculations which are not optimized.
    </Tip>

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input matrix.
        col_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantization scales.
        row_stats (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantization scales.
        out_col (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the column-wise quantized data.
        out_row (`torch.Tensor`, *optional*): A pre-allocated tensor to hold the row-wise quantized data.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The row-wise quantized data.
        - `torch.Tensor` with dtype `torch.int8`: The column-wise quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The row-wise quantization scales.
        - `torch.Tensor` with dtype `torch.float32`: The column-wise quantization scales.
        - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
    NzUrow_stats must be None. int8_double_quant() does not support pre-allocated row_stats.zUcol_stats must be None. int8_double_quant() does not support pre-allocated col_stats.zQout_col must be None. int8_double_quant() does not support pre-allocated out_col.zQout_row must be None. int8_double_quant() does not support pre-allocated out_row.)r  )r	  r9   r+  r,  int8_double_quantr.  )ra   r  r  r  r  r  r   r   r   r  r  s   'r  statsc                 C      t jjj| |S )aY  Dequantizes a tensor with dtype `torch.int8` to `torch.float32`.

    Args:
        A (`torch.Tensor` with dtype `torch.int8`): The quantized int8 tensor.
        stats (`torch.Tensor` with dtype `torch.float32`): The row-wise quantization statistics.

    Returns:
        `torch.Tensor` with dtype `torch.float32`: The dequantized tensor.
    )r9   r+  r,  int8_vectorwise_dequantr.  )ra   r  r   r   r   r    s   r  c                 C   r  )aw  Quantizes a tensor with dtype `torch.float16` to `torch.int8` in accordance to the `LLM.int8()` algorithm.

    For more information, see the [LLM.int8() paper](https://arxiv.org/abs/2208.07339).

    Args:
        A (`torch.Tensor` with dtype `torch.float16`): The input tensor.
        threshold (`float`, *optional*):
            An optional threshold for sparse decomposition of outlier features.

            No outliers are held back when 0.0. Defaults to 0.0.

    Returns:
        `Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]`: A tuple containing the quantized tensor and relevant statistics.
        - `torch.Tensor` with dtype `torch.int8`: The quantized data.
        - `torch.Tensor` with dtype `torch.float32`: The quantization scales.
        - `torch.Tensor` with dtype `torch.int32`, *optional*: A list of column indices which contain outlier features.
    )r9   r+  r,  int8_vectorwise_quantr.  )ra   r  r   r   r   r    s   r  r  c                 C   s  t | ts2| jr| jtjksJ dt| jd | jd |  |  d 	 |  d 	 | 
 d} |d u rFtj| j|jd f|j|jd}| j}| j |ksRJ | j |ks[J | j
 |ksdJ | j|jd ksnJ | rtdnd}| |r}dnd }|jd }t j}t| j}t| j}	t| j
}
t|}t|}t| j}t| j}t| j}t|jd }t|}t|}t| j| j| j
||g t|||	|
||||||||t| |S )Nz8Tensor must be `COOSparseTensor or a PyTorch COO tensor.r   r   )r  r  r  r  r  r   r@   rK   FT) r  r  	is_sparselayoutr9   
sparse_coorL   _nnzindicesr   r   r  r  r@   rK   r  r  rr   r  r  r  r  rB   r*   r7   rc   r=   re   r   r   	cspmm_coors  )r  ru   r_   r  r  r  r  r  	ptrRowidx	ptrColidx	ptrValuesptrBptrCcnnzcrowsAccolsAccolsBcldbcldcr   r   r   spmm_coo  sf   
	 






r  c                 C   s  |d u rt j| j|jd f|j| jjd}| j}| j	 |ks!J | j
	 |ks*J | j	 |ks3J | j|jd ksFJ | j d|j | rLdnd}| |rUdnd }|jd }t j| jdd\}}	|	d }
t j|	dd\}}| }| }|d d	ksJ d
|d  d|jt jt jfv sJ t|
}t|}t|}t| j}t| j
}t| j}t|}t|}t|}t|		 }t| j}t| j}t| j}t|jd }t|jd }t|}t|}t|Y t| j| j
| j|||g |jt jkr t|||||||||||||| n)|jt jkrAt|||||||||||||| W d    |S W d    |S W d    |S 1 sUw   Y  |S )Nr   r  r   z vs FTr  )
descending    z)Current max count per row is 8 but found r  )r9   r   r  rL   r@   r   rK   r  r  rr   r  r  r  r  r  cumsumr   r   r   int8rc   r=   re   rG   r   r    cspmm_coo_very_sparse_naive_fp16 cspmm_coo_very_sparse_naive_int8)r  ru   dequant_statsr_   r  r  r  r  r   r  r   	max_countmax_idx	ptrOffsetptrMaxCount	ptrMaxIdxr  r  r  r   r  ptrDequantStats	cnnz_rowsr  r  r  crowsBr  r  r  r   r   r   spmm_coo_very_sparse	  s   "&
 






&
&
&&r  g     _@ztThis function is deprecated and will be removed in a future release. Consider using `int8_vectorwise_quant` instead.vectorc                 C   s$  |dkrt |   }t | | d t j}||fS |dv r>t jt | |dd}t | t|  t j}||fS |dkru| j	}|  } |  | 
  }|dkrWd}d	| }| 
 }t || }	t ||  |	 |	 } | |fS |d
v r| j	}|  } t j| |ddt j| |dd }d||dk< d	| }t j| |dd}t || }	t ||  |	 |	 } | |fS |dkrt  B t | }
t j|
|dd}|d }|
||
k}t | | }||
| | | |< t | | t t j}W d    ||fS 1 sw   Y  ||fS d S )Nlinear   )r  rowTr  	zeropointr   r   g     o@)vector-zeropointrow-zeropointtruncated-vectorgffffff?)r9   r   r   r  r   r   r  r  CrK   minaminno_grad	expand_asr   )r   r  r   rz  xqrK   dynaqxminxzpxabsxr   r   r   r   r   vectorwise_quant`	  sV    



r,  c                 C   s  |dkr|| t t   }|  | |S |dkr(d||  }|  | |S |dkrqd||  }|  }t|jdkrIt|jdkrI|d}t|jdkr\t|jdkr\|d}t|jdkrh||9 }n||9 }||S |dkr|  }t|jdkrt|jdkr|d}t|jdkrt|jdkr|d}t|jdkr|d| 9 }n|d| 9 }|d|  9 }||S |d	kr|  }t|jdkrt|jdkr|d}t|jdkrt|jdkr|d}t|jdkr||| t t   9 }n
||| t t   9 }||S |d
v rd|  }t|jdkr/t|jdkr/|d}t|jdkrDt|jdkrD|d}t|jdkrS||t  9 }n||t  9 }||t  9 }||S d S )Nr  r  r   r  r:  r   r   r  r  )r   r  )r!  r  r   r   rL   squeezer/   )r&  S1S2rK   r   r   r   r   r   r   vectorwise_mm_dequant	  sd   











 
 

r0  r1   )T)NTr    )Tr   T)r   T)Tr   r   r   )Tr   r   )r   )Nr   r   )NNNr&  F)NNNNr&  F)Nr6  )NNNr6  )NNNr6  r   r  )NNNN)	Nr   r   r   r   r   Nr   F)r   r   Nr   )r   r   F)r   )NFFN)NFF)NNNr   )r   )NNNNr   )r   r  )collections.abcr   ctypesr=   r   mathr   typingr   r   r   numpyrT   r9   r   typing_extensionsr   bitsandbytes.utilsr	   r
   
cextensionr   r*  cadam32bit_grad_fp32cadam32bit_grad_fp16cadam32bit_grad_bf16cmomentum32bit_grad_32cmomentum32bit_grad_16crmsprop32bit_grad_32crmsprop32bit_grad_16clion32bit_grad_fp32clion32bit_grad_fp16clion32bit_grad_bf16cadagrad32bit_grad_32cadagrad32bit_grad_16cademamix32bit_grad_fp32cademamix32bit_grad_fp16cademamix32bit_grad_bf16rq  cadam_static_8bit_grad_32cadam_static_8bit_grad_16cmomentum_static_8bit_grad_32cmomentum_static_8bit_grad_16crmsprop_static_8bit_grad_32crmsprop_static_8bit_grad_16clion_static_8bit_grad_32clion_static_8bit_grad_16r~  cadam_8bit_blockwise_grad_fp32cadam_8bit_blockwise_grad_fp16cadam_8bit_blockwise_grad_bf16"cmomentum_8bit_blockwise_grad_fp32"cmomentum_8bit_blockwise_grad_fp16"cmomentum_8bit_blockwise_grad_bf16!crmsprop_8bit_blockwise_grad_fp32!crmsprop_8bit_blockwise_grad_fp16!crmsprop_8bit_blockwise_grad_bf16clion_8bit_blockwise_grad_fp32clion_8bit_blockwise_grad_fp16clion_8bit_blockwise_grad_bf16!cadagrad_8bit_blockwise_grad_fp32!cadagrad_8bit_blockwise_grad_fp16!cadagrad_8bit_blockwise_grad_bf16"cademamix_8bit_blockwise_grad_fp32"cademamix_8bit_blockwise_grad_fp16"cademamix_8bit_blockwise_grad_bf16r  r   r6   rB   r@   FIRST_CUDA_DEVICEr:   device_countrG   rH   rk   r`   r-   rz   r{   r~   r   r   r   r   FutureWarningr   r   r>   r   rc   r  r   r   r  r-  r   r5  rD  rn   rL  rO  rH  rV  rX  rT  r[  r]  rY  r\  r  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  halfr  r  r  r  r  r  r!  r,  r0  r   r   r   r   <module>   s*  %"







+
7$
C 4
I

MT


O



I

$
$(	

g	
 	


@
)W
(
h
(`


9%		
3

CR,