o
    3wieE                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlZd dl	m
Z
mZmZmZ ddlmZ ddlmZ ed	d
dejdejfddZedd
dejdejdejfddZdejdejdejfddZedd
		d?dejdejdejdeej deej dejfddZedd
d@dejfddZedd
	d@dejdeejejejejeej f fddZ	d@dejdeejeej f fd d!Zed"d
dejd#ejd$edeejejf fd%dZed&d
dejd'ejd#ejd$edejdejfd(dZed)d
dejd'ejd#ejd$edejdejddfd*dZdejd'ejd#ejd$edejdejddfd+d,Zed-d
dejd$ed.ed/ejdeejejf f
d0dZed1d
dejd'ejd$ed.ed2ee dejdejfd3dZed4d
dejd'ejd$ed.ed2ee dejdejddfd5dZdejd'ejd$ed.edejdejddfd6d7Zed8d
dejdejd9ee d'ejd#ejd$edejfd:dZed;d
dejdejd9ee d'ejd#ejd$edejddfd<dZdejdejd9ee d'ejd#ejd$edejddfd=d>ZdS )A    )SequenceN)prod)Optional)CUBLAS_Context_cuda_device_of_get_tensor_streamget_ptr   )register_kernel)libz bitsandbytes::int8_linear_matmulcudaABc                 C   s:   t jg | jd d |jd R | jt jd}t| ||S )Nr   devicedtype)torchemptyshaper   int32_int8_linear_matmul_implr   r   out r   [/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/bitsandbytes/backends/cuda/ops.py_   s   .r   z$bitsandbytes::int8_linear_matmul.outr   c                 C   s   t | || d S )N)r   r   r   r   r   r      s   c                    s^  || } }| j |j t| jtjkdd  t|jtjkdd  t| jdkdd  t|jdv dd  ttdkfd	d t jtjk g d d
 d R t j k fdd \}}td d
 }d
 }d
 }d
 }t||kfdd |d dkrt|	 | 	 
 tj}	 |	S t| P t | j}
t| }t|}t }d }t|}t|}t|}t|}t|}t|}t| }t|
|||||||||||}W d    n	1 sw   Y  |r-|dkrtdtdddd|||fd|||f
 S )Nc                   S      dS )NzB must be int8r   r   r   r   r   <lambda>       z*_int8_linear_matmul_impl.<locals>.<lambda>c                   S   r   )NzA must be int8r   r   r   r   r   r       r      c                   S   r   )Nz:Only two dimensional matrices are supported for argument Br   r   r   r   r   r   !   r   )r    r	   c                   S   r   )NzCOnly two or three dimensional matrices are supported for argument Ar   r   r   r   r   r   "   r   r   c                      
   d  S )Nz(Input tensor dimensions need to be > 0: r   r   )shapeBr   r   r   #      
 r   c                      s   d j  d S )NzOutput shape z does not match expected shape r   r   )r   shapeCr   r   r   '       c                      s   d d  S )NzQint8_linear_matmul only supports B^T @ A. Inner dimensions do not match: B @ A = z @ r   r   )shapeAr"   r   r   r   1   s       d   z#int8_linear_matmul not implemented!z$cublasLt ran into an error!
	shapeA=z	, shapeB=z	, shapeC=z
	(lda, ldb, ldc)=z
	(m, n, k)=)r   r   _checkr   int8ndimr   r   matmulfloatttocopy_r   r   get_instanceget_contextr   r   ctc_int32r   r   cigemmlt_32NotImplementedErrorRuntimeError)r   r   r   kmnldaldbldcresultctxptrAptrBptrCptrRowScalestream	has_errorr   )r   r'   r"   r%   r   r      sZ   
 







"
,r   zbitsandbytes::int8_mm_dequant	row_stats	col_statsr   biasreturnc                    s>  t  jt jk fdd t jt jkfdd t jt jkfdd t j t jd}t }t|}t}t}	t	t
 jd d }
t	 jd }|d urd|jt jkrdt|nd }t  t|||	|||
|t  W d    n1 sw   Y  |d ur|jt jkr|| ||pt jS )Nc                         d j  S )NzA must be int32, got r   r   r   r   r   r   `       _.<locals>.<lambda>c                      rK   )Nzrow_stats must be float32, got rL   r   )rG   r   r   r   a   rN   c                      rK   )Nzcol_stats must be float32, got rL   r   )rH   r   r   r   b   rN   rL   r   )r   r*   r   r   float32
empty_likefloat16r   r4   r5   r   r   r   r   cdequant_mm_int32_fp16r   add_r0   )r   rG   rH   r   rI   r   rA   ptrOutptrRowStatsptrColStatsnumRowsnumColsptrBiasr   )r   rH   rG   r   r   X   s&    

z#bitsandbytes::int8_vectorwise_quant        c                    sN  t  jt jk fdd t |dkdd  t jd d } jd }t j| jt jd}t j j jt j	d}d }|dkrb 
 |k}| rXt |jddd}n
t jd jt jd}t $ tt t|t|t|t|t|t  W d    n1 sw   Y  |d	kr|d urd|d d |f< |||fS )
Nc                      rK   )NzA must be float16, got rL   r   rM   r   r   r      rN   rO   r[   c                   S   r   )Nzthreshold must be non-negativer   r   r   r   r   r      r   r   r   r   )dim   )r   r*   r   rR   r   r   r   r   rP   r+   absanyargwhereviewint64r   r   cint8_vector_quantr   r4   c_floatr5   r   )r   	thresholdrowscolsrG   out_rowoutlier_colsoutliersr   rM   r   r      s4   


zbitsandbytes::int8_double_quantc                 C   s~   t jjjj| |d\}}}t| |d\}}|dkr#|d ur#| |d} t | d|	d 
t j}||||  |fS )N)re   r[   g     _@r   )r   opsbitsandbytesint8_vectorwise_quantdefault_get_col_absmaxmasked_fillroundmul	unsqueezer0   r+   flattenr.   )r   re   	quant_rowrG   ri   rH   outlier_mask	quant_colr   r   r   r      s   
"c                 C   s^   t |   d }|  d| jd }|dkr"||k}||d |jddd }||fS )Nr   r[   r   F)r\   keepdim)	r   r*   is_floating_pointr^   ra   r   masked_fill_amaxr.   )r   re   rv   absArH   r   r   r   ro      s   ro   z bitsandbytes::quantize_blockwisecode	blocksizec              
      sT  t | t |dv  t  jt jk fdd |  }||   }t j|f| jt jd}t j| t j	d}t
| c t t| t|t|t|t|  f}| jt jkr`tj|  n | jt jkrltj|  n| jt jkrxtj|  ntd| j W d    ||fS W d    ||fS W d    ||fS 1 sw   Y  ||fS )Ni   i   i   i         @   c                      rK   )Nzcode must be float32, got rL   r   r}   r   r   r      rN   rO   r   rL   z?Blockwise quantization only supports 16/32-bit floats, but got )r   _check_is_sizer*   r   rP   numelr   r   rQ   uint8r   r   r4   r5   c_intrR   r   cquantize_blockwise_fp16bfloat16cquantize_blockwise_bf16cquantize_blockwise_fp32
ValueError)r   r}   r~   r;   blocksabsmaxr   argsr   r   r   r      sB   

	


z"bitsandbytes::dequantize_blockwiser   c                 C   s&   t j| |d}t| |||||d |S )NrL   r   )r   rQ   _dequantize_blockwise_implr   r   r}   r~   r   r   r   r   r   r      s   z&bitsandbytes::dequantize_blockwise.outc                    sR   t jkfdd t j jk fdd t |||d d S )Nc                         d  dj  S NzExpected out.dtype == , got rL   r   r   r   r   r   r     r&   rO   c                         d j  dj  S NExpected out.shape == r   r$   r   r   r   r   r   r         r   )r   r*   r   r   r   r   r   )r   r   r   r   r      s   	c              
      s"  t |dv  t  jt jk fdd t t jt jt jfv fdd t \ t|t t|t|t	
|t	
  t f}t jkrRtj|  nt jkr]tj|  n"t jkrwtj|  W d    d S W d    d S W d    d S W d    d S 1 sw   Y  d S )Nr   c                      rK   )NzA must be uint8, got rL   r   rM   r   r   r     rN   z,_dequantize_blockwise_impl.<locals>.<lambda>c                      r!   )NzGBlockwise dequantization only supports 16bit/32bit floating types, got r   r   rL   r   r   r     r#   )r   r*   r   r   rR   r   rP   r   r   r4   r   r   r   r   cdequantize_blockwise_fp16cdequantize_blockwise_bf16cdequantize_blockwise_fp32)r   r   r}   r~   r   r   r   r   )r   r   r   r     s8   





"r   zbitsandbytes::quantize_4bit
quant_typequant_storagec           	   	      s  t |dv  t |dv  t  jt jt jt jfv  fdd   }||   }t j|f jt jd}t j|d |j	d  df j|d}t
  d t t|t|t|t|f} jt jkrw|dkrqtj|  nEtj|  n5 jt jkr|dkrtj|  nMtj|  n= jt jkr|dkrtj|  nAtj|  W d    ||fS W d    ||fS W d    ||fS W d    ||fS W d    ||fS W d    ||fS W d    ||fS 1 sw   Y  ||fS )	Nr   fp4nf4c                      rK   )NzDBlockwise 4bit quantization only supports 16/32-bit floats, but got rL   r   rM   r   r   r   -  rN   rO   r   r]   r    r   )r   r*   r   r   rR   rP   r   r   r   itemsizer   r   r4   r5   r   r   cquantize_blockwise_bf16_fp4cquantize_blockwise_bf16_nf4cquantize_blockwise_fp16_fp4cquantize_blockwise_fp16_nf4cquantize_blockwise_fp32_fp4cquantize_blockwise_fp32_nf4)	r   r~   r   r   r;   r   r   r   r   r   rM   r   r   %  sh   
$
	






zbitsandbytes::dequantize_4bitr   c                 C   s*   t j||| jd}t| |||||d |S )N)r   r   r   )r   r   r   _dequantize_4bit_implr   r   r~   r   r   r   r   r   r   r   r   R  s   	z!bitsandbytes::dequantize_4bit.outc                    sP   t jkfdd t j k fdd t| ||| d d S )Nc                      s   d d j  S r   r$   r   )r   r   r   r   r   j  r&   rO   c                      r   r   rL   r   r   r   r   r   k  r&   r   )r   r*   r   r   r   r   r   )r   r   r   r   r   `  s   
c              
      s  t |dv  t |dv  t  t jt jt jfv  fdd t|  d t| t|t|t|t|	 t
| f}|jt jkrT|dkrNtj|  nAtj|  n3|jt jkrj|dkrdtj|  nCtj|  n5|jt jkr|dkrztj|  n5tj|  W d    d S W d    d S W d    d S W d    d S W d    d S W d    d S W d    d S 1 sw   Y  d S )Nr   r   c                      r!   )NzFBlockwise 4bit dequantization only supports 16/32-bit floats, but got r   r   rL   r   r   r   {  r#   z'_dequantize_4bit_impl.<locals>.<lambda>r   )r   r*   r   rR   rP   r   r   r4   r   r   r   r   r   cdequantize_blockwise_bf16_fp4cdequantize_blockwise_bf16_nf4cdequantize_blockwise_fp16_fp4cdequantize_blockwise_fp16_nf4cdequantize_blockwise_fp32_fp4cdequantize_blockwise_fp32_nf4)r   r   r~   r   r   r   r   r   rL   r   r   o  sP   


"r   zbitsandbytes::gemv_4bitr"   c              	   C   sJ   g | j d d |d R }tj|| j| jd}t| ||||||d |S )Nr   r   r   r   )r   r   r   r   r   _gemv_4bit_impl)r   r   r"   r   r}   r~   r   r   r   r   r   r     s   zbitsandbytes::gemv_4bit.outc              	      sn   t jg  jd d d R k fdd t j jk fdd t ||||d d S )Nr   r   c                      s*   dg  j d d d R  dj  S )Nr   r   r   r   r$   r   r   r   r"   r   r   r     s   * rO   c                      r   r   rL   r   r   r   r   r     r   r   )r   r*   r   r   r   )r   r   r"   r   r}   r~   r   r   r   r   r     s   
 c                 C   s  t | t|d }td}t|d }	|}
t| jd d d }|}t| }t|  | jt jkrVt	
|||	t| t|t|t|t||
||t|| nQ| jt jkr{t	|||	t| t|t|t|t||
||t|| n<| jt jkrt	|||	t| t|t|t|t||
||t|| W d    d S W d    d S W d    d S W d    d S 1 sw   Y  d S )Nr   r]   r   r    )r   r   r4   r5   r   r   r   r   rR   r   cgemm_4bit_inference_naive_fp16r   r   cgemm_4bit_inference_naive_bf16rP   cgemm_4bit_inference_naive_fp32)r   r   r"   r   r}   r~   r   r:   r;   r9   r<   r=   r>   rE   r   r   r   r     s   
	

!"r   )NN)r[   )collections.abcr   ctypesr4   mathr   typingr   r   bitsandbytes.functionalr   r   r   r   _opsr
   
cextensionr   Tensorr   r   r   tuplero   intr   strr   r   r   r   r   r   <module>   s~   ?&)
, .
,
+	