o
    3wiQ                     @   s  d dl Z d dlmZ e sdd Zde jfddZdS d dlZd dlmZ	 ej
ejdd	id
dejddiddgdgdejde	jfddZde jfddZej
ejddddd
dejddddd
dgddgdejde	jde	jde	jfddZdd ZdS )    N)is_triton_availablec                 C      d S N )inputr   r   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/bitsandbytes/triton/quantize_global.pyquantize_global_transpose      r   xc                 C   r   r   r   )r
   r   r   r   quantize_global
   r	   r   
BLOCK_SIZEi      )	num_warpsi      )
num_stages
n_elements)configskeyc                 C   st   t jdd}|| }|t d| }||k }t j| | |d}	t |}
t jd|	|
  }t j|| ||d d S )Nr   )axismask     _@)tl
program_idarangeload	libdevicellrintstore)x_ptrabsmax_inv_ptr
output_ptrr   r   pidblock_startoffsetsr   r
   
absmax_invoutputr   r   r   _quantize_global   s   
r'   c                    sn   |    d}d| }tj| jdtjd}| jr|js J |   fdd}t	| | ||  ||fS )Nr         ?cudadevicedtypec                    s   t  | d fS )Nr   tritoncdiv)metar   r   r   <lambda>/   s    z!quantize_global.<locals>.<lambda>)
absmax	unsqueezetorchemptyshapeint8is_cudanumelr'   )r
   absmaxr%   r&   gridr   r1   r   r   )   s         )BLOCK_MBLOCK_NGROUP_MMNr@   rA   rB   c                 C   s  t d}||	 d |	 }||
 d |
 }|| }|| }t|||  |}|| ||  }|| | }||	 t d|	 }||
 t d|
 }| |d d d f | |d d d f |   } ||k d d d f ||k d d d f @ }t j| |d}t |}||	 t d|	 }||
 t d|
 }||d d d f | |d d d f |   }||k d d d f ||k d d d f @ }t jd||  }t j|||d d S )Nr   r   r   r   )r   r   minr   r   r   r   r   )Ar    B	stride_am	stride_an	stride_bn	stride_bmrC   rD   r@   rA   rB   r"   grid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnr   ar%   r&   r   r   r   _quantize_global_transpose4   s(   
,(
,(rV   c              
      s   |    d}d| }| j\ tj dtjd}|dkr*|d ks,J | ddks<| ddks<J |ddksL|ddksLJ  fdd}t	| | ||| d| d|d|d 	 ||fS )Nr   r(   r)   r*   r   c                    s"   t  | d t | d  fS )Nr@   rA   r-   )METArC   rD   r   r   r2   p   s   " z+quantize_global_transpose.<locals>.<lambda>)
r3   r4   r5   r8   r6   r7   r9   sizestriderV   )r   r<   r%   outr=   r   rX   r   r   f   s(   
   )r6    bitsandbytes.triton.triton_utilsr   r   Tensorr   r.   triton.languagelanguager   autotuneConfigjit	constexprr'   rV   r   r   r   r   <module>   sB    
)