o
    پi                     @   s   d dl Z d dlZd dlmZ ejdd ZejdejfddZejdejdejd	ejd
ejdejf
ddZ	dddZ
dddZdS )    Nc                 C   s`  t | jt jkd | jd }| j| }|dkrd}n|}t | ||| |dg}|t dddddddf ? d@ }t |d	}|d
krHd}n|| }t |||||  |ddg}|dt dd	 ddddddf ? d@ }t |d	}|||  }t ||d|dg}|dt dd dddddf ? d@ }t |d	}t || jdd dg }|S )z
    Vertical popcount
    Input  x : uint32[..., N]
    Output y : uint32[..., 32]
    semantics : y[..., i] = sum_j((x[..., j] >> i) & 1)
    credits: @apgoucher
    z,x should consist of 32-bit unsigned integers      r      Ni         i       )	tlstatic_assertdtypeuint32shapenumelreshapearangesum)xBLOCK_NBATCHESsa1ysa2sa3 r   e/home/ubuntu/.local/lib/python3.10/site-packages/triton_kernels/reduction_details/reduce_bitmatrix.pyvpopc   s*   


$*(r   BLOCKc                 C   s2   t d}|| t d| }t | | d d S )Nr   )r   
program_idr   store)Retr   pidoffsr   r   r   _sum_bitmatrix_memset-   s   
r$   	stride_bm	stride_bn	stride_pmBLOCK_MMBLOCK_Mc                 C   s  t |	|
 dk |	|
 }t|t jr|j rt |}t d}t d}||	 t d|	 }|d t dd }|}t j| ||  ||  ||k dd}t 	|||
g}t
|}|| t d| }t j|| t |ddd t ||d d d f |  |d d d f |  | d S )Nr   r   r
   )maskotherrelaxed)sem)r   r   
isinstancetensorr   is_ptrloadr   r   r   r   
atomic_addr   r    )Bshape_bmr%   r&   r!   Partialsr'   	stride_pnshape_pnr(   r)   	TILE_SIZEpid_mpid_noffs_moffs_nn_rowsbitsretoffs_tr   r   r   _sum_bitmatrix_rows4   s   


$8rA      c                 C   s<   t j}|| |}tj|| f|tjd}t|f || |S )Ndevicer   )tritoncdivtorchemptyint32r$   )n_colsrD   MEMSET_BLOCKrF   blocksout_retr   r   r   
clear_sumsL   s
   
rN   c                 C   s   |d usJ t j}|}| j\}}| jd }|j|fksJ tdd| }|| }	|||	}
||d}tj|d |
| f|jtjd}t	|dd}t
|
|f | jj|| d| d|||d|d|jd ||	dd |d |||d d f }||fS )Nr   r   r   r
   rC   r   )r)   r(   	num_warps)rE   rF   r   	shape_maxmaxrG   rH   rD   rI   	transposerA   storagedatastride)r   rM   partials_block_sizerF   PARTIALS_BLOCK_Mr=   rJ   
n_rows_maxr8   r(   pids_xpids_yout_partialsr   r   r   sum_bitmatrix_rowsT   s,   



 
r\   )rB   )N)rG   rE   triton.languagelanguager   jitr   	constexprr$   rA   rN   r\   r   r   r   r   <module>   s"    
&
