o
    پil                     @   s^   d dl mZmZmZ d dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ G dd dZdS )    )TypeTupleOptionalN)Int32Int64Float32
const_exprc                   @   s   e Zd Zefdeej dedefddZdd Z	dd	 Z
d
d Zd defddZdejdefddZ	d!dejjdejdedeejeej f fddZej	d!dedejdedefddZdS )"ReductionBasedtypeNstagec                 C   s   || _ || _|| _|| _d S N)r
   r   r   reduction_dtype)selfr
   r   r   r    r   H/home/ubuntu/.local/lib/python3.10/site-packages/quack/reduction_base.py__init__   s   
zReductionBase.__init__c                 C   s   t  r   )NotImplementedErrorr   r   r   r   _threads_per_row   s   zReductionBase._threads_per_rowc                 C   s   | j dkrdS dS )Ni @        )r   r   r   r   r   _num_threads   s   zReductionBase._num_threadsc                 C   s
   d| _ d S N   )	cluster_nr   r   r   r   _set_cluster_n   s   
zReductionBase._set_cluster_nr   vecsizec                 C   s   | j | dksJ d| j  d| |  }|  }|tjj dks$J t| j | || j }|| || | f}t	| j
|||}|||fS )Nr   zInput N z! is not divisible by vector size )r   r   r   cutearch	WARP_SIZEceil_divr   
copy_utilstiled_copy_2dr
   )r   r   threads_per_rownum_threadsnum_blocks_Ntiler_mn
tiled_copyr   r   r   _get_tiled_copy   s   $
zReductionBase._get_tiled_copy	tv_layoutr   c                 C   sj   t j|dgdt jj }t |jd dkr|nt|jd d t jj d}t j|| ||f| jfddS )Nr   )moder   )r   r      )order)	r   sizer   r    rankshapemaxmake_ordered_layoutr   )r   r*   r   	num_warpswarps_per_rowr   r   r   _get_reduction_buffer_layout&   s   z*ReductionBase._get_reduction_buffer_layoutFsmemis_persistentreturnc                 C   s\   |j | j| || jdd}t| jdkr(|jt|s| jn| jd d}||fS d }||fS )N   )byte_alignmentr   r,   )	num_elems)allocate_tensorr   r5   r   r   allocate_arrayr   r   )r   r6   r*   r7   reduction_buffermbar_ptrr   r   r   #_allocate_reduction_buffer_and_mbar2   s   z1ReductionBase._allocate_reduction_buffer_and_mbartidxr?   r3   c                 C   sl   t | jdkr4|| jk r(tj|| d t |r(tj|| j | || j  tj  tj  d S d S r   )r   r   r   r   r   mbarrier_initmbarrier_init_fencecluster_arrive_relaxed)r   rA   r?   r3   r7   r   r   r   _initialize_clusterB   s   

z!ReductionBase._initialize_clusterN)r   )F)__name__
__module____qualname__r   r   cutlassNumericintr   r   r   r   r)   r   Layoutr5   utilsSmemAllocatorboolr   Tensorr   Pointerr@   jitr   rE   r   r   r   r   r	      s8     

r	   )typingr   r   r   rI   cutlass.cuter   r   r   r   r   quack.copy_utilsr"   r	   r   r   r   r   <module>   s   