o
    پi                     @   s.  d dl mZmZ d dlmZ d dlZd dlmZ d dlm	Z	 ddl
mZmZ ddlmZ dd	lmZmZ eG d
d dZeG dd dZeG dd dZedZeddddZdeeB ejB fddZeG dd dZeG dd deZdejeB dB fddZd"ddZdedee fd d!ZdS )#    )	dataclassfields)TypeN)TensorDescriptor)create_ragged_descriptor   )
clear_sumssum_bitmatrix_rows)cuda_capability_geq)LayoutStridedLayoutc                   @   sT   e Zd ZU ejed< dZeed< dd Ze	dd Z
dd	 ZdddZdddZdS )StoragedataNlayoutc                 C   s2   t | jtjs	J | jd u rt| jj| _d S d S N)
isinstancer   torchTensorr   r   shapeself r   I/home/ubuntu/.local/lib/python3.10/site-packages/triton_kernels/tensor.py__post_init__   s   
zStorage.__post_init__c                 C      | j jS r   )r   devicer   r   r   r   r         zStorage.devicec                    s   t ddsdS t| jjdvrdS t| j zdW n ty*   dY nw | jj}| jj	t
jkr8dn| j d   fd	d
t|D }t|S )N	   r   F)         r         c                    s(   g | ]}|kr|   d  dkqS )   r   r   ).0ibitwidth	major_dimstridesr   r   
<listcomp>+   s   ( z,Storage.is_tma_compliant.<locals>.<listcomp>)r
   lenr   r   liststrideindex
ValueErrorndimdtyper   uint8element_sizerangeall)r   r1   	compliantr   r'   r   is_tma_compliant   s   
 zStorage.is_tma_compliantFc                 C   s   t | j }t | jj}| j d dk}|rB|d d |d |d g }|d d |d |d g }|d d |d |d g }| jjtjkrh| jjdkrh|	d}|| d ||< |d d dkrht
d| j|}t| j|||S )	Nr!   r   BLACKWELL_VALUEr   r$   r   z^inner shape need to be multiple of 128 for mxfp4 (CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B) TMAs.)r-   r   r.   r   r2   r   r3   r   namer/   r0   swizzle_block_shaper   )r   block_shape	transposer*   r   indxr   r   r   make_dense_tma.   s   
zStorage.make_dense_tmac                 C   s@   |dv r
|  ||S |dksJ t| jjd }t| j||dS )N)densegatherscatterraggedr   )
ragged_dim)r@   r,   r   r   r   )r   r=   moder>   rE   r   r   r   make_tma?   s
   zStorage.make_tma)F)__name__
__module____qualname__r   r   __annotations__r   r   r   propertyr   r8   r@   rG   r   r   r   r   r      s   
 


r   c                   @   s   e Zd ZU eed< dS )IntegerTyper(   N)rH   rI   rJ   intrK   r   r   r   r   rM   G   s   
 rM   c                   @   s.   e Zd ZU eed< eed< eed< dd ZdS )	FloatTypebitwidth_exponentbitwidth_mantissa	is_signedc                 C   s   t | j| j | j | _d S r   )rN   rR   rP   rQ   r(   r   r   r   r   r   R   s   zFloatType.__post_init__N)rH   rI   rJ   rN   rK   boolr   r   r   r   r   rO   L   s
   
 rO   r   T)rP   rQ   rR   typec                 C   s   t | tjr| jd S | jS Nr#   )r   r   r2   itemsizer(   )rT   r   r   r   r(   Z   s   
r(   c                   @   s   e Zd ZU eejB ed< dZee	B ejB ed< dZ
ee dB ed< dZee dB ed< dd Zedd	 Zed
d ZdddZdd Zdd Zdd Zedd Zdd ZdddZdS )r   storageNr2   r   	shape_maxc                    s&  t | jtjrt| j| _| jd u r| jjj| _t| jdk r(| jd u r(t	d| jd u r5t
| jjj| _dd  dd tt fdd| jsLJ | jd u rZd gt| j | _tt| j| jD ]#\}\}}|d ur} |s}t	d| dt| |d u r|| j|< qctt | jsJ d S )	Nr#   z)shape must be provided for sub-byte typesc                 S   s
   t | tS r   )r   rN   sr   r   r   <lambda>t   s   
 z&Tensor.__post_init__.<locals>.<lambda>c                 S   s   t | do
|  dkS )Nnumelr   )hasattrr\   rY   r   r   r   r[   u   s    c                    s    | p| S r   r   rY   is_intis_itemr   r   r[   v   s    z
shape_max[z] must be `int` or `None`; got )r   rW   r   r   r   r2   r   r(   r   r0   r-   r6   maprX   r,   	enumerateziprT   )r   r&   rZ   smaxr   r^   r   r   g   s(   



zTensor.__post_init__c                 C   s
   t | jS r   )r,   r   r   r   r   r   r1      s   
zTensor.ndimc                 C   r   r   )rW   r   r   r   r   r   r      r   zTensor.devicec                 C   s"   |d u r
| j j S | j j|S r   )rW   r   r.   r   r&   r   r   r   r.      s   "zTensor.stridec                 C      | j j S r   )rW   r   data_ptrr   r   r   r   rg         zTensor.data_ptrc                 C   rf   r   )rW   r   r\   r   r   r   r   r\      rh   zTensor.numelc                 C   s   t | jd S rU   )r(   r2   r   r   r   r   r4      s   zTensor.element_sizec                 C   s   | j }t|tr|jS |S r   )rW   r   r   r   )r   tr   r   r   r      s   zTensor.datac                 C   s   | j S r   )r1   r   r   r   r   dim   s   z
Tensor.dimc                 C   s   |d u r| j S | j | S r   )r   re   r   r   r   size   s   
zTensor.sizer   )rH   rI   rJ   r   r   r   rK   r2   rM   rO   r   r-   rN   rX   r   rL   r1   r   r.   rg   r\   r4   r   rj   rk   r   r   r   r   r   `   s$   
 



r   c                       s:   e Zd ZU dZdZejed< d fdd	Zdd Z	  Z
S )		Bitmatrixa@  
    Represents a boolean matrix in a packed format where each element occupies
    a single bit of memory.

    _scratchpad is either None or an all-zero array of size >= shape[-1]; we pass it along
    with the actual bitmatrix to avoid having to launch a separate memset
    kernel when we call Bitmatrix::sum().
    N
scratchpadc                    s   t  j|t||d || _d S )N)r2   r   rX   )super__init__BITrm   )r   rW   r   rX   rm   	__class__r   r   ro      s   
zBitmatrix.__init__c                 C   sF   | j \}}| j}| jd u rt||| _| jd | }d | _t| ||S r   )r   r   rm   r   r	   )r   partials_block_size_n_colsdevout_retr   r   r   sum   s   

zBitmatrix.sum)NN)rH   rI   rJ   __doc__rm   r   r   rK   ro   rx   __classcell__r   r   rq   r   rl      s
   
 	rl   tensorc                 C   s"   | d u rd S t | tr| jjS tS r   )r   r   rW   r   r   r{   r   r   r   
get_layout   s
   
r}   c                 C   sR   |d u r| j }t| j}||  d  t| j t| 9  < tt| ||dS )Nr   )r2   r   )r2   r-   r   r.   r/   r(   r   r   )torch_tensorr2   r   r   r   r   wrap_torch_tensor   s
   
(r   
layout_clsc                    sj   t  tsJ  j}|j|j}||jfi |}||} fddt D }tt	||fi |S )Nc                    s&   i | ]}|j d kr|j t |j qS )rW   )r;   getattr)r%   kr|   r   r   
<dictcomp>   s   & z"convert_layout.<locals>.<dictcomp>)
r   r   rW   r   unswizzle_datar   r   swizzle_datar   r   )r{   r   layout_kwargsold_storageold_data
new_layoutnew_dataattrsr   r|   r   convert_layout   s   
r   r   )dataclassesr   r   typingr   r   triton.tools.tensor_descriptorr   triton.tools.ragged_tmar   "reduction_details.reduce_bitmatrixr   r	   target_infor
   tensor_details.layoutr   r   r   rM   rO   rp   FP4r2   r(   r   rl   r}   r   r   r   r   r   r   <module>   s0    9	D
