o
    ܹi_                     @   sl   d dl Z d dlmZ d dlmZ dddZe jdd Ze jddej	fdd	Z
e jddej	fd
dZdS )    N)TensorDescriptorc                    s   t |}t  j}t|}|dk r||7 }d|  kr$|d k s)J d J d|dks1J dt||ks;J dd}d}|| |ksIJ d	|||<  |}d
| |g fddt|D  }||g| }	ddg| }
t |	||
S )a  
    Given a 2- or 3-dimensional tensor T, this creates a 'ragged descriptor'
    which behaves like a concatenation (along the first axis) of subarrays
    of potentially unequal size.

    The load_ragged and store_ragged device functions can be used to read
    and write from subarrays T[batch_offset : batch_offset + batch_size]
    with hardware bounds-checking preventing any sort of leakage outside
    the subarray.
    r      zlast dimension cannot be ragged   z<read-write ragged descriptors must have at most 3 dimensionsz1block shape must have same length as tensor shapei     @z#number of rows may not exceed 2**30l        c                    s   g | ]}  |qS  )stride).0iTr   T/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/triton/tools/ragged_tma.py
<listcomp>)   s    z,create_ragged_descriptor.<locals>.<listcomp>)listshapelenr   ranger   )r   block_shape
ragged_dimtensor_shaperankmax_intbillionragged_stride
tma_stride	tma_shape	box_shaper   r
   r   create_ragged_descriptor   s"   
(
"r   c                 C   s"   d}|| | }| | }|||fS )z;
    Helper function for load_ragged and store_ragged.
    r   r   )batch_offset
batch_sizerowr   xyr   r   r   to_ragged_indices0   s   
r"   r   c           	      C   s~   t t| jt|d kd t|||| \}}}| ||g|d|  |g ||d d  }t ||jdd }|S )z
    Read from a subarray T[batch_offset : batch_offset + batch_size] with
    hardware bounds-checking, where reading outside the subarray gives zeros.

    Coords should be an appropriately-sized list of integers, just like in
    TMA.load().
       z*TMA must be a read-write ragged descriptorNr   )tlstatic_assertr   r   r"   loadreshape)	TMAr   r   coordsr   c0c1c2datar   r   r   load_ragged=   s
   
0r.   c           	      C   sb   t |||| \}}}t|ddg|j }| ||g|d|  |g ||d d  | dS )a  
    Write to a subarray T[batch_offset : batch_offset + batch_size] with
    hardware bounds-checking, where writes outside the subarray are masked
    correctly.

    Coords should be an appropriately-sized list of integers, just like in
    TMA.store().
    r   N)r"   r$   r'   r   store)	r(   r   r   r)   r-   r   r*   r+   r,   r   r   r   store_raggedO   s   6r0   )r   )tritontriton.languagelanguager$   triton.tools.tensor_descriptorr   r   jitr"   	constexprr.   r0   r   r   r   r   <module>   s    
(
