o
    ۾i                     @   s   d dl mZmZ d dlZzd dlZd dlmZ W n ey) Z	 zede	dZ	[	ww ej
dejdejfddZ		ddejd	ejd
ee deee  fddZdS )    )ListOptionalNzTriton is not installedNUM_SMS
BLOCK_SIZEc	                 C   s  t d}	t ||}
t |	||
 |D ]p}||
 }||
 | }|du r&|nt || }|t d| }|d t d|d  }||k }||k }t |||  | |}|dddf t dddddf ? d@ dk}||}t | ||  | td ||@  qdS )a  Apply a bitmask to logits in-place using Triton. The bitmask is a 01 bitwise compressed tensor,
    where 0 means the token is masked and 1 means the token is not masked. After applying the bitmask,
    the masked logits will be set to -inf.

    Parameters
    ----------
    logits_ptr : tl.tensor
        Pointer to the logits tensor to apply the bitmask to.

    bitmask_ptr : tl.tensor
        Pointer to the bitmask tensor to apply.

    indices_ptr : Optional[tl.tensor]
        Optional pointer to indices tensor specifying which rows to apply the mask to.

    num_rows : int
        Number of rows to process. If indices_ptr is provided, this is the number of unique indices.

    vocab_size : int
        Size of the vocabulary dimension. If the logits does not have a vocab padding, this is the
        same as the logits's second dimension. Otherwise, this is the actual size of the vocabulary.

    logits_strides : int
        Stride between rows in the logits tensor.

    bitmask_strides : int
        Stride between rows in the bitmask tensor.

    NUM_SMS : int
        Number of streaming multiprocessors to use.

    BLOCK_SIZE : int
        Size of processing blocks.
    r   N       inf)	tl
program_idcdivrangeloadarangereshapestorefloat)
logits_ptrbitmask_ptrindices_ptrnum_rows
vocab_sizelogits_stridesbitmask_stridesr   r   pid
num_blockswork_idrow_idblock_offsetbatch_idoffsetsbitmask_offsets
vocab_maskpacked_bitmask_maskpacked_bitmaskbitmask r%   g/home/ubuntu/.local/lib/python3.10/site-packages/xgrammar/kernels/apply_token_bitmask_inplace_triton.py"apply_token_bitmask_inplace_kernel   s&   
/0
r'   logitsr$   r   indicesc                 C   s2  t jdj}d}t jdj}t jjd urd|vrd}nd}|jt jks)J dt	| j
d |j
d d }|d u r=|}n||ksKJ d	| d
| |d urSt|n| jdkr]| j
d nd}	|d urst j|t jd}
|
j| jdd}|f}t| | |||	||  d | d |||| d|    dd d S )Ncudai   r   gfx1@   r   zbitmask must be of type int32zvocab_size z( is larger than the detected vocab_size    r   )dtypeT)devicenon_blocking      )	num_warps
num_stages)torchr*   get_device_propertiesmulti_processor_countgcnArchNameversionhipr/   int32minshapelenndimtensortor0   r'   strideelement_size)r(   r$   r   r)   r   r   arch	WARP_SIZEdetected_vocab_sizer   indices_cpugridr%   r%   r&   "apply_token_bitmask_inplace_tritonP   s>   
(


rJ   )NN)typingr   r   r6   tritontriton.languagelanguager	   ImportErrorerrjit	constexprr'   TensorintrJ   r%   r%   r%   r&   <module>   s4    
	F
