o
    پi                  
   @   s   d dl mZmZmZ d dlZd dlZd dlmZ d dl	m
Z
 ejdejdejfddZ	ddejd	ejd
eeee ejf  fddZdS )    )ListOptionalUnionN)get_device_core_countNUM_SMS
BLOCK_SIZEc	                 C   s  t d}	t ||}
t |	||
 |D ]p}||
 }||
 | }|du r&|nt || }|t d| }|d t d|d  }||k }||k }t |||  | |}|dddf t dddddf ? d@ dk}||}t | ||  | td ||@  qdS )a  Apply a bitmask to logits in-place using Triton. The bitmask is a 01 bitwise compressed tensor,
    where 0 means the token is masked and 1 means the token is not masked. After applying the bitmask,
    the masked logits will be set to -inf.

    Parameters
    ----------
    logits_ptr : tl.tensor
        Pointer to the logits tensor to apply the bitmask to.

    bitmask_ptr : tl.tensor
        Pointer to the bitmask tensor to apply.

    indices_ptr : Optional[tl.tensor]
        Optional pointer to indices tensor specifying which rows to apply the mask to.

    num_rows : int
        Number of rows to process. If indices_ptr is provided, this is the number of unique indices.

    vocab_size : int
        Size of the vocabulary dimension. If the logits does not have a vocab padding, this is the
        same as the logits's second dimension. Otherwise, this is the actual size of the vocabulary.

    logits_strides : int
        Stride between rows in the logits tensor.

    bitmask_strides : int
        Stride between rows in the bitmask tensor.

    NUM_SMS : int
        Number of streaming multiprocessors to use.

    BLOCK_SIZE : int
        Size of processing blocks.
    r   N       inf)	tl
program_idcdivrangeloadarangereshapestorefloat)
logits_ptrbitmask_ptrindices_ptrnum_rows
vocab_sizelogits_stridesbitmask_stridesr   r   pid
num_blockswork_idrow_idblock_offsetbatch_idoffsetsbitmask_offsets
vocab_maskpacked_bitmask_maskpacked_bitmaskbitmask r'   a/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/constrained/triton_ops/bitmask_ops.py"apply_token_bitmask_inplace_kernel   s,   
/0
r)   logitsr&   indicesc                 C   s  t  }d}d}|jtjksJ d| j}|j}| jdkr"d|d f}|jdkr-d|d f}|d | d | }||d ksNJ d| d|d  d|d  t|d |d | }	d }
t|tsft|tj	rvtj
|tj| jd	}|jd }
n|d |d ksJ d
|d  d|d  |d }
|dkr|f}nt|	|}|
| f}t|d }t| | |||
|	|d |d |||d d|    dd d S )Ni   r   zbitmask must be of type int32r	   r   z'Bitmask width too large: allow at most z int32s for logits' width z
, but got )dtypedevicezbatch size mismatch: logits z vs bitmask       )	num_warps
num_stages)r   r,   torchint32shapendimmin
isinstancelistTensortensorr-   tritonr   next_power_of_2r)   element_size)r*   r&   r+   r   r   BITS_PER_BLOCKlogits_shapebitmask_shaperequired_bitmask_widthr   r   gridr   r'   r'   r(   "apply_token_bitmask_inplace_tritonT   sZ   



rC   )N)typingr   r   r   r2   r;   triton.languagelanguager   sglang.srt.utilsr   jit	constexprr)   r9   intrC   r'   r'   r'   r(   <module>   s&   	I