o
    Y۷i                     @   s   d dl Zd dlmZmZmZ d dlmZmZm	Z	 d dl
mZ dd Zdd Zd	d
 Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  ZG d!d" d"e	Zed#krhe  dS dS )$    N)cudaint32float32)skip_on_cudasimunittestCUDATestCase)ENABLE_CUDASIMc                 C      t d}t   || |< d S N   )r   gridsyncthreadsaryi r   W/home/ubuntu/vllm_env/lib/python3.10/site-packages/numba/cuda/tests/cudapy/test_sync.pyuseless_syncthreads      
r   c                 C   r	   r
   r   r   syncwarpr   r   r   r   useless_syncwarp   r   r   c                 C   s    t d}t d || |< d S )Nr     r   r   r   r   r   useless_syncwarp_with_mask   s   

r   c                 C   s   t jdt}t d}|||< t   |dk r)|| ||d   ||< t d |dk r>|| ||d   ||< t d |dk rS|| ||d   ||< t d |d	k rh|| ||d	   ||< t d
 |dkrx|d |d  | d< d S d S )N    r      r                     r   )r   sharedarrayr   r   r   )ressmr   r   r   r   coop_syncwarp   s&   




r&   c                 C   sR   d}t j|t}t d}|dkrt|D ]}|||< qt   || | |< d S )Nd   r   r   )r   r"   r#   r   r   ranger   )r   Nr%   r   jr   r   r   simple_smem4   s   

r+   c                 C   sT   t d\}}t jdt}|d |d  |||f< t   |||f | ||f< d S )Nr    
      r   r   r   r"   r#   r   r   )r   r   r*   r%   r   r   r   coop_smem2d?   s
   r0   c                 C   s<   t d}t jdt}|d ||< t   || | |< d S )Nr   r   r    r/   )r   r   r%   r   r   r   dyn_shared_memoryG   s
   
r1   c                 C   ,   | d  d7  < t   | d  d7  < d S Nr   {   iA  )r   threadfencer   r   r   r   use_threadfenceO      r7   c                 C   r2   r3   )r   threadfence_blockr6   r   r   r   use_threadfence_blockU   r8   r:   c                 C   r2   r3   )r   threadfence_systemr6   r   r   r   use_threadfence_system[   r8   r<   c                 C       t d}t | | ||< d S r
   )r   r   syncthreads_countary_inary_outr   r   r   r   use_syncthreads_counta      
rB   c                 C   r=   r
   )r   r   syncthreads_andr?   r   r   r   use_syncthreads_andf   rC   rE   c                 C   r=   r
   )r   r   syncthreads_orr?   r   r   r   use_syncthreads_ork   rC   rG   c                 C   s   t rdS t j| kS )NT)r   r   get_current_devicecompute_capability)ccr   r   r   _safe_cc_checkp   s   rK   c                   @   s   e Zd Zdd Zdd Zeddd Zedee	dd	d
d Z
edee	dd	dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2S )3TestCudaSyncc                 C   sT   t d|}d}tj|tjd}tj|tjd}|d|f | tj|| d S )Nvoid(int32[::1])r-   dtyper   )r   jitnpemptyr   arangetestingassert_equal)selfkernelcompilednelemr   expr   r   r   _test_uselessx   s   zTestCudaSync._test_uselessc                 C      |  t d S N)r[   r   rV   r   r   r   test_useless_syncthreads   s   z%TestCudaSync.test_useless_syncthreadsz#syncwarp not implemented on cudasimc                 C   r\   r]   )r[   r   r^   r   r   r   test_useless_syncwarp   s   z"TestCudaSync.test_useless_syncwarp)   r   z'Partial masks require CC 7.0 or greaterc                 C   r\   r]   )r[   r   r^   r   r   r   test_useless_syncwarp_with_mask   s   z,TestCudaSync.test_useless_syncwarp_with_maskc                 C   sP   d}d}d}t dt}tjdtjd}|||f | tj||d  d S )Ni  r   r   rM   rN   r   )r   rP   r&   rQ   zerosr   rT   rU   )rV   expectednthreadsnblocksrX   r$   r   r   r   test_coop_syncwarp   s   zTestCudaSync.test_coop_syncwarpc              	   C   sV   t dt}d}tj|tjd}|d|f | | t|tj|tjdk d S )NrM   r'   rN   r   )	r   rP   r+   rQ   rR   r   
assertTrueallrS   )rV   rX   rY   r   r   r   r   test_simple_smem   s
   $zTestCudaSync.test_simple_smemc                 C   s   t dt}d}tj|tjd}|d|f | t|}t|jd D ]}t|jd D ]}|d |d  |||f< q.q%| 	t
|| d S )Nzvoid(float32[:,::1])r,   rN   r   r   )r   rP   r0   rQ   rR   r   
empty_liker(   shaperh   allclose)rV   rX   rl   r   rZ   r   r*   r   r   r   test_coop_smem2d   s   
zTestCudaSync.test_coop_smem2dc              
   C   sf   t dt}d}tj|tjd}|d|d|jd f | | t|dtj	|jtj
d k d S )Nzvoid(float32[::1])2   rN   r   r   r   r    )r   rP   r1   rQ   rR   r   sizerh   ri   rS   r   )rV   rX   rl   r   r   r   r   test_dyn_shared_memory   s
   *z#TestCudaSync.test_dyn_shared_memoryc                 C   f   t d d  f}t|t}tjdtj d}|d | | d|d  ts1| d|	| d S d S )Nr-   rN   r   r     r   z
membar.gl;)
r   r   rP   r7   rQ   rc   assertEqualr   assertIninspect_asmrV   sigrX   r   r   r   r   test_threadfence_codegen      z%TestCudaSync.test_threadfence_codegenc                 C   rr   )Nr-   rN   rs   rt   r   zmembar.cta;)
r   r   rP   r:   rQ   rc   ru   r   rv   rw   rx   r   r   r   test_threadfence_block_codegen   r{   z+TestCudaSync.test_threadfence_block_codegenc                 C   rr   )Nr-   rN   rs   rt   r   zmembar.sys;)
r   r   rP   r<   rQ   rc   ru   r   rv   rw   rx   r   r   r   test_threadfence_system_codegen   r{   z,TestCudaSync.test_threadfence_system_codegenc                 C   s^   t t}tjd|d}tjdtjd}d|d< d|d< |d || | t|dk d S )NH   rN   r      *   )r   r~   F   )	r   rP   rB   rQ   onesrc   r   rh   ri   )rV   in_dtyperX   r@   rA   r   r   r   _test_syncthreads_count   s   
z$TestCudaSync._test_syncthreads_countc                 C      |  tj d S r]   )r   rQ   r   r^   r   r   r   test_syncthreads_count      z#TestCudaSync.test_syncthreads_countc                 C   r   r]   )r   rQ   int16r^   r   r   r   test_syncthreads_count_upcast   r   z*TestCudaSync.test_syncthreads_count_upcastc                 C   r   r]   )r   rQ   int64r^   r   r   r   test_syncthreads_count_downcast   r   z,TestCudaSync.test_syncthreads_count_downcastc                 C   s   t t}d}tj||d}tj|tjd}|d|f || | t|dk d|d< |d|f || | t|dk d S Nr'   rN   r   r   r   )	r   rP   rE   rQ   r   rc   r   rh   ri   rV   r   rX   rY   r@   rA   r   r   r   _test_syncthreads_and      
z"TestCudaSync._test_syncthreads_andc                 C   r   r]   )r   rQ   r   r^   r   r   r   test_syncthreads_and   r   z!TestCudaSync.test_syncthreads_andc                 C   r   r]   )r   rQ   r   r^   r   r   r   test_syncthreads_and_upcast   r   z(TestCudaSync.test_syncthreads_and_upcastc                 C   r   r]   )r   rQ   r   r^   r   r   r   test_syncthreads_and_downcast   r   z*TestCudaSync.test_syncthreads_and_downcastc                 C   s   t t}d}tj||d}tj|tjd}|d|f || | t|dk d|d< |d|f || | t|dk d S r   )r   rP   rG   rQ   rc   r   rh   ri   r   r   r   r   _test_syncthreads_or   r   z!TestCudaSync._test_syncthreads_orc                 C   r   r]   )r   rQ   r   r^   r   r   r   test_syncthreads_or  r   z TestCudaSync.test_syncthreads_orc                 C   r   r]   )r   rQ   r   r^   r   r   r   test_syncthreads_or_upcast  r   z'TestCudaSync.test_syncthreads_or_upcastc                 C   r   r]   )r   rQ   r   r^   r   r   r   test_syncthreads_or_downcast
  r   z)TestCudaSync.test_syncthreads_or_downcastN)__name__
__module____qualname__r[   r_   r   r`   r   
skipUnlessrK   rb   rg   rj   rn   rq   rz   r|   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rL   w   sB    





	rL   __main__)numpyrQ   numbar   r   r   numba.cuda.testingr   r   r   numba.core.configr   r   r   r   r&   r+   r0   r1   r7   r:   r<   rB   rE   rG   rK   rL   r   mainr   r   r   r   <module>   s.     