o
    ¦¸¢iä–  ã                
   @   sz  d dl mZ d dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d
dlmZ d dlmZ d dlmZmZmZ d dlm Z m!Z! e
ƒ Z"e"j#Z#e"j$Z%e"j&Z&dd„ Z'e%e (e¡dƒdd„ ƒZ)e%e (e¡dƒdd„ ƒZ*e%e (e¡dƒdd„ ƒZ+e%e (e¡dƒdd„ ƒZ,e%e (e¡dƒdd„ ƒZ-e%e d ƒd!d"„ ƒZ.e%e d#ƒd$d%„ ƒZ/e%e d&ƒd'd(„ ƒZ0e#ej1j2ej3ƒd)d*„ ƒZ4d a5d+d,„ Z6e#ej7j8ej9ej:ƒd-d.„ ƒZ;e#ej7j8ej<ej:ƒe#ej7j8ej=ej:ƒd/d0„ ƒƒZ>e#ej?j8ej9ej:ƒd1d2„ ƒZ@e#ej?j8ej<ej:ƒe#ej?j8ej=ej:ƒd3d4„ ƒƒZAe#ejBƒd5d6„ ƒZCe#ejDƒd7d8„ ƒZEe#ejFƒd9d:„ ƒZGe#ejHƒd;d<„ ƒZIe#ejHejJƒd=d>„ ƒZKe#ejLejJejJejJejJejJƒe#ejLejJejJejMejJejJƒe#ejLejJejJejNejJejJƒe#ejLejJejJejOejJejJƒd?d@„ ƒƒƒƒZPe#ejQejJejJejRƒdAdB„ ƒZSe#ejTejJejJƒe#ejTejJejMƒe#ejTejJejNƒe#ejTejJejOƒdCdD„ ƒƒƒƒZUe#ejVejJejJƒe#ejVejJejMƒe#ejVejJejNƒe#ejVejJejOƒdEdF„ ƒƒƒƒZWe#ejXƒdGdH„ ƒZYe#ejZƒdIdJ„ ƒZ[e#ej\ej:ƒdKdL„ ƒZ]e#ej^ej:ej:ej:ƒdMdN„ ƒZ_dOdP„ Z`eejaejbƒdQdR„ ƒZceejbejaƒdSdT„ ƒZddUdV„ ZeeejaejfƒdWdX„ ƒZgeejfejaƒeej9ejaƒdYdZ„ ƒƒZhd[d\„ Zieiejjjkd]ƒ eiejld]ƒ eiejmd]ƒ eiejjjnd^ƒ eiejod^ƒ eiejpd^ƒ eiejjjqd_ƒ eiejrd_ƒ eiejsd_ƒ e#ejjjtejaƒd`da„ ƒZue#ejvejaƒdbdc„ ƒZwe#ejjjxejaƒddde„ ƒZye#ezejaƒdfdg„ ƒZ{e#ejjj|ejaejaejaƒdhdi„ ƒZ}e#ej~ejaejaƒe#ejejaejaƒdjdk„ ƒƒZ€dlZdmdn„ Z‚e#ejjjƒejaejaƒe‚doƒƒ e#ej„ejaejaƒe‚doƒƒ e#ejjj…ejaejaƒe‚dpƒƒ e#ej†ejaejaƒe‚dpƒƒ e#ejjj‡ejaejaƒe‚dqƒƒ e#ejˆejaejaƒe‚dqƒƒ e#ejjj‰ejaejaƒe‚drƒƒ e#ejŠejaejaƒe‚drƒƒ e#ejjj‹ejaejaƒe‚dsƒƒ e#ejŒejaejaƒe‚dsƒƒ e#ejjjejaejaƒe‚dtƒƒ e#ejŽejaejaƒe‚dtƒƒ dudv„ Zeejjjdwdrƒ eejjj‘dxdtƒ ej’dyej“dziZ”e#ej•ej’ƒe#ej•ej“ƒd{d|„ ƒƒZ–e#ej—ej˜ƒd}d~„ ƒZ™e#ej—ejšƒdd€„ ƒZ›e#ejœej:ƒdd‚„ ƒZe#ejžejJƒe#ejžej˜ƒdƒd„„ ƒƒZŸe#ejžejMƒe#ejžejšƒd…d†„ ƒƒZ e#ej¡ej:ej:ej:ƒd‡dˆ„ ƒZ¢e#e£ejNejNƒd‰dŠ„ ƒZ¤e#e£ejOejNƒe#e£ejNejOƒe#e£ejOejOƒd‹dŒ„ ƒƒƒZ¥e#e¦ejNejNƒddŽ„ ƒZ§e#e¦ejOejNƒe#e¦ejNejOƒe#e¦ejOejOƒdd„ ƒƒƒZ¨e#e©ejNƒe#e©ejOƒd‘d’„ ƒƒZªe#e©ejNejfƒe#e©ejOejfƒd“d”„ ƒƒZ«d•d–„ Z¬ej­d— Z®d—ej­ Z¯e#ej°ejNƒe¬e®ƒƒ e#ej°ejOƒe¬e®ƒƒ e#ej±ejNƒe¬e¯ƒƒ e#ej±ejOƒe¬e¯ƒƒ d˜d™„ Z²dšd›„ Z³e#ej´jlej3ejµej:ƒe#ej´jlej3ej=ej:ƒe#ej´jlej3ej<ej:ƒe³dœd„ ƒƒƒƒZ¶e#ej´joej3ejµej:ƒe#ej´joej3ej=ej:ƒe#ej´joej3ej<ej:ƒe³dždŸ„ ƒƒƒƒZ·e#ej´j¸ej3ejµej:ƒe#ej´j¸ej3ej=ej:ƒe#ej´j¸ej3ej<ej:ƒe³d d¡„ ƒƒƒƒZ¹e#ej´jºej3ejµej:ƒe#ej´jºej3ej=ej:ƒe#ej´jºej3ej<ej:ƒe³d¢d£„ ƒƒƒƒZ»d¤d¥„ Z¼e¼ej´j½d¦ƒ e¼ej´j¾d§ƒ e¼ej´j¿d¨ƒ e#ej´jÀej3ejµej:ƒe#ej´jÀej3ej=ej:ƒe#ej´jÀej3ej<ej:ƒe³d©dª„ ƒƒƒƒZÁe#ej´j£ej3ejµej:ƒe#ej´j£ej3ej<ej:ƒe#ej´j£ej3ej=ej:ƒe³d«d¬„ ƒƒƒƒZÂe#ej´j¦ej3ejµej:ƒe#ej´j¦ej3ej<ej:ƒe#ej´j¦ej3ej=ej:ƒe³d­d®„ ƒƒƒƒZÃe#ej´jÄej3ejµej:ƒe#ej´jÄej3ej<ej:ƒe#ej´jÄej3ej=ej:ƒe³d¯d°„ ƒƒƒƒZÅe#ej´jÆej3ejµej:ƒe#ej´jÆej3ej<ej:ƒe#ej´jÆej3ej=ej:ƒe³d±d²„ ƒƒƒƒZÇe#ej´jÈej3ej:ej:ƒd³d´„ ƒZÉe#ej´jÊej3ejµej:ej:ƒe#ej´jÊej3ej<ej:ej:ƒe#ej´jÊej3ej=ej:ej:ƒdµd¶„ ƒƒƒZËe#ejÌejÍƒd·d¸„ ƒZÎ	¹d¾dºd»„ZÏe&e!ƒd¼d½„ ƒZÐee Ñ¡ e#ƒ dS )¿é    )ÚreduceN)Úir)ÚRegistryÚ
lower_cast)Úparse_dtype)Úmodels)ÚtypesÚcgutils)Úufunc_db)Úregister_ufuncsé   )Únvvm)Úcuda)Ú	nvvmutilsÚstubsÚerrors)Údim3ÚCUDADispatcherc                 C   sB   t  | d| ¡}t  | d| ¡}t  | d| ¡}t | |||f¡S )Nz%s.xz%s.yz%s.z)r   Ú	call_sregr	   Úpack_struct)ÚbuilderÚprefixÚxÚyÚz© r   úQ/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/numba/cuda/cudaimpl.pyÚinitialize_dim3   s   r   Ú	threadIdxc                 C   ó
   t |dƒS )NÚtid©r   ©Úcontextr   ÚsigÚargsr   r   r   Úcuda_threadIdx    ó   
r&   ÚblockDimc                 C   r   )NÚntidr!   r"   r   r   r   Úcuda_blockDim%   r'   r*   ÚblockIdxc                 C   r   )NÚctaidr!   r"   r   r   r   Úcuda_blockIdx*   r'   r-   ÚgridDimc                 C   r   )NÚnctaidr!   r"   r   r   r   Úcuda_gridDim/   r'   r0   Úlaneidc                 C   s   t  |d¡S )Nr1   )r   r   r"   r   r   r   Úcuda_laneid4   ó   r2   r   c                 C   ó   |  |d¡S ©Nr   ©Úextract_valuer"   r   r   r   Údim3_x9   r3   r8   r   c                 C   r4   )Nr   r6   r"   r   r   r   Údim3_y>   r3   r9   r   c                 C   r4   )Né   r6   r"   r   r   r   Údim3_zC   r3   r;   c                 C   s   |d S r5   r   r"   r   r   r   Úcuda_const_array_likeJ   s   r<   c                 C   s   t d7 a d | t ¡S )zÍDue to bug with NVVM invalid internalizing of shared memory in the
    PTX output.  We can't mark shared memory to be internal. We have to
    ensure unique name is generated for shared memory symbol.
    r   z{0}_{1})Ú_unique_smem_idÚformat©Únamer   r   r   Ú_get_unique_smem_idT   s   rA   c              	   C   s8   |j d j}t|j d ƒ}t| ||f|tdƒtjddS )Nr   r   Ú_cudapy_smemT©ÚshapeÚdtypeÚsymbol_nameÚ	addrspaceÚcan_dynsized)r%   Úliteral_valuer   Ú_generic_arrayrA   r   ÚADDRSPACE_SHARED©r#   r   r$   r%   ÚlengthrE   r   r   r   Úcuda_shared_array_integer^   s   ýrN   c              	   C   s>   dd„ |j d D ƒ}t|j d ƒ}t| |||tdƒtjddS )Nc                 S   ó   g | ]}|j ‘qS r   ©rI   ©Ú.0Úsr   r   r   Ú
<listcomp>k   ó    z+cuda_shared_array_tuple.<locals>.<listcomp>r   r   rB   TrC   )r%   r   rJ   rA   r   rK   ©r#   r   r$   r%   rD   rE   r   r   r   Úcuda_shared_array_tupleh   s   
ýrW   c              	   C   s4   |j d j}t|j d ƒ}t| ||f|dtjddS )Nr   r   Ú_cudapy_lmemFrC   )r%   rI   r   rJ   r   ÚADDRSPACE_LOCALrL   r   r   r   Úcuda_local_array_integers   s   ýrZ   c              	   C   s:   dd„ |j d D ƒ}t|j d ƒ}t| |||dtjddS )Nc                 S   rO   r   rP   rQ   r   r   r   rT   €   rU   z(ptx_lmem_alloc_array.<locals>.<listcomp>r   r   rX   FrC   )r%   r   rJ   r   rY   rV   r   r   r   Úptx_lmem_alloc_array}   s   
ýr[   c                 C   óD   |rJ ‚d}|j }t t ¡ d¡}t |||¡}| |d¡ |  ¡ S )Nzllvm.nvvm.membar.ctar   ©Úmoduler   ÚFunctionTypeÚVoidTyper	   Úget_or_insert_functionÚcallÚget_dummy_value©r#   r   r$   r%   ÚfnameÚlmodÚfntyÚsyncr   r   r   Úptx_threadfence_blockˆ   ó   ri   c                 C   r\   )Nzllvm.nvvm.membar.sysr   r]   rd   r   r   r   Úptx_threadfence_system“   rj   rk   c                 C   r\   )Nzllvm.nvvm.membar.glr   r]   rd   r   r   r   Úptx_threadfence_devicež   rj   rl   c                 C   s*   |   tjd¡}t tj¡}t| |||gƒS )Nl   ÿÿ )Úget_constantr   Úint32ÚnoneÚptx_syncwarp_mask)r#   r   r$   r%   ÚmaskÚmask_sigr   r   r   Úptx_syncwarp©   s   rs   c                 C   sD   d}|j }t t ¡ t d¡f¡}t |||¡}| ||¡ |  ¡ S )Nzllvm.nvvm.bar.warp.syncé    )	r^   r   r_   r`   ÚIntTyper	   ra   rb   rc   rd   r   r   r   rp   °   s   rp   c              
   C   sü  |\}}}}}|j d }	|	tjv r| |t |	j¡¡}d}
|j}t t 	t d¡t d¡f¡t d¡t d¡t d¡t d¡t d¡f¡}t
 |||
¡}|	jdkr| ||||||f¡}|	tjkr}| |d¡}| |d¡}| |t ¡ ¡}t
 |||f¡}|S | |t d¡¡}| ||  tjd¡¡}| |t d¡¡}| ||||||f¡}| ||||||f¡}| |d¡}| |d¡}| |d¡}| |t d¡¡}| |t d¡¡}| ||  tjd¡¡}| ||¡}|	tjkrô| |t ¡ ¡}t
 |||f¡}|S )a  
    The NVVM intrinsic for shfl only supports i32, but the cuda intrinsic
    function supports both 32 and 64 bit ints and floats, so for feature parity,
    i64, f32, and f64 are implemented. Floats by way of bitcasting the float to
    an int, then shuffling, then bitcasting back. And 64-bit values by packing
    them into 2 32bit values, shuffling thoose, and then packing back together.
    r:   zllvm.nvvm.shfl.sync.i32rt   r   r   é@   )r%   r   Úreal_domainÚbitcastr   ru   Úbitwidthr^   r_   ÚLiteralStructTyper	   ra   rb   Úfloat32r7   Ú	FloatTypeÚmake_anonymous_structÚtruncÚlshrrm   Úi8ÚzextÚshlÚor_Úfloat64Ú
DoubleType)r#   r   r$   r%   rq   ÚmodeÚvalueÚindexÚclampÚ
value_typere   rf   rg   ÚfuncÚretÚrvÚpredÚfvÚvalue1Ú
value_lshrÚvalue2Úret1Úret2Úrv1Úrv2Úrv1_64Úrv2_64Úrv_shlr   r   r   Úptx_shfl_sync_i32º   sJ   

ÿþ

ñ
rš   c                 C   s^   d}|j }t t t d¡t d¡f¡t d¡t d¡t d¡f¡}t |||¡}| ||¡S )Nzllvm.nvvm.vote.syncrt   r   )r^   r   r_   rz   ru   r	   ra   rb   )r#   r   r$   r%   re   rf   rg   r‹   r   r   r   Úptx_vote_syncð   s   ÿþr›   c                 C   s†   |\}}|j d j}|j d tjv r| |t |¡¡}d |¡}|j}t 	t d¡t d¡t |¡f¡}	t
 ||	|¡}
| |
||f¡S )Nr   zllvm.nvvm.match.any.sync.i{}rt   )r%   ry   r   rw   rx   r   ru   r>   r^   r_   r	   ra   rb   ©r#   r   r$   r%   rq   r‡   Úwidthre   rf   rg   r‹   r   r   r   Úptx_match_any_syncû   s   
"rž   c                 C   s–   |\}}|j d j}|j d tjv r| |t |¡¡}d |¡}|j}t 	t 
t d¡t d¡f¡t d¡t |¡f¡}	t ||	|¡}
| |
||f¡S )Nr   zllvm.nvvm.match.all.sync.i{}rt   )r%   ry   r   rw   rx   r   ru   r>   r^   r_   rz   r	   ra   rb   rœ   r   r   r   Úptx_match_all_sync  s   
ÿþrŸ   c                 C   ó,   t jt  t  d¡g ¡dddd}| |g ¡S )Nrt   zactivemask.b32 $0;ú=rT©Úside_effect©r   Ú	InlineAsmr_   ru   rb   ©r#   r   r$   r%   Ú
activemaskr   r   r   Úptx_activemask  s   ÿr¨   c                 C   r    )Nrt   zmov.u32 $0, %lanemask_lt;r¡   Tr¢   r¤   r¦   r   r   r   Úptx_lanemask_lt$  s
   þr©   c                 C   s   |  |d ¡S r5   )Úctpopr"   r   r   r   Úptx_popc,  ó   r«   c                 C   s
   |j |Ž S ©N)Úfmar"   r   r   r   Úptx_fma1  r'   r¯   c                 C   s:   dddœ}z||  W S  t y   d| › d}t |¡‚w )N)Úf32Úf)Úf64Úd)rt   rv   z$Conversion between float16 and floatú unsupported©ÚKeyErrorr   ÚCudaLoweringError©ry   ÚtypemapÚmsgr   r   r   Úfloat16_float_ty_constraint6  s   


þr»   c           	      C   sd   |j |j kr|S t|j ƒ\}}t |  |¡t d¡g¡}t |d|› dd|› d¡}| ||g¡S )Né   zcvt.ú.f16 $0, $1;ú=ú,h)ry   r»   r   r_   Úget_value_typeru   r¥   rb   ©	r#   r   ÚfromtyÚtotyÚvalÚtyÚ
constraintrg   Úasmr   r   r   Úfloat16_to_float_cast@  s   rÈ   c           	      C   sb   |j |j kr|S t|j ƒ\}}t t d¡|  |¡g¡}t |d|› dd|› ¡}| ||g¡S )Nr¼   úcvt.rn.f16.ú $0, $1;ú=h,)ry   r»   r   r_   ru   rÀ   r¥   rb   rÁ   r   r   r   Úfloat_to_float16_castL  s   rÌ   c                 C   s>   dddddœ}z||  W S  t y   d| › d}t |¡‚w )NÚcÚhÚrÚl)é   r¼   rt   rv   z"Conversion between float16 and intr´   rµ   r¸   r   r   r   Úfloat16_int_constraintX  s   

þrÒ   c           
      C   sf   |j }t|ƒ}|jrdnd}t |  |¡t d¡g¡}t |d|› |› dd|› d¡}	| |	|g¡S )NrS   Úur¼   zcvt.rni.r½   r¾   r¿   )	ry   rÒ   Úsignedr   r_   rÀ   ru   r¥   rb   ©
r#   r   rÂ   rÃ   rÄ   ry   rÆ   Ú
signednessrg   rÇ   r   r   r   Úfloat16_to_integer_castb  s   
þr×   c           
      C   sd   |j }t|ƒ}|jrdnd}t t d¡|  |¡g¡}t |d|› |› dd|› ¡}	| |	|g¡S )NrS   rÓ   r¼   rÉ   rÊ   rË   )	ry   rÒ   rÔ   r   r_   ru   rÀ   r¥   rb   rÕ   r   r   r   Úinteger_to_float16_casto  s   
ÿþrØ   c                    s    t | tjtjƒ‡ fdd„ƒ}d S )Nc                    sB   t  t  d¡t  d¡t  d¡g¡}t  |ˆ › dd¡}| ||¡S )Nr¼   z.f16 $0,$1,$2;ú=h,h,h©r   r_   ru   r¥   rb   ©r#   r   r$   r%   rg   rÇ   ©Úopr   r   Úptx_fp16_binary  s
   ÿz*lower_fp16_binary.<locals>.ptx_fp16_binary©Úlowerr   Úfloat16)ÚfnrÝ   rÞ   r   rÜ   r   Úlower_fp16_binary~  ó   rã   ÚaddÚsubÚmulc                 C   ó4   t  t  d¡t  d¡g¡}t  |dd¡}| ||¡S )Nr¼   zneg.f16 $0, $1;ú=h,hrÚ   rÛ   r   r   r   Úptx_fp16_hneg’  ó   rê   c                 C   ó   t | |||ƒS r­   )rê   r"   r   r   r   Úoperator_hneg™  r¬   rí   c                 C   rè   )Nr¼   zabs.f16 $0, $1;ré   rÚ   rÛ   r   r   r   Úptx_fp16_habsž  rë   rî   c                 C   rì   r­   )rî   r"   r   r   r   Úoperator_habs¥  r¬   rï   c                 C   sH   t  d¡t  d¡t  d¡g}t  t  d¡|¡}t  |dd¡}| ||¡S )Nr¼   zfma.rn.f16 $0,$1,$2,$3;z=h,h,h,h)r   ru   r_   r¥   rb   )r#   r   r$   r%   Úargtysrg   rÇ   r   r   r   Úptx_hfmaª  s   rñ   c                 C   ó   dd„ }|   ||||¡S )Nc                 S   s   t j | |¡S r­   )r   Úfp16Úhdiv)r   r   r   r   r   Úfp16_divµ  s   zfp16_div_impl.<locals>.fp16_div©Úcompile_internal)r#   r   r$   r%   rõ   r   r   r   Úfp16_div_impl²  s   rø   z’{{
          .reg .pred __$$f16_cmp_tmp;
          setp.{op}.f16 __$$f16_cmp_tmp, $1, $2;
          selp.u16 $0, 1, 0, __$$f16_cmp_tmp;
        }}c                    ó   ‡ fdd„}|S )Nc           	         sr   t  t  d¡t  d¡t  d¡g¡}t  |tjˆ dd¡}| ||¡}|  tj	d¡}| 
|t  d¡¡}| d||¡S )Nr¼   rÜ   rÙ   r   z!=)r   r_   ru   r¥   Ú	_fp16_cmpr>   rb   rm   r   Úint16rx   Úicmp_unsigned)	r#   r   r$   r%   rg   rÇ   ÚresultÚzeroÚ
int_resultrÜ   r   r   Úptx_fp16_comparisonÃ  s   "z*_gen_fp16_cmp.<locals>.ptx_fp16_comparisonr   )rÝ   r   r   rÜ   r   Ú_gen_fp16_cmpÂ  s   r  ÚeqÚneÚgeÚgtÚleÚltc                    s    t | tjtjƒ‡ fdd„ƒ}d S )Nc                    s(   t ˆ ƒ| |||ƒ}| ||d |d ¡S )Nr   r   )r  Úselect)r#   r   r$   r%   ÚchoicerÜ   r   r   Úptx_fp16_minmaxÝ  s   z*lower_fp16_minmax.<locals>.ptx_fp16_minmaxrß   )râ   re   rÝ   r
  r   rÜ   r   Úlower_fp16_minmaxÜ  rä   r  ÚmaxÚminÚ
__nv_cbrtfÚ	__nv_cbrtc           
      C   sF   |j }t| }|  |¡}|j}t ||g¡}t |||¡}	| |	|¡S r­   )	Úreturn_typeÚ
cbrt_funcsrÀ   r^   r   r_   r	   ra   rb   )
r#   r   r$   r%   rÅ   re   Úftyrf   rg   râ   r   r   r   Úptx_cbrtñ  s   
r  c              	   C   ó2   t  |jt t d¡t d¡f¡d¡}| ||¡S )Nrt   Ú	__nv_brev©r	   ra   r^   r   r_   ru   rb   ©r#   r   r$   r%   râ   r   r   r   Úptx_brev_u4ý  ó   ýr  c              	   C   r  )Nrv   Ú__nv_brevllr  r  r   r   r   Úptx_brev_u8	  r  r  c                 C   s   |  |d |  tjd¡¡S r5   )Úctlzrm   r   Úbooleanr"   r   r   r   Úptx_clz  s   þr  c              	   C   r  )Nrt   Ú__nv_ffsr  r  r   r   r   Ú
ptx_ffs_32  ó   ýr   c              	   C   s2   t  |jt t d¡t d¡f¡d¡}| ||¡S )Nrt   rv   Ú
__nv_ffsllr  r  r   r   r   Ú
ptx_ffs_64&  r!  r#  c                 C   s   |\}}}|  |||¡S r­   )r  )r#   r   r$   r%   ÚtestÚaÚbr   r   r   Úptx_selp0  s   
r'  c              	   C   ó4   t  |jt t ¡ t ¡ t ¡ f¡d¡}| ||¡S )NÚ
__nv_fmaxf©r	   ra   r^   r   r_   r|   rb   r  r   r   r   Ú
ptx_max_f46  ó   þûr+  c              
   C   óh   t  |jt t ¡ t ¡ t ¡ f¡d¡}| ||  ||d |jd t	j
¡|  ||d |jd t	j
¡g¡S )NÚ	__nv_fmaxr   r   ©r	   ra   r^   r   r_   r…   rb   Úcastr%   r   Údoubler  r   r   r   Ú
ptx_max_f8A  ó   þûþr2  c              	   C   r(  )NÚ
__nv_fminfr*  r  r   r   r   Ú
ptx_min_f4R  r,  r5  c              
   C   r-  )NÚ	__nv_fminr   r   r/  r  r   r   r   Ú
ptx_min_f8]  r3  r7  c              	   C   sJ   t  |jt t d¡t ¡ f¡d¡}| ||  ||d |j	d t
j¡g¡S )Nrv   Ú__nv_llrintr   )r	   ra   r^   r   r_   ru   r…   rb   r0  r%   r   r1  r  r   r   r   Ú	ptx_roundn  s   þûÿr9  c                 C   rò   )Nc                 S   sÂ   t  | ¡s
t  | ¡r| S |dkr1|dkrd|d  }d}nd| }d}| | | }t  |¡r0| S n	d|  }| | }t|ƒ}t  || ¡dkrOdt|d ƒ }|dkr[|| | }|S ||9 }|S )Nr   é   g      $@g’ÕMÏð€Dg      ð?g      à?g       @)ÚmathÚisinfÚisnanÚroundÚfabs)r   ÚndigitsÚpow1Úpow2r   r   r   r   r   Úround_ndigitsƒ  s,   
ÿ
þz$round_to_impl.<locals>.round_ndigitsrö   )r#   r   r$   r%   rC  r   r   r   Úround_to_impl€  s   !rD  c                    rù   )Nc                    s$   |j \}|  |ˆ ¡}| ||d ¡S r5   )r%   rm   Úfmul)r#   r   r$   r%   ÚargtyÚfactor©Úconstr   r   Úimpl¨  s   zgen_deg_rad.<locals>.implr   )rI  rJ  r   rH  r   Úgen_deg_rad§  s   rK  g     €f@c                    s˜   |t jv rt j|dd}|g}n
tjˆ |t|ƒd}‡ ‡fdd„t||ƒD ƒ}|j}||kr6td||f ƒ‚|j	t|ƒkrHtd|j	t|ƒf ƒ‚||fS )z4
    Convert integer indices into tuple of intp
    r   )rE   Úcount)rL  c                    s"   g | ]\}}ˆ  ˆ ||tj¡‘qS r   )r0  r   Úintp)rR   ÚtÚi©r   r#   r   r   rT   À  s    ÿz&_normalize_indices.<locals>.<listcomp>zexpect %s but got %sz#indexing %d-D array with %d-D index)
r   Úinteger_domainÚUniTupler	   Úunpack_tupleÚlenÚziprE   Ú	TypeErrorÚndim)r#   r   ÚindtyÚindsÚarytyÚvaltyÚindicesrE   r   rP  r   Ú_normalize_indices·  s   
ÿÿr]  c                    rù   )Nc                    sj   |j \}}}|\}}}	|j}
t| |||||ƒ\}}|  |¡| ||ƒ}tj| ||||dd}ˆ | ||
||	ƒS )NT©Ú
wraparound)r%   rE   r]  Ú
make_arrayr	   Úget_item_pointer)r#   r   r$   r%   rZ  rX  r[  ÚaryrY  rÄ   rE   r\  ÚlaryÚptr©Údispatch_fnr   r   ÚimpÏ  s   

ÿÿz_atomic_dispatcher.<locals>.impr   )rf  rg  r   re  r   Ú_atomic_dispatcherÎ  s   rh  c                 C   ó\   |t jkr|j}| t |¡||f¡S |t jkr&|j}| t |¡||f¡S | d||d¡S )Nrå   Ú	monotonic)	r   r{   r^   rb   r   Údeclare_atomic_add_float32r„   Údeclare_atomic_add_float64Ú
atomic_rmw©r#   r   rE   rd  rÄ   rf   r   r   r   Úptx_atomic_add_tupleà  ó   
ÿ
ÿro  c                 C   ri  )Nræ   rj  )	r   r{   r^   rb   r   Údeclare_atomic_sub_float32r„   Údeclare_atomic_sub_float64rm  rn  r   r   r   Úptx_atomic_subñ  rp  rs  c                 C   óL   |t jjv r|j}|j}ttd|› ƒ}| ||ƒ||f¡S td|› dƒ‚)NÚdeclare_atomic_inc_intzUnimplemented atomic inc with ú array©	r   ÚcudadeclÚunsigned_int_numba_typesry   r^   Úgetattrr   rb   rV  ©r#   r   rE   rd  rÄ   Úbwrf   râ   r   r   r   Úptx_atomic_inc  ó   r}  c                 C   rt  )NÚdeclare_atomic_dec_intzUnimplemented atomic dec with rv  rw  r{  r   r   r   Úptx_atomic_dec  r~  r€  c                    s@   t ‡ fdd„ƒ}tjtjtjfD ]}t| tj|tjƒ|ƒ qd S )Nc                    s2   |t jjv r| ˆ ||d¡S tdˆ › d|› dƒ‚)Nrj  zUnimplemented atomic z with rv  ©r   rx  Úinteger_numba_typesrm  rV  ©r#   r   rE   rd  rÄ   rÜ   r   r   Úimpl_ptx_atomic  s   z+ptx_atomic_bitwise.<locals>.impl_ptx_atomic)rh  r   rM  rR  ÚTuplerà   ÚArrayÚAny)ÚstubrÝ   r„  rÅ   r   rÜ   r   Úptx_atomic_bitwise  s
   ÿr‰  ÚandÚorÚxorc                 C   s,   |t jjv r| d||d¡S td|› dƒ‚)NÚxchgrj  zUnimplemented atomic exch with rv  r  rƒ  r   r   r   Úptx_atomic_exch/  s   rŽ  c                 C   ó–   |j }|tjkr| t |¡||f¡S |tjkr#| t |¡||f¡S |tjtj	fv r4|j
d||ddS |tjtjfv rE|j
d||ddS td| ƒ‚©Nr  rj  ©ÚorderingÚumaxz&Unimplemented atomic max with %s array)r^   r   r„   rb   r   Údeclare_atomic_max_float64r{   Údeclare_atomic_max_float32rn   Úint64rm  Úuint32Úuint64rV  rn  r   r   r   Úptx_atomic_max:  ó   
ÿ
ÿr™  c                 C   r  ©Nr  rj  r‘  Úuminz&Unimplemented atomic min with %s array)r^   r   r„   rb   r   Údeclare_atomic_min_float64r{   Údeclare_atomic_min_float32rn   r–  rm  r—  r˜  rV  rn  r   r   r   Úptx_atomic_minN  rš  rŸ  c                 C   r  r  )r^   r   r„   rb   r   Údeclare_atomic_nanmax_float64r{   Údeclare_atomic_nanmax_float32rn   r–  rm  r—  r˜  rV  rn  r   r   r   Úptx_atomic_nanmaxb  rš  r¢  c                 C   r  r›  )r^   r   r„   rb   r   Údeclare_atomic_nanmin_float64r{   Údeclare_atomic_nanmin_float32rn   r–  rm  r—  r˜  rV  rn  r   r   r   Úptx_atomic_nanminv  rš  r¥  c                 C   sT   |  |jd tj|jd |jd ¡}|d |  tjd¡|d |d f}t| |||ƒS )Nr   r   r:   )r  r%   r   rM  rm   Úptx_atomic_casr"   r   r   r   Úptx_atomic_compare_and_swapŠ  s   $"r§  c                 C   s–   |j \}}}}|\}}	}
}t| |||	||ƒ\}}|  |¡| ||ƒ}tj| ||||dd}|jtjjv rD|j	}|jj
}t |||||
|¡S td|j ƒ‚)NTr^  z&Unimplemented atomic cas with %s array)r%   r]  r`  r	   ra  rE   r   rx  r‚  r^   ry   r   Úatomic_cmpxchgrV  )r#   r   r$   r%   rZ  rX  Úoldtyr[  rb  rY  ÚoldrÄ   r\  rc  rd  rf   ry   r   r   r   r¦  ‘  s   ÿÿr¦  c                 C   s@   t jt  t  ¡ t  d¡g¡dddd}|d }| ||g¡ d S )Nrt   znanosleep.u32 $0;rÏ   Tr¢   r   )r   r¥   r_   r`   ru   rb   )r#   r   r$   r%   Ú	nanosleepÚnsr   r   r   Úptx_nanosleep©  s
   ÿr­  Fc               	      sb  t tj|dƒ}|dko|ot|ƒdk}|dkr|stdƒ‚ˆ j| }	t|tjtj	fƒp5t|	t
jƒp5|tjk}
|tjvrC|
sCtd| ƒ‚ˆ  |¡}t ||¡}|tjkr\tj|||d}n4|j}t ||||¡}ˆ  |¡}d|d  ¡ > |_|r{d|_nt |tj¡|_| |t  t !d¡¡d¡}t" #t $¡ j%¡}ˆ  |¡}| &|¡}|}g }t't(|ƒƒD ]\}}| )|¡ ||9 }q¬d	d
„ t(|ƒD ƒ}‡ fdd
„|D ƒ}|rútj*t +t !d¡g ¡dddd}| ,| -|g ¡t !d¡¡}ˆ  .tj/|¡}| 0||¡g}n	‡ fdd
„|D ƒ}t|ƒ}tj1||dd}ˆ  2|¡ˆ |ƒ}ˆ j3|| 4||j5j6¡||ˆ  .tj/|¡d d | 7¡ S )Nr   r   zarray length <= 0zunsupported type: %sr?   ÚexternalrÑ   Úgenericc                 S   s   g | ]}|‘qS r   r   rQ   r   r   r   rT   õ  s    z"_generic_array.<locals>.<listcomp>c                    ó   g | ]	}ˆ   tj|¡‘qS r   ©rm   r   rM  rQ   ©r#   r   r   rT   ö  ó    rt   zmov.u32 $0, %dynamic_smem_size;r¡   Tr¢   rv   c                    r°  r   r±  rQ   r²  r   r   rT     r³  ÚC)rE   rW  Úlayout)ÚdatarD   ÚstridesÚitemsizeÚmeminfo)8r   Úoperatorrç   rT  Ú
ValueErrorÚdata_model_managerÚ
isinstancer   ÚRecordÚBooleanr   ÚStructModelrá   Únumber_domainrV  Úget_data_typer   Ú	ArrayTyper   rY   r	   Úalloca_oncer^   Úadd_global_variableÚget_abi_sizeofÚ
bit_lengthÚalignÚlinkageÚConstantÚ	UndefinedÚinitializerÚaddrspacecastÚPointerTyperu   ÚllÚcreate_target_dataÚNVVMÚdata_layoutÚget_abi_sizeÚ	enumerateÚreversedÚappendr¥   r_   r   rb   rm   rM  Úudivr†  r`  Úpopulate_arrayrx   r¶  ÚtypeÚ	_getvalue) r#   r   rD   rE   rF   rG   rH   Ú	elemcountÚdynamic_smemÚ
data_modelÚother_supported_typeÚlldtypeÚlarytyÚdataptrrf   ÚgvmemrÈ  Ú
targetdatar¸  Ú
laststrideÚrstridesrO  Úlastsizer·  ÚkstridesÚget_dynshared_sizeÚdynsmem_sizeÚ	kitemsizeÚkshaperW  rZ  rb  r   r²  r   rJ   ´  sx   

ÿý


ÿ
ÿ



þÿûrJ   c                 C   s   |   ¡ S r­   )rc   )r#   r   rÅ   Úpyvalr   r   r   Úcuda_dispatcher_const  s   rí  )F)ÒÚ	functoolsr   rº  r;  Úllvmliter   Úllvmlite.bindingÚbindingrÏ  Únumba.core.imputilsr   r   Únumba.core.typing.npydeclr   Únumba.core.datamodelr   Ú
numba.corer   r	   Únumba.npr
   Únumba.np.npyimplr   Úcudadrvr   Únumbar   Ú
numba.cudar   r   r   Únumba.cuda.typesr   r   Úregistryrà   Úlower_getattrÚ
lower_attrÚlower_constantr   ÚModuler&   r*   r-   r0   r2   r8   r9   r;   rI  Ú
array_liker†  r<   r=   rA   ÚsharedÚarrayÚIntegerLiteralr‡  rN   r…  rR  rW   ÚlocalrZ   r[   Úthreadfence_blockri   Úthreadfence_systemrk   Úthreadfencerl   Úsyncwarprs   Úi4rp   Úshfl_sync_intrinsicr€   Úf4Úf8rš   Úvote_sync_intrinsicr  r›   Úmatch_any_syncrž   Úmatch_all_syncrŸ   r§   r¨   Úlanemask_ltr©   Úpopcr«   r®   r¯   r»   rá   ÚFloatrÈ   rÌ   rÒ   ÚIntegerr×   rØ   rã   ró   Úhaddrå   ÚiaddÚhsubræ   ÚisubÚhmulrç   ÚimulÚhnegrê   Únegrí   Úhabsrî   Úabsrï   Úhfmarñ   ÚtruedivÚitruedivrø   rú   r  Úheqr  Úhner  Úhger  Úhgtr  Úhler  Úhltr  r  ÚhmaxÚhminr{   r„   r  Úcbrtr  ÚbrevÚu4r  Úu8r  Úclzr  Úffsr   r#  Úselpr'  r  r+  r2  r  r5  r7  r>  r9  rD  rK  ÚpiÚ_deg2radÚ_rad2degÚradiansÚdegreesr]  rh  ÚatomicrM  ro  rs  Úincr}  Údecr€  r‰  Úand_rƒ   rŒ  ÚexchrŽ  r™  rŸ  Únanmaxr¢  Únanminr¥  Úcompare_and_swapr§  Úcasr¦  r«  r—  r­  rJ   rí  Ú
get_ufuncsr   r   r   r   Ú<module>   sú   










		
		







	ÿÿÿÿ.










	





þ










%






ÿd
