o
    ic                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlZddlmZmZ eejZed jd	d
Zg dZerKejededndZG dd dZdd ZedZedZedZedZedZedZedZ edZ!ededZ"edZ#edZ$edZ%edZ&ed Z'ed!Z(ed"Z)ed#Z*ed$Z+ed%Z,ed&Z-ed'Z.ed(Z/ed)Z0ed*Z1ed+Z2ed,Z3ed-Z4ed.Z5ed/Z6ed0Z7ed1Z8ed2Z9ed3Z:ed4Z;ed5Z<ed6Z=ed7Z>ed8Z?ed9Z@ed:ZAed;ZBed<ZCed=ZDed>ZEed?ZFed@ZGedAZHedBZIedCZJdDdEdFeKfdGdHZLddFeKfdIdJZMddLdLdMdNdOZNdPdQdRdRdQdLdLdSdTdUZOdLdLdMdVdWZPdPdLdLdXdYdZZQdPd[dLdLd\d]d^ZRd_d` ZSddLdLdadbdcZTdLdLdMdddeZUdPdfdLdLd\dgdhZVdLdLdMdidjZWdLdLdMdkdlZXdLdLdMdmdnZYdPdodLdLd\dpdqZZddLdLdadrdsZ[dQdRdRdQdPdLdLdtdue\dve\dwe\dxe\dyeKf
dzd{Z]dPdLdLdXdyeKfd|d}Z^dPdLdLdXdyeKfd~dZ_dPdLdLdXdyeKfddZ`dPd[dLdLd\dyeKfddZadLdLdMddZbdPdfdLdLd\dyeKfddZcdLdLdMddZddLdLdMddZedLdLdMddZfdPdodLdLd\ddZgdLdLdMddZhddde
e fddZiddde
e fddZjd ddekfddZldekfddZmdekdekdekfddZnerueodddddddndZpdefddZqdS )    N)defaultdict)reduce)Path)CallableOptionalTuple   )cupyhas_cupy_gpuz_custom_kernels.cuutf8encoding)0backprop_clipped_linear<double>backprop_clipped_linear<float>backprop_dish<double>backprop_dish<float>backprop_gelu<double>backprop_gelu<float>backprop_hard_swish<double>backprop_hard_swish<float>%backprop_hard_swish_mobilenet<double>$backprop_hard_swish_mobilenet<float>backprop_maxout<double>backprop_maxout<float>backprop_mish<double>backprop_mish<float>backprop_reduce_max<double>backprop_reduce_max<float>backprop_reduce_mean<double>backprop_reduce_mean<float>backprop_reduce_sum<double>backprop_reduce_sum<float>backprop_seq2col<double>backprop_seq2col<float>backprop_swish<double>backprop_swish<float>clipped_linear<double>clipped_linear<float>dish<double>dish<float>gather_add<double>gather_add<float>gelu<double>gelu<float>maxout<double>maxout<float>mish<double>mish<float>pad<double>
pad<float>pad<int>pad<long long>reduce_max<double>reduce_max<float>reduce_sum<double>reduce_sum<float>seq2col<double>seq2col<float>swish<double>swish<float>)z--std=c++11)codeoptionsname_expressionsc                   @   sz   e Zd ZU dZeed< ed ed< eeg df  ed< g dZddded	eeg df  d
dfddZ	dd Z
dd ZdS )
LazyKernelzWraps around `cupy.RawModule` and `cupy.RawKernel` to verify CuPy availability
    and lazily compile the latter on first invocation.

    The default CuPy behaviour triggers the compilation as soon as the `cupy.RawKernel` object
    is accessed.namezcupy.RawKernel_kernel_compile_callbackrB   rC   rD   Ncompile_callbackrG   returnc                C   s   || _ d | _|| _d S NrE   )selfrB   rG    rK   R/home/ubuntu/.local/lib/python3.10/site-packages/thinc/backends/_custom_kernels.py__init__V   s   
zLazyKernel.__init__c                 O   s   |    | j|i | d S rI   )_compile_kernelrC   )rJ   argskwargsrK   rK   rL   __call__`   s   zLazyKernel.__call__c                 C   sZ   | j d urd S | jd ur|  | _ ntd urt| j| _ | j d u r+td| j dd S )Nzcouldn't compile Cupy kernel '')rC   rD   KERNELSget_functionrB   
ValueError)rJ   rK   rK   rL   rN   d   s   


zLazyKernel._compile_kernel)__name__
__module____qualname____doc__str__annotations__r   r   	__slots__rM   rQ   rN   rK   rK   rK   rL   rA   I   s    
 

rA   c                   C   s    t sd S ttd jdddS )Nz_murmur3.cur   r   	hash_data)r
   r	   	RawKernelPWD	read_textrK   rK   rK   rL   compile_mmhq   s   ra   r'   r&   r)   r(   r+   r*   r-   r,   r]   rF   r/   r.   r1   r0   r3   r2   r4   r5   r7   r6   r9   r8   r;   r:   r=   r<   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   Tzerosrc   c                C   s   |rt | |S t | |S rI   )r	   rc   empty)shapedtyperc   rK   rK   rL   _alloc   s   rg   c                 C   s   |rt | S t | S rI   )r	   
zeros_like
empty_like)arrayrc   rK   rK   rL   _alloc_like   s   

rk         )threads_per_block
num_blocksc             	   C   s  |dk rt d| | D ]}t| qdd | D }t|}|| | 7 }tj|dd}t| |f| d jdd   }tj|| d jd}t	t| d}	t
| D ]
\}
}|jj|	|
< qRt|	}	ttj| d jdd  d}|jd	krt|f|f||	||t| |f |S |jd
krt|f|f||	||t| |f |S |jdkrt|f|f||	||t| |f |S |jdkrt|f|f||	||t| |f |S )Nrl   z.Rounding for padding must at least be 1, was: c                 S   s   g | ]}t |qS rK   )len).0seqrK   rK   rL   
<listcomp>   s    zpad.<locals>.<listcomp>int32rf   r   int64float32float64)rU   _is_float_or_int_arraymaxr	   rj   rp   re   rd   rf   numpy	enumeratedataptrr   operatormulpad_kernel_floatpad_kernel_doublepad_kernel_int32pad_kernel_int64)seqsround_torn   ro   rr   seq_lensmax_seq_lenfinal_shapeoutptrsidxstriderK   rK   rL   pad   s`   





r   Fg      ?g        )inplaceslopeoffsetmin_valmax_valrn   ro   c          	   
   C   sn   t |  | }|st| dd}| jdkr%t|f|f|| ||||| jf |S t|f|f|| ||||| jf |S NFrb   rw   )_is_float_arrayrk   rf   clipped_linear_kernel_floatsizeclipped_linear_kernel_double)	Xr   r   r   r   r   rn   ro   r   rK   rK   rL   clipped_linear   s"   
r   c          	   
   C   s   | j dkrtd| j  |j dkrtd|j  t|  |d}t|| jd  |jd }|jd }| jd }| jd }t||f| jdd}| jd	kr_t|f|f|| |||||f |S t	|f|f|| |||||f |S )
Nr   z5gather_add expects table with dimensionality 2, was: z7gather_add expects indices with dimensionality 2, was: rt   r   rl   Trf   rc   rw   )
ndimrU   r   astype_check_indicesre   rg   rf   gather_add_kernel_floatgather_add_kernel_double)	tableindicesrn   ro   BKTOr   rK   rK   rL   
gather_add  s2   









r   )r   rn   ro   c                C   s^   t |  | }|st| dd}| jdkr!t|f|f|| | jf |S t|f|f|| | jf |S r   )r   rk   rf   dish_kernel_floatr   dish_kernel_double)r   r   rn   ro   r   rK   rK   rL   dish5  s   
r   g      @)r   	thresholdrn   ro   c                C   b   t |  | }|st| dd}| jdkr"t|f|f|| || jf |S t|f|f|| || jf |S r   )r   rk   rf   gelu_kernel_floatr   gelu_kernel_doubler   r   r   rn   ro   r   rK   rK   rL   geluB     
r   c                 C   s*   | d u rt j|gdd} | S t| | | S )Nrt   ru   )r	   rj   _check_lengths)lengthsr   rK   rK   rL   check_seq2col_lengthsS  s
   
r   )r   rn   ro   c          
   
   C   s   t |  | jd }|d d }| jd }t||}|jd }t||| f| jdd}	| jdkrY|jdkrY| jdkrJt|f|f|	| |||||f |	S t|f|f|	| |||||f |	S Nr   r   rl   Tr   rw   )r   re   r   rg   rf   r   seq2col_kernel_floatseq2col_kernel_double)
rr   nWr   rn   ro   r   nFInLr   rK   rK   rL   seq2col[  s"   




r   c          	   	   C   s   t |  | j\}}}||f}t|| jdd}t|ddd}| jdkr4t|f|f||| |||f ||fS t|f|f||| |||f ||fS )NFr   irw   )r   re   rg   rf   maxout_kernel_floatmaxout_kernel_double)	r   rn   ro   r   r   P	out_shapebestwhichrK   rK   rL   maxoutt  s   
r      c                C   r   r   )r   rk   rf   mish_kernel_floatr   mish_kernel_doubler   rK   rK   rL   mish  s   
r   c             	   C   s   t |  t|}| jd }| jd }t|| t||f| jdd}| jdkr6t|f|f|| ||||f |S t|f|f|| ||||f |S Nr   rl   Tr   rw   )r   rp   re   r   rg   rf   reduce_sum_kernel_floatreduce_sum_kernel_doubler   r   rn   ro   r   r   r   r   rK   rK   rL   
reduce_sum  s   



r   c             	   C   s   t |  t|}| jd }| jd }t|| t||f| jdd}| jdkr5t|f|f|| ||||f nt|f|f|| ||||f ||dd  }|S )Nr   rl   Tr   rw   )rl   g|=)	r   rp   re   r   rg   rf   r   r   reshaper   rK   rK   rL   reduce_mean  s   



r   c          
   
   C   s   t |  t|}| jd }| jd }t||dd ||f}t|| jdd}t|ddd}	| jdkrDt|f|f||	| ||||f ||	fS t|f|f||	| ||||f ||	fS )Nr   rl   
min_lengthFr   r   rw   )r   rp   re   r   rg   rf   reduce_max_kernel_floatreduce_max_kernel_double)
r   r   rn   ro   r   r   r   r   maxesr   rK   rK   rL   
reduce_max  s"   


r   g      1@c                C   r   r   )r   rk   rf   swish_kernel_floatr   swish_kernel_doubler   rK   rK   rL   swish  r   r   c          
   
   C   s   t |  | jd }|d d }| jd | }t||}|jd }t||f| jdd}	| jdkrY|jdkrY| jdkrJt|f|f|	| |||||f |	S t|f|f|	| |||||f |	S r   )r   re   r   rg   rf   r   backprop_seq2col_kernel_floatbackprop_seq2col_kernel_double)
dYr   r   rn   ro   r   r   r   r   r   rK   rK   rL   backprop_seq2col  s"   



r   )r   r   r   r   r   rn   ro   r   r   r   r   r   c          
      C   s   t |  t || jd | }	|st| dd}	| jdkr-t|f|f|	| ||||||	jf |	S t|f|f|	| ||||||	jf |	S Nre   Frb   rw   )r   re   rk   rf   $backprop_clipped_linear_kernel_floatr   %backprop_clipped_linear_kernel_double)
r   r   r   r   r   r   r   rn   ro   r   rK   rK   rL   backprop_clipped_linear  s$   
r   c                C   p   t |  t || jd | }|st| dd}| jdkr)t|f|f|| ||jf |S t|f|f|| ||jf |S r   )r   re   rk   rf    backprop_hard_swish_kernel_floatr   !backprop_hard_swish_kernel_doubler   r   r   rn   ro   r   rK   rK   rL   backprop_hard_swish2     
r   c                C   r   r   )r   re   rk   rf   *backprop_hard_swish_mobilenet_kernel_floatr   +backprop_hard_swish_mobilenet_kernel_doubler   rK   rK   rL   backprop_hard_swish_mobilenetH  r   r   c                C   r   r   )r   re   rk   rf   backprop_dish_kernel_floatr   backprop_dish_kernel_doubler   rK   rK   rL   backprop_dish^  s   
r   c                C   st   t |  t || jd | }|st| dd}| jdkr*t|f|f|| |||jf |S t|f|f|| |||jf |S r   )r   re   rk   rf   backprop_gelu_kernel_floatr   backprop_gelu_kernel_doubler   r   r   r   rn   ro   r   rK   rK   rL   backprop_geluy  s   	
r   c             	   C   s   t |  | jd }| jd }t|||f| jdd}t|||| | jdkr5t|f|f|| ||||f |S t|f|f|| ||||f |S r   )r   re   rg   rf   _check_which_maxoutbackprop_maxout_kernel_floatbackprop_maxout_kernel_double)r   r   r   rn   ro   r   r   r   rK   rK   rL   backprop_maxout  s   


r   c                C   st   t |  t || jd | }|st| dd}| jdkr*t|f|f|| ||| jf |S t|f|f|| ||| jf |S r   )r   re   rk   rf   backprop_mish_kernel_floatr   backprop_mish_kernel_doubler   rK   rK   rL   backprop_mish  s   
r   c             	   C      t |  t|}t| }| jd }t|| t||f| jdd}| jdkr7t|f|f|| ||||f |S t	|f|f|| ||||f |S Nrl   Fr   rw   )
r   rp   intsumre   r   rg   rf    backprop_reduce_sum_kernel_float!backprop_reduce_sum_kernel_double)d_sumsr   rn   ro   r   r   r   r   rK   rK   rL   backprop_reduce_sum     


r  c             	   C   r   r   )
r   rp   r   r   re   r   rg   rf   !backprop_reduce_mean_kernel_float"backprop_reduce_mean_kernel_double)d_meansr   rn   ro   r   r   r   r   rK   rK   rL   backprop_reduce_mean  r  r  c          	   
   C   s   t |  t|}t| }| jd }t||dd t||f| jdd}t|||f| | jdkrBt	|f|f|| |||||f |S t
|f|f|| |||||f |S )Nrl   r   Tr   rw   )r   rp   r   r   re   r   rg   rf   _check_which_reduce_max backprop_reduce_max_kernel_float!backprop_reduce_max_kernel_double)	d_maxesr   r   rn   ro   r   r   r   r   rK   rK   rL   backprop_reduce_max  s    

r  c             	   C   s   t |  t || jd t || jd | }|st| dd}| jdkr2t|f|f|| ||||jf |S t|f|f|| ||||jf |S r   )r   re   rk   rf   backprop_swish_kernel_floatr   backprop_swish_kernel_double)r   r   Yr   r   rn   ro   r   rK   rK   rL   backprop_swish  s   
r  c             	   C   sF   t | jd dfddd}d}d}t|f|f|| ||| jd |f |S )Nr      uint32Tr         )rg   re   hash_data_kernel)idsseedrn   ro   r   out_sizein_sizerK   rK   rL   hash  s   r  r   re   c                C   F   | j dv s	J d|d ur| j|kr!d| d| j }t|d S d S )N)rw   rx   z/CUDA kernel can only handle float32 and float64%array has incorrect shape, expected: , was: rf   re   rU   r   re   msgrK   rK   rL   r   -  s   r   c                C   r  )N)rw   rx   rt   rv   z=CUDA kernel can only handle float32, float64, int32 and int64r  r  r  r  rK   rK   rL   ry   7  s   ry   r   n_elemsc                C   sH   | j dks	J dt| |kstd| t| |kr"tdd S )Nrt   z,lengths should be encoded as 32-bit integersz all sequence lengths must be >= z%lengths must sum up to the batch size)rf   r	   allrU   r   
IndexError)r   r   r   rK   rK   rL   r   C  s   r   nc                 C   s0   | j dks	J dt| d|std| d S )Nrt   z,indices should be encoded as 32-bit integersr   z'index out of bounds, must be >= 0 && < )rf   _values_within_ranger"  )r   r#  rK   rK   rL   r   K  s   r   r   r   r   c                 C   sZ   ||f}d}| j dksJ || j|kr!d| d| j }t|t| d|s+tdd S )N:maximum index (which) should be encoded as 32-bit integersrt   5maximum index (which) has incorrect shape, expected: r  r   )maximum index (which) value out of bounds)rf   re   rU   r$  r"  )r   r   r   r   re   r  rK   rK   rL   r   R  s   
r   zT x, T lower, T upperzbool rzx >= lower && x < upperza & bzr = atruewithin_rangec                 C   sd   d}| j dksJ || j|krd| d| j }t|t| dk| t|dk @ s0tdd S )Nr%  rt   r&  r  r   r   r'  )rf   re   rU   r	   r!  expand_dimsr"  )r   re   r   r  rK   rK   rL   r  l  s   
r  )T)rl   )rr   recollectionsr   	functoolsr   pathlibr   typingr   r   r   r{   compatr	   r
   __file__parentr_   r`   KERNELS_SRCKERNELS_LIST	RawModulerS   rA   ra   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	  r  r  r  r   r   r   r   r  r  boolrg   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   r   floatr   r   r   r   r   r   r   r  r  r  r  r  r   ry   r   r   r   r   ReductionKernelr$  r  rK   rK   rK   rL   <module>   sH   
6	(>
$






