o
    W۷i/y                     @  sp  d dl mZ d dlZd dlZd dlmZ d dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ dad	d
 Zdd Zdd ZdGddZdGddZdd ZdGddZdd ZdGddZdGddZdGddZdGdd Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Z d-d. Z!d/d0 Z"d1d2 Z#dHd4d5Z$d6d7 Z%d8d9 Z&d:d; Z'dId>d?Z(dGd@dAZ)dJdCdDZ*dKdEdFZ+dS )L    )annotationsN)linalg)_core)cublas)device)_util   c                   C  s   t S N_batched_gesv_limit r   r   A/home/ubuntu/vllm_env/lib/python3.10/site-packages/cupy/cublas.pyget_batched_gesv_limit   s   r   c                 C  s   | a d S r	   r
   )limitr   r   r   set_batched_gesv_limit   s   r   c                 C  s8  t | | t |  t |  | j|jks| j|jd kr/| jdd |jd| jd  ks3tdt | |\}}|jdkrGt	
|j|S |dkrNd}n|dkrUd}n|d	kr\d
}n|dkrcd}ntdtt|d }tt|d }| jdkrt| jdd nd}| jd }| j|jkr|jd nd}	|j}
| jj}|jj}t	j| |||ddd|d} t	j||||	ddd|d}| jj|kr|  } |jj|kr| }|t krtd|t  t }|}|| | j }t	j| jj| jj||  |t	jd}|}||	 |j }t	j|jj|jj||  |t	jd}t	j
||ftj d}t	j
|ftj d}tj
dtj d}||||jj||jj|jj| t !|| ||tj"||	|jj||jj|jj||j#j| |d dkrd|j$}|d dk r|d|d  7 }t%&||ddd|
j'|ddS )a  Solves multiple linear matrix equations using cublas<t>getr[fs]Batched().

    Computes the solution to system of linear equation ``ax = b``.

    Args:
        a (cupy.ndarray): The matrix with dimension ``(..., M, M)``.
        b (cupy.ndarray): The matrix with dimension ``(..., M)`` or
            ``(..., M, K)``.

    Returns:
        cupy.ndarray:
            The matrix with dimension ``(..., M)`` or ``(..., M, K)``.
       NzEa must have (..., M, M) shape and b must have (..., M) or (..., M, K)r   fsdFcDzinvalid dtypegetrfBatchedgetrsBatched   dtypez/The matrix size ({}) exceeds the set limit ({}))r   z Error reported by {} in cuBLAS. z)The {}-th parameter had an illegal value.F)copy)(r   _assert_cupy_array_assert_stacked_2d_assert_stacked_squarendimshape
ValueErrorlinalg_common_typesizecupyempty	TypeErrorgetattrr   mathproddataptrascontiguousarrayreshape	transposer!   r   warningswarnformatr   get_cublas_handleitemsizearangeuintpnumpyint323_check_cublas_info_array_if_synchronization_allowedCUBLAS_OP_Nctypes__name__r   LinAlgErrorastype)abr    	out_dtypetgetrfgetrsbsnnrhsb_shape
a_data_ptr
b_data_ptrhandleldaa_stepa_arrayldbb_stepb_arraypivotdinfoinfomsgr   r   r   batched_gesv   s   

"
"


 
r[   c                 C     t | |dS )zFinds the (smallest) index of the element with the maximum magnitude.

    Note: The result index is 1-based index (not 0-based index).
    amax	_iamaxminxoutr   r   r   iamaxw      rc   c                 C  r\   )zFinds the (smallest) index of the element with the minimum magnitude.

    Note: The result index is 1-based index (not 0-based index).
    aminr^   r`   r   r   r   iamin   rd   rf   c              
   C  s   | j dkrtd| j | jj}|dkrd}n|dkrd}n|dkr&d}n|dkr-d	}ntd
ttd| | }t	 }d}t
|||\}}	}
z||| j| jjd| W t||
 nt||
 w |d u rl|	}|S |j|krwt|	| |S )Nr   !x must be a 1D array (actual: {})r   r   r   r   r   r   r   r   i)r%   r'   r7   r    charr,   r-   r   r   r8   _setup_result_ptrr)   r0   r1   setPointerModer   elementwise_copy)ra   rb   namer    rG   funcrP   result_dtype
result_ptrresult	orig_moder   r   r   r_      s6   


r_   c           	   
   C     | j dkrtd| j | jj}|dkrtj}n|dkr!tj}n|dkr)tj}n|dkr1tj	}nt
dt }| }t|||\}}}z||| j| jjd| W t|| nt|| w |du ri|}|S |j|krtt|| |S )	z&Computes the sum of the absolute of x.r   rg   r   r   r   r   r   N)r%   r'   r7   r    ri   r   sasumdasumscasumdzasumr,   r   r8   lowerrj   r)   r0   r1   rk   r   rl   	ra   rb   r    rn   rP   ro   rp   rq   rr   r   r   r   asum   4   


rz   c              
   C  s   t || |jj}|dkrtj}n|dkrtj}n|dkr!tj}n|dkr)tj}ntdt	
 }t|| |\} }}z|||j||jjd|jjd W t|| dS t|| w )z5Computes y += a * x.

    (*) y will be updated.
    r   r   r   r   r   r   N)_check_two_vectorsr    ri   r   saxpydaxpycaxpyzaxpyr,   r   r8   _setup_scalar_ptrr)   r0   r1   rk   )rD   ra   yr    rn   rP   a_ptrrr   r   r   r   axpy   s    
 r   c           
   
   C  s   | j j}|dkrtj}n|dkrtj}n|dv rtdtdt| | t }|}t	|||\}}}	z||| j
| jjd|jjd| W t||	 nt||	 w |du r[|}|S |j |krft|| |S )$Computes the dot product of x and y.r   r   FDz&Use dotu() or dotc() for complex dtyper   r   N)r    ri   r   sdotddotr,   r|   r   r8   rj   r)   r0   r1   rk   r   rl   
ra   r   rb   r    rn   rP   ro   rp   rq   rr   r   r   r   dot   s.   

 
r   c           
   
   C     | j j}|dv rt| ||dS |dkrtj}n|dkrtj}ntdt| | t	 }|}t
|||\}}}	z||| j| jjd|jjd| W t||	 nt||	 w |du r^|}|S |j |krit|| |S )r   fdrb   r   r   r   r   N)r    ri   r   r   cdotuzdotur,   r|   r   r8   rj   r)   r0   r1   rk   r   rl   r   r   r   r   dotu  .   

 
r   c           
   
   C  r   )z+Computes the dot product of x.conj() and y.r   r   r   r   r   r   N)r    ri   r   r   cdotczdotcr,   r|   r   r8   rj   r)   r0   r1   rk   r   rl   r   r   r   r   dotc  r   r   c           	   
   C  rs   )	z(Computes the Euclidean norm of vector x.r   rg   r   r   r   r   r   N)r%   r'   r7   r    ri   r   snrm2dnrm2scnrm2dznrm2r,   r   r8   rx   rj   r)   r0   r1   rk   r   rl   ry   r   r   r   nrm2;  r{   r   c              
   C  s   |j dkrtd|j |jj}|dkrtj}n|dkr!tj}n|dkr)tj}n|dkr1tj	}nt
dt }t|| |\} }}z|||j||jjd W t|| dS t|| w )	z1Computes x *= a.

    (*) x will be updated.
    r   rg   r   r   r   r   r   N)r%   r'   r7   r    ri   r   sscaldscalcscalzscalr,   r   r8   r   r)   r0   r1   rk   )rD   ra   r    rn   rP   r   rr   r   r   r   scal\  s"   
r   c                 C  sx   | j dkrtd| j |j dkrtd|j | j|jkr*td| j|j| j|jkr:td| j|jd S )Nr   rg   z!y must be a 1D array (actual: {})z1x and y must be the same size (actual: {} and {})z2x and y must be the same dtype (actual: {} and {}))r%   r'   r7   r)   r    r,   )ra   r   r   r   r   r|   x  s   

r|   c                 C  s   t | }|d u st|tjr.|d u s|j|kr tjg |d}n|}|jj}t 	| t j
 n%t|tjrO|j|krAtjg |d}n|}|jj}t 	| t j ntd|||fS )Nr   z(out must be either cupy or numpy ndarray)r   getPointerMode
isinstancer*   ndarrayr    r+   r0   r1   rk   CUBLAS_POINTER_MODE_DEVICEr<   r@   CUBLAS_POINTER_MODE_HOSTr,   )rP   rb   r    moderq   rp   r   r   r   rj     s   


rj   c                 C  sL   t ||\}}t| }t|tjrt| tj nt| tj |||fS r	   )	_get_scalar_ptrr   r   r   r*   r   rk   r   r   )rP   rD   r    r   r   r   r   r   r     s   

r   c                 C  sh   t | tjr| j|krtj| |d} | jj}| |fS t | tjr%| j|ks,tj| |d} | jj}| |fS )Nr   )	r   r*   r   r    arrayr0   r1   r<   r@   )rD   r    r   r   r   r   r     s   
r   c                 C  s  |j j}|dkrtj}n|dkrtj}n|dkrtj}n|dkr$tj}ntd|jdks/J |j|j  kr<dks?J  J |j |j   krM|j ksPJ  J |j	\}}	t
| } | tjkrd|	|}
}n||	}
}|j	d |
ksrJ |j	d |ks{J t||j \}}t||j \}}t }t|}t|tjst|tjrt|tjst|}|jj}t|tjst|}|jj}t|tj nt|tj z{|jr||| ||	||jj||jjd||jjd nP|jr| tjkr| tjkrtj} ntj} ||| |	|||jj|	|jjd||jjd n-|jdd	}||| ||	||jj||jjd||jjd W t|| d
S W t|| d
S W t|| d
S t|| w )zComputes y = alpha * op(a) @ x + beta * y

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.

    Note: ''y'' will be updated.
    r   r   r   r   r   r   r   r   orderN)r    ri   r   sgemvdgemvcgemvzgemvr,   r%   r&   _trans_to_cublas_opr?   r   r   r8   r   r   r*   r   r   r0   r1   rk   r   r   _f_contiguous_c_contiguousCUBLAS_OP_CCUBLAS_OP_Tr!   )transaalpharD   ra   betar   r    rn   mrK   xlenylen	alpha_ptrbeta_ptrrP   rr   r   r   r   gemv  sl    "









r   c                 C  s  |j j}|dkrtj}n|dkrtj}n|dv rtdtd|jdks'J |j|j  kr4dks7J  J |j |j   krE|j ksHJ  J |j\}}|jd |ksVJ |jd |ks_J t	 }t
|| |\} }	}
|jj|jj}}z\|jr|||||	|d|d|jj|
 n7|jr|||||	|d|d|jj|
 n-|jd	d
}|||||	|d|d|jj|
 t|| W t||
 dS W t||
 dS W t||
 dS t||
 w )DComputes a += alpha * x @ y.T

    Note: ''a'' will be updated.
    r   r   r   z#Use geru or gerc for complex dtypesr   r   r   r   r   r   N)r    ri   r   sgerdgerr,   r%   r&   r   r8   r   r0   r1   r   r   r!   r   rl   rk   r   ra   r   rD   r    rn   r   rK   rP   r   rr   x_ptry_ptraar   r   r   ger  s<    "
  r   c                 C  s  |j j}|dv rt| |||S |dkrtj}n|dkrtj}ntd|jdks*J |j|j  kr7dks:J  J |j |j   krH|j ksKJ  J |j\}}|jd |ksYJ |jd |ksbJ t	
 }t|| |\} }	}
|jj|jj}}z\|jr|||||	|d|d|jj|
 n7|jr|||||	|d|d|jj|
 n-|jdd}|||||	|d|d|jj|
 t|| W t||
 d	S W t||
 d	S W t||
 d	S t||
 w )
r   r   r   r   r   r   r   r   r   N)r    ri   r   r   cgeruzgerur,   r%   r&   r   r8   r   r0   r1   r   r   r!   r   rl   rk   r   r   r   r   geru  s<    "
  r   c                 C  s  |j j}|dv rt| |||S |dkrtj}n|dkrtj}ntd|jdks*J |j|j  kr7dks:J  J |j |j   krH|j ksKJ  J |j\}}|jd |ksYJ |jd |ksbJ t	
 }t|| |\} }	}
|jj|jj}}z@|jr|||||	|d|d|jj|
 n$|jdd}|||||	|d|d|jj|
 t|| W t||
 d	S W t||
 d	S t||
 w )
zKComputes a += alpha * x @ y.T.conj()

    Note: ''a'' will be updated.
    r   r   r   r   r   r   r   r   N)r    ri   r   r   cgerczgercr,   r%   r&   r   r8   r   r0   r1   r   r!   r   rl   rk   r   r   r   r   gerc<  s4    "
 r   Fc                 C  s  |j j}|dkrtj}n|dkrtj}ntd|jdksJ |j|j  kr,dks/J  J |j |j   kr=|j ks@J  J |j\}	}
|jd |
ksNJ |jd |
ksWJ |js`|j	dd}t
||j \}}t
||j \}}t }t|}t|tjst|tjrt|tjst|}|jj}t|tjst|}|jj}t|tj nt|tj |rtj}ntj}t }z||||
| ||jj|	|jjd||jjd W t|| |S t|| w )	z)Computes y = alpha*A @ x + beta * y

    r   r   zComplex dtypes not supportedr   r   r   r   r   )r    ri   r   ssbmvdsbmvr,   r%   r&   r   r!   r   r   r8   r   r   r*   r   r   r0   r1   rk   r   r   CUBLAS_FILL_MODE_LOWERCUBLAS_FILL_MODE_UPPER)kr   rD   ra   r   r   rx   r    rn   r   rK   r   r   rP   rr   uplor   r   r   sbmv_  sN    "





r   c                 C  sb   | dks	| t jkrt j} | S | dks| t jkrt j} | S | dks%| t jkr*t j} | S td| )NNTHzinvalid trans (actual: {}))r   r?   r   r   r,   r7   )transr   r   r   r     s   r   c                 C  sL   d }|t jt jfv r"| jr| jd }||fS | jr"| jd }d| }||fS )Nr   r   )r   r?   r   r   r&   r   )rD   r   ldr   r   r   _decide_ld_and_trans  s   

r   c                 C  s,   |d u r| j d }| js| jdd} | |fS )Nr   r   r   )r&   r   r!   )rD   rQ   r   r   r   _change_order_if_necessary  s
   
r         ?        c                 C  sj  |j |j   krdksJ  J |j|jksJ |jj}|dkr$tj}n|dkr,tj}n|dkr4tj}n|dkr<tj}ntdt	| } t	|}| tj
krS|j\}	}
n|j\}
}	|tj
krl|jd }|jd |
kskJ n|jd }|jd |
kszJ |d	u rtj|	|f|dd
}d}n|j dksJ |j|	|fksJ |j|ksJ t||j\}}t||j\}}t }t|}t|tjst|tjrt|tjst|}|jj}t|tjst|}|jj}t|tj nt|tj t|| \}} t||\}}|d	u sh|d	u sh|jr8z ||| ||	||
||jj||jj|||jj|	 W t|| |S t|| w |jrhz$||d| d|  ||	|
||jj||jj|||jj| W t|| |S t|| w t||\}}t||\}}|}|js|jdd}z||| ||	||
||jj||jj|||jj|	 W t|| nt|| w |jst || |S )a  Computes out = alpha * op(a) @ op(b) + beta * out

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    r   r   r   r   r   r   r   r   Nr    r   r   r   )!r%   r    ri   r   sgemmdgemmcgemmzgemmr,   r   r?   r&   r*   r+   r   r   r8   r   r   r   r   r0   r1   rk   r   r   r   r   r   r   r!   r   rl   )r   transbrD   rE   rb   r   r   r    rn   r   r   rK   r   r   rP   rr   rQ   rT   r   r   r   r   gemm  s    







r   c                 C  sL  |j |j   krdksJ  J |j|jksJ |jj}|dkr$tj}n|dkr,tj}n|dkr4tj}n|dkr<tj}ntdt	| } t	|}| tj
krS|j\}	}
n|j\}
}	|tj
krg|j|	|
fksfJ n	|j|
|	fkspJ |du rtj|	|
f|dd}n|j dksJ |j|	|
fksJ |j|ksJ t||j\}}t||j\}}t }t|}t|tjst|tjrt|tjst|}|jj}t|tjst|}|jj}t|tj nt|tj t|| \}} t||\}}|du sZ|du sZ|jr+z||| ||	|
||jj|||jj||jj|	 W t|| |S t|| w |jrZz#||d	|  d	| |
|	||jj|||jj||jj|
 W t|| |S t|| w t||\}}t||\}}|}|jst|jdd
}z||| ||	|
||jj|||jj||jj|	 W t|| nt|| w |jst || |S )zComputes alpha * op(a) + beta * op(b)

    op(a) = a if transa is 'N', op(a) = a.T if transa is 'T',
    op(a) = a.T.conj() if transa is 'H'.
    op(b) = b if transb is 'N', op(b) = b.T if transb is 'T',
    op(b) = b.T.conj() if transb is 'H'.
    r   r   r   r   r   r   Nr   r   r   )!r%   r    ri   r   sgeamdgeamcgeamzgeamr,   r   r?   r&   r*   r+   r   r   r8   r   r   r   r   r0   r1   rk   r   r   r   r   r   r   r!   r   rl   )r   r   r   rD   r   rE   rb   r    rn   r   rK   r   r   rP   rr   rQ   rT   r   r   r   r   geam  s    





r   r   c                 C  s0  |j dksJ d|j   krdksJ  J |j|jksJ |jj}|dkr*tj}n|dkr2tj}n|dkr:tj}n|dkrBtj}ntd| dksO| tj	krStj	} n| d	ks\| tj
kr`tj
} ntd
| |j\}}| tj	kr|j|d t| d ksJ n|j|d t| d ksJ |du r|jrd}	nd}	tj||f||	d}n|j dksJ |j|jksJ |j|jksJ t }
|jr|js|jdd}||
d|  |||jj||jj||jj|
 |S |js|jdd}|}|js|jdd}||
| |||jj||jj||jj|
 |jst|| |S )znComputes diag(x) @ a or a @ diag(x)

    Computes diag(x) @ a if side is 'L', a @ diag(x) if side is 'R'.
    r   r   r   r   r   r   r   LRzinvalid side (actual: {})r   NCr   r   )r%   r    ri   r   sdgmmddgmmcdgmmzdgmmr,   CUBLAS_SIDE_LEFTCUBLAS_SIDE_RIGHTr'   r7   r&   r)   absr   r*   r+   r   r8   r!   r0   r1   r   r   rl   )siderD   ra   rb   incxr    rn   r   rK   r   rP   r   r   r   r   dgmmh  sb   

 r   c                 C  s  |j dksJ |jj}|dkrtj}n|dkrtj}n|dkr#tj}n|dkr+tj}ntdt	| } | tj
kr>|j\}}	n|j\}	}|du rTtj||f|dd}d	}n|j dks[J |j||fksdJ |j|kskJ |rqtj}
ntj}
t||j\}}t||j\}}t }t|}t|tjst|tjrt|tjst|}|jj}t|tjst|}|jj}t|tj nt|tj t|| \}} t|| \}}|jr|js|jd
d}d|  } |jd }z||d|
 | ||	||jj|||jj| W t|| |S t|| w |js"|jdd}|jd }d|  } |}|js.|jdd}z|||
| ||	||jj|||jj| W t|| nt|| w |jsX||d< |S )a"  Computes out := alpha*op1(a)*op2(a) + beta*out

    op1(a) = a if trans is 'N', op2(a) = a.T if transa is 'N'
    op1(a) = a.T if trans is 'T', op2(a) = a if transa is 'T'
    lower specifies  whether  the  upper  or  lower triangular
    part  of the  array  out  is to be  referenced
    r   r   r   r   r   r   Nr   r   r   r   r   r   .) r%   r    ri   r   ssyrkdsyrkcsyrkzsyrkr,   r   r?   r&   r*   zerosr   r   r   r   r8   r   r   r   r   r0   r1   rk   r   r   r   r   r!   r   )r   rD   rb   r   r   rx   r    rn   rK   r   r   r   r   rP   rr   rQ   ldo_r   r   r   r   syrk  s   










r   r	   )F)Nr   r   )Nr   )Nr   r   F),
__future__r   r.   r<   r   r5   r*   r   cupy_backends.cuda.libsr   	cupy.cudar   cupy.linalgr   r   r   r   r[   rc   rf   r_   rz   r   r   r   r   r   r   r|   rj   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   sP    
Z

!!



!
A&%
#5

]
X<