o
    پi'                     @   sb   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ e r'd dl	Z	d dl
Z
G dd deZdS )    N)BaseLoRABackend)LoRABatchInfo)ForwardBatch)is_npuc                       s  e Zd ZdZdedejf fddZdejdejdejfd	d
Z		d%dejdejdejdejfddZ
	d%dejdejdejdejdejdedejdejfddZ	d%dejdejdejdejdejf
ddZdedefddZdedee d ee d!ee d"ef
d#d$Z  ZS )&AscendLoRABackendascendmax_loras_per_batchdevicec                    s   t  || d S N)super__init__)selfr   r	   kwargs	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/lora/backend/ascend_backend.pyr      s   zAscendLoRABackend.__init__xweightsreturnc           
      O   s   |j \}}|j \}}}tj||f|j|jd}tjj||| jj	| jj
|d | jjd| jj	j| jj
|dd}	||	9 }|S )Ndtyper	         ?r   output_size)shapetorchzerosr   r	   opsnpusgmv_shrink
batch_infoweight_indicesseg_lensscalingsgatherrepeat_interleave	unsqueeze)
r   r   r   argsr   total_seq_len_weight_out_dimoutput_tensorscalingr   r   r   run_lora_a_sgemm   s(   
	
z"AscendLoRABackend.run_lora_a_sgemmNbase_outputc           
   	   O   sb   |j \}}|j \}}}|d u rtj||f|j|jd}	n|}	tjj||| jj	| jj
|	d| |	S )Nr	   r   r   )r   r   r   r	   r   r   r    sgmv_expandr"   r#   r$   )
r   r   r   r0   r)   r   r*   r+   r,   r-   r   r   r   run_lora_b_sgemm3   s"   

z"AscendLoRABackend.run_lora_b_sgemm
qkv_lora_a
qkv_lora_boutput_offsetoutput_offset_cpumax_qkv_out_dimc              
   O   sF  d}
t |tjs
J |j\}}|j\}}}|j\}}}||
 }|d u r0tj||f|j|jd}n|}tj|||j|jd}tjj	||| j
j| j
j|d | j
jd| j
jj| j
j|dd}||9 }t|
D ]7}|| }||d  }|| }tjj|d d || ||d  f |d d ||f | j
j| j
j||| qi|S )	N   r1   r   r   r   r   r      
isinstancer   Tensorr   r   r	   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   ranger2   )r   r   r4   r5   r6   r7   r8   r0   r)   r   
num_slicesr*   r+   weight_intermediate_dimr,   max_rankr-   lora_a_outputr.   slice_idslice_offsetslice_offset_next
slice_sizer   r   r   run_qkv_loraQ   sT   



zAscendLoRABackend.run_qkv_loragate_up_lora_agate_up_lora_bc              
   O   sB  d}t |tjs
J |j\}}	|j\}	}
}	|j\}	}}	|| }|
| }|d u r4tj||f|j|jd}n|}tj||
|j|jd}tjj	||| j
j| j
j|d | j
jd| j
jj| j
j|dd}||9 }d}t|D ]/}tjj|d d || ||d  f |d d ||| f | j
j| j
j||| ||7 }qo|S )	N   r1   r   r   r   r   r   r:   r;   )r   r   rH   rI   r0   r)   r   r?   r*   r+   r@   r,   rF   rA   r-   rB   r.   rD   rC   r   r   r   run_gate_up_lora   sT   




	z"AscendLoRABackend.run_gate_up_loramax_bs_in_cuda_graphnum_tokens_per_bsc                 C   s   t dR t|dd t j|f|t jdt j|d t jd|t j|t jdt j| jt jdt j| jt jdd d
| _	t j
| j	jd | d| j	jd|d  d W d    d S 1 sZw   Y  d S )Nr    T)r   r:   )
bsuse_cuda_graphnum_segmentsr$   
seg_indptrmax_lenr#   
lora_ranksr%   permutationr   )dimout)r   r	   r   fullint32emptyr   r   floatnpu_graph_batch_infocumsumr$   rQ   )r   rL   rM   r   r   r   init_cuda_graph_batch_info   s*   
"z,AscendLoRABackend.init_cuda_graph_batch_infoforward_batchr#   rS   r%   rO   c                 C   s  t j|t jddd}t j|t jddd}t j|t jddd}|j}	|r8| jd us,J d| j}
|j|
_|j|
_nc|j	 rBt
|jnd}|j	 rL|jn	t j|	t j| jd}t j|	d ft j| jd}t j|dd|dd < t|j|j|d	||t j|	ft j| jdt j| jft j| jdt j| jft j| jdd d

}
|
jd | j j|dd |
jd | j j|dd |
jd |	 j|dd |
| _d S )NTcpu)r   
pin_memoryr	   z(NPU Graph batch info is not initialized.r:   r   r   )rU   F)
rN   rP   rR   rO   r$   rQ   r#   rS   r%   rT   )non_blocking)r   tensorrX   rZ   
batch_sizer[   rN   rP   forward_mode	is_extendmaxextend_seq_lens_cpuextend_seq_lensonesr	   r   r\   r   rY   r   rS   copy_r%   r#   r"   )r   r^   r#   rS   r%   rO   weight_indices_tensorlora_ranks_tensorscalings_tensorrN   r"   rR   r$   rQ   r   r   r   prepare_lora_batch   sj   	




z$AscendLoRABackend.prepare_lora_batchr
   )__name__
__module____qualname__nameintr   r	   r   r=   r/   r3   rG   rK   r]   r   listrZ   boolrn   __classcell__r   r   r   r   r      s    

&
C
=
r   )r   $sglang.srt.lora.backend.base_backendr   sglang.srt.lora.utilsr   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.utilsr   sgl_kernel_npu	torch_npur   r   r   r   r   <module>   s    