o
    پi                     @   s\   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	m
Z
 d dlmZ G dd deZdS )    N)BaseLoRABackend)embedding_lora_a_fwdgate_up_lora_b_fwdqkv_lora_b_fwdsgemm_lora_a_fwdsgemm_lora_b_fwd)LoRABatchInfo)ForwardBatchc                       s6  e Zd ZdZdedejf fddZ	d)dejdejd	ed
ejdejf
ddZ	dejdejdejfddZ
	d)dejdejdejdejfddZ	d)dejdejdejdejdedejdejfddZ	d)dejdejdejdejdejf
ddZdedefd d!Zd"ed#ee d$ee d%ee d&ef
d'd(Z  ZS )*TritonLoRABackendtritonmax_loras_per_batchdevicec                    s   t  || d S N)super__init__)selfr   r   kwargs	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/lora/backend/triton_backend.pyr      s   zTritonLoRABackend.__init__N	input_idsweights
vocab_sizeextra_embeddingsreturnc                 O   s   t ||| j||dS )z0Run LoRA A embedding lookup using Triton kernel.)r   r   
batch_infor   r   )r   r   )r   r   r   r   r   argsr   r   r   r   run_lora_a_embedding   s   
z&TritonLoRABackend.run_lora_a_embeddingxc                 O   s   t ||| jS r   )r   r   )r   r   r   r   r   r   r   r   run_lora_a_sgemm,   s   z"TritonLoRABackend.run_lora_a_sgemmbase_outputc                 O   s   t ||| j|S r   )r   r   )r   r   r   r!   r   r   r   r   r   run_lora_b_sgemm1   s   z"TritonLoRABackend.run_lora_b_sgemm
qkv_lora_a
qkv_lora_boutput_offsetmax_qkv_out_dimc                 O   s:   t |tjsJ t||| jdd}	t|	|| j|||}
|
S )N   	stack_num)
isinstancetorchTensorr   r   r   )r   r   r#   r$   r%   r&   r!   r   r   lora_a_outputlora_outputr   r   r   run_qkv_lora;   s   zTritonLoRABackend.run_qkv_loragate_up_lora_agate_up_lora_bc           
      O   sF   t |tjsJ |jd d }t||| jdd}t||| j||}	|	S )N   r(   )r*   r+   r,   shaper   r   r   )
r   r   r0   r1   r!   r   r   
output_dimr-   r.   r   r   r   run_gate_up_loraW   s   
z"TritonLoRABackend.run_gate_up_loramax_bs_in_cuda_graphnum_tokens_per_bsc                 C   s   t dR t|dd t j|f|t jdt j|d t jd|t j|t jdt j| jt jdt j| jt jdd d
| _t j	| jj
d | d| jjd|d  d W d    d S 1 sZw   Y  d S )NcudaT)dtype   )
bsuse_cuda_graphnum_segmentsseg_lens
seg_indptrmax_lenweight_indices
lora_ranksscalingspermutationr   )dimout)r+   r   r   fullint32zerosr   floatcuda_graph_batch_infocumsumr?   r@   )r   r7   r8   r   r   r   init_cuda_graph_batch_infot   s*   
"z,TritonLoRABackend.init_cuda_graph_batch_infoforward_batchrB   rC   rD   r=   c                 C   s  t j|t jddd}t j|t jddd}t j|t jddd}|j}	|r8| jd us,J d| j}
|j|
_|j|
_nc|j	 rBt
|jnd}|j	 rL|jn	t j|	t j| jd}t j|	d ft j| jd}t j|dd|dd < t|j|j|d	||t j|	ft j| jdt j| jft j| jdt j| jft j| jdd d

}
|
jd | j j|dd |
jd | j j|dd |
jd |	 j|dd |
| _d S )NTcpu)r:   
pin_memoryr   z)CUDA Graph batch info is not initialized.r;   )r:   r   r   )rF   F)
r<   r>   rA   r=   r?   r@   rB   rC   rD   rE   )non_blocking)r+   tensorrI   rK   
batch_sizerL   r<   r>   forward_mode	is_extendmaxextend_seq_lens_cpuextend_seq_lensonesr   rJ   rM   r   emptyr   int64rC   copy_rD   rB   r   )r   rO   rB   rC   rD   r=   weight_indices_tensorlora_ranks_tensorscalings_tensorr<   r   rA   r?   r@   r   r   r   prepare_lora_batch   sj   	




z$TritonLoRABackend.prepare_lora_batchr   )__name__
__module____qualname__nameintr+   r   r   r,   r   r    r"   r/   r6   rN   r	   listrK   boolra   __classcell__r   r   r   r   r
      s    

	


!

r
   )r+   $sglang.srt.lora.backend.base_backendr   sglang.srt.lora.triton_opsr   r   r   r   r   sglang.srt.lora.utilsr   ,sglang.srt.model_executor.forward_batch_infor	   r
   r   r   r   r   <module>   s    