o
    پi
5                     @   sd   d dl Z d dlmZ d dlmZmZ d dlmZmZ d dl	m
Z
 d dlmZ dZG dd	 d	eZdS )
    N)BaseLoRABackend) chunked_sgmv_lora_expand_forward chunked_sgmv_lora_shrink_forward)LoRABatchInfogenerate_sequence_lengths)ForwardBatch)
ServerArgs   c                       sX  e Zd ZdZdZdedejdef fddZ	dej
d	ej
d
ej
fddZ	d.dej
d	ej
dej
dej
d
ej
f
ddZ	d.dej
dej
dej
dej
dedej
d
ej
fddZ	d.dej
dej
dej
dej
dej
d
ej
fddZded
efddZdedefd d!Zded"ee d#ee d$ee d%ef
d&d'Zedefd(d)Zd*ej
d+efd,d-Z  ZS )/ChunkedSgmvLoRABackenda  
    Chunked LoRA backend using segmented matrix-vector multiplication.

    This backend is largely based on the SGMV (Segmented Gather Matrix-Vector multiplication) algorithm
    introduced in the Punica paper (https://arxiv.org/pdf/2310.18547). One main variation made here is to
    segment the input sequences into fixed-size chunks, which reduces excessive kernel launches especially
    when the LoRA distribution is skewed.
    csgmvmax_loras_per_batchdeviceserver_argsc                    s   t  || |j| _d S N)super__init__max_lora_chunk_sizemax_chunk_size)selfr   r   r   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/lora/backend/chunked_backend.pyr      s   zChunkedSgmvLoRABackend.__init__xweightsreturnc                 O   s   t ||| jddS )N   r   r   
batch_info
num_slices)r   r   )r   r   r   argskwargsr   r   r   run_lora_a_sgemm$   s   z'ChunkedSgmvLoRABackend.run_lora_a_sgemmNoutput_offsetbase_outputc           	      O   s$   |j d }|}t||| j|||dS )Nr   r   r   slice_offsetsmax_slice_sizer$   )shaper   r   )	r   r   r   r#   r$   r    r!   
output_dimr(   r   r   r   run_lora_b_sgemm.   s   

z'ChunkedSgmvLoRABackend.run_lora_b_sgemm
qkv_lora_a
qkv_lora_bmax_qkv_out_dimc                 O   s<   t |tjsJ t||| jdd}	t|	|| j|||d}
|
S )N   r   r&   )
isinstancetorchTensorr   r   r   )r   r   r,   r-   r#   r.   r$   r    r!   lora_a_outputlora_outputr   r   r   run_qkv_loraC   s    z#ChunkedSgmvLoRABackend.run_qkv_loragate_up_lora_agate_up_lora_bc                 O   sJ   t |tjsJ |jd d }t||| jdd}	t|	|| j|||d}
|
S )Nr%      r   r&   )r0   r1   r2   r)   r   r   r   )r   r   r6   r7   r#   r$   r    r!   r*   r3   r4   r   r   r   run_gate_up_lorad   s"   z'ChunkedSgmvLoRABackend.run_gate_up_loraforward_batchc                 C   sP   | j tkrtS |j r|jn|j}|dkrd}n	|dkr d}nd}t| j |S )a  
        Heuristically determine the chunk size based on token token number in a batch.

        Args:
            forward_batch (ForwardBatch): The batch information containing sequence lengths.

        Returns:
            The determined chunk size
              @       r	   )r   MIN_CHUNK_SIZEforward_mode	is_extendextend_num_tokens
batch_sizemin)r   r:   
num_tokens
chunk_sizer   r   r   _determine_chunk_size   s   
z,ChunkedSgmvLoRABackend._determine_chunk_sizemax_bs_in_cuda_graphnum_tokens_per_bsc                 C   s   |t  d t  | }|| }td@ t|dtj|tjdtj|d tjdtj|tjdtj|tjdtj| jtjdtj| jtjdd d d
| _W d    d S 1 sVw   Y  d S )Nr   cudaTdtype)
bsuse_cuda_graphseg_lens
seg_indptrweight_indicespermutation
lora_ranksscalingsnum_segmentsmax_len)	r?   r1   r   r   zerosint32r   floatcuda_graph_batch_info)r   rH   rI   max_num_segmentsmax_num_tokensr   r   r   init_cuda_graph_batch_info   s$   
"z1ChunkedSgmvLoRABackend.init_cuda_graph_batch_inforQ   rS   rT   rN   c                 C   s  |  |}tj||d\}}| j||d\}	}
t|	}tj|tjddd}tj|tjddd}|stt	|j
||dtj|d ftj| jdtj|ftj| jdtj| jftj| jdtj| jftj| jdtjt|ftj| jdd d	
}n| j}|j
|_||_||_|jd | j j|dd
 |jd | j j|dd
 |jd | j|	dd
 |jd |d  j|
dd
 |jd t| j|dd
 || _d S )N)seq_weight_indicesr:   )weights_reorderedrF   Tcpu)rL   
pin_memoryr   Fr   )rL   r   )
rM   rU   rV   rN   rP   rQ   rS   rT   rR   rO   )non_blocking)rG   r
   _get_permutation_get_segments_infolenr1   tensorrX   rY   r   rC   emptyr   r   rZ   rM   rU   rV   rS   copy_rT   rQ   rP   rR   r   )r   r:   rQ   rS   rT   rN   rF   rR   weight_indices_reorderedseg_weight_indicesrP   rU   lora_ranks_tensorscalings_tensorr   r   r   r   prepare_lora_batch   sn   





z)ChunkedSgmvLoRABackend.prepare_lora_batchc                 C   s   t d6 t j| t jd} t|}t | |}t jt|ft jdd}t j	|d|d || }||fW  d   S 1 s>w   Y  dS )a  
        Computes permutation indices for reordering tokens by their LoRA adapter assignments.

        This function implements the "gather" step in Chunked Segmented Gather Matrix Vector
        multiplication by creating a permutation that groups tokens by their LoRA adapter.
        Tokens using the same LoRA adapter are placed together to enable efficient batched
        computation.

        Example:
            seq_weight_indices = [0, 1, 0]  # 3 sequences using adapters [0, 1, 0]
            extend_seq_lens = [2, 1, 3]     # sequence lengths [2, 1, 3 tokens]

            # Creates row_weight_indices: [0, 0, 1, 0, 0, 0] (6 tokens total)
            # Returns permutation: [0, 1, 3, 4, 5, 2] (groups adapter 0 tokens together)
            # weights_reordered: [0, 0, 0, 0, 0, 1] (sorted by adapter)

        Args:
            seq_weight_indices: List of LoRA adapter indices for each sequence
            forward_batch (ForwardBatch): Batch information containing sequence lengths

        Returns:
            tuple: (permutation, weights_reordered) where:
                - permutation: Token reordering indices to group by adapter
                - weights_reordered: Sorted adapter indices for each token
        r`   rK   TrL   ra   )stableoutN)
r1   r   rf   rX   r   repeat_interleaverg   re   longargsort)r^   r:   seg_lens_cpurow_weight_indicesrR   r_   r   r   r   rc     s   $z'ChunkedSgmvLoRABackend._get_permutationr_   rF   c                 C   s  t d| t j|dd\}}g }g }t||D ]/\}}| }|| d | }	|| g|	  ||g|	d   |||	d |   qt j|t jd}
t j|t jdd}t j	t
|
d ft jdd}d|d< t j|
dd|dd	< ||fW  d	   S 1 sw   Y  d	S )
a  
        Computes segment information for chunked SGMV operations.

        This function takes the reordered weight indices and creates segments of fixed size
        (self.segment_size) for efficient kernel execution. Each segment contains tokens
        that use the same LoRA adapter, enabling vectorized computation.

        The segmentation is necessary because:
        1. GPU kernels work efficiently on fixed-size blocks
        2. Large groups of tokens using the same adapter are split into manageable chunks
        3. Each segment can be processed independently in parallel

        Example:
            weights_reordered = [0, 0, 0, 0, 0, 1]  # 5 tokens with adapter 0, 1 with adapter 1
            segment_size = 3

            # Creates segments:
            # Segment 0: tokens 0-2 (adapter 0), length=3
            # Segment 1: tokens 3-4 (adapter 0), length=2
            # Segment 2: token 5 (adapter 1), length=1

            # Returns:
            # weight_indices_list: [0, 0, 1] (adapter for each segment)
            # seg_indptr: [0, 3, 5, 6] (cumulative segment boundaries)

        Args:
            weights_reordered (torch.Tensor): Sorted adapter indices for each token
            chunk_size (int): Fixed size for each segment

        Returns:
            tuple: (weight_indices_list, seg_indptr) where:
                - weight_indices_list: LoRA adapter index for each segment
                - seg_indptr: Cumulative segment boundaries (CSR-style indptr)
        r`   T)return_countsr   rK   rn   r   )dimN)r1   r   unique_consecutivezipitemextendappendrf   rX   rg   re   cumsum)r   r_   rF   unique_weightscountsweight_indices_listseg_lens_list
weight_idx	group_lennum_segsrO   rP   r   r   r   rd   +  s.   #
$z)ChunkedSgmvLoRABackend._get_segments_infor   )__name__
__module____qualname____doc__nameintr1   r   r   r   r2   r"   r+   r5   r9   r   rG   r]   listrY   boolrm   staticmethodrc   rd   __classcell__r   r   r   r   r
      s    		



'	
"

I)r
   )r1   $sglang.srt.lora.backend.base_backendr   sglang.srt.lora.triton_opsr   r   sglang.srt.lora.utilsr   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.server_argsr   r?   r
   r   r   r   r   <module>   s    