o
    پi                     @   s6   d dl mZmZ d dlZd dlmZ G dd dZdS )    )TupleUnionN)ForwardBatchc                   @   sH  e Zd ZdZdedejfddZ	d)dejdejd	ed
ejdejf
ddZ	dejdejd
ejd	edejf
ddZ
dejdejdejfddZdejdejdejfddZdejdejdeejeej f dejfddZdejdejdeejeej f dejfddZdedefd d!Zd"ed#ee d$ee d%ee d&ef
d'd(ZdS )*BaseLoRABackenda<  Base class for different Lora backends.
       Each backend has its own implementation of Lora kernels.

    Args:
        max_loras_per_batch: maximum number of different lora weights
                             that can be applied in a single forward batch.
        device: the device where the backend runs.
    max_loras_per_batchdevicec                 C   s   || _ || _d S N)r   r   )selfr   r    r
   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/lora/backend/base_backend.py__init__   s   
zBaseLoRABackend.__init__N	input_idsweights
vocab_sizeextra_embeddingsreturnc                 O      dS )a=  Run LoRA A embedding lookup with CUDA graph support.

        Args:
            input_ids: token IDs with shape (s,), where s is the sum of all sequence lengths
            weights: LoRA A embedding weights with shape (num_loras, rank, vocab_size)
            vocab_size: base vocabulary size (tokens >= vocab_size are extra tokens)
            extra_embeddings: extra token embeddings with shape (num_loras, num_extra_tokens, rank)
            Only needed if there are added tokens beyond base vocabulary.

        Returns:
            result with shape (s, rank)
        Nr
   )r	   r   r   r   r   argskwargsr
   r
   r   run_lora_a_embedding      z$BaseLoRABackend.run_lora_a_embeddingoutputc                 O   s   t )ay  
        Apply extra token embeddings to output in-place.

        Args:
            input_ids: (s,) token IDs
            output: (s, embed_dim) output tensor to be modified
            extra_embeddings: (num_loras, num_extra_tokens, embed_dim) extra embeddings
            vocab_size: base vocabulary size

        Returns:
            output: modified output tensor
        )NotImplementedError)r	   r   r   r   r   r   r   r
   r
   r   run_extra_token_embedding-   r   z)BaseLoRABackend.run_extra_token_embeddingxc                 O   r   )ad  Run segment Gemm of lora a modules with current backend.
        The definition of segment Gemm can be referred to https://docs.flashinfer.ai/api/gemm.html.

        Args:
             x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
             weights: a set of lora weights with shape (num_lora, c * r, input_dim),
                      here r is lora rank, c is a multiplier for stacked modules (e.g., c=3 for qkv_proj, c=2 for gate_up_proj)
                      usually input_dim is much larger than r
        Returns:
             result with shape (s, c * r)
        Nr
   r	   r   r   r   r   r
   r
   r   run_lora_a_sgemmD   s   z BaseLoRABackend.run_lora_a_sgemmc                 O   r   )a  Run segment Gemm of lora b modules with current backend.
        The definition of segment Gemm can be referred to https://docs.flashinfer.ai/api/gemm.html.

        Args:
             x: input matrix with shape (s, r), here s is the sum of all sequence lengths, r is lora rank
             weights: a set of lora weights with shape (num_lora, output_dim, r)
                      usually output_dim is much larger than r
        Returns:
             result with shape (s, output_dim)
        Nr
   r   r
   r
   r   run_lora_b_sgemmT   s   z BaseLoRABackend.run_lora_b_sgemm
qkv_lora_a
qkv_lora_bc                 O   r   )a  Run the lora pass for QKV Layer.

        Args:
            x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
            qkv_lora_a: lora_a module for qkv, with shape (num_lora, 3 * r, input_dim)
            qkv_lora_b: lora_b module for qkv.
                        If passed in as a tensor, its shape should be (num_lora,output_dim_q + 2 * output_dim_kv, r)
                        If passed in as a tuple of two tensors, it should contain:
                           a lora_b module for q, with shape (1, num_lora, output_dim_q, r)
                           and a combined lora_b module for kv, with shape (2, num_lora, output_dim_kv, r)
        Returns:
            result with shape (s, output_dim_q + 2 * output_dim_kv)
        Nr
   )r	   r   r   r   r   r   r
   r
   r   run_qkv_lorac   r   zBaseLoRABackend.run_qkv_loragate_up_lora_agate_up_lora_bc                 O   r   )a}  Run the lora pass for gate_up_proj, usually attached to MergedColumnParallelLayer.

        Args:
            x: input matrix with shape (s, input_dim), here s is the sum of all sequence lengths
            gate_up_lora_a: lora_a module for gate_up_proj, with shape (num_lora, 2 * r, input_dim)
            gate_up_lora_b: lora_b module for qkv.
                        If passed in as a tensor, its shape should be (num_lora, 2 * output_dim, r)
                        If passed in as a tuple, it should contain two tensors with shape (num_lora, output_dim, r)
        Returns:
            result with shape (s, 2 * output_dim)
        Nr
   )r	   r   r!   r"   r   r   r
   r
   r   run_gate_up_loraz   s   z BaseLoRABackend.run_gate_up_loramax_bs_in_cuda_graphnum_tokens_per_bsc                 C   r   )a  Initialize the batch info for CUDA Graph mode.

        This method provides a hook for each backend to conduct its own initialization
        logic for CUDA Graph mode.

        Args:
            cuda_graph_batch_info: the LoRABatchInfo object created in LoraManager
            max_bs_in_cuda_graph: maximum batch size for CUDA Graph mode
            num_tokens_per_bs: number of tokens per sequence (1 for decoding, >1 for target_verify)
        Nr
   )r	   r$   r%   r
   r
   r   init_cuda_graph_batch_info   s   z*BaseLoRABackend.init_cuda_graph_batch_infoforward_batchweight_indices
lora_ranksscalingsuse_cuda_graphc                 C   r   )a[  Prepare the lora weights and batch info for current forward batch.

        This method provides a hook for each backend to conduct its own preparation
        logic for each forward batch.

        Args:
            forward_batch: the ForwardBatch object for current forward pass
            weight_indices: list of indices of lora weights to be applied for current batch
            lora_ranks: list of lora ranks corresponding to weight_indices
            scalings: list of scaling factors corresponding to weight_indices
            use_cuda_graph: whether to use CUDA Graph for this batch
        Nr
   )r	   r'   r(   r)   r*   r+   r
   r
   r   prepare_lora_batch   s   z"BaseLoRABackend.prepare_lora_batchr   )__name__
__module____qualname____doc__inttorchr   r   Tensorr   r   r   r   r   r   r    r#   r&   r   listfloatboolr,   r
   r
   r
   r   r      s    		






r   )typingr   r   r2   ,sglang.srt.model_executor.forward_batch_infor   r   r
   r
   r
   r   <module>   s    