o
    -i                     @   s   d dl mZ d dlZerd dlmZ dejdeejejejeeeef fddZ	dd	d
e
edB  dedededejdeejejejeje
e f fddZdS )    )TYPE_CHECKINGN)LoRAMappingtoken_lora_tensorreturnc           	      C   s   t j| dd\}}t j|dd}t |}|dd |dd  |  }|  }|d}d}|dkr?|dkr?d}|||||||fS )	at  
    Get the information required for the sgmv kernel. With the  features:
    1. If consecutive requests in the batch use the same LoRA, this function
    will combine them into a single request, improving sgmv kernel inference
    performance.
    2. At the beginning of each prefill stage inference, recalculations are
    needed based on the input, but only once.
    T)return_countsr   )dim   NF)	torchunique_consecutivecumsum
zeros_likecopy_maxitemsumsize)	r   lora_indices_tensorseq_length_tensor
cum_resultb_seq_start_tensor
max_length
token_nums
batch_sizeno_lora r   [/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/lora/punica_wrapper/utils.pycompute_meta   s(   


r   mappingr   lora_index_to_id	max_loras
vocab_sizeextra_vocab_sizedevicec                    sl  t | j }| }| } fdd| jD }	d}
tt|D ]!}|| dkr0 || nd}
|| dkr:|
nd||< |
||< q!|||g}tj|tj	|d}tj|	tj	|d}t
|d | |d ||  g}t|dk|d |}|d }|}| }t|dk|d |}tjdt||tj	d	|t|  }|jd |jd |jd |jd g}|||||fS )
ax  Converts LoRAMapping to index tensors.

    Args:
        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
        lora_index_to_id: List mapping LoRA ids to LoRA indices.
        max_loras: Maximum number of LoRAs.
        vocab_size: Model vocab size.
        extra_vocab_size: Extra vocab size each LoRA can have.

    Returns:
        A tuple of tensors:
            base_indices: Tensor of shape [batch_size] mapping batch rows to
                LoRA indices.
            sampler_indices: Tensor of shape [batch_size] mapping requests to
                LoRA indices for sampler. For generation, this will be the
                same as base_indices. For prefill, this will map requests
                to LoRA indices.
            sampler_indices_padded: Tensor of shape [batch_size] mapping
                requests to LoRA indices for sampler with padding.
                Same as sampler_indices, but -1 is replaced with
                max_loras.
            embeddings_indices: Tensor of shape [2, batch_size] mapping
                requests to embedding indices. First row is for embeddings
                added by the LoRAs, second row is for the LoRA.lora_a
                embeddings.
            indices_len: List of lengths of the above tensors. It contains
                (base_indices, sampler_indices, sampler_indices_padded,
                embeddings_indices).
    c                    s"   g | ]}|d kr  |ndqS )r   r	   )index).0xr   r   r   
<listcomp>]   s    z#convert_mapping.<locals>.<listcomp>Nr   r	   )dtyper#      r   )r#   r)   )listindex_mappingcopyprompt_mappingrangelenr$   r
   tensorlongstackwhereclonearangeshape)r   r   r    r!   r"   r#   index_mapping_indicesembedding_indiceslora_indicesr.   lora_idxiindices_listindicesprompt_mapping_tensorembeddings_indicesbase_indicessampler_indicessampler_indices_paddedindices_lenr   r'   r   convert_mapping4   sh   %



rE   )typingr   r
   vllm.lora.layersr   Tensortupleintboolr   r+   r#   rE   r   r   r   r   <module>   s0   
'
