o
    
۾i=                     @   sl   d Z ddlmZmZ ddlmZ ddlZddlmZm	Z	 er$ddl
mZ G dd	 d	eZG d
d deZdS )z
Based on:
Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
Punica: Multi-Tenant LoRA Serving.
https://arxiv.org/abs/2310.18547
    )ABCabstractmethod)TYPE_CHECKINGN   )compute_metaconvert_mapping)LoRAMappingc                   @   s  e Zd ZdZedddeedB  dededdf
d	d
Zedee	j
df e	j
B de	j
dee	j
df dede	j
dB f
ddZe		d"de	j
dee	j
df e	j
B dee	j
df deedf dede	j
dB fddZe	d#de	j
de	j
de	j
dede	j
dB f
ddZeddde	j
de	j
dee	j
df dee	j
df dedeedf dee	j
df dB de	j
dB fddZeddde	j
de	j
de	j
de	j
de	j
dB de	j
dB fd d!ZdS )$PunicaWrapperABCz
    PunicaWrapper ABC.
    mappingr   lora_index_to_idN	max_loras
vocab_sizereturnc                 K      t )z2
        Update the lora-related metadata
        NotImplementedErrorselfr
   r   r   r   kwargs r   X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_base.pyupdate_metadata   s   z PunicaWrapperABC.update_metadatay.xlora_a_stackedscalec                 K   r   )z?
        Performs GEMM  for multiple slices of lora_a.
        r   r   r   r   r   r   r   r   r   r   
add_shrink)      zPunicaWrapperABC.add_shrinkr   Tlora_b_stackedoutput_slicesoffset_startc                 K   r   )z>
        Performs GEMM for multiple slices of lora_b.
        r   r   r   r   r   r    r!   
add_inputsr   r   r   r   
add_expand8   s   zPunicaWrapperABC.add_expandr#   c                 K   r   )z
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA,
        and this layer only requires the expand operation.
        r   r   r   r   r   r#   r   r   r   r   add_lora_embeddingH   r   z#PunicaWrapperABC.add_lora_embeddingbufferr(   c          	      K   r   )z4
        Applicable to linear-related lora.
        r   	r   r   r   r   r   r   r    r(   r   r   r   r   add_lora_linearW   s   z PunicaWrapperABC.add_lora_linearc                K   r   )zI
        Applies lora  specifically for LogitsProcessorWithLoRA.
        r   r   r   r   r   r   r   r(   r   r   r   r   add_lora_logitsj   s   z PunicaWrapperABC.add_lora_logitsr   TT)__name__
__module____qualname____doc__r   listintr   tupletorchTensorfloatr   r$   boolr&   r*   r,   r   r   r   r   r	      s    

	

		
r	   c                   @   s>  e Zd ZdZdededejeB fddZddd	e	ed
B  dedefddZ
dejdd
fddZedeejejejeeef fddZedejfddZedejfddZedejfddZedejfddZddd	e	ed
B  dedefddZedeejd f ejB d!ejd"eejd f d#edejd
B f
d$d%Ze	&	'dKdejd!eejd f ejB d(eejd f d)eed f d*edejd
B fd+d,Ze	'dLdejd!ejd(ejd-edejd
B f
d.d/Zed
d0dejd!ejd"eejd f d(eejd f d#ed)eed f d1eejd f d
B dejd
B fd2d3Zed
d0dejd!ejd"ejd(ejd1ejd
B dejd
B fd4d5Z	
	6dMd7ejd8ed9ed:eded;ejd<ejd
B d=edeejejejejf fd>d?Z	6	6	&	
dNdejd!ejd"eejd f d(eejd f d@ejdAejd
B dBejdCejd
B dDedEed;ejdFedGedHejd
B fdIdJZd
S )OPunicaWrapperBasez
    PunicaWrapperBase is designed to manage and provide metadata for the punica
    kernel. The main function is to maintain the state information for
    Multi-LoRA, and to provide the interface for the punica.
    max_num_batched_tokensmax_batchesdevicec                 K   s   t j|t j|d| _t j|t j|d| _t j|t j|d| _t jd|t j|d| _d gd | _t j|t j|d| _t j|t j|d| _	t j|t j|d| _
|| _d| _d| _d| _d| _d| _d S )N)dtyper=         r   F)r6   emptylong_token_lora_indices_sampler_indices_sampler_indices_padded_embeddings_indicesindices_len_seq_start_locs_seq_lengths_lora_indices_per_batchr=   
max_length
token_nums
batch_size
is_prefillno_lora)r   r;   r<   r=   r   r   r   r   __init__   s0   

zPunicaWrapperBase.__init__r
   r   r   Nr   r   c                 C   s   d}t |||||| j\}}}}	}
| jd |jd  | | jd |jd  | | jd |jd  | | jd |	jd d |	jd f |	 |
| jd d < d S )Nr   r   )	r   r=   rD   shapecopy_rE   rF   rG   rH   )r   r
   r   r   r   extra_vocab_sizebase_indicessampler_indicessampler_indices_paddedembeddings_indicesrH   r   r   r   _update_base_metadata   s2   
	z'PunicaWrapperBase._update_base_metadatatoken_lora_tensorr   c           	      C   s   t |\}}}}}}}| jd |jd  | | jd |jd  | | jd |jd  | || _|| _|| _|| _	d S )Nr   )
r   rI   rR   rS   rJ   rK   rN   rL   rM   rP   )	r   rZ   b_seq_start_tensorseq_length_tensorlora_indices_tensorrN   rL   rM   rP   r   r   r   _update_prefill_metadata   s$   	
z*PunicaWrapperBase._update_prefill_metadatac                 C   s:   | j d| j | jd| j | jd| j | j| j| jfS )aY  
        This property provides a convenient way to access the necessary
        metadata for prefill-related  kernel computations.
            1. seq_start_locs: Tensor of sequence start positions.
            2. seq_lengths: Tensor of sequence lengths.
            3. lora_indices_per_batch: Tensor of lora indices, and an index of
                -1 means no lora should be applied.
            4. batch_size: Batch size after clustering identical lora indices.
            5. max_length: The maximum sequence length in the batch.
            6. token_nums: The token numbers in the batch.
        N)rI   rN   rJ   rK   rL   rM   )r   r   r   r   prefill_metadata   s   z"PunicaWrapperBase.prefill_metadatac                 C      | j d }| jd| S )z
        This property provides the lora indices corresponding to each token
        in the batch. An index of -1 means no lora should be applied.
        r   N)rH   rD   )r   token_lora_lenr   r   r   token_lora_indices      
z$PunicaWrapperBase.token_lora_indicesc                 C   r`   )zt
        This property is used to access the lora indices specifically for
        LogitsProcessorWithLoRA.
        r   N)rH   rE   )r   sampler_indices_lenr   r   r   rV     rc   z!PunicaWrapperBase.sampler_indicesc                 C   r`   )zJ
        This property provides access to padded sampler indices.
        r?   N)rH   rF   )r   indices_padded_lenr   r   r   rW     s   
z(PunicaWrapperBase.sampler_indices_paddedc                 C   s    | j d }| jddd|f S )z
        This property provides access to the indices used for lora embeddings,
        specifically for VocabParallelEmbeddingWithLoRA.
           N)rH   rG   )r   embeddings_indices_lenr   r   r   rX     s   
z$PunicaWrapperBase.embeddings_indicesc                 K   s6   |  |||| |jr| | j d| _d S d| _d S )NTF)rY   rO   r^   rb   r   r   r   r   r     s
   

z!PunicaWrapperBase.update_metadatar   .r   r   r   c                 K   r   )a  
        Performs GEMM  for multiple slices of lora_a.

        Semantics:
        for i in range(len(lora_a_stacked)):
            y[i] += (x @ lora_a_stacked[i]) * scale

        Args:
            y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
            scale (float): Scaling factor for the operation

        r   r   r   r   r   r   -  s   zPunicaWrapperBase.add_shrinkr   Tr   r    r!   c                 K   r   )a  
        Performs GEMM for multiple slices of lora_b.

        Semantics:
            offset = offset_start
            for i in range(len(lora_b_stacked)):
                slice = output_slices[i]
                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                offset += slice

        Args:
            y (torch.Tensor): Output tensor.
            x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
            output_slices (tuple[int, ...]): Every slice's size
            offset_start (int): The starting position of y, defaults to 0
            add_inputs (bool):  Defaults to True.

        r   r"   r   r   r   r$   G  s   zPunicaWrapperBase.add_expandr#   c                 K   r   )a  
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
        and this layer only requires the expand operation.
        Semantics:
            y += x @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            add_inputs (bool): Default to True.
        r   r%   r   r   r   r&   h  s   z$PunicaWrapperBase.add_lora_embeddingr'   r(   c          	      K   r   )aB  
        Applicable to linear-related lora.

        Semantics:
            for i in range(len(lora_a_stacked)):
                y[i] += (
                    x[i].unsqueeze(0)
                    @ lora_a_stacked[indices[i], layer_idx, :, :]
                    @ lora_b_stacked[indices[i], layer_idx, :, :]
                    * scale
                    ).squeeze(0)

        Args:
            y (torch.Tensor): Output tensor. Will be changed in-place.
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
            scale (float): Scaling factor.
            output_slices (tuple[int, ...]): Every slice's size.
            buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
        r   r)   r   r   r   r*     s   #z!PunicaWrapperBase.add_lora_linearc                K   r   )a  
        Applies lora  specifically for LogitsProcessorWithLoRA.

        Semantics:
            buffer = (x @ lora_a_stacked) * scale
            y += buffer @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_a_stacked (torch.Tensor): lora_a's weights.
            lora_b_stacked (torch.Tensor):lora_b's weights.
            scale (float): Scaling factor.
            buffer (Optional[torch.Tensor]):Default to None.
        r   r+   r   r   r   r,     s   z!PunicaWrapperBase.add_lora_logitsFtopk_ids
num_tokens
block_sizenum_expertsadapter_enabled
expert_mappad_sorted_idsc	           	      C   r   )z~
        Aligns tokens and experts into block-sized chunks for LoRA-based
        mixture-of-experts (MoE) execution.
        r   )	r   rh   ri   rj   rk   r   rl   rm   rn   r   r   r   moe_lora_align_block_size  s   z+PunicaWrapperBase.moe_lora_align_block_sizetopk_weightssorted_token_ids
expert_idsnum_tokens_post_paddedmax_lora_rank	top_k_numfully_shardedoffsettoken_lora_mappingc                 C   r   )zj
        Performs a fused forward computation for LoRA of
        Mixture-of-Experts (MoE) layer.
        r   )r   r   r   r   r   rp   rq   rr   rs   rt   ru   shrink_configexpand_configrl   mul_routed_weightrv   rw   rx   r   r   r   add_lora_fused_moe  s   z$PunicaWrapperBase.add_lora_fused_moer-   r.   )NF)FFr   N)r/   r0   r1   r2   r4   r6   r=   strrQ   r3   rY   r7   r^   propertyr5   r_   rb   rV   rW   rX   r   r   r8   r   r$   r9   r&   r*   r,   ro   r|   r   r   r   r   r:   |   sR   
%

$


	 

	$	
%	

!	
r:   )r2   abcr   r   typingr   r6   utilsr   r   vllm.lora.layersr   r	   r:   r   r   r   r   <module>   s   f