o
    
۾i9                     @   s   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ e	r4ddlmZmZmZmZ dd	lmZ d
dlmZ eG dd deZdS )z
Based on:
Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
Punica: Multi-Tenant LoRA Serving.
https://arxiv.org/abs/2310.18547
    )finalN)LoRAMapping)get_captured_lora_counts)
HAS_TRITONtriton)round_up)LoRAKernelMetafused_moe_loralora_expandlora_shrink)_custom_ops   )PunicaWrapperBasec                   @   s<  e Zd ZdZdededejeB fddZde	de
ed	B  d
edefddZdejdejdeejdf defddZ		d<dejdejdeejdf deedf dedd	fddZ	d=dejdejdejdedd	f
ddZd	d dejdejdeejdf deejdf dedeedf d!ejd	B dd	fd"d#Zd	d dejdejdejdejd!ejd	B dd	fd$d%Z			&	&d>d'ejd(ed)ed*ed
ed+ejd,ejd	B d-ed.edeejejejejf fd/d0Z	&	&			d?dejdejdeejdf deejdf d1ejd2ejd	B d3ejd4ejd	B d5ed6ed+ejd7ed8ed9ejd	B fd:d;Zd	S )@PunicaWrapperGPUz
    PunicaWrapperGPU is designed to manage and provide metadata for the punica
    kernel. The main function is to maintain the state information for
    Multi-LoRA, and to provide the interface for the punica triton kernel.
    max_num_batched_tokensmax_batchesdevicec                 K   sd   t | ||| |d | _| jj| _t| j| jj}tj| j|||d| _tj| j|||d| _	d S )Nlora_config)r   captured_lora_counts)
r   __init__r   	max_lorasr   specialize_active_lorar   maketoken_mapping_metaprompt_mapping_meta)selfr   r   r   kwargsr    r   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_gpu.pyr   (   s$   


zPunicaWrapperGPU.__init__mappinglora_index_to_idNr   
vocab_sizec                 K   s8   |j | _ | |||| | j| j | j| j d S )N)
is_prefill_update_base_metadatar   prepare_tensorstoken_lora_indicesr   sampler_indices)r   r   r    r   r!   r   r   r   r   update_metadataK   s   z PunicaWrapperGPU.update_metadatayxlora_a_stacked.scalec                 K   sB   | d|jd }t|||g| j|d| jj|R   dS )a  
        Performs GEMM  for multiple slices of lora_a.

        Semantics:
        for i in range(len(lora_a_stacked)):
            y[i] += (x @ lora_a_stacked[i]) * scale

        Args:
            y (torch.Tensor): Output tensors
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
            scale (float): Scaling factor for the operation
        r   N)viewshaper   r   	meta_argssizer   r   )r   r(   r)   r*   r+   r   r   r   r   
add_shrinkZ   s   zPunicaWrapperGPU.add_shrinkr   Tlora_b_stackedoutput_slicesoffset_startreturnc           
      K   s|   |}| d|jd }|jdksJ |dt|ksJ |d}	t|||g| j|	| jj	R |dd |
|}dS )a2  
        Performs GEMM for multiple slices of lora_b.

        Semantics:
            for i in range(len(lora_b_stacked)):
                slice = output_slices[i]
                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                offset += slice

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensors
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
            output_slices (tuple[int, ...]): Every slice's size
            add_inputs (bool): Defaults to True.
        r,      r   r   Tr4   
add_inputsN)r-   r.   ndimr0   lenr
   r   r/   r   r   view_as)
r   r(   r)   r2   r3   r4   r8   r   y_org
num_tokensr   r   r   
add_expand{   s$   
zPunicaWrapperGPU.add_expandr8   c                 K   s>   t |jdd|f|g| j|d| jjR d|d dS )a]  
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.

        Semantics:
            y += x @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            add_inputs (bool): Default to True.
        r   dimr7   N)r
   	unsqueezer   r/   r0   r   r   )r   r(   r)   r2   r8   r   r   r   r   add_lora_embedding   s   
z#PunicaWrapperGPU.add_lora_embedding)bufferrC   c          
      K   s   t |t |  krt |ksJ  J |du sJ d|d d}	tjt ||d|	ftj|jd}| j||||fi | | j||||fddi| dS )a5  
        Applicable to linear-related lora.

        Semantics:
            for i in range(len(lora_a_stacked)):
                y[i] += (
                    x[i].unsqueeze(0)
                    @ lora_a_stacked[indices[i], layer_idx, :, :]
                    @ lora_b_stacked[indices[i], layer_idx, :, :]
                    * scale
                    ).squeeze(0)
        Args:
            y (torch.Tensor): Output tensor. Will be changed in-place.
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
            scale (float): Scaling factor.
            output_slices (tuple[int, ...]): Every slice's size.
            buffer (Optional[torch.Tensor]): Defaults to None.
        NdTo minimize overhead, the buffer should be created by .add_lora_linear() instead of being passed in.r   r,   dtyper   r8   T)r:   r0   torchemptyfloat32r   r1   r>   )
r   r(   r)   r*   r2   r+   r3   rC   r   rr   r   r   add_lora_linear   s4   (!

z PunicaWrapperGPU.add_lora_linearc          
      K   s   |}| d|jd }| d|jd }|d}	|du s!J dtj|d|	ftj|jd}t||g|jddg| j	
|d| jj|R   t|jdd|g|g| j	
|d| jjR ddi ||}dS )	a  
        Applies lora  specifically for LogitsProcessorWithLoRA.

        Semantics:
            buffer = (x @ lora_a_stacked) * scale
            y += buffer @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_a_stacked (torch.Tensor): lora_a's weights.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            scale (float): Scaling factor.
            buffer (Optional[torch.Tensor]): Default to None.
        r,   NrD   r   rE   r?   r8   T)r-   r.   r0   rG   rH   rI   r   r   rA   r   r/   r   r   r
   r;   )
r   r(   r)   r*   r2   r+   rC   r   r<   rJ   r   r   r   add_lora_logits  s>   




	z PunicaWrapperGPU.add_lora_logitsFtopk_idsr=   
block_sizenum_expertsadapter_enabled
expert_mappad_sorted_idsnaive_block_assignmentc
                 C   s   | j || jj\}
}}}}}}|	r|d}d}d}nS| ||d   }|r-t||}tj|| ftj	|j
d}t||}tj|| ftj	|j
d}tj|tj	|j
d}t||
|||||||||| |duro|| }d|||fS )z~
        Aligns tokens and experts into block-sized chunks for LoRA-based
        mixture-of-experts (MoE) execution.
        r,   Nr   rE   )r   r/   r   r   reshapenumelr   rG   rH   int32r   r   cdivopsmoe_lora_align_block_size)r   rM   r=   rN   rO   r   rP   rQ   rR   rS   token_lora_mapping_lora_ids
expert_ids
sorted_idsnum_tokens_post_padmax_num_tokens_paddedmax_num_m_blocksr   r   r   rY   E  sT   


z*PunicaWrapperGPU.moe_lora_align_block_sizetopk_weightssorted_token_idsr]   num_tokens_post_paddedmax_lora_rank	top_k_numfully_shardedoffsetrZ   c                 C   s,  | j |d| jj\}}}}}}}|du r|}tg ||||||||||	|
||||dd|dd|dd|dd	|d
d|dd|dd|dd|dd|dd|dd	|d
d|dd|dd|||R   dS )zb
        Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer.
        r   NBLOCK_SIZE_M@   BLOCK_SIZE_NBLOCK_SIZE_K    GROUP_SIZE_M   	NUM_WARPS   
NUM_STAGESr6   SPLIT_Kr   )r   r/   r0   r   r   r	   get)r   r(   r)   r*   r2   rb   rc   r]   rd   re   rf   shrink_configexpand_configrP   mul_routed_weightrg   rh   rZ   token_lora_mapping_metar[   r\   num_active_lorasr   r   r   add_lora_fused_moe  s   	














z#PunicaWrapperGPU.add_lora_fused_moe)r   T)T)NFF)FFr   N)__name__
__module____qualname____doc__intrG   r   strr   r   listr'   Tensortuplefloatr1   r>   boolrB   rK   rL   rY   rz   r   r   r   r   r       s   
#


'
	
3
)
	
G

E	

O	
r   )r~   typingr   rG   vllm.lora.layersr   vllm.lora.utilsr   vllm.triton_utilsr   r   vllm.utils.math_utilsr   vllm.lora.ops.triton_opsr   r	   r
   r   vllmr   rX   punica_baser   r   r   r   r   r   <module>   s   