o
    
۾iB"                     @   s\   d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ eG dd	 d	eZdS )
z
Based on:
Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
Punica: Multi-Tenant LoRA Serving.
https://arxiv.org/abs/2310.18547
    )finalN)LoRAMapping)bgmv_expandbgmv_expand_slicebgmv_shrink   )PunicaWrapperBasec                   @   s  e Zd ZdZdededejeB fddZde	de
ed	B  d
edefddZdejdejfddZdejdejdejdefddZdejdejdejdededefddZdejdejdeejdf defddZ	 	!d1dejdejd"eejdf d#eedf d$edd	fd%d&Z	!d2dejdejd"ejdedd	f
d'd(Zd	d)dejdejdeejdf d"eejdf ded#eedf d*ejd	B dd	fd+d,Zedejfd-d.Zd	d)dejdejdejd"ejd*ejd	B dd	fd/d0Zd	S )3PunicaWrapperXPUz
    PunicaWrapperXPU is designed to manage and provide metadata for the punica
    kernel. The main function is to maintain the state information for
    Multi-LoRA, and to provide the interface for the punica ipex kernel.
    max_num_batched_tokensmax_batchesdevicec                 K   sD   t | ||| tj| jd tj| jd tj| jd d S )Nr   r   )r   __init__torch_dynamomark_dynamic_token_lora_indices_embeddings_indices_sampler_indices_padded)selfr
   r   r   kwargs r   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_xpu.pyr      s   zPunicaWrapperXPU.__init__mappinglora_index_to_idN	max_loras
vocab_sizec                 K   s   |j | _ | |||| d S N)
is_prefill_update_base_metadata)r   r   r   r   r   r   r   r   r   update_metadata(   s   z PunicaWrapperXPU.update_metadataxreturnc                 C   s   t | jdd|dS )Nr   )r   narrowr   size)r   r    r   r   r   _get_token_lora_indices3   s   z(PunicaWrapperXPU._get_token_lora_indicesyw_t_allscalec                 C   s   t |||| || d S r   )r   r$   )r   r%   r    r&   r'   r   r   r   _apply_shrink6   s   zPunicaWrapperXPU._apply_shrinky_offsety_slice_size
add_inputsc                 C   s"   |  |}t||||||| d S r   )r$   r   )r   r%   r    r&   r)   r*   r+   token_lora_indicesr   r   r   _apply_expand?   s   
	zPunicaWrapperXPU._apply_expandlora_a_stacked.c                 K   s@   | d|jd }tt|D ]}| || ||| | qdS )a  
        Performs GEMM  for multiple slices of lora_a.

        Semantics:
        for i in range(len(lora_a_stacked)):
            y[i] += (x @ lora_a_stacked[i]) * scale

        Args:
            y (torch.Tensor): Output tensors
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
            scale (float): Scaling factor for the operation
        N)viewshaperangelenr(   )r   r%   r    r.   r'   r   	slice_idxr   r   r   
add_shrinkM   s   zPunicaWrapperXPU.add_shrinkr   Tlora_b_stackedoutput_slicesoffset_startc           
   	   K   s   |}| d|jd }|jdksJ |dt|ksJ tt|D ]}	| j|||	 ||	 |||	 |d |||	 7 }q#|| dS )a2  
        Performs GEMM for multiple slices of lora_b.

        Semantics:
            for i in range(len(lora_b_stacked)):
                slice = output_slices[i]
                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                offset += slice

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensors
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
            output_slices (tuple[int, ...]): Every slice's size
            add_inputs (bool): Defaults to True.
        r/      r   r+   N)r0   r1   ndimr#   r3   r2   r-   view_as)
r   r%   r    r6   r7   r8   r+   r   y_orgr4   r   r   r   
add_expandg   s   zPunicaWrapperXPU.add_expandc                 K   s   |  |}t||||| dS )a]  
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.

        Semantics:
            y += x @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            add_inputs (bool): Default to True.
        N)r$   r   )r   r%   r    r6   r+   r   r,   r   r   r   add_lora_embedding   s   
z#PunicaWrapperXPU.add_lora_embedding)bufferr@   c          
      K   s   t |t |  krt |ksJ  J |du r1|d d}	tjt ||d|	ftj|jd}| j||||fi | | j||||fddi| dS )a6  
        Applicable to linear-related lora.

        Semantics:
            for i in range(len(lora_a_stacked)):
                y[i] += (
                    x[i].unsqueeze(0)
                    @ lora_a_stacked[indices[i], layer_idx, :, :]
                    @ lora_b_stacked[indices[i], layer_idx, :, :]
                    * scale
                    ).squeeze(0)

        Args:
            y (torch.Tensor): Output tensor. Will be changed in-place.
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
            scale (float): Scaling factor.
            output_slices (tuple[int, ...]): Every slice's size.
            buffer (Optional[torch.Tensor]): Defaults to None.
        Nr   r/   dtyper   r+   T)r3   r#   r   zerosfloat32r   r5   r>   )
r   r%   r    r.   r6   r'   r7   r@   r   rr   r   r   add_lora_linear   s4   ("
z PunicaWrapperXPU.add_lora_linearc                 C   s   | j dd S )zJ
        This property provides access to padded sampler indices.
        N)r   )r   r   r   r   sampler_indices_padded   s   z'PunicaWrapperXPU.sampler_indices_paddedc                K   s   |}| d|jd }| d|jd }|d}	|du r,tj|d|	ftj|jd}t| jdd|d}
t	||||
| t
||||
dd ||S )a  
        Applies lora  specifically for LogitsProcessorWithLoRA.

        Semantics:
            buffer = (x @ lora_a_stacked) * scale
            y += buffer @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_a_stacked (torch.Tensor): lora_a's weights.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            scale (float): Scaling factor.
            buffer (Optional[torch.Tensor]): Default to None.
        r/   Nr   rA   Tr:   )r0   r1   r#   r   rC   rD   r   r"   _sampler_indicesr   r   r<   )r   r%   r    r.   r6   r'   r@   r   r=   rE   sampler_indicesr   r   r   add_lora_logits   s   

z PunicaWrapperXPU.add_lora_logits)r   T)T)__name__
__module____qualname____doc__intr   r   strr   r   listr   Tensor	IntTensorr$   floatr(   boolr-   tupler5   r>   r?   rF   propertyrG   rJ   r   r   r   r   r	      s    



	

 
	
2
 
	
=
r	   )rN   typingr   r   vllm.lora.layersr   vllm.lora.ops.ipex_opsr   r   r   punica_baser   r	   r   r   r   r   <module>   s   