o
    
۾iA*                     @   sT   d dl mZ d dlZd dlmZmZmZmZmZm	Z	 ddl
mZ G dd deZdS )    )CallableN)bgmv_expandbgmv_expand_slicebgmv_shrinksgmv_expandsgmv_expand_slicesgmv_shrink   )PunicaWrapperBasec                   @   s  e Zd ZdZdededejeB fddZdej	dej	d	ej	d
e
fddZdej	dej	d	ej	d
e
fddZdej	dej	d	ej	defddZdej	dej	d	ej	defddZdej	dej	d	ej	dededefddZdej	dej	d	ej	dededefddZ	d3dej	dej	d	ej	dededefddZdej	dej	d	ej	d
e
fddZdeej	df ej	B dej	d eej	df d
e
fd!d"Z	#	d4dej	deej	df ej	B d$eej	df d%eedf d&ed'd(fd)d*Z	d3dej	dej	d$ej	ded'd(f
d+d,Zd(d-dej	dej	d eej	df d$eej	df d
e
d%eedf d.eej	df d(B d'd(fd/d0Zd(d-dej	dej	d ej	d$ej	d.ej	d(B d'd(fd1d2Zd(S )5PunicaWrapperCPUz
    PunicaWrapperCPU is designed to manage and provide metadata for the punica
    kernel. The main function is to maintain the state information for
    Multi-LoRA, and to provide the interface for the pytorch punica ops.
    max_num_batched_tokensmax_batchesdevicec                 K   s   t | ||| d S N)r
   __init__)selfr   r   r   kwargs r   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/lora/punica_wrapper/punica_cpu.pyr      s   zPunicaWrapperCPU.__init__yxw_t_allscalec                 C   (   | j rd S t|||g| j|R   d S r   )no_lorar   prefill_metadatar   r   r   r   r   r   r   r   _shrink_prefill&      z PunicaWrapperCPU._shrink_prefillc                 C      t |||| j| d S r   )r   token_lora_indicesr   r   r   r   _shrink_decode8      zPunicaWrapperCPU._shrink_decode
add_inputsc                 C   r   r   )r   r   r   r   r   r   r   r#   r   r   r   _expand_prefillA   r   z PunicaWrapperCPU._expand_prefillc                 C   r   r   )r   r    r$   r   r   r   _expand_decodeS   r"   zPunicaWrapperCPU._expand_decodey_offsety_slice_sizec                 C   s0   | j rd S t|||g| j|||R   d S r   )r   r   r   r   r   r   r   r'   r(   r#   r   r   r   _expand_slice_prefill\   s   
z&PunicaWrapperCPU._expand_slice_prefillc                 C   s   t |||| j||| d S r   )r   r    r)   r   r   r   _expand_slice_decoder   s   	z%PunicaWrapperCPU._expand_slice_decodeTc                 C   s(   | j r| jn| j}||||||| dS )z
        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all`
        computation, which is suitable for the
        GEMM of lora'b.
        N)
is_prefillr*   r+   )r   r   r   r   r'   r(   r#   expand_slice_funr   r   r   _apply_expand   s   zPunicaWrapperCPU._apply_expandc                 C   sD   |}| d|jd }| jr| jn| j}||||| ||}dS )af  
        Perform the ` y+=x@w_t_all` computation, which is suitable for the
        GEMM of lora'a.
        When `is_prefill is` true, it indicates that it is currently the
        prefill stage, and the `_shrink_prefill` function should be called.
        Otherwise, it is the decode stage, and the _shrink_decode function
        should be called.
        N)viewshaper,   r   r!   view_as)r   r   r   r   r   y_org
shrink_funr   r   r   _apply_shrink   s   zPunicaWrapperCPU._apply_shrink.lora_a_stackedc                 K   s@   | d|jd }tt|D ]}| || ||| | qdS )a  
        Performs GEMM  for multiple slices of lora_a.
        When `is_prefill is` true, it indicates that it is currently the
        prefill stage, and the `_shrink_prefill` function should be called.
        Otherwise, it is the decode stage, and the _shrink_decode function
        should be called.

        Semantics:
        for i in range(len(lora_a_stacked)):
            y[i] += (x @ lora_a_stacked[i]) * scale

        Args:
            y (Union[tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weights
            scale (float): Scaling factor for the operation
        r/   N)r0   r1   rangelenr5   )r   r   r   r6   r   r   	slice_idxr   r   r   
add_shrink   s   zPunicaWrapperCPU.add_shrinkr   lora_b_stackedoutput_slicesoffset_startreturnNc              	   K   sh   |}| d|jd }|}	tt|D ]}
| j|||
 ||
 |	||
 |d |	||
 7 }	q||}dS )aT  
        Performs GEMM for multiple slices of lora_b.

        Semantics:
            for i in range(len(lora_b_stacked)):
                slice = output_slices[i]
                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i]
                offset += slice

        Args:
            y (torch.Tensor): Output tensor.
            x (Union[tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight
            output_slices (tuple[int, ...]): Every slice's size
            add_inputs (bool):  Defaults to True.
        r/   r#   N)r0   r1   r7   r8   r.   r2   )r   r   r   r;   r<   r=   r#   r   r3   offset_leftr9   r   r   r   
add_expand   s   zPunicaWrapperCPU.add_expandc                 K   s$   | j r| jn| j}||||| dS )a]  
        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.

        Semantics:
            y += x @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_b_stacked (torch.Tensor): lora_b's weights.
            add_inputs (bool): Default to True.
        N)r,   r%   r&   )r   r   r   r;   r#   r   
expand_funr   r   r   add_lora_embedding   s   z#PunicaWrapperCPU.add_lora_embedding)bufferrD   c          	         s   t |t |  krt |ksJ  J |du r/|d d t fddtt |D }| j|||fi | | j||||fddi| dS )aB  
        Applicable to linear-related lora.

        Semantics:
            for i in range(len(lora_a_stacked)):
                y[i] += (
                    x[i].unsqueeze(0)
                    @ lora_a_stacked[indices[i], layer_idx, :, :]
                    @ lora_b_stacked[indices[i], layer_idx, :, :]
                    * scale
                    ).squeeze(0)

        Args:
            y (torch.Tensor): Output tensor. Will be changed in-place.
            x (torch.Tensor): Input tensor
            lora_a_stacked (tuple[torch.Tensor, ...]): lora_a's weight.
            lora_b_stacked (tuple[torch.Tensor, ...]): lora_b's weight.
            scale (float): Scaling factor.
            output_slices (tuple[int, ...]): Every slice's size.
            buffer (Optional[tuple[torch.Tensor, ...]]): Defaults to None.
        Nr   r/   c                 3   s.    | ]}t jd  ft jjdV  qdS )r   dtyper   N)torchzerossizefloat32r   ).0_rr   r   r   	<genexpr>1  s
    
z3PunicaWrapperCPU.add_lora_linear.<locals>.<genexpr>r#   T)r8   rI   tupler7   r:   rA   )	r   r   r   r6   r;   r   r<   rD   r   r   rM   r   add_lora_linear	  s   ("

z PunicaWrapperCPU.add_lora_linearc          
      K   s   |}| d|jd }| d|jd }|d}	|du r,tj|d|	ftj|jd}t|||| j| t	|||| jdd |
|}dS )a  
        Applies lora  specifically for LogitsProcessorWithLoRA.

        Semantics:
            buffer = (x @ lora_a_stacked) * scale
            y += buffer @ lora_b_stacked

        Args:
            y (torch.Tensor): Output tensor.
            x (torch.Tensor): Input tensor.
            lora_a_stacked (torch.Tensor): lora_a's weights.
            lora_b_stacked (torch.Tensor):lora_b's weights.
            scale (float): Scaling factor.
            buffer (Optional[torch.Tensor]):Default to None.
        r/   Nr   rE   Tr?   )r0   r1   rI   rG   rH   rJ   r   r   sampler_indicesr   r2   )
r   r   r   r6   r;   r   rD   r   r3   rN   r   r   r   add_lora_logits:  s   
z PunicaWrapperCPU.add_lora_logits)T)r   T)__name__
__module____qualname____doc__intrG   r   strr   Tensorfloatr   r!   boolr%   r&   r*   r+   r.   r5   rP   r:   rA   rC   rQ   rS   r   r   r   r   r      sD   
	

	

	




%
	
.
$
	
9
r   )collections.abcr   rG   vllm.lora.ops.torch_opsr   r   r   r   r   r   punica_baser
   r   r   r   r   r   <module>   s
    	