o
    پiCS                     @   s  d dl mZ d dlZd dlm  mZ d dlmZ d dlmZm	Z	m
Z
mZ d dlmZmZmZmZ d dlmZmZ d dlmZ d dlmZ G d	d
 d
ejZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdejdedefddZ dS )    )OptionalN)nn)get_tensor_model_parallel_ranksplit_tensor_along_last_dim tensor_model_parallel_all_gather tensor_model_parallel_all_reduce)ColumnParallelLinearMergedColumnParallelLinearQKVParallelLinearRowParallelLinear)ParallelLMHeadVocabParallelEmbedding)BaseLoRABackend)LoRABatchInfoc                       sh   e Zd Zdejdef fddZdejfddZ	dd	 Z
d
ejdefddZdejdefddZ  ZS )BaseLayerWithLoRA
base_layerlora_backendc                    s:   t    || _d| _|| _t| jdr| jj| _d S d S )NFweight)super__init__r   set_lorar   hasattrr   selfr   r   	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/lora/layers.pyr      s   
zBaseLayerWithLoRA.__init__xc                 C   s   | j |S N)r   forward)r   r   r   r   r   r    (   s   zBaseLayerWithLoRA.forwardc                 G      d S r   r   )r   argsr   r   r   set_lora_info+      zBaseLayerWithLoRA.set_lora_infoAtp_rankc                 C   r!   r   r   r   r%   r&   r   r   r   slice_lora_a_weights.   r$   z&BaseLayerWithLoRA.slice_lora_a_weightsBc                 C   r!   r   r   r   r)   r&   r   r   r   slice_lora_b_weights1   r$   z&BaseLayerWithLoRA.slice_lora_b_weights)__name__
__module____qualname__r   Moduler   r   torchTensorr    r#   intr(   r+   __classcell__r   r   r   r   r      s    r   c                       s   e Zd ZdZdededdf fddZdeej	 d	ej	d
ej	fddZ
dej	dej	dej	fddZdej	dedej	fddZdej	dej	dej	fddZdej	fddZdej	defddZdej	defddZ  ZS )VocabParallelEmbeddingWithLoRAz
    Vocab parallel embedding layer with LoRA support (simplified for TP=1, no extra tokens).

    For embedding layers: output = base_embedding(x) + lora_B @ lora_A[x]
    where lora_A[x] is direct embedding lookup from lora_A weights.
    r   r   returnNc                    sN   t  || |j| _|j| _|j| _tjd| jgtj	t
| jd| _d S Nr   dtypedevicer   r   r   embedding_dim	embed_dimorg_vocab_size
vocab_sizer0   tensorint32next
parametersr9   output_offsetr   r   r   r   r   =   s   z'VocabParallelEmbeddingWithLoRA.__init__new_embeddings_bufferembedding_A_bufferembedding_B_bufferc                 C   s   d| _ || _|| _|| _dS )z%Set LoRA buffers for embedding layer.TN)r   rD   rE   rF   )r   rD   rE   rF   r   r   r   r#   M   s   
z,VocabParallelEmbeddingWithLoRA.set_lora_infobase_outputinput_c                 C   s(   |  ||}| jj|| j| j|d}|S )z
        Apply LoRA to base embedding output.
        Formula: output = base_output + lora_B @ lora_A_embedding(input_)
        r   weightsrC   rG   )run_lora_a_embeddingr   run_lora_b_sgemmrF   rC   )r   rG   rH   
batch_infolora_a_outputlora_outputr   r   r   
apply_loraY   s   	z)VocabParallelEmbeddingWithLoRA.apply_lorarM   c                 C   s6   | j j|| j| jt| dr| jdur| jndd}|S )z
        Apply LoRA A weights using efficient embedding lookup with CUDA graph support.
        Maps tokens to their corresponding LoRA adapters internally.
        It also includes added/extra token processing.
        rD   N)	input_idsrJ   r>   extra_embeddings)r   rK   rE   r>   r   rD   )r   rH   rM   rN   r   r   r   rK   m   s   	
z3VocabParallelEmbeddingWithLoRA.run_lora_a_embeddingc                 C   s   t d)a  
        Need to impl:

        Process extra tokens (tokens >= vocab_size) by looking up their embeddings
        from the new_embeddings_buffer and replacing them in base_output.

        Args:
            input_: (s,) token IDs
            base_output: (s, embed_dim) base embedding output to be modified in-place

        Returns:
            base_output: (s, embed_dim) modified input base_output (tensor[0,0,0,...]) with extra token embeddings
        a  Error in sglang/python/sglang/srt/lora/layers.py - VocabParallelEmbeddingWithLoRA 
Current SGLang codebase did not support tuned lora with extra/added tokens. 
[TODO]: 
1. Refer to this commit: https://github.com/yushengsu-thu/sglang/commit/90415211eee8a28a316de262583d4d33fa615d10#diff-191177438bcc223837963de63c005850371f8c8a860acb153b26744b66ecc623 to complete 
2. And then you need to modified the en/decoder tokenizer - tokenizer_manager.py to support extra_token_embedding in-place. 
NotImplementedError)r   rH   rG   r   r   r   extra_token_embedding   s   z4VocabParallelEmbeddingWithLoRA.extra_token_embeddingc                 C   sb   | j j}|| jd k}| j||d}t| dr%| jdur%| ||}| j	r/| 
|||}|S )z
        Forward pass with LoRA support and CUDA graph compatibility.

        Extra tokens (tokens >= vocab_size) are now handled efficiently
        in the backend's run_lora_a_embedding method.
           r   rD   N)r   rM   r>   r   r    masked_fillr   rD   rU   r   rP   )r   rH   rM   added_tokens_maskrG   r   r   r   r       s   
z&VocabParallelEmbeddingWithLoRA.forwardr%   r&   c                 C      |dkrt d| d S NrV   zTVocabParallelEmbeddingWithLoRA does not support tensor parallelism > 1. Got tp_size=rS   r'   r   r   r   r(         z3VocabParallelEmbeddingWithLoRA.slice_lora_a_weightsr)   c                 C   rY   rZ   rS   r*   r   r   r   r+      r[   z3VocabParallelEmbeddingWithLoRA.slice_lora_b_weights)r,   r-   r.   __doc__r   r   r   r   r0   r1   r#   rP   r   rK   rU   r    r2   r(   r+   r3   r   r   r   r   r4   5   sP    



r4   c                       s   e Zd ZdZdededdf fddZdejd	ejfd
dZ	dejdejdejfddZ
dejfddZdejdefddZdejdefddZ  ZS )ParallelLMHeadWithLoRAz
    Parallel LM Head layer with LoRA support (simplified for TP=1).

    The LM head computes logits = hidden_states @ (W + B @ A)^T
    r   r   r5   Nc                    sN   t  || |j| _|j| _|j| _tjd| jgtj	t
| jd| _d S r6   r:   r   r   r   r   r      s   zParallelLMHeadWithLoRA.__init__lm_head_A_bufferlm_head_B_bufferc                 C   s   d| _ || _|| _dS )z#Set LoRA buffers for LM head layer.TN)r   r^   r_   )r   r^   r_   r   r   r   r#      s   
z$ParallelLMHeadWithLoRA.set_lora_inforG   hidden_statesc                 C   ,   | j || j}| j j|| j| j|d}|S )z
        Apply LoRA to LM head layer.

        For LM head: output = hidden @ (W + B @ A)^T
                           = hidden @ W^T + hidden @ A^T @ B^T
                           = base_output + (hidden @ A^T) @ B^T
        rI   )r   run_lora_a_sgemmr^   rL   r_   rC   )r   rG   r`   rN   rO   r   r   r   rP      s   z!ParallelLMHeadWithLoRA.apply_lorac                 C   s2   t j|| jt| jdd d}| jr| ||}|S )Nbias)rc   )Flinearr   getattrr   r   rP   )r   r`   rG   r   r   r   r      s   zParallelLMHeadWithLoRA.forwardr%   r&   c                 C   rY   NrV   zLParallelLMHeadWithLoRA does not support tensor parallelism > 1. Got tp_size=rS   r'   r   r   r   r(        z+ParallelLMHeadWithLoRA.slice_lora_a_weightsr)   c                 C   rY   rg   rS   r*   r   r   r   r+   "  rh   z+ParallelLMHeadWithLoRA.slice_lora_b_weights)r,   r-   r.   r\   r   r   r   r0   r1   r#   rP   r    r2   r(   r+   r3   r   r   r   r   r]      s0    



r]   c                       s   e Zd Zdededdf fddZdejdejfd	d
ZdejdejdejfddZ	dejfddZ
dejdefddZdejdefddZ  ZS )ColumnParallelLinearWithLoRAr   r   r5   Nc                    sB   t  || | jjd }tjd|gtjt| j j	d| _
d S r6   )r   r   r   output_partition_sizesr0   r?   r@   rA   rB   r9   rC   )r   r   r   
shard_sizer   r   r   r   .  s   z%ColumnParallelLinearWithLoRA.__init__A_bufferB_bufferc                 C      d| _ || _|| _d S NT)r   rl   rm   )r   rl   rm   r   r   r   r#   >     
z*ColumnParallelLinearWithLoRA.set_lora_inforG   r   c                 C   ra   NrI   r   rb   rl   rL   rm   rC   r   rG   r   rN   rO   r   r   r   rP   G     z'ColumnParallelLinearWithLoRA.apply_lorarH   c                 C   sl   | j js| j jnd }| j j| j ||}| jr| ||}| j jr&t|}n|}| j jr0| j jnd }||fS r   )	r   skip_bias_addrc   quant_methodapplyr   rP   gather_outputr   )r   rH   rc   output_paralleloutputoutput_biasr   r   r   r    Q  s   
z$ColumnParallelLinearWithLoRA.forwardr%   r&   c                 C      |S r   r   r'   r   r   r   r(   b  r$   z1ColumnParallelLinearWithLoRA.slice_lora_a_weightsr)   c                 C   s8   | j jd }|| }|d | }|||d d f }|S )Nr   rV   )r   rj   )r   r)   r&   rk   	start_idxend_idxr   r   r   r+   e  s
   z1ColumnParallelLinearWithLoRA.slice_lora_b_weights)r,   r-   r.   r   r   r   r0   r1   r#   rP   r    r2   r(   r+   r3   r   r   r   r   ri   -  s"    
	
ri   c                       s   e Zd Zdededdf fddZdejdejfd	d
ZdejdejdejfddZ	dejde
fddZdejde
fddZ  ZS )"MergedColumnParallelLinearWithLoRAr   r   r5   Nc                       t  || d S r   r   r   r   r   r   r   r   n     z+MergedColumnParallelLinearWithLoRA.__init__rl   rm   c                 C   sL   d| _ || _|| _| jjd }tjd|d| gtjt| j	 j
d| _d S )NTr      r7   )r   A_buffer_gate_upB_buffer_gate_upr   rj   r0   r?   r@   rA   rB   r9   rC   )r   rl   rm   rk   r   r   r   r#   u  s   z0MergedColumnParallelLinearWithLoRA.set_lora_inforG   r   c                 C   s    | j j|| j| j| j|d}|S )N)r   gate_up_lora_agate_up_lora_brC   rG   )r   run_gate_up_lorar   r   rC   r   rG   r   rO   r   r   r   rP     s   z-MergedColumnParallelLinearWithLoRA.apply_lorar%   r&   c                 C   r|   r   r   r'   r   r   r   r(     r$   z7MergedColumnParallelLinearWithLoRA.slice_lora_a_weightsr)   c                 C   s^   | j jd }| j jd }|| }|d | }tj|||d d f ||| ||  fddS )Nr   rV   dim)r   rj   output_sizesr0   concat)r   r)   r&   rk   	gate_sizer}   r~   r   r   r   r+     s   z7MergedColumnParallelLinearWithLoRA.slice_lora_b_weights)r,   r-   r.   r	   r   r   r0   r1   r#   rP   r2   r(   r+   r3   r   r   r   r   r   m  s     

r   c                       s   e Zd Zdededdf fddZdejdejfd	d
ZdejdejdejfddZ	dejde
fddZdejde
dejfddZ  ZS )QKVParallelLinearWithLoRAr   r   r5   Nc                    sn   t  || | jj}| jj}tjd||| |d|  gtjt| j	 j
d| _| j | _t||| _d S )Nr   r   r7   )r   r   r   q_proj_shard_sizekv_proj_shard_sizer0   r?   r@   rA   rB   r9   rC   cpuoutput_offset_cpumaxmax_qkv_out_dim)r   r   r   r   r   r   r   r   r     s   

z"QKVParallelLinearWithLoRA.__init__A_buffer_qkvB_buffer_qkvc                 C   rn   ro   )r   r   r   )r   r   r   r   r   r   r#     rp   z'QKVParallelLinearWithLoRA.set_lora_inforG   r   c              	   C   s(   | j j|| j| j|| j| j| jd}|S )N)r   
qkv_lora_a
qkv_lora_brG   rC   r   r   )r   run_qkv_lorar   r   rC   r   r   r   r   r   r   rP     s   
z$QKVParallelLinearWithLoRA.apply_lorar%   r&   c                 C   r|   r   r   r'   r   r   r   r(     r$   z.QKVParallelLinearWithLoRA.slice_lora_a_weightsr)   c                 C   s   | j }|j}|j}|j}|| }|| }|| }	||	 }
|
| }|j\}}}|||d d f }|||
 || d d f }||| |
 || | d d f }tj|||fddS )Nr   r   )r   r   r   num_kv_head_replicasr   r0   r   )r   r)   r&   r   r   r   r   q_start_idx	q_end_idxkv_shard_idkv_start_idx
kv_end_idxq_sizek_size_	B_q_shard	B_k_shard	B_v_shardr   r   r   r+     s(   $z.QKVParallelLinearWithLoRA.slice_lora_b_weights)r,   r-   r.   r
   r   r   r0   r1   r#   rP   r2   r(   r+   r3   r   r   r   r   r     s     
	"r   c                       s   e Zd Zdededdf fddZdejdejfd	d
ZdejdejdejfddZ	ddejfddZ
dejdefddZdejdefddZ  ZS )RowParallelLinearWithLoRAr   r   r5   Nc                    r   r   r   r   r   r   r   r     r   z"RowParallelLinearWithLoRA.__init__rl   rm   c                 C   sB   d| _ || _|| _| jj}tjd|gtjt| j	 j
d| _d S )NTr   r7   )r   rl   rm   r   output_sizer0   r?   r@   rA   rB   r9   rC   )r   rl   rm   r   r   r   r   r#     s   z'RowParallelLinearWithLoRA.set_lora_inforG   r   c                 C   ra   rq   rr   rs   r   r   r   rP     rt   z$RowParallelLinearWithLoRA.apply_loraFrH   c           
      C   s   | j jr|}nt }t|| j jd}||  }| j j| j |}| jr*| 	||}| j j
r;| j jdkr;|s;t|}n|}| j jsU| j jd urM|| j j n|}d }	||	fS |}| j j}	||	fS )N)num_partitionsrV   )r   input_is_parallelr   r   tp_size
contiguousrv   rw   r   rP   reduce_resultsr   ru   rc   )
r   rH   skip_all_reduceinput_parallelr&   splitted_inputry   output_rz   r{   r   r   r   r      s:   
z!RowParallelLinearWithLoRA.forwardr%   r&   c                 C   s8   | j j}|| }|d | }|d d ||f  }|S )NrV   )r   input_size_per_partitionr   )r   r%   r&   rk   r}   r~   r   r   r   r(   8  s
   z.RowParallelLinearWithLoRA.slice_lora_a_weightsr)   c                 C   r|   r   r   r*   r   r   r   r+   ?  r$   z.RowParallelLinearWithLoRA.slice_lora_b_weights)F)r,   r-   r.   r   r   r   r0   r1   r#   rP   r    r2   r(   r+   r3   r   r   r   r   r     s    
&r   layerr   r5   c                 C   s^   t ttttttttt	t
ti}| D ]\}}t| |r$|| |}|  S qtdt|  d)Nz*No corresponding LoRA layer supported for .)r   r]   r   r4   r
   r   r	   r   r   ri   r   r   items
isinstance	Exceptiontype)r   r   supported_layer_typessrc_layer_typelora_layer_typeretr   r   r   get_lora_layerC  s   	

r   )!typingr   r0   torch.nn.functionalr   
functionalrd   sglang.srt.distributedr   r   r   r   sglang.srt.layers.linearr   r	   r
   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   $sglang.srt.lora.backend.base_backendr   sglang.srt.lora.utilsr   r/   r   r4   r]   ri   r   r   r   r   r   r   r   r   <module>   s0     Z@8MQ