o
    پim                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZmZm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* e +e,Z-G dd dZ.dS )    N)DictIterableListOptional)
LoadConfig)get_layer_id)ParallelLMHeadVocabParallelEmbedding)BaseLoRABackend)get_backend_from_name)BaseLayerWithLoRAget_lora_layer)LoRAAdapter)
LoRAConfig)LoRARef)LoRAMemoryPool)LoRATypeget_normalized_target_modulesget_target_module_name)LoRAUpdateOutput)ForwardBatch)
ServerArgs)replace_submodule)
AutoConfigc                   @   s  e Zd Z						dFdejjdededed	ej	d
e
dedededee deee  deee  fddZdedefddZ	dGdededefddZdedefddZd edefd!d"Zdedefd#d$Zd%eee  defd&d'Ze fd(eee  d)eee  fd*d+Zd,efd-d.Zd/d0 Z			dHdee deee  deee  fd1d2ZdIdeee  fd3d4Z 		dJdee deee  fd5d6Z!defd7d8Z"ded9e#eej$f fd:d;Z%	dIded9e#eej$f d<e#d=ee# def
d>d?Z&d@dA Z'dBdC Z(dDdE Z)dS )KLoRAManagertriton   r   N
base_modelbase_hf_configmax_loras_per_batchload_configdtypeserver_argslora_backendtp_sizetp_rankmax_lora_ranktarget_modules
lora_pathsc                 C   s   || _ || _|| _|| _|| _t| j  j| _|| _|	| _	d | _
|j| _|j| _td| d t|}||| j|d| _| j|
||d d S )NzUsing z as backend of LoRA kernels.)r   devicer"   )r&   r'   r(   )r   r   r   r    r!   next
parametersr)   r$   r%   lora_added_tokens_sizeenable_lora_overlap_loadinglora_eviction_policyeviction_policyloggerinfor   r#   
init_state)selfr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   backend_type r5   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/lora/lora_manager.py__init__3   s0   
zLoRAManager.__init__max_bs_in_cuda_graphnum_tokens_per_bsc                 C   s   || _ | jj||d d S )N)r8   r9   )r8   r#   init_cuda_graph_batch_info)r3   r8   r9   r5   r5   r6   r:   b   s
   
z&LoRAManager.init_cuda_graph_batch_info successerror_messagereturnc                 C   s   t ||dd | j D dS )Nc                 S   s   i | ]}|j |jqS r5   )	lora_name	lora_path).0lora_refr5   r5   r6   
<dictcomp>q   s    z9LoRAManager.create_lora_update_result.<locals>.<dictcomp>)r<   r=   loaded_adapters)r   	lora_refsvalues)r3   r<   r=   r5   r5   r6   create_lora_update_resultk   s   z%LoRAManager.create_lora_update_resultrB   c              
   C   s   |j dur
|jdusJ d|j| jvsJ d|j dz(t|j}| || || j|j< | | || j|j< |  j	t
|j7  _	W n ty` } z| jdt|dW  Y d}~S d}~ww | jddS )	z
        Load a single LoRA adapter from the specified path.

        Args:
            lora_ref (LoRARef): The LoRARef object containing the LoRA name, path, and ID.
        N?LoRARef must have both lora_name and lora_path set for loading.LoRA adapter with ID Y is already loaded. This should have been verified before request is sent to the backend.Fr<   r=   Tr<   )r?   r@   lora_idlorasr   validate_new_adapterconfigsload_lora_weightsrE   num_pinned_lorasintpinned	ExceptionrG   str)r3   rB   new_adapterer5   r5   r6   load_lora_adapterw   s*   

zLoRAManager.load_lora_adapterlora_configc                 C   s   |j dkr	td| j D ]'}|j|jkrtd|j d|j|jkr5t|j d|j d|j  qt| dd}|oC|	| }|rStd	|j d
|j
 d|jrg| j| jd kritd|j ddS dS )z
        Validate if an adapter can be loaded into the current LoRA memory pool and generate error if it is incompatible.
        r   zQLoRA serving currently doesn't support adapters that add tokens to the vocabularyFailed to load LoRA adapter z because it is already loadedz is already loaded with name: z., but another copy is being loaded with name: memory_poolNzLoRA adapter z with rank z is incompatible with the current LoRA memory pool configuration. Please ensure that the LoRA adapter's rank is within the configured `--max-lora-rank` and that the target modules are included in `--lora-target-modules`.r   z as a pinned adapter. It is not allowed to pin all slots in the LoRA memory pool to avoid starvation for unpinned adapters and base models. Please increase your `--max-loras-per-batch` or load it as unpinned LoRA adapters.)r,   
ValueErrorrE   rF   r?   r@   r0   warninggetattrcan_supportrrT   rR   r   )r3   rZ   rB   existing_lora_refr\   incompatibler5   r5   r6   rO      s6   
z LoRAManager.validate_new_adapterc              
   C   s   | j |j}| j|j}|dur|dusJ d|j dz| j |j= | j|j= | j|j= |  jt|j8  _W n tyU } z| j	dt
|dW  Y d}~S d}~ww | j	ddS )z
        Unload LoRA adapters by their names. This will remove the adapters from the memory pool and
        delete the corresponding LoRA modules.
        NrI   zU is not loaded. This should have been verified before request is sent to the backend.FrK   TrL   )rP   getrM   rE   rN   rR   rS   rT   rU   rG   rV   )r3   rB   adapterrX   r5   r5   r6   unload_lora_adapter   s$   


zLoRAManager.unload_lora_adapterlora_idsc                 C   s   t || jkr	dS | jdkrdS d}|D ]}|dur3| j|}|dus,J d| d|t|j7 }q|| jksEJ d| d| j d	t || }| jj| j }||kS )
zh
        Validate if the LoRA IDs in the batch can be loaded into the current LoRA memory pool.
        Fr   TNzLoRA ID z not found in lora_refs.z-Number of pinned LoRA adapters in the batch (z/) exceeds the total number of pinned adapters (z2). This indicates a bug in the LoRA loading logic.)lenr   rR   rE   rd   rS   rT   r\   )r3   rg   pinned_loras_in_batchrM   rB   required_slotsmem_pool_vacancyr5   r5   r6   validate_lora_batch   s*   


zLoRAManager.validate_lora_batch	new_lorasrunning_lorasc                 C   sD   ||B }t || jksJ | jj|| j| j| j | j| j	d d S )N)cur_uidslora_adapterslora_modulesrE   lora_embed_tokens_modulelora_lm_head_module)
rh   r   r\   prepare_lora_batchrN   rq   rE   copyembed_tokens_modulelm_head_module)r3   rm   rn   ro   r5   r5   r6   fetch_new_loras   s   
zLoRAManager.fetch_new_lorasforward_batchc           
      C   s   |j }t| do|| jko|j }dgt|j }dg| j }dg| j }t|jD ]$\}}| j	
|||< |d urO| j| }	|	jj||| < |	j||| < q+| jj|||||d d S )Nr8   r   )ry   weight_indices
lora_ranksscalingsuse_cuda_graph)
batch_sizehasattrr8   forward_modeis_cuda_graphrh   rg   r   	enumerater\   get_buffer_idrN   configra   scalingr#   rt   )
r3   ry   bsr}   rz   r{   r|   iuidlorar5   r5   r6   rt   	  s.   


zLoRAManager.prepare_lora_batchc              
   C   s   t | jD ]*\}}| D ]!\}}t|| jj}|| jj||tj	d| jj||tj
d qq| jdurO| j| jdtj	| jdtj	| jdtj
 | jduri| j| jdtj	| jdtj
 dS dS )zZ
        Update all LoRA modules to associate them with the latest memory buffer.
        )target_modulelayer_id	lora_typeNadded_tokensembed_tokenslm_head)r   rq   itemsr   r\   r'   set_lora_info
get_tensorr   LORA_ALORA_Brv   get_embedding_tensorrw   )r3   r   layer_modulesmodule_namemoduler   r5   r5   r6   update_lora_info&  s<   

zLoRAManager.update_lora_infoc                 C   sP   |s|dur
|dusJ d|  | | j||d |   |   |   dS )z
        Initialize the internal (mutable) state of the LoRAManager.

        When `lora_paths` is provided and not empty, it might be used for inferring LoRA shape info such as
        the target modules and max_lora_rank.
        NzWhen no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization.)r&   r'   )init_lora_adaptersinit_lora_shapesinit_lora_modulesinit_memory_poolr   )r3   r&   r'   r(   r5   r5   r6   r2   K  s   
zLoRAManager.init_statec                 C   sV   i | _ i | _i | _d| _|r'|D ]}| |}|js&td|j d|j qd S d S )Nr   r[   z: )	rP   rN   rE   rR   rY   r<   RuntimeErrorr?   r=   )r3   r(   rB   resultr5   r5   r6   r   e  s   
zLoRAManager.init_lora_adaptersc           	      C   s  |rt |nt | _| j D ]u\}}t|jtr>|jdv r5|dur#q| j| j}t	d| d|j dt	d|j dt|jt
sHt	dt |j}|dur~|| js}|| j }| j| j}t	d| d	t| d
t| j dt| j|B  d	q| j| q|dur|| _ntdd | j D dd| _| jdu rtdd | j D d}|dkrtd| d || _dS dS )zQInfer LoRA target modules and max_lora_rank from loaded adapters if not provided.)z
all-linearallNzLoRA adapter 'z' uses target_modules='z' which cannot be resolved automatically. Please explicitly specify --lora-target-modules during server startup. You can specify 'all' to enable all supported module types.z*SGLang does not recognize target_modules='z'. Please use a list of module name suffixes in the adapter's PEFT config, or explicitly specify --lora-target-modules during server startup.a  SGLang currently only supports inferring LoRA target modules when a list of suffixes is provided in `target_modules` field of PEFT config. Please explicitly specify `--lora-target-modules` during server startup. You can specify `all` to enable all support modules types. z' contains target modules z> that are not included in the specified --lora-target-modules zG. Please update --lora-target-modules to include all required modules: z/, or use 'all' to enable all supported modules.c                 S   s   g | ]}|j qS r5   )ra   rA   xr5   r5   r6   
<listcomp>  s    z0LoRAManager.init_lora_shapes.<locals>.<listcomp>r   )defaultc                 s   s     | ]}|j d kr|j V  qdS )r   N)r,   r   r5   r5   r6   	<genexpr>  s    
z/LoRAManager.init_lora_shapes.<locals>.<genexpr>zself.lora_added_tokens_size=z from LoRA adapters.)r   setr'   rP   r   
isinstancerV   rE   r?   r]   listissubsetsortedupdater&   maxrF   r,   r*   r0   r1   )	r3   r&   r'   rM   r   r?   adapter_target_modulesunsupported_modulesinferred_extra_vocab_sizer5   r5   r6   r   z  sr   




zLoRAManager.init_lora_shapesc                 C   sF   t |j| j|j | j| j| j}|  | jr|  || j	|j< dS )zh
        Load the weights of a LoRA adapter to CPU memory and conducts post-loading validation.
        N)
r   rM   rP   r   r    r#   initialize_weightsr-   pin_weights_in_cpurN   )r3   rB   lora_adapterr5   r5   r6   rQ     s   
zLoRAManager.load_lora_weightstensorsc                 C   s:   t |j| j|j | j| j| j}|| || j|j< dS )zP
        Load the weights of a LoRA adapter from tensors to CPU memory.
        N)r   rM   rP   r   r    r#   initialize_weights_from_tensorsrN   )r3   rB   r   r   r5   r5   r6   load_lora_weights_from_tensors  s   

z*LoRAManager.load_lora_weights_from_tensorsconfig_dictadded_tokens_configc              
   C   s   |j dur
|jdusJ d|j| jvsJ d|j dz*t||}| || || j|j< | || || j	|j< |  j
t|j7  _
W n tyb } z| jdt|dW  Y d}~S d}~ww | jddS )	zJ
        Load a single LoRA adapter from tensors and config dict.
        NrH   rI   rJ   FrK   TrL   )r?   r@   rM   rN   r   	from_dictrO   rP   r   rE   rR   rS   rT   rU   rG   rV   )r3   rB   r   r   r   rW   rX   r5   r5   r6   load_lora_adapter_from_tensors  s*   z*LoRAManager.load_lora_adapter_from_tensorsc                 C   sB   t | j| j| j| j| j| j| j| j| j	| j
d
| _| dh dS )zH(Re)initialize the LoRA memory pool based on the current configurations.)
r   r   r!   r$   r%   r&   r'   r   r/   r,   N)r   r   r   r!   r$   r%   r&   r'   r   r/   r,   r\   rx   )r3   r5   r5   r6   r     s   zLoRAManager.init_memory_poolc                 C   s   t || j}t| j|| |S N)r   r#   r   r   )r3   r   r   lora_moduler5   r5   r6   set_lora_module-  s   zLoRAManager.set_lora_modulec           
      C   s  dd t | jjD | _d | _d | _d| jv rYt| jdd }d }| j	 D ]\}}|
dr2|} nq%|d urY|d urY||u rYtd t|j|j|jj|jd}|j|_|| j_| j	 D ]e\}}t| jdd rp| j|spq^d|v rd| jv rt|trt|ts| ||}|| _q^d|v rd| jv rt|trt|ts| ||}|| _q^|dd	 | jv rt|}	| ||| j|	 |< q^d S )
Nc                 S   s   g | ]}i qS r5   r5   )rA   _r5   r5   r6   r   4  s    z1LoRAManager.init_lora_modules.<locals>.<listcomp>r   r   zslm_head is tied with embed_tokens. Creating a separate ParallelLMHead that shares the base weight for LoRA support.)num_embeddingsembedding_dimparams_dtypeorg_num_embeddingsshould_apply_lora.)ranger   num_hidden_layersrq   rv   rw   r'   r_   r   named_modulesendswithr0   r1   r   org_vocab_sizer   weightr!   r   r   r   r	   r   r   splitr   )
r3   r   r   namemoduntied_lm_headr   r   r   r   r5   r5   r6   r   2  sn   



zLoRAManager.init_lora_modules)r   r   r   NNN)r;   )NNNr   )NN)*__name__
__module____qualname__torchnnModuler   rS   r   r!   r   rV   r   r   r   r   r7   r:   boolr   rG   rY   r   rO   rf   r   rl   rx   r   rt   r   r2   r   r   rQ   r   Tensorr   r   r   r   r   r5   r5   r5   r6   r   2   s    		



/


"( 


'




\

"r   )/loggingtypingr   r   r   r   r   sglang.srt.configs.load_configr   sglang.srt.layers.utilsr   *sglang.srt.layers.vocab_parallel_embeddingr   r	   $sglang.srt.lora.backend.base_backendr
   %sglang.srt.lora.backend.lora_registryr   sglang.srt.lora.layersr   r   sglang.srt.lora.lorar   sglang.srt.lora.lora_configr   sglang.srt.lora.lora_registryr   sglang.srt.lora.mem_poolr   sglang.srt.lora.utilsr   r   r   sglang.srt.managers.io_structr   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.server_argsr   sglang.srt.utilsr   &sglang.srt.utils.hf_transformers_utilsr   	getLoggerr   r0   r   r5   r5   r5   r6   <module>   s*   
