o
    id                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlmZmZmZm Z m!Z!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 ee6Z7edZ8dZ9G dd de3e:e8f Z;G dd dZ<G dd de;e Z=G dd de<Z>e<fd ej?d!e:d"e:d#e:d$ed%e
d&ej@d'eAe< d(e<fd)d*ZBdS )+    N)Callable)TypeVar)nn)
VllmConfig)
LoRAConfig)init_logger)BaseLayerWithLoRAFusedMoE3DWithLoRALoRAMappingLoRAMappingType)	LoRAModel)LoRALayerWeightsPackedLoRALayerWeights)PunicaWrapperBaseget_punica_wrapper)
from_layerfrom_layer_logits_processorget_supported_lora_modulesis_moe_modelprocess_packed_modules_mappingreplace_submodule)FusedMoE)SupportsLoRAsupports_multimodal)is_pooling_model)MultiModelKeys)PPMissingLayer)MULTIMODAL_REGISTRY)MultiModalBudget)LRUCache)is_pin_memory_availableTlanguage_modelc                       sJ   e Zd Zdedeegef f fddZdededB f fdd	Z  Z	S )
AdapterLRUCachecapacitydeactivate_fnc                    s   t  | || _d S N)super__init__r%   )selfr$   r%   	__class__ M/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/lora/model_manager.pyr(   1   s   
zAdapterLRUCache.__init__keyvalueNc                    s$   t d| | | t ||S )NzRemoving adapter int id: %d)loggerdebugr%   r'   
_on_remove)r)   r.   r/   r*   r,   r-   r2   5   s   
zAdapterLRUCache._on_remove)
__name__
__module____qualname__intr   objectr(   r!   r2   __classcell__r,   r,   r*   r-   r#   0   s     "r#   c                   @   s`  e Zd ZdZ	dRdedededededejd	e	dB fd
dZ
ded	e	ddfddZd	e	deddfddZdefddZedefddZedefddZedefddZdedefddZdefddZdefdd Zdedefd!d"Zd#eddfd$d%Zd&d' Zd(d) Zd*ed+d,fd-d.Zed/ee dB  dee dB  fd0d1Z!	dRded2ed3e"eef dB defd4d5Z#d*efd6d7Z$d*ede%dB fd8d9Z&d:eddfd;d<Z'd=eddfd>d?Z(d=ed+e)d*efd@dAZ*d=ed*ede dB fdBdCZ+dDedefdEdFZ,dGedefdHdIZ-d#eddfdJdKZ.dDedefdLdMZ/de"eef fdNdOZ0dDededB fdPdQZ1dS )SLoRAModelManagerz7A manager that manages multiple LoRA-fine-tuned models.Nmodelmax_num_seqsmax_num_batched_tokens
vocab_sizelora_configdevicevllm_configc           	      C   s   || _ t| j | _| jsJ d| j jj di | _i | _d| _|| _|| _	|| _
| j| jks1J t|d d | _dg| j | _|| _t| j | _t| j | _i | _i | _d| _t| j }|od| j j| _|ok| j j| _| || |   | | j _dS )a  Create a LoRAModelManager and adapter for a given model.

        Args:
            model: the model to be adapted.
            max_num_seqs: the maximum number of sequences model can run in a
                single batch.
            max_num_batched_tokens: the maximum number of tokens model can run
                in a single batch.
            vocab_size: the vocab size of the model.
            lora_config: the LoRA configuration.
        z#No supported LoRA modules found in .LoRA   N) r:   r   supported_lora_modulesr+   r3   _registered_adapters_active_adaptersadapter_typer>   r?   r;   r$   
lora_slotsmathceilr<   lora_index_to_idr=   r   packed_modules_mappingr   packed_modulesmodules_last_mappingr   is_3d_moe_weight_is_3d_moe_modelis_non_gated_moe_is_non_gated_moe_init_punica_wrapper_create_lora_moduleslora_manager)	r)   r:   r;   r<   r=   r>   r?   r@   is_moer,   r,   r-   r(   >   s4   
zLoRAModelManager.__init__returnc                 C   sX   t | jo
t| jd| _i | _| jr| || d S t|| j| j| j	d}|| jt
< d S )Nget_mm_mappingmax_batchesr?   r>   )r   r:   hasattrsupports_mmpunica_wrapper_mapping_maybe_init_mmr   r;   r?   r>   DEFAULT_LANGUAGE_WRAPPER_KEY)r)   r<   r@   llm_punica_wrapperr,   r,   r-   rT   s   s   

z%LoRAModelManager._init_punica_wrapperc                 C   sH  t }d| _| j | _t| jjdksJ t|| j| j	| j
d}| jjd }|| j|< | j
jr9| jo7t| jd| _| js>d S td t||}t|j }| j| }t|| j| | j	| j
d}	| jjD ]}
|	| j|
< qh| jjrt| jdr| j|}t|| j| | j	| j
d}| jjD ]}
|| j|
< qd S td d S d S )	NF   rZ   r   get_num_mm_encoder_tokenszLoRA for the tower and connector of multimodal models is experimental and may contain bugs. Please report any related issues on GitHub if you encounter them.get_num_mm_connector_tokenszConnector LoRA support disabled: model does not implement get_num_mm_connector_tokens(). This method is required to determine the connector's token budget for LoRA operations.)r   supports_tower_connector_lorar:   rY   
mm_mappinglenr"   r   r;   r?   r>   r^   enable_tower_connector_lorar]   r\   r0   warningr   maxmm_max_items_per_promptvaluesrc   get_encoder_budgettower_model	connectorrd   warning_once)r)   r@   r<   mm_registryra   	lm_prefix	mm_budgetlimit_per_promptnum_encoder_tokenstower_punica_wrapperprefixconnector_tokensconnector_punica_wrapperr,   r,   r-   r_      sh   

zLoRAModelManager._maybe_init_mmc                 C   
   t | jS r&   )rg   rE   r)   r,   r,   r-   __len__      
zLoRAModelManager.__len__c                 C      | j jS r&   )r>   max_cpu_lorasr{   r,   r,   r-   r$         zLoRAModelManager.capacityc                 C   r~   r&   )r>   	max_lorasr{   r,   r,   r-   rH      r   zLoRAModelManager.lora_slotsc                 C   s   | j S r&   )rH   r{   r,   r,   r-   adapter_slots   s   zLoRAModelManager.adapter_slotslora_idc           	      C   s   || j v rdS tdd t| jD d}|du rtd|\}}d| j |< | j| }td|j| |j| j|< | j	
 D ]\}}| ||}|sO|| q=|||j|j q=dS )z;Move LoRA into a GPU buffer to be used in the forward pass.Fc                 s   s$    | ]\}}|d u r||fV  qd S r&   r,   ).0ir   r,   r,   r-   	<genexpr>   s    z4LoRAModelManager.activate_adapter.<locals>.<genexpr>NzNo free lora slotsz+Activating LoRA. int id: %d, slot index: %dT)rF   next	enumeraterK   
ValueErrorrE   r0   r1   idrN   items_get_lora_layer_weights
reset_loraset_loralora_alora_b)	r)   r   first_free_slotindex_
lora_modelmodule_namemodulemodule_lorar,   r,   r-   activate_adapter   s8   



z!LoRAModelManager.activate_adapterc                 C   s2   z| j |}d | j |< W d S  ty   Y d S w r&   )rK   r   r   )r)   r   r   r,   r,   r-   _deactivate_adapter
  s   z$LoRAModelManager._deactivate_adapterlorac                 C   s   |  | || j|j< d S r&   )_create_merged_loras_inplacerE   r   )r)   r   r,   r,   r-   _add_adapter  s   
zLoRAModelManager._add_adapterc                 C   s   t d)%Pin a LoRAModel in the manager cache.zVPinning is not supported in LoRAModelManager. Use LRUCacheLoRAModelManager for pinning)NotImplementedErrorr)   r   r,   r,   r-   pin_adapter  s   zLoRAModelManager.pin_adaptermappingc                 C   s   | j r| js| j r| jjd nt}n(|jtjkr#| jjr#| jjd }n|jtj	kr4| jj
r4| jj
d }n| jjd }| |}|d usEJ ||| j| jd | j d S )Nr   rb   )r]   re   rf   r"   r`   typer   TOWERrn   	CONNECTORro   _get_punica_wrapperupdate_metadatarK   rH   r=   )r)   r   target_prefixpunica_wrapperr,   r,   r-   _set_adapter_mapping  s$   
z%LoRAModelManager._set_adapter_mappingc                 C   s&   | j   dg| j | _| j  dS )z'Remove all LoRAModels from the manager.N)rE   clearrH   rK   rF   r{   r,   r,   r-   remove_all_adapters5  s   
z$LoRAModelManager.remove_all_adaptersc              
   C   sr  dt dt fdd}| jjddD ]\}}t|trq| |s q| |}|d u r4td| jj	j
| q| jrE|drEtjd	|d
d q|dd }| j|g }t|trb| jr^dgnddg}t| j|t|| j| j|| jj}d|v rd}||}	|	r|	 d| }| j|}
t| j|t|
|| j| j| jj}| jrt|tsq| || | | || qd S )Nr   rX   c                 S   s   |  dd S )NrA   r   )
rpartitionr   r,   r,   r-   _parent_module<  s   z=LoRAModelManager._create_lora_modules.<locals>._parent_moduleF)remove_duplicatez]Regarding %s, vLLM currently only supports adding LoRA to language model, %s will be ignored.z
mixer.gatezHLoRA is not supported for non-gated MoE gate module. %s will be ignored.local)scoperA   w13w1w3lm_headlogits_processor)strr:   named_modules
isinstancer   _match_target_modulesr   r0   ri   r+   r3   rS   endswith
debug_oncesplitrL   getr   rQ   r   r   rH   r>   configget_submoduler   r]   r   register_module_register_packed_modulesset_mapping)r)   r   r   r   r   partspacked_moduled_lst
new_modulelogits_processor_module_nameparent_modulelogits_processor_moduler,   r,   r-   rU   ;  s|   




z%LoRAModelManager._create_lora_modulesr   r   r   c                 C   s0   t |tsJ d| dt| || j|< d S )NzModule z+ must be a BaseLayerWithLoRA instance, got )r   r   r   rN   )r)   r   r   r,   r,   r-   r     s   z LoRAModelManager.register_modulelorasc                 C   sV   t | d dksJ dg }tdt | dD ]}|| ||d   |d q|S )zPad LoRA weight pairs to triplets for non-gated MoE.

        For non-gated MoE, each expert has 2 entries (w1, w2) that need to be
        padded to triplets (w1, w2, None) to match pack_moe expectations.
           r   z1Expected pairs of LoRA weights for non-gated MoE.N)rg   rangeextendappend)r   paddedr   r,   r,   r-   _pad_lora_pairs_to_triplets  s   	z,LoRAModelManager._pad_lora_pairs_to_tripletsrankembedding_modulesc              
   C   s  t ||i }| j D ]A\}}| |r!t|tr!| |du r"q|d}|| jvr|dus2J |d |v r|d dkrO|j	d j
d }|jd j
d }	n"t|jdrY|jjn|jjj
d }t|jd	rj|jjn|jjj
d }	t|||	||j	d jd
}
|
|j|< q|jjdkrt||j|j||jd j
d  |jd jd
}
|
|j|< t||j|j||jd j
d  |jd jd
}
|
|j|d < qt||j	d j
d |jd j
d ||j	d jd
}
|
|j|< q|d}| j|d  }g }t|D ]'\}}t|d | |j	| j
d |j| j
d ||j	| jd
}
||
 q|jjdkrC| j r9t!|dkr9| "|}t#j$||| j d}
nt#%|}
|
|j|< q|S )z-Create zero-initialized LoRAModel for warmup.NrA   r   r   r   org_vocab_sizerb   embedding_dimcpur	   .base_layerFusedMoEWithLoRArR   )&r   r:   r   r   r   r   r   r   rM   lora_a_stackedshapelora_b_stackedr\   
base_layerr   weightr   r   create_dummy_lora_weightsdtyper   r+   r3   w2_input_sizew2_output_sizew2_lora_a_stackedw13_input_sizew13_output_sizew13_lora_a_stackedrL   r   r   rS   rg   r   r   pack_moepack)r)   r   r   r   r:   r   r   r   	input_dim
output_dimr   replacementssublorasr   rr,   r,   r-   create_dummy_lora  s   









	





z"LoRAModelManager.create_dummy_lorac                    s   t  fdd| jD S )Nc                 3   s,    | ]}t d j|d p| kV  qdS )z.*\.{target_module}$)target_moduleN)rematchformat)r   r   r   r,   r-   r     s    
z9LoRAModelManager._match_target_modules.<locals>.<genexpr>)anyrD   )r)   r   r,   r   r-   r     s   z&LoRAModelManager._match_target_modulesc                 C   sF   | j s| jt S t| j tddD ]}||r | j|   S qdS )zW
        Determine whether this module supports LoRA and which wrapper to use.
        T)r.   reverseN)r]   r^   r`   sortedkeysrg   
startswith)r)   r   rw   r,   r,   r-   r   &  s   

z$LoRAModelManager._get_punica_wrappermodule_full_namec                    s^   | d}|d }| j|g }t|dkrd S d|d d   fdd|D | j|< d S )NrA   r   rb   c                    s    g | ]} r d  | n|qS )rA   r,   )r   r   rw   r,   r-   
<listcomp>@  s    z=LoRAModelManager._register_packed_modules.<locals>.<listcomp>)r   rL   r   rg   joinrM   )r)   r   r   r   r   r,   r   r-   r   7  s   

z)LoRAModelManager._register_packed_modulesr   c                 C   s<  | j  D ]\}}g }t }d}|D ]}| ||}|| |r(d}|| q|s,qtt|D ]}	||	 r9q2d ||	< q2| jrS|	|sS|
dd}
|	|rS|
}|drs| jrft|dkrf| |}tj||| jd|j|< nt||j|< |D ]	}|j|d  q}q|j D ]}|  q| j D ]\}}t|tr| ||| qtt|j }|jd usJ t|jtrtt|j}n|jj}t|dkot }|r|j D ]>}t|jtrtt|jD ]}|j| d u rq|j|   |j|< |j!|   |j!|< qq|j  |_|j!  |_!qd S d S )	NFTmodel. z.expertsr   r   r   )"rM   r   setr   r   addr   rg   r   check_lora_namereplacer   rS   r   r   r   r   r   poprl   optimizerN   r   r	   _stack_moe_lora_weightsr   iterr   listr?   r   r    
pin_memoryr   )r)   r   r   new_module_namesreplacement_lorasreplaced_modulehas_replacementr   r   r   replaced_module_namer   
first_loralora_devicer  r   r,   r,   r-   r   D  s|   







z-LoRAModelManager._create_merged_loras_inplacec                 C   s&  |  ||}|rt|jr|  ||d }|}|d us J |d us&J | jr|jd jd }|j|d|jjd |_|j|d|jjd |_|j|jjd d||_|j|jjd d||_|j	ddd
 |_|j	ddd
 |_|j|jg|_|j|jg|_d S |jjd |j }|jj|dd}|jj|dd}	|jd d ddf j|dd}
|jdd ddf j|dd}|jj|dd}|jj|dd}g }g }t|D ],}|||  |||  ||	|  ||
|  |||  |||  q||_||_d S d S d S )Nr   r   rb   r   r   )dim.)r   torch	is_tensorr   rQ   r   r   reshaper   permute
contiguousr   chunkr   r   )r)   r   r   r   r   gate_up_proj_loradown_proj_loranum_expertsgate_proj_a	up_proj_agate_proj_b	up_proj_bdown_proj_adown_proj_br   r   r   r,   r,   r-   r    sz   

z(LoRAModelManager._stack_moe_lora_weightsc                 C   sB   |}| j r||s|dd}||r|}td ||S )Nr   r   z\For the pool model, successfully loaded the LoRA weights after removing the prefix 'model.'.)r   r  r  r0   	info_onceget_lora)r)   r   r   org_module_namer,   r,   r-   r     s   

z(LoRAModelManager._get_lora_layer_weights
adapter_idc                 C   s*   || j vrdS | | | j |d  dS NFT)rF   r   r  r)   r#  r,   r,   r-   deactivate_adapter  s
   

z#LoRAModelManager.deactivate_adapteradapterc                 C   sH   t d|j|j |j| jv rdS t| j| jkrtd| | dS )N%Adding lora. Model id: %d, int id: %dFzNo free adapter slots.T)r0   r1   r   rE   rg   r$   RuntimeErrorr   )r)   r'  r,   r,   r-   add_adapter  s   
zLoRAModelManager.add_adapterc                 C   s"   | j |kr| | || _ d S d S r&   )rO   r   )r)   r   r,   r,   r-   set_adapter_mapping  s   


z$LoRAModelManager.set_adapter_mappingc                 C   s*   |  | || jvrdS | j|d  dS r$  )r&  rE   r  r%  r,   r,   r-   remove_adapter  s
   

zLoRAModelManager.remove_adapterc                 C   rz   r&   )dictrE   r{   r,   r,   r-   list_adapters  r}   zLoRAModelManager.list_adaptersc                 C   s   | j |S r&   )rE   r   r%  r,   r,   r-   get_adapter  s   zLoRAModelManager.get_adapterr&   )2r3   r4   r5   __doc__r   r6   r   r  r?   r   r(   rT   r_   r|   propertyr$   rH   r   boolr   r   r   r   r   r
   r   r   rU   r   r   staticmethodr  r   r   r-  r   r   r   r   r   r   r	   r  r   r&  r*  r+  r,  r.  r/  r,   r,   r,   r-   r9   ;   s    

5

I
&b


i	I
U
	r9   c                       s0   e Zd Zdedeegef f fddZ  ZS )LoRALRUCacher$   deactivate_lora_fnc                    s   t  || d S r&   )r'   r(   )r)   r$   r5  r*   r,   r-   r(     s   zLoRALRUCache.__init__)r3   r4   r5   r6   r   r2  r(   r8   r,   r,   r*   r-   r4    s    (r4  c                       s   e Zd ZdZ	ddejdededededej	d	e
dB f fd
dZdeeef fddZdedefddZdedef fddZdefddZdedefddZdefddZdefddZ  ZS )LRUCacheLoRAModelManagerz;A model manager that manages multiple LoRAs with LRU cache.Nr:   r;   r<   r=   r>   r?   r@   c              	      s<   t  ||||||| t| j| j| _t| j| j| _d S r&   )	r'   r(   r4  r$   r&  rE   rH   r   rF   )r)   r:   r;   r<   r=   r>   r?   r@   r*   r,   r-   r(     s   
	
z!LRUCacheLoRAModelManager.__init__rX   c                 C   s   t | jjS )zList all registered LoRAModels.)r-  rE   cacher{   r,   r,   r-   r.  7  s   z&LRUCacheLoRAModelManager.list_adaptersr   c                 C   sF   t d|j|j |j| jvr| | d}|S | j|j d}|S )zAdd a LoRAModel to the manager.r(  TF)r0   r1   r   rE   r   touch)r)   r   	was_addedr,   r,   r-   r*  ;  s   
z$LRUCacheLoRAModelManager.add_adapterr   c                    s@   || j vrt| j | jkr| j   t |}| j | |S r&   )rF   rg   rH   remove_oldestr'   r   r8  )r)   r   resultr*   r,   r-   r   G  s   

z)LRUCacheLoRAModelManager.activate_adapterc                 C   s    t | jdkr| j  dS dS )Nr   TF)rg   rE   r:  r{   r,   r,   r-   remove_oldest_adapterU  s   
z.LRUCacheLoRAModelManager.remove_oldest_adapterc                 C   s   |  | | | dS )r   T)_pin_lora_in_cpu_cache_pin_lora_in_gpu_cacher   r,   r,   r-   r   [  s   

z$LRUCacheLoRAModelManager.pin_adapterc              
   C   s>   z	| j | W d S  ty } z	td| d|d }~ww )NzPinning failed. LoRA z is not registered.)rE   pinr   )r)   r   errr,   r,   r-   r=  a  s   
z/LRUCacheLoRAModelManager._pin_lora_in_cpu_cachec                 C   s$   || j vr
| | | j | d S r&   )rF   r   r?  r   r,   r,   r-   r>  i  s   

z/LRUCacheLoRAModelManager._pin_lora_in_gpu_cacher&   )r3   r4   r5   r0  r   Moduler6   r   r  r?   r   r(   r-  r   r.  r2  r*  r   r<  r   r=  r>  r8   r,   r,   r*   r-   r6    s:    
r6  r:   r;   r<   r=   r>   r@   r?   lora_manager_clsrX   c           
   
   K   s@   t | tstdt|  d|d| ||||||d|}	|	S )z(Create a LoRA adapter for a given model.zModel z is not supported for LoRA.)r:   r;   r<   r=   r>   r@   r?   Nr,   )r   r   r   r   )
r:   r;   r<   r=   r>   r@   r?   rB  kwargsrV   r,   r,   r-   create_lora_managerq  s   

rD  )CrI   collections.abcr   typingr   regexr   r  r   vllm.configr   vllm.config.lorar   vllm.loggerr   vllm.lora.layersr   r	   r
   r   vllm.lora.lora_modelr   vllm.lora.lora_weightsr   r   vllm.lora.punica_wrapperr   r   vllm.lora.utilsr   r   r   r   r   r   $vllm.model_executor.layers.fused_moer   vllm.model_executor.modelsr   r   %vllm.model_executor.models.interfacesr   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.utilsr   vllm.multimodalr   vllm.multimodal.encoder_budgetr   vllm.utils.cacher   vllm.utils.platform_utilsr    r3   r0   r!   r`   r6   r#   r9   r4  r6  rA  r?   r   rD  r,   r,   r,   r-   <module>   sn         __
