o
    -i1                     @   s  d dl Z d dlmZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZ d d	lmZ d d
lmZmZ d dlmZmZ d dlmZmZm Z m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 ee7Z8edZ9dZ:G dd de2e;e9f Z<G dd dZ=G dd de<e Z>G dd de=Z?e=fd ej@d!e;d"e;d#e;d$ed%e
d&ejAd'eBe= d(e=fd)d*ZCdS )+    N)Callable)TypeVar)nn)
VllmConfig)
LoRAConfigModelConfig)init_logger)BaseLayerWithLoRAFusedMoE3DWithLoRALoRAMappingLoRAMappingType)	LoRAModel)LoRALayerWeightsPackedLoRALayerWeights)PunicaWrapperBaseget_punica_wrapper)
from_layerfrom_layer_logits_processorget_supported_lora_modulesis_moe_modelprocess_packed_modules_mappingreplace_submodule)FusedMoE)SupportsLoRAsupports_multimodal)is_pooling_model)MultiModelKeys)PPMissingLayer)MULTIMODAL_REGISTRY)LRUCache)is_pin_memory_available)MultiModalBudgetTlanguage_modelc                       sJ   e Zd Zdedeegef f fddZdededB f fdd	Z  Z	S )
AdapterLRUCachecapacitydeactivate_fnc                    s   t  | || _d S N)super__init__r&   )selfr%   r&   	__class__ T/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/lora/model_manager.pyr)   1   s   
zAdapterLRUCache.__init__keyvalueNc                    s$   t d| | | t ||S )NzRemoving adapter int id: %d)loggerdebugr&   r(   
_on_remove)r*   r/   r0   r+   r-   r.   r3   5   s   
zAdapterLRUCache._on_remove)
__name__
__module____qualname__intr   objectr)   r"   r3   __classcell__r-   r-   r+   r.   r$   0   s     "r$   c                   @   s`  e Zd ZdZ	dRdedededededejd	e	dB fd
dZ
ded	e	ddfddZd	e	deddfddZdefddZedefddZedefddZedefddZdedefddZdefddZdefdd Zdedefd!d"Zd#eddfd$d%Zd&d' Zd(d) Zd*ed+d,fd-d.Zed/ee dB  dee dB  fd0d1Z!	dRded2ed3e"eef dB defd4d5Z#d*efd6d7Z$d*ede%dB fd8d9Z&d:eddfd;d<Z'd=eddfd>d?Z(d=ed+e)d*efd@dAZ*d=ed*ede dB fdBdCZ+dDedefdEdFZ,dGedefdHdIZ-d#eddfdJdKZ.dDedefdLdMZ/de"eef fdNdOZ0dDededB fdPdQZ1dS )SLoRAModelManagerz7A manager that manages multiple LoRA-fine-tuned models.Nmodelmax_num_seqsmax_num_batched_tokens
vocab_sizelora_configdevicevllm_configc           	      C   s   || _ t| j | _| jsJ d| j jj di | _i | _d| _|| _|| _	|| _
| j| jks1J t|d d | _dg| j | _|| _t| j | _t| j | _i | _i | _d| _t| j }|od| j j| _|ok| j j| _| || |   | | j _dS )a  Create a LoRAModelManager and adapter for a given model.

        Args:
            model: the model to be adapted.
            max_num_seqs: the maximum number of sequences model can run in a
                single batch.
            max_num_batched_tokens: the maximum number of tokens model can run
                in a single batch.
            vocab_size: the vocab size of the model.
            lora_config: the LoRA configuration.
        z#No supported LoRA modules found in .LoRA   N) r;   r   supported_lora_modulesr,   r4   _registered_adapters_active_adaptersadapter_typer?   r@   r<   r%   
lora_slotsmathceilr=   lora_index_to_idr>   r   packed_modules_mappingr   packed_modulesmodules_last_mappingr   is_3d_moe_weight_is_3d_moe_modelis_non_gated_moe_is_non_gated_moe_init_punica_wrapper_create_lora_moduleslora_manager)	r*   r;   r<   r=   r>   r?   r@   rA   is_moer-   r-   r.   r)   >   s4   
zLoRAModelManager.__init__returnc                 C   sX   t | jo
t| jd| _i | _| jr| || d S t|| j| j| j	d}|| jt
< d S )Nget_mm_mappingmax_batchesr@   r?   )r   r;   hasattrsupports_mmpunica_wrapper_mapping_maybe_init_mmr   r<   r@   r?   DEFAULT_LANGUAGE_WRAPPER_KEY)r*   r=   rA   llm_punica_wrapperr-   r-   r.   rU   s   s   

z%LoRAModelManager._init_punica_wrapperc                 C   s`  |j }t}d| _| j | _t| jjdksJ t|| j	| j
| jd}| jjd }|| j|< | jjrC||j| _| joAt| jd| _| jsHd S td t||}t| j  }| j| }	t|	| j	| | j
| jd}
| jjD ]}|
| j|< qt| jjrt| jdr| j|	}t|| j	| | j
| jd}| jjD ]}|| j|< qd S td d S d S )	NF   r[   r   get_num_mm_encoder_tokenszLoRA for the tower and connector of multimodal models is experimental and may contain bugs. Please report any related issues on GitHub if you encounter them.get_num_mm_connector_tokenszConnector LoRA support disabled: model does not implement get_num_mm_connector_tokens(). This method is required to determine the connector's token budget for LoRA operations.)model_configr   supports_tower_connector_lorar;   rZ   
mm_mappinglenr#   r   r<   r@   r?   r_   enable_tower_connector_loracreate_processorinfomm_processor_infor^   r]   r1   warningr!   maxget_allowed_mm_limitsvaluesrd   get_encoder_budgettower_model	connectorre   warning_once)r*   rA   r=   rf   mm_registryrb   	lm_prefix	mm_budgetlimit_per_promptnum_encoder_tokenstower_punica_wrapperprefixconnector_tokensconnector_punica_wrapperr-   r-   r.   r`      sp   

zLoRAModelManager._maybe_init_mmc                 C   
   t | jS r'   )ri   rF   r*   r-   r-   r.   __len__      
zLoRAModelManager.__len__c                 C      | j jS r'   )r?   max_cpu_lorasr   r-   r-   r.   r%         zLoRAModelManager.capacityc                 C   r   r'   )r?   	max_lorasr   r-   r-   r.   rI      r   zLoRAModelManager.lora_slotsc                 C   s   | j S r'   )rI   r   r-   r-   r.   adapter_slots   s   zLoRAModelManager.adapter_slotslora_idc           	      C   s   || j v rdS tdd t| jD d}|du rtd|\}}d| j |< | j| }td|j| |j| j|< | j	
 D ]\}}| ||}|sO|| q=|||j|j q=dS )z;Move LoRA into a GPU buffer to be used in the forward pass.Fc                 s   s$    | ]\}}|d u r||fV  qd S r'   r-   ).0ir   r-   r-   r.   	<genexpr>   s    z4LoRAModelManager.activate_adapter.<locals>.<genexpr>NzNo free lora slotsz+Activating LoRA. int id: %d, slot index: %dT)rG   next	enumeraterL   
ValueErrorrF   r1   r2   idrO   items_get_lora_layer_weights
reset_loraset_loralora_alora_b)	r*   r   first_free_slotindex_
lora_modelmodule_namemodulemodule_lorar-   r-   r.   activate_adapter   s8   



z!LoRAModelManager.activate_adapterc                 C   s2   z| j |}d | j |< W d S  ty   Y d S w r'   )rL   r   r   )r*   r   r   r-   r-   r.   _deactivate_adapter  s   z$LoRAModelManager._deactivate_adapterlorac                 C   s   |  | || j|j< d S r'   )_create_merged_loras_inplacerF   r   )r*   r   r-   r-   r.   _add_adapter  s   
zLoRAModelManager._add_adapterc                 C   s   t d)%Pin a LoRAModel in the manager cache.zVPinning is not supported in LoRAModelManager. Use LRUCacheLoRAModelManager for pinning)NotImplementedErrorr*   r   r-   r-   r.   pin_adapter  s   zLoRAModelManager.pin_adaptermappingc                 C   s   | j r| js| j r| jjd nt}n(|jtjkr#| jjr#| jjd }n|jtj	kr4| jj
r4| jj
d }n| jjd }| |}|d usEJ ||| j| jd | j d S )Nr   rc   )r^   rg   rh   r#   ra   typer   TOWERrs   	CONNECTORrt   _get_punica_wrapperupdate_metadatarL   rI   r>   )r*   r   target_prefixpunica_wrapperr-   r-   r.   _set_adapter_mapping   s$   
z%LoRAModelManager._set_adapter_mappingc                 C   s&   | j   dg| j | _| j  dS )z'Remove all LoRAModels from the manager.N)rF   clearrI   rL   rG   r   r-   r-   r.   remove_all_adapters9  s   
z$LoRAModelManager.remove_all_adaptersc              
   C   sr  dt dt fdd}| jjddD ]\}}t|trq| |s q| |}|d u r4td| jj	j
| q| jrE|drEtjd	|d
d q|dd }| j|g }t|trb| jr^dgnddg}t| j|t|| j| j|| jj}d|v rd}||}	|	r|	 d| }| j|}
t| j|t|
|| j| j| jj}| jrt|tsq| || | | || qd S )Nr   rY   c                 S   s   |  dd S )NrB   r   )
rpartitionr   r-   r-   r.   _parent_module@  s   z=LoRAModelManager._create_lora_modules.<locals>._parent_moduleF)remove_duplicatez]Regarding %s, vLLM currently only supports adding LoRA to language model, %s will be ignored.z
mixer.gatezHLoRA is not supported for non-gated MoE gate module. %s will be ignored.local)scoperB   w13w1w3lm_headlogits_processor)strr;   named_modules
isinstancer   _match_target_modulesr   r1   rn   r,   r4   rT   endswith
debug_oncesplitrM   getr   rR   r   r   rI   r?   configget_submoduler   r^   r	   register_module_register_packed_modulesset_mapping)r*   r   r   r   r   partspacked_moduled_lst
new_modulelogits_processor_module_nameparent_modulelogits_processor_moduler-   r-   r.   rV   ?  s|   




z%LoRAModelManager._create_lora_modulesr   r   r	   c                 C   s0   t |tsJ d| dt| || j|< d S )NzModule z+ must be a BaseLayerWithLoRA instance, got )r   r	   r   rO   )r*   r   r   r-   r-   r.   r     s   z LoRAModelManager.register_modulelorasc                 C   sV   t | d dksJ dg }tdt | dD ]}|| ||d   |d q|S )zPad LoRA weight pairs to triplets for non-gated MoE.

        For non-gated MoE, each expert has 2 entries (w1, w2) that need to be
        padded to triplets (w1, w2, None) to match pack_moe expectations.
           r   z1Expected pairs of LoRA weights for non-gated MoE.N)ri   rangeextendappend)r   paddedr   r-   r-   r.   _pad_lora_pairs_to_triplets  s   	z,LoRAModelManager._pad_lora_pairs_to_tripletsrankembedding_modulesc              
   C   sr  t ||i }| j D ]*\}}| |r!t|tr!| |du r"q|d}|| jvr|dus2J |d |v rnt	|j
drB|j
jn|j
jjd }t	|j
drS|j
jn|j
jjd }	t|||	||jd jd}
|
|j|< q|jjd	krt||j|j||jd jd  |jd jd}
|
|j|< t||j|j||jd jd  |jd jd}
|
|j|d
 < qt||jd jd |jd jd ||jd jd}
|
|j|< q|d}| j|d  }g }t|D ]'\}}t|d | |j| jd |j| jd ||j| jd}
||
 q|jjdkr,| j r"t!|dkr"| "|}t#j$||| j d}
nt#%|}
|
|j|< q|S )z-Create zero-initialized LoRAModel for warmup.NrB   r   org_vocab_sizerc   embedding_dimr   cpur
   .base_layerFusedMoEWithLoRArS   )&r   r;   r   r   r   r	   r   r   rN   r]   
base_layerr   weightshaper   r   create_dummy_lora_weightslora_a_stackeddtyper   r,   r4   w2_input_sizew2_output_sizew2_lora_a_stackedw13_input_sizew13_output_sizew13_lora_a_stackedlora_b_stackedrM   r   r   rT   ri   r   r   pack_moepack)r*   r   r   r   r;   r   r   r   	input_dim
output_dimr   replacementssublorasr   rr-   r-   r.   create_dummy_lora  s   









	





z"LoRAModelManager.create_dummy_lorac                    s   t  fdd| jD S )Nc                 3   s,    | ]}t d j|d p| kV  qdS )z.*\.{target_module}$)target_moduleN)rematchformat)r   r   r   r-   r.   r     s    
z9LoRAModelManager._match_target_modules.<locals>.<genexpr>)anyrE   )r*   r   r-   r   r.   r     s   z&LoRAModelManager._match_target_modulesc                 C   sF   | j s| jt S t| j tddD ]}||r | j|   S qdS )zW
        Determine whether this module supports LoRA and which wrapper to use.
        T)r/   reverseN)r^   r_   ra   sortedkeysri   
startswith)r*   r   r|   r-   r-   r.   r   #  s   

z$LoRAModelManager._get_punica_wrappermodule_full_namec                    s^   | d}|d }| j|g }t|dkrd S d|d d   fdd|D | j|< d S )NrB   r   rc   c                    s    g | ]} r d  | n|qS )rB   r-   )r   r   r|   r-   r.   
<listcomp>=  s    z=LoRAModelManager._register_packed_modules.<locals>.<listcomp>)r   rM   r   ri   joinrN   )r*   r   r   r   r   r-   r   r.   r   4  s   

z)LoRAModelManager._register_packed_modulesr   c                 C   s<  | j  D ]\}}g }t }d}|D ]}| ||}|| |r(d}|| q|s,qtt|D ]}	||	 r9q2d ||	< q2| jrS|	|sS|
dd}
|	|rS|
}|drs| jrft|dkrf| |}tj||| jd|j|< nt||j|< |D ]	}|j|d  q}q|j D ]}|  q| j D ]\}}t|tr| ||| qtt|j }|jd usJ t|jtrtt|j}n|jj}t|dkot }|r|j D ]>}t|jtrtt|jD ]}|j| d u rq|j|   |j|< |j!|   |j!|< qq|j  |_|j!  |_!qd S d S )	NFTmodel. z.expertsr   r   r   )"rN   r   setr   r   addr   ri   r   check_lora_namereplacer   rT   r   r   r   r   r   poprq   optimizerO   r   r
   _stack_moe_lora_weightsr   iterr   listr@   r   r    
pin_memoryr   )r*   r   r   new_module_namesreplacement_lorasreplaced_modulehas_replacementr   r   r   replaced_module_namer   
first_loralora_devicer  r   r-   r-   r.   r   A  s|   







z-LoRAModelManager._create_merged_loras_inplacec                 C   s&  |  ||}|rt|jr|  ||d }|}|d us J |d us&J | jr|jd jd }|j|d|jjd |_|j|d|jjd |_|j|jjd d||_|j|jjd d||_|j	ddd
 |_|j	ddd
 |_|j|jg|_|j|jg|_d S |jjd |j }|jj|dd}|jj|dd}	|jd d ddf j|dd}
|jdd ddf j|dd}|jj|dd}|jj|dd}g }g }t|D ],}|||  |||  ||	|  ||
|  |||  |||  q||_||_d S d S d S )Nr   r   rc   r   r   )dim.)r   torch	is_tensorr   rR   r   r   reshaper   permute
contiguousr   chunkr   r   )r*   r   r   r   r   gate_up_proj_loradown_proj_loranum_expertsgate_proj_a	up_proj_agate_proj_b	up_proj_bdown_proj_adown_proj_br   r   r   r-   r-   r.   r
    sz   

z(LoRAModelManager._stack_moe_lora_weightsc                 C   sB   |}| j r||s|dd}||r|}td ||S )Nr  r  z\For the pool model, successfully loaded the LoRA weights after removing the prefix 'model.'.)r   r  r  r1   	info_onceget_lora)r*   r   r   org_module_namer-   r-   r.   r     s   

z(LoRAModelManager._get_lora_layer_weights
adapter_idc                 C   s*   || j vrdS | | | j |d  dS NFT)rG   r   r  r*   r(  r-   r-   r.   deactivate_adapter  s
   

z#LoRAModelManager.deactivate_adapteradapterc                 C   sH   t d|j|j |j| jv rdS t| j| jkrtd| | dS )N%Adding lora. Model id: %d, int id: %dFzNo free adapter slots.T)r1   r2   r   rF   ri   r%   RuntimeErrorr   )r*   r,  r-   r-   r.   add_adapter  s   
zLoRAModelManager.add_adapterc                 C   s"   | j |kr| | || _ d S d S r'   )rP   r   )r*   r   r-   r-   r.   set_adapter_mapping  s   


z$LoRAModelManager.set_adapter_mappingc                 C   s*   |  | || jvrdS | j|d  dS r)  )r+  rF   r  r*  r-   r-   r.   remove_adapter  s
   

zLoRAModelManager.remove_adapterc                 C   r   r'   )dictrF   r   r-   r-   r.   list_adapters  r   zLoRAModelManager.list_adaptersc                 C   s   | j |S r'   )rF   r   r*  r-   r-   r.   get_adapter  s   zLoRAModelManager.get_adapterr'   )2r4   r5   r6   __doc__r   r7   r   r  r@   r   r)   rU   r`   r   propertyr%   rI   r   boolr   r   r   r   r   r   r   r   rV   r   r   staticmethodr  r   r   r2  r   r   r   r   r   r   r
   r
  r   r+  r/  r0  r1  r3  r4  r-   r-   r-   r.   r:   ;   s    

5

M
&b


b	I
U
	r:   c                       s0   e Zd Zdedeegef f fddZ  ZS )LoRALRUCacher%   deactivate_lora_fnc                    s   t  || d S r'   )r(   r)   )r*   r%   r:  r+   r-   r.   r)     s   zLoRALRUCache.__init__)r4   r5   r6   r7   r   r7  r)   r9   r-   r-   r+   r.   r9    s    (r9  c                       s   e Zd ZdZ	ddejdededededej	d	e
dB f fd
dZdeeef fddZdedefddZdedef fddZdefddZdedefddZdefddZdefddZ  ZS )LRUCacheLoRAModelManagerz;A model manager that manages multiple LoRAs with LRU cache.Nr;   r<   r=   r>   r?   r@   rA   c              	      s<   t  ||||||| t| j| j| _t| j| j| _d S r'   )	r(   r)   r9  r%   r+  rF   rI   r   rG   )r*   r;   r<   r=   r>   r?   r@   rA   r+   r-   r.   r)     s   
	
z!LRUCacheLoRAModelManager.__init__rY   c                 C   s   t | jjS )zList all registered LoRAModels.)r2  rF   cacher   r-   r-   r.   r3  4  s   z&LRUCacheLoRAModelManager.list_adaptersr   c                 C   sF   t d|j|j |j| jvr| | d}|S | j|j d}|S )zAdd a LoRAModel to the manager.r-  TF)r1   r2   r   rF   r   touch)r*   r   	was_addedr-   r-   r.   r/  8  s   
z$LRUCacheLoRAModelManager.add_adapterr   c                    s@   || j vrt| j | jkr| j   t |}| j | |S r'   )rG   ri   rI   remove_oldestr(   r   r=  )r*   r   resultr+   r-   r.   r   D  s   

z)LRUCacheLoRAModelManager.activate_adapterc                 C   s    t | jdkr| j  dS dS )Nr   TF)ri   rF   r?  r   r-   r-   r.   remove_oldest_adapterR  s   
z.LRUCacheLoRAModelManager.remove_oldest_adapterc                 C   s   |  | | | dS )r   T)_pin_lora_in_cpu_cache_pin_lora_in_gpu_cacher   r-   r-   r.   r   X  s   

z$LRUCacheLoRAModelManager.pin_adapterc              
   C   s>   z	| j | W d S  ty } z	td| d|d }~ww )NzPinning failed. LoRA z is not registered.)rF   pinr   )r*   r   errr-   r-   r.   rB  ^  s   
z/LRUCacheLoRAModelManager._pin_lora_in_cpu_cachec                 C   s$   || j vr
| | | j | d S r'   )rG   r   rD  r   r-   r-   r.   rC  f  s   

z/LRUCacheLoRAModelManager._pin_lora_in_gpu_cacher'   )r4   r5   r6   r5  r   Moduler7   r   r  r@   r   r)   r2  r   r3  r7  r/  r   rA  r   rB  rC  r9   r-   r-   r+   r.   r;    s:    
r;  r;   r<   r=   r>   r?   rA   r@   lora_manager_clsrY   c           
   
   K   s@   t | tstdt|  d|d| ||||||d|}	|	S )z(Create a LoRA adapter for a given model.zModel z is not supported for LoRA.)r;   r<   r=   r>   r?   rA   r@   Nr-   )r   r   r   r   )
r;   r<   r=   r>   r?   rA   r@   rG  kwargsrW   r-   r-   r.   create_lora_managern  s   

rI  )DrJ   collections.abcr   typingr   regexr   r  r   vllm.configr   vllm.config.lorar   r   vllm.loggerr   vllm.lora.layersr	   r
   r   r   vllm.lora.lora_modelr   vllm.lora.lora_weightsr   r   vllm.lora.punica_wrapperr   r   vllm.lora.utilsr   r   r   r   r   r   $vllm.model_executor.layers.fused_moer   vllm.model_executor.modelsr   r   %vllm.model_executor.models.interfacesr   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.utilsr   vllm.multimodalr   vllm.utils.cacher   vllm.utils.platform_utilsr    vllm.v1.worker.utilsr!   r4   r1   r"   ra   r7   r$   r:   r9  r;  rF  r@   r   rI  r-   r-   r-   r.   <module>   sn         \_
