o
    -iJ,                     @   s   d dl mZ d dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ e	eZG dd dZG dd deZdS )    )contextmanager)AnyLiteralN)
VllmConfig)init_logger)	LoRAModel)LoRAModelManagerLRUCacheLoRAModelManagercreate_lora_manager)
PEFTHelper)LoRARequest)get_adapter_absolute_pathc                
   @   sH  e Zd ZU dZeZee ed< efde	de
jdeeef dee fddZed	d
 ZedefddZ	d-de
jjde	dB defddZdedefddZdededefddZdedefddZdee dedB ddfddZdefdd Zd!ee ddfd"d#Z d$edefd%d&Z!dedefd'd(Z"d)d* Z#dee fd+d,Z$dS ).WorkerLoRAManagerzWorkerLoRAManager that manages LoRA models on the worker side.

    Every request, the requested LoRAs will be loaded (unless they are already
    loaded), and every other LoRA will be unloaded._manager_clsvllm_configdeviceembedding_moduleslora_model_clsc                 C   s\   || _ || _d| _|jj| _|jj| _|j | _|j	| _	|jj
 }|j| _|| _|  d S NF)_lora_model_clsr   _cached_dummy_lorascheduler_configmax_num_seqsmax_num_batched_tokensmodel_configget_vocab_size
vocab_sizelora_config	hf_configget_text_configmax_position_embeddingsr   )selfr   r   r   r   text_config r#   U/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/lora/worker_manager.py__init__    s   
zWorkerLoRAManager.__init__c                 c   s    d| _ dV  d| _ dS )z_Use this context manager to reuse the dummy lora model
        to avoid creating it repeatedly.NF)r   r!   r#   r#   r$   dummy_lora_cache9   s   
z"WorkerLoRAManager.dummy_lora_cachereturnc                 C   s   dS )NTr#   r&   r#   r#   r$   
is_enabledA   s   zWorkerLoRAManager.is_enabledNmodelc              
   C   0   t || j| j| j| j| j| j|d}|| _|jS )N)r   r   r   r   r   lora_manager_clsr   )	r
   r   r   r   r   r   r   _adapter_managerr*   r!   r*   r   lora_managerr#   r#   r$   r
   E      
z%WorkerLoRAManager.create_lora_managerlora_requestc                 C   s  zb| j j}| j j}g }|D ]}||v r|||  n|| |dkr)|| qt|}t|j}t	|| j
|j}|| j | j j}	t|	dd }
| jj||||jd| jj| j|j|
d	}W |S  ty{ } ztd|j d|j |d }~w ty } z|d }~ww )Nexpertshf_to_vllm_mappercpu)peft_helperlora_model_idr   dtypemodel_vocab_sizetensorizer_config_dictweights_mapperzLoading lora z failed: No adapter found for )r-   supported_lora_modulespacked_modules_mappingextendappendsetr   	lora_pathr   from_local_dirr    r9   validate_legalr   r*   getattrr   from_local_checkpointlora_int_id
lora_dtyper   FileNotFoundError
ValueError	lora_name	Exception)r!   r1   r;   r<   expected_lora_lstmoduleexpected_lora_modulesr@   r5   r*   r3   loraer#   r#   r$   _load_adapterW   s\   



zWorkerLoRAManager._load_adapterrankc                 C   s^   |j |  v r	dS t| jtr| j|j }n| j|j || j}| jd u r)|| _| j	|S r   )
rE   list_adapters
isinstancer   r   cloner-   create_dummy_lorar   add_adapter)r!   r1   rQ   
dummy_lorar#   r#   r$   add_dummy_lora   s   

z WorkerLoRAManager.add_dummy_lora
adapter_idc                 C      | j |S N)r-   pin_adapterr!   rY   r#   r#   r$   r\         zWorkerLoRAManager.pin_adapterrequestsmappingc                 C   s&   |  | |d ur| j| d S d S r[   )_apply_adaptersr-   set_adapter_mapping)r!   r_   r`   r#   r#   r$   set_active_adapters   s   
z%WorkerLoRAManager.set_active_adaptersc                 C   s   | j jo| j jS r[   )r-   supports_mmsupports_tower_connector_lorar&   r#   r#   r$   re      s   z/WorkerLoRAManager.supports_tower_connector_loraadapter_requestsc                 C   s   |   }dd |D }t|| jjkr"tdt| d| jj dt|}|| D ]}| | q*|| D ]	}| ||  q6d S )Nc                 S      i | ]}|r|j |qS r#   )rY   ).0adapter_requestr#   r#   r$   
<dictcomp>       z5WorkerLoRAManager._apply_adapters.<locals>.<dictcomp>zNumber of requested models (z1) is greater than the number of GPU model slots ().)rR   lenr-   adapter_slotsRuntimeErrorr?   remove_adapterrV   )r!   rf   existing_adapters
models_maprequested_idsrY   r#   r#   r$   ra      s    z!WorkerLoRAManager._apply_adaptersri   c                 C   s:   |j |  v r	dS | |}| j|}| j|j |S r   )rY   rR   rP   r-   rV   activate_adapterid)r!   ri   loaded_adapterloadedr#   r#   r$   rV      s   
zWorkerLoRAManager.add_adapterc                 C   rZ   r[   )r-   rp   r]   r#   r#   r$   rp      r^   z WorkerLoRAManager.remove_adapterc                 C   s   | j   d S r[   )r-   remove_all_adaptersr&   r#   r#   r$   rx         z%WorkerLoRAManager.remove_all_adaptersc                 C   s   t | j S r[   )r?   r-   rR   r&   r#   r#   r$   rR      ry   zWorkerLoRAManager.list_adaptersr[   )%__name__
__module____qualname____doc__r   r   type__annotations__r   r   torchr   dictstrr%   r   r'   propertyboolr)   nnModuler   r
   r   rP   intrX   r\   r?   rc   re   ra   rV   rp   rx   rR   r#   r#   r#   r$   r      sF   
 



:r   c                   @   sl   e Zd ZU dZeZee ed< 	ddej	j
dedB defddZd	ee ddfd
dZdedefddZdS )LRUCacheWorkerLoRAManagera   WorkerLoRAManager that manages LoRA models on the worker side.

    Uses an LRU Cache. Every request, the requested LoRAs will be loaded
    (unless they are already loaded) and least recently used LoRAs will
    be unloaded if the cache is above capacity.r   Nr*   r   r(   c              
   C   r+   )N)r,   r   r   r   r   r   r   )	r
   r   r   r   r   r   r   r-   r*   r.   r#   r#   r$   r
      r0   z-LRUCacheWorkerLoRAManager.create_lora_managerlora_requestsc                 C   sX   dd |D }t || jjkrtdt | d| jj d| D ]}| | q"d S )Nc                 S   rg   r#   )rE   )rh   r1   r#   r#   r$   rj      rk   z=LRUCacheWorkerLoRAManager._apply_adapters.<locals>.<dictcomp>zNumber of requested LoRAs (z0) is greater than the number of GPU LoRA slots (rl   )rm   r-   
lora_slotsro   valuesrV   )r!   r   	loras_maprN   r#   r#   r$   ra      s   z)LRUCacheWorkerLoRAManager._apply_adaptersr1   c                 C   s   |j |  vs
|jr5| |}| j|j t| jd | jjkr.t	| jt
s)J | j  | j|}n	| j|j d u}| j|j  |S )N   )rE   rR   load_inplacerP   r-   rp   ru   rm   capacityrS   r	   remove_oldest_adapterrV   get_adapterrt   )r!   r1   rN   rw   r#   r#   r$   rV      s   

z%LRUCacheWorkerLoRAManager.add_adapterr[   )rz   r{   r|   r}   r	   r   r~   r   r   r   r   r   r   r
   r?   r   ra   r   rV   r#   r#   r#   r$   r      s   
 
r   )
contextlibr   typingr   r   r   vllm.configr   vllm.loggerr   vllm.lora.lora_modelr   vllm.lora.model_managerr   r	   r
   vllm.lora.peft_helperr   vllm.lora.requestr   vllm.lora.utilsr   rz   loggerr   r   r#   r#   r#   r$   <module>   s    :