o
    
۾ih+                     @   sv  U d dl Z d dlmZ d dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z" d d
l#m$Z$ d dl%m&Z& d dl'm(Z(m)Z) erd dl*m+Z+ d dl,m-Z- d dl.m/Z/ ee0Z1de2de3de4e2 fddZ5d a6dd Z7e"eeeee eeeeeeee!eehZ8e9e:e  e;d< dej<de3fddZ=	d8dej<de2dede4de
dB dej<fdd Z>	d8dd!d"d#de2dede
dB defd$d%Z?dej<d&e@d'ej<dej<fd(d)ZA	d8d*e@d+d,deBe@e3f fd-d.ZCd*e@de3fd/d0ZDdej<de4e@ fd1d2ZEd3e@de@fd4d5ZFdej<deGe@e4e@ f fd6d7ZHdS )9    N)TYPE_CHECKING)HfHubHTTPErrorHFValidationError)nn)PretrainedConfig)envs)
LoRAConfig)init_logger)BaseLayerWithLoRAColumnParallelLinearWithLoRA#ColumnParallelLinearWithShardedLoRAFusedMoE3DWithLoRAFusedMoEWithLoRALogitsProcessorWithLoRA/MergedColumnParallelLinearVariableSliceWithLoRA"MergedColumnParallelLinearWithLoRA)MergedColumnParallelLinearWithShardedLoRAMergedQKVParallelLinearWithLoRA&MergedQKVParallelLinearWithShardedLoRAQKVParallelLinearWithLoRA QKVParallelLinearWithShardedLoRAReplicatedLinearWithLoRARowParallelLinearWithLoRA RowParallelLinearWithShardedLoRAVocabParallelEmbeddingWithLoRA)FusedMoE)
LinearBase)get_moe_expert_mappingget_packed_modules_mapping)LogitsProcessor)ParallelLMHead)WeightsMapper	max_loras
specializereturnc                    s*   |s d gS  fddt d d D S )a8  
    Returns num_active_loras values for cudagraph capture.

    When specialize=True: powers of 2 up to max_loras, plus max_loras + 1.
    When specialize=False: just [max_loras + 1].

    This is the single source of truth for LoRA capture cases, used by both
    CudagraphDispatcher and PunicaWrapperGPU.
       c                    s,   g | ]}||d  @ dks| d  kr|qS )r%   r    ).0nr"   r&   C/home/ubuntu/.local/lib/python3.10/site-packages/vllm/lora/utils.py
<listcomp>=   s    &z,get_captured_lora_counts.<locals>.<listcomp>   )range)r"   r#   r&   r)   r*   get_captured_lora_counts0   s
   


r.   c                   C   s   t d7 a t S )Nr%   )_GLOBAL_LORA_IDr&   r&   r&   r*   get_lora_idE   s   r0   _all_lora_classesmodelc                 C   s(   t dd |  D rtd dS dS )z@Checks if the model contains FusedMoE layers and warns the user.c                 s   s    | ]}t |tV  qd S N)
isinstancer   )r'   moduler&   r&   r*   	<genexpr>a   s    zis_moe_model.<locals>.<genexpr>z8MoE model detected. Using fused MoE LoRA implementation.TF)anymoduleslogger	info_once)r2   r&   r&   r*   is_moe_model_   s   
r;   layerlora_configpacked_modules_listmodel_configc                 C   s>   t D ]}|j| |||dr|| }|||| |  S q| S )N)source_layerr=   r>   r?   )r1   can_replace_layercreate_lora_weights)r<   r"   r=   r>   r?   lora_clsinstance_layerr&   r&   r*   
from_layerg   s   	rE   r   lm_headr    c                 C   s0   t | |j|jj|jj| }|||| |S r3   )r   embedding_dimweightdtypedeviceget_sharded_to_full_mappingrB   )r<   rF   r"   r=   r?   retr&   r&   r*   from_layer_logits_processor|   s   rM   module_name
new_modulec                 C   s<   |  d|ddd }|dd }t||| |S )z1Replace a submodule in a model with a new module..N)get_submodulejoinsplitsetattr)r2   rN   rO   parenttarget_namer&   r&   r*   replace_submodule   s   rX   nameweights_mapperzWeightsMapper | Nonec                 C   s   |  dr| dd} |r|| n| } d|  } n	|r || n| } |  dr)dnd}| d}|d dkrS|d d	ksB|d d
krSd||d }||d d	kfS |d dks_|d dkrpd||d }||d dkfS t|  d)a  Parse the name of lora weights.

    args:
        name: the name of the fine-tuned LoRA, e.g.
            base_model.model.dense1.weight
        weights_mapper: maps the name of weight, e.g.
            `model.` -> `language_model.model.`,
    return:
        tuple(module_name, is_lora_a):
            module_name: the name of the module, e.g. model.dense1,
            is_lora_a whether the tensor is lora_a or lora_b.
    zbase_model.model. r,   r   rP   rQ   rH   lora_Alora_Blora_embedding_Alora_embedding_Bz is unsupported LoRA weight)
startswithreplace	_map_namerT   rS   
ValueError)rY   rZ   start_indexpartsnew_namer&   r&   r*   parse_fine_tuned_lora_name   s   


$rh   c                 C   s   d}|  |S )N)z.embed_tokens.base_layer.weightz.lm_head.base_layer.weight)endswith)rY   embedding_suffixesr&   r&   r*   is_base_embeddding_weights   s   
rk   c                 C   s   t  }|  D ]8\}}t|dd}|dur|D ]}|| qt|tfr/||dd  t|tfr?||dd  qt|S )z2
    In vLLM, all linear layers support LoRA.
    embedding_modulesNrP   rQ   )	setnamed_modulesgetattraddr4   r   rT   r   list)r2   supported_lora_modulesrY   r5   rl   r&   r&   r*   get_supported_lora_modules   s   rs   	lora_pathc                    s   t j r S  drt j S t j rt j S tjr>ddl	m
}m ddlm}  fdd}||f}d}n fdd}ttf}d	}z| }W |S  |y`   t|   Y S w )
a'  
    Resolves the given lora_path to an absolute local path.

    If the lora_path is identified as a Hugging Face model identifier,
    it will download the model and return the local snapshot path.
    Otherwise, it treats the lora_path as a local file path and
    converts it to an absolute path.

    Parameters:
    lora_path (str): The path to the lora model, which can be an absolute path,
                     a relative path, or a Hugging Face model identifier.

    Returns:
    str: The resolved absolute local path to the lora model.
    ~r   )InvalidParametersnapshot_download)	HTTPErrorc                      s
    dS )N)model_idr&   r&   rt   rw   r&   r*   <lambda>  s   
 z+get_adapter_absolute_path.<locals>.<lambda>z&Error downloading the ModelScope modelc                      s   t j dS )N)repo_id)huggingface_hubrw   r&   )rt   r&   r*   r{     s    z'Error downloading the HuggingFace model)ospathisabsra   
expanduserexistsabspathr   VLLM_USE_MODELSCOPE modelscope.hub.snapshot_downloadrv   rw   requestsrx   r   r   r9   	exception)rt   rv   rx   download_fndownload_exceptions	error_loglocal_snapshot_pathr&   rz   r*   get_adapter_absolute_path   s,   

r   c                 C   sH   t | r t|  }rt| }| jsdd |D |d< |S tdt| S )Nc                 S   s&   g | ]\}}}}d |vr| dqS )z..rP   )rstrip)r'   _weight_namer&   r&   r*   r+   *  s
    
z2process_packed_modules_mapping.<locals>.<listcomp>expertszGTo support LoRA for MoE model, 'get_expert_mapping' must be implemented)r;   r   r   is_3d_moe_weightAttributeError)r2   moe_packed_mappingpacked_modules_mappingr&   r&   r*   process_packed_modules_mapping  s   
r   r3   )Ir~   typingr   r}   huggingface_hub.utilsr   r   torchr   transformersr   vllmr   vllm.config.lorar   vllm.loggerr	   vllm.lora.layersr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   $vllm.model_executor.layers.fused_moer   !vllm.model_executor.layers.linearr   vllm.model_executor.utilsr   r   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.vocab_parallel_embeddingr     vllm.model_executor.models.utilsr!   __name__r9   intboolrq   r.   r/   r0   r1   rm   type__annotations__Moduler;   rE   rM   strrX   tuplerh   rk   rs   r   dictr   r&   r&   r&   r*   <module>   s   
L




,	$7