o
    
۾i                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ dd	lmZ G d
d deZdS )    N)PretrainedConfig)
LoRAConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)LogitsProcessor)VocabParallelEmbedding)current_platform   )BaseLayerWithLoRAc                       sd  e Zd ZdZdededejdejde	e dB ddf fd	d
Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Z	d2dedededB ddfddZd efd!d"Zd ed#eje	ej B d$eje	ej B fd%d&Z	d2d'ejd(ed)ejdB dejdB fd*d+Zd,d- Ze	d2d.ejded/e	dedB de f
d0d1Z!  Z"S )3LogitsProcessorWithLoRAa  
    LoRA wrapper for LogitsProcessor, with extra logic to handle the
    application of the LoRA adapter and added LoRA vocabulary.

    Args:
        base_layer: LogitsProcessor layer
        hidden_size: hidden size of the model
        dtype: data type of the model
        device: device of the model
        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
            received from base_layer.get_sharded_to_full_mapping(). If None,
            no reindexing will be done.
    
base_layerhidden_sizedtypedevicesharded_to_full_mappingNreturnc                    s<   t    || _|| _|| _|| _t | _t | _	|| _
d S N)super__init__r   r   r   r   r   tp_sizer   tp_rankr   )selfr   r   r   r   r   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/vllm/lora/layers/logits_processor.pyr   #   s   

z LogitsProcessorWithLoRA.__init__c                 C      | j jS r   )r   logits_as_inputr   r   r   r   r   4      z'LogitsProcessorWithLoRA.logits_as_inputc                 C   r   r   )r   
vocab_sizer   r   r   r   r    8   r   z"LogitsProcessorWithLoRA.vocab_sizec                 C   r   r   )r   scaler   r   r   r   r!   <   r   zLogitsProcessorWithLoRA.scalec                 C   r   r   )r   soft_capr   r   r   r   r"   @   r   z LogitsProcessorWithLoRA.soft_capc                 C   r   r   )r   use_all_gatherr   r   r   r   r#   D   r   z&LogitsProcessorWithLoRA.use_all_gatherc                 C   r   r   )r   org_vocab_sizer   r   r   r   r$   H   r   z&LogitsProcessorWithLoRA.org_vocab_sizec                 C   r   r   )r   include_gpu_probs_tensorr   r   r   r   r%   L   r   z0LogitsProcessorWithLoRA.include_gpu_probs_tensorc                 C   r   r   )r   "should_modify_greedy_probs_inplacer   r   r   r   r&   P   r   z:LogitsProcessorWithLoRA.should_modify_greedy_probs_inplace	max_loraslora_configmodel_configc                 C   s   d| j j  k rdkrtd tj|d|j| jf|j| jd| _	tj|d| j j|jf|j| jd| _
| jd urFtj| j| jtjd| _d S d | _d S )Ni }  i  zAWhen using LoRA, vocab size must be 32000 >= vocab_size <= 257024r	   )r   r   )r   r   )r   r    
ValueErrortorchzerosmax_lora_rankr   
lora_dtyper   lora_a_stackedlora_b_stackedr   tensorlongsharded_to_full_mapping_gpu)r   r'   r(   r)   r   r   r   create_lora_weightsT   s8   


z+LogitsProcessorWithLoRA.create_lora_weightsindexc                 C   s   d| j |< d| j|< d S )Nr   )r/   r0   )r   r5   r   r   r   
reset_lora{   s   
z"LogitsProcessorWithLoRA.reset_loralora_alora_bc                 C   s   t |tjsJ t |tjsJ | | | j|dd |jd d |jd f j|dd | j|dd |jd d |jd f j|dd d S )Nr   r	   T)non_blocking)
isinstancer+   Tensorr6   r/   shapecopy_r0   )r   r5   r7   r8   r   r   r   set_lora   s   
&&
z LogitsProcessorWithLoRA.set_lorahidden_stateslm_headembedding_biasc                 C   s   t |dr	|j}n|}|j||}|d ur||7 }| j|}|d u r&d S | jd ur4|d d | jf }| j||| j| j	d}t
 sF|}|d d d | jjf }|S )Nr   g      ?)hasattrr   quant_methodapply_gather_logitsr3   punica_wrapperadd_lora_logitsr/   r0   r   can_update_inplacer    )r   r?   r@   rA   actual_lm_headlogitslora_outputr   r   r   _get_logits   s$   

z#LogitsProcessorWithLoRA._get_logitsc                 O   s   t | jj| g|R i |S r   )typer   forward)r   argskwargsr   r   r   rN      s   zLogitsProcessorWithLoRA.forwardsource_layerpacked_modules_listc                 C   s   dS )NFr   )clsrQ   r(   rR   r)   r   r   r   can_replace_layer   s   	z)LogitsProcessorWithLoRA.can_replace_layerr   )#__name__
__module____qualname____doc__r   intr+   r   r   listr   propertyr   r    r!   r"   r#   r$   r%   r&   r   r   r4   r6   r;   r>   r   rL   rN   classmethodnnModuleboolrT   __classcell__r   r   r   r   r      s    









'

3r   )r+   torch.nnr]   transformersr   vllm.config.lorar   vllm.distributedr   r   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.vocab_parallel_embeddingr   vllm.platformsr   baser
   r   r   r   r   r   <module>   s   