o
    iR                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ ddlmZ G d	d
 d
ejZG dd dejZdS )    )IterableN)
VllmConfig)LogitsProcessor)ParallelLMHead)default_weight_loader   )maybe_prefixc                       sD   e Zd Zdedededdf fddZdejdejfd	d
Z  Z	S )ResidualBlockconfighidden_size
num_layersreturnNc                    s8   t    t fddt|D | _t | _d S )Nc              	      s$   g | ]}t jt d ddqS )medusa_fc_biasF)bias)nnLineargetattr.0_r
   r    W/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/medusa.py
<listcomp>   s    
z*ResidualBlock.__init__.<locals>.<listcomp>)super__init__r   
ModuleListrangelayersSiLUact)selfr
   r   r   	__class__r   r   r      s   

zResidualBlock.__init__xc                 C   s"   | j D ]}|| || }q|S )N)r   r    )r!   r$   layerr   r   r   forward#   s   
zResidualBlock.forward)
__name__
__module____qualname__r   intr   torchTensorr&   __classcell__r   r   r"   r   r	      s    r	   c                       s   e Zd ZdZdddededdf fdd	Zd
ejde	ej fddZ
d
e	ej de	ej fddZdeeeejf  dee fddZ  ZS )Medusaaq  This class implements the Medusa draft model from the paper: https://arxiv.org/abs/2401.10774
    Reference implementation: https://github.com/FasterDecoding/Medusa

    Differences from reference implementation:
    1. Currently this only supports generating proposals from top-1 tokens.
    2. We have an optional token_map which reduces draft vocab to most
       frequently used tokens to give some additional speed-up by reducing
       sampling overhead. This is disabled unless the checkpoint file has
       explicit token_map tensor and config has an optional attribute
       truncated_vocab_size < vocab_size. To use this technique, one has to find
       the top-k most frequent tokens in target dataset and add that as a tensor
       in the draft checkpoint (using key token_map). Also, the draft config
       needs to have truncated_vocab_size (=k) as an attribute. prefixvllm_configr1   r   Nc                   s   |j jj t    _t fddtjj	D _
 j_ j_t ddrItj jtdd_fddtjj	D _nt fddtjj	D _t d	d
}t jj|_d _d S )Nc                    s"   g | ]}t  jjjjd qS ))r
   r   r   )r	   r
   r   num_hidden_layersr   )r
   r!   r   r   r   =   s    z#Medusa.__init__.<locals>.<listcomp>original_lm_headFlm_headr0   c                    s   g | ]} j qS r   )r5   r   )r!   r   r   r   O   s    c              	      s*   g | ]}t  j jtd | dqS )z	lm_heads.r0   )r   
vocab_sizer   r   )r   i)r
   r1   r   r   r   R   s    logit_scaleg      ?)speculative_configdraft_model_config	hf_configr   r   r
   r   r   r   	num_headsblocksr6   orig_vocab_sizetruncated_vocab_sizer   r   r   r   r5   lm_headsr   logits_processor	token_map)r!   r2   r1   r8   r"   )r
   r1   r!   r   r   8   s6   







zMedusa.__init__hidden_statesc                    s    fdd| j D S )Nc                    s   g | ]}| qS r   r   )r   blockrC   r   r   r   j   s    z"Medusa.forward.<locals>.<listcomp>)r=   )r!   rC   r   rE   r   r&   i   s   zMedusa.forwardc              	   C   s   g }t || jD ]G\}}| ||}|d u rt|dksJ q| jd u r*|| q|tj tjg |j	d d | j
R |j|jd  ||d d| jf< q|S )Nr   )sizedevicedtype.)zipr@   rA   lenrB   appendr+   infonesshaper>   rH   rI   )r!   rC   
logits_lsthsr5   _logitsr   r   r   compute_logitsl   s&   
	zMedusa.compute_logitsweightsc           	      C   s2  t |  }t }i }|D ]5\}}|dd}|dkr*| j| jk r)tj|dd| _q||v r3|||< qt	| j
ddrB|dkrB||d< q| D ]1\}}d	|v rd| jd urd|jd
 | jjd
 krd|| j }|| }t	|dt}||| || qG| jd ur| jj| jd
 jjd | j| jks| jd usJ |S )Nzmedusa_heads.r/   rB   F)requires_gradr4   zlm_heads.0.weightzlm_head.weightr5   r   weight_loader)rH   )dictnamed_parameterssetreplacer?   r>   r   	ParameterrB   r   r
   itemsrO   r   addtor@   weightrH   )	r!   rT   params_dictloaded_paramsweights_mapnameloaded_weightparamrV   r   r   r   load_weights   s<   




zMedusa.load_weights)r'   r(   r)   __doc__r   strr   r+   r,   listr&   rS   r   tuplerY   rf   r-   r   r   r"   r   r.   )   s     1
,r.   )collections.abcr   r+   torch.nnr   vllm.configr   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.vocab_parallel_embeddingr   -vllm.model_executor.model_loader.weight_utilsr   utilsr   Moduler	   r.   r   r   r   r   <module>   s   