o
    
۾i(                     @   s   d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZ ddlmZmZmZmZ ee Z!G dd deZeG dd dej"Z#G dd deZ$dS )    )IterableN)LlamaConfig)support_torch_compile)
VllmConfig)init_logger)ReplicatedLinear)LogitsProcessor)QuantizationConfig)VocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)LlamaDecoderLayerLlamaForCausalLM   )AutoWeightsLoaderget_draft_quant_configmaybe_prefixprocess_eagle_weightc                       sR   e Zd Z		ddededededB ddf
 fdd	ZdededB fd
dZ	  Z
S )r    Nvllm_configdisable_input_layernormprefixconfigreturnc                    s,   t  j|||d |r| `t | _d S d S )Nr   r   )super__init__input_layernormnnIdentity)selfr   r   r   r   	__class__ Z/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/llama_eagle.pyr   "   s
   zLlamaDecoderLayer.__init__c                 C   s   t |S )z8Use drafter's quantization config instead of verifier's.)r   )r    r   r#   r#   r$   get_quant_config1   s   z"LlamaDecoderLayer.get_quant_config)r   N)__name__
__module____qualname__r   boolstrr   r   r	   r%   __classcell__r#   r#   r!   r$   r   !   s    r   c                
       s   e Zd Zddddedededdf fd	d
ZdejdejfddZ	dejdejdejde
ejejf fddZdee
eejf  dee fddZ  ZS )
LlamaModelr   r   )r   start_layer_idr   r   r-   r   Nc             	      s   t    jjj_jj_t_t	jjjj
t dd_t fddtjjD _tjj
d jj
djjjt ddd_d S )	Nembed_tokensr   c              
      s2   g | ]}t |d kt d|  jdqS )r   zlayers.r   )r   r   r   ).0ir   r    r-   r   r#   r$   
<listcomp>M   s    z'LlamaModel.__init__.<locals>.<listcomp>   Ffc)
input_sizeoutput_sizebiasparams_dtypequant_configr   return_bias)r   r   speculative_configdraft_model_config	hf_configr   
vocab_sizer   r:   r
   hidden_sizer   r.   r   
ModuleListrangenum_hidden_layerslayersr   model_configdtyper5   )r    r   r   r-   r!   r2   r$   r   8   s.   




zLlamaModel.__init__	input_idsc                 C   s
   |  |S N)r.   r    rG   r#   r#   r$   embed_input_idsa   s   
zLlamaModel.embed_input_ids	positionshidden_statesc                 C   sR   |  |}| tj||fdd}d }| jD ]
}||||\}}q|| }||fS )N)dim)r.   r5   torchcatrD   )r    rG   rK   rL   input_embedsresiduallayerr#   r#   r$   forwardd   s   


zLlamaModel.forwardweightsc                 C   s  g d}t |  }t }|D ]y\}}| jd urA| j| }rA|| }t|dt}	| dkr2|n|d }|	|| || qd|v sId|v rSt	||}|d u rSq|D ]\}
}}||vr_qU|
||
}|| }|j}	|	|||  n|| }t|dt}	|	|| || q|S )N))	.qkv_projz.q_projq)rV   z.k_projk)rV   z.v_projv).gate_up_projz
.gate_projr   )rZ   z.up_projr   weight_loaderr   scale
zero_point)dictnamed_parameterssetr:   get_cache_scalegetattrr   rN   addr   replacer[   )r    rU   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr[   
param_nameweight_nameshard_idr#   r#   r$   load_weightsv   s>   




zLlamaModel.load_weights)r&   r'   r(   r   r*   intr   rO   TensorrJ   tuplerT   r   r`   ro   r+   r#   r#   r!   r$   r,   6   s.    )
,r,   c                   @   s   e Zd ZdddedefddZdejdejfd	d
Z	ddejdejdejdejdB de	ejejf f
ddZ
dee	eejf  fddZdS )EagleLlamaForCausalLMr   r/   r   r   c                C   s   t j|  |jjj| _t| jdd d u r t| jdd }|| j_|j	
|j}t|d|d| _t| jdd}t| jj|d| _d S )Ndraft_vocab_sizer?   model)r   r   r-   logit_scaleg      ?)r\   )r   Moduler   r<   r=   r>   r   rb   rt   rE   get_num_layersparallel_configr,   ru   r   r?   logits_processor)r    r   r   base_vocab_sizetarget_layer_numrv   r#   r#   r$   r      s   zEagleLlamaForCausalLM.__init__rG   r   c                 C   s   | j |S rH   )ru   rJ   rI   r#   r#   r$   rJ      s   z%EagleLlamaForCausalLM.embed_input_idsNrK   rL   inputs_embedsc                 C   s*   |d urt t| j d| |||S )Nz( does not support multimodal inputs yet.)NotImplementedErrortyper&   ru   )r    rG   rK   rL   r}   r#   r#   r$   rT      s
   zEagleLlamaForCausalLM.forwardrU   c                    s,    fdd}t  d d}|t|| d S )Nc                    s*   | \}}d|vrd| }t  | ||fS )Nlm_headzmodel.)r   )inputsrh   ri   r    r#   r$   	transform   s
   
z5EagleLlamaForCausalLM.load_weights.<locals>.transform)skip_prefixes)r   ro   map)r    rU   r   loaderr#   r   r$   ro      s   z"EagleLlamaForCausalLM.load_weightsrH   )r&   r'   r(   r   r*   r   rO   rq   rJ   rr   rT   r   ro   r#   r#   r#   r$   rs      s     
 rs   )%collections.abcr   rO   torch.nnr   transformersr   vllm.compilation.decoratorsr   vllm.configr   vllm.loggerr   !vllm.model_executor.layers.linearr   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.quantization.base_configr	   3vllm.model_executor.layers.vocab_parallel_embeddingr
   -vllm.model_executor.model_loader.weight_utilsr   r    vllm.model_executor.models.llamar   r   utilsr   r   r   r   r&   loggerrw   r,   rs   r#   r#   r#   r$   <module>   s&   m