o
    
۾iU<                     @   s.  d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZmZ d dlmZmZ d dl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ee'Z(G dd deZed dd d ddG dd dej)Z*G dd deZ+dS )    )IterableN)LlamaConfig)support_torch_compile)
VllmConfigget_current_vllm_config)init_logger)RMSNorm)QKVParallelLinearReplicatedLinear)LogitsProcessor)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)LlamaDecoderLayerLlamaForCausalLM)NestedTensors   )AutoWeightsLoaderget_draft_quant_configmaybe_prefixprocess_eagle_weightc                       s   e Zd Z			ddedededB deddf
 fd	d
ZdededB fddZ	de
jdee
je
jf fddZde
jdee
je
jf fddZde
jde
jde
jde
jdB dee
je
jf f
ddZ  ZS )r    Nr   vllm_configprefixconfig	layer_idxreturnc              
      s   t  j|||d |p|jj}| |}|dkrd| j n| j}t|dd}t|| jj	| jj
| jj||t|dd| j_t|j|jd| _|| _t|d	drT| j| _d S | j| _d S )
N)r   r   r      attention_biasFqkv_proj)biasquant_configr   epsnorm_before_residual)super__init__model_config	hf_configget_quant_confighidden_sizegetattrr	   	self_attnhead_dimtotal_num_headstotal_num_kv_headsr   r!   r   rms_norm_epshidden_normr   _norm_before_residual_residual_norm_norm_after_residual)selfr   r   r   r   r#   qkv_input_sizeqkv_bias	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/llama_eagle3.pyr(   '   s&   


zLlamaDecoderLayer.__init__c                 C   s   t |S )z8Use drafter's quantization config instead of verifier's.)r   )r7   r   r<   r<   r=   r+   M   s   z"LlamaDecoderLayer.get_quant_confighidden_statesc                 C   s   |  |}|}||fS Nr3   r7   r>   residualr<   r<   r=   r4   Q   s   
z'LlamaDecoderLayer._norm_before_residualc                 C   s   |}|  |}||fS r?   r@   rA   r<   r<   r=   r6   X   s   
z&LlamaDecoderLayer._norm_after_residual	positionsembedsrB   c                 C   sx   | j dkr| |}| j|d\}}tj||gdd}n| ||\}}| j||d}| ||\}}| |}||fS )Nr   )r>   )dim)rC   r>   )r   input_layernormr5   torchcatr.   post_attention_layernormmlp)r7   rC   rD   r>   rB   r<   r<   r=   forward_   s   


zLlamaDecoderLayer.forward)r   Nr   )__name__
__module____qualname__r   strr   intr(   r   r+   rH   Tensortupler4   r6   rL   __classcell__r<   r<   r:   r=   r   &   sJ    &

r   rE   )	input_idsrC   r>   input_embeds)dynamic_arg_dimsc                       s   e Zd Zddddedededdf fd	d
ZdejdejfddZ		ddejdejdejdejdB de
ejejf f
ddZdee
eejf  dee fddZ  ZS )
LlamaModelr   r   )start_layer_idr   r   rY   r   r   Nc             	      s  t    |jjj_jj_t|_t	jdd }|d ur*d|v r*|d _
nd_
t  tjjjjtdd_t fddtjjD _j
r}tjdrcjjd	 }njjd	 }t|jjd
|jjjtdd
d_tjjjjd_d S )Neagle_configuse_aux_hidden_stateTembed_tokensr   c              	      s.   g | ]}t  td |  j|dqS )zlayers.)r   r   r   )r   r   r   ).0r   current_vllm_configr   r7   rY   r<   r=   
<listcomp>   s    z'LlamaModel.__init__.<locals>.<listcomp>target_hidden_size   Ffc)
input_sizeoutput_sizer"   params_dtyper#   r   return_biasr$   )r'   r(   speculative_configdraft_model_configr*   r   
vocab_sizer   r#   r-   r[   r   r   r,   r   r\   nn
ModuleListrangenum_hidden_layerslayershasattrrb   r
   r)   dtyperd   r   r2   norm)r7   r   rY   r   rZ   fc_input_sizer:   r_   r=   r(      sH   



	zLlamaModel.__init__rU   c                 C   s
   |  |S r?   )r\   )r7   rU   r<   r<   r=   embed_input_ids   s   
zLlamaModel.embed_input_idsrC   r>   rV   c                 C   sf   |d u r	|  |}|jd |jd ksJ d }| jD ]}|||||d\}}q| ||\}}||fS )NrE   )rC   rD   r>   rB   )ru   shaperp   rs   )r7   rU   rC   r>   rV   rB   layerhidden_prenormr<   r<   r=   rL      s   

zLlamaModel.forwardweightsc                 C   s*  g d}t |  }t }|D ]\}}d|v r|dd}| jd urK| j| }rK|| }t|dt}	| dkr<|n|d }|	|| |	| qd|v sSd|v r]t
||}|d u r]q|D ]\}
}}||vriq_|||
}|| }|j}	|	|||  n|| }t|dt}	|	|| |	| q|S )N))	.qkv_projz.q_projq)rz   z.k_projk)rz   z.v_projv).gate_up_projz
.gate_projr   )r~   z.up_projr   z	midlayer.z	layers.0.weight_loaderr   scale
zero_point)dictnamed_parameterssetreplacer#   get_cache_scaler-   r   rF   addr   r   )r7   ry   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   
param_nameweight_nameshard_idr<   r<   r=   load_weights   sB   




zLlamaModel.load_weightsr?   )rM   rN   rO   r   rQ   rP   r(   rH   rR   ru   rS   rL   r   r   r   rT   r<   r<   r:   r=   rX   }   s4    :
,rX   c                   @   s   e Zd ZdddedefddZ		ddejd	edB d
ejdB dejfddZ		ddejdejdejdejdB de
ejejf f
ddZdejdejdB fddZdejdejfddZdee
eejf  fddZdS )Eagle3LlamaForCausalLMr   r]   r   r   c                C   s  t j|  |jjj| _t| jdd d u r t| jdd }|| j_|j	
|j}|| j_t|d|d| _t| jdd}t| jj| jjt|dd| _t| jj|d	| _t jtj| jjtjd
dd| _|jj| _| jr| jdtd| jjrvdnd| jj dd d S d S )Ndraft_vocab_sizerk   model)r   r   rY   logit_scaleg      ?lm_headr]   )r   )rr   F)requires_gradmask_hiddenr   rc   )
persistent)rl   Moduler(   ri   rj   r*   r   r-   r   r)   get_num_layersparallel_configtarget_layer_countrX   r   r   r,   r   r   r   logits_processor	ParameterrH   zeroslongdraft_id_to_target_idparallel_draftinguse_parallel_draftingregister_bufferr[   )r7   r   r   base_vocab_sizetarget_layer_numr   r<   r<   r=   r(   
  sL   

zEagle3LlamaForCausalLM.__init__NrU   multimodal_embeddingsis_multimodalr   c                 C   s   | j |S r?   )r   ru   )r7   rU   r   r   r<   r<   r=   ru   8  s   z&Eagle3LlamaForCausalLM.embed_input_idsrC   r>   inputs_embedsc                 C   s   |  ||||S r?   )r   )r7   rU   rC   r>   r   r<   r<   r=   rL   @  s   zEagle3LlamaForCausalLM.forwardc                 C   s   |  | j|}| jd u r$|jd | jjks"J d| jj d|j |S tj| jj|j	d}|| j }|
|jd | jjftd}||d d |f< |S )Nr   z"Expected logits to have shape (*, z), but got )devicer   z-inf)r   r   r   rv   r   rk   rH   aranger   r   new_fullfloat)r7   r>   logitsbasetargets
logits_newr<   r<   r=   compute_logitsI  s(   

z%Eagle3LlamaForCausalLM.compute_logitsc                 C   s   | j js|S | j |S r?   )r   r[   rd   )r7   r>   r<   r<   r=   combine_hidden_statesa  s   z,Eagle3LlamaForCausalLM.combine_hidden_statesry   c           
      C   s
  i }d}d}d}|D ]G\}}d|v rq
d|v r | dd}d}n"d|v r:| js-td q
| j|dd	 d}q
d
|vrBd| }d|v rHd}|||< t| | q
|s[| jr[tddg}|se|	d |sl|	d | j
jsu|	d t| d |d}	|	|  d S )NFt2dd2tr   Tr   zmmask_hidden found in weights but model is not configured for parallel drafting. Skipping loading mask_hidden.r   rE   r   zmodel.r\   zzmask_hidden not found in weights but model is configured for parallel drafting. Please provide mask_hidden in the weights.zfc.)skip_prefixesskip_substrs)r   r   loggerwarningr   copy_viewr   
ValueErrorappendr   r[   r   r   items)
r7   ry   model_weightsincludes_draft_id_mappingincludes_embed_tokensincludes_mask_hiddenr   r   r   loaderr<   r<   r=   r   j  sT   



z#Eagle3LlamaForCausalLM.load_weights)NNr?   )rM   rN   rO   r   rP   r(   rH   rR   r   ru   rS   rL   r   r   r   r   r<   r<   r<   r=   r   	  sH    1

	

 	r   ),collections.abcr   rH   torch.nnrl   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.loggerr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr	   r
   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.quantization.base_configr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r    vllm.model_executor.models.llamar   r   vllm.multimodal.inputsr   utilsr   r   r   r   rM   r   r   rX   r   r<   r<   r<   r=   <module>   s8   W 