o
    
۾i )                     @   s   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ G dd dejZG dd dejZG dd dejZdS )zInference-only Ernie-MTP model.    )IterableN)PretrainedConfig)
VllmConfig)RMSNorm)LogitsProcessor)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )LlamaDecoderLayer)is_pp_missing_parametermaybe_prefixc                       sT   e Zd Zdededdf fddZ	ddejd	ejd
ejdedejf
ddZ	  Z
S )ErnieMultiTokenPredictorLayervllm_configprefixreturnNc                    s`   t    |jj}t|j|jd| _t|j|jd| _t	j
|jd |jdd| _t||| _d S )N)eps   F)bias)super__init__model_config	hf_configr   hidden_sizerms_norm_epsmtp_emb_normmtp_hidden_normnnLinearmtp_linear_projr   	mtp_blockselfr   r   config	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/ernie_mtp.pyr   /   s   
z&ErnieMultiTokenPredictorLayer.__init__r   inputs_embeds	positionsprevious_hidden_statesspec_step_indexc                 C   sd   |d usJ d||dk< |  |}| |}| tj||gdd}| j||d d\}}|| }|S )Nr   )dim)r*   hidden_statesresidual)r   r   r    torchcatr!   )r#   r)   r*   r+   r,   r/   r0   r'   r'   r(   forward>   s   


z%ErnieMultiTokenPredictorLayer.forwardr   )__name__
__module____qualname__r   strr   r1   Tensorintr3   __classcell__r'   r'   r%   r(   r   .   s(    r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdejdejdejdB de	dejfddZ
	ddejdede	dejfddZ  ZS )ErnieMultiTokenPredictor r   r   r   c                   sp   t    jj}|j| _|j| _tj	
 fddt| j| j| j D | _t|j|j| _t|j| _d S )Nc                    s&   i | ]}t |t  d | qS )z.layers.)r8   r   ).0idxr   r   r'   r(   
<dictcomp>a   s    z5ErnieMultiTokenPredictor.__init__.<locals>.<dictcomp>)r   r   r   r   num_hidden_layersmtp_start_layer_idxnum_nextn_predict_layersnum_mtp_layersr1   r   
ModuleDictrangelayersr   
vocab_sizer   embed_tokensr   logits_processorr"   r%   rA   r(   r   Y   s"   

z!ErnieMultiTokenPredictor.__init__	input_idsr   c                 C   s
   |  |S N)rK   r#   rM   r'   r'   r(   embed_input_idsr   s   
z(ErnieMultiTokenPredictor.embed_input_idsNr   r*   r+   r)   spec_step_idxc                 C   s0   |d u r	|  |}| jt| j|  ||||S rN   )rK   rI   r8   rD   )r#   rM   r*   r+   r)   rQ   r'   r'   r(   r3   u   s   
z ErnieMultiTokenPredictor.forwardr/   lm_headc                 C   s$   | j t| j|   | ||}|S rN   )rI   r8   rD   rL   )r#   r/   rR   rQ   logitsr'   r'   r(   compute_logits   s   z'ErnieMultiTokenPredictor.compute_logits)Nr   r4   )r5   r6   r7   r   r8   r   r1   r9   rP   r:   r3   r   rT   r;   r'   r'   r%   r(   r<   X   s8    
r<   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z			ddejdB dejdejde	dB dejdB de
dejfddZ	ddejde
dejdB fddZdeeeejf  dee fddZdededefddZ  ZS )ErnieMTPr=   r>   r   r   c                   sd   t    |jj| _t|t|dd| _t| jj	| jj
t|dd| _| jjr0| jjj| j_d S d S )Nmodel)r   r   rR   r>   )r   r   r   r   r$   r<   r   rV   r   rJ   r   rR   tie_word_embeddingsrK   weight)r#   r   r   r%   r'   r(   r      s   


zErnieMTP.__init__rM   r   c                 C   s   | j |S rN   )rV   rP   rO   r'   r'   r(   rP      s   zErnieMTP.embed_input_idsNr   r*   r/   intermediate_tensorsr)   rQ   c                 C   s&   |dksJ d|  |||||}|S )Nr   z(ernie_mtp only support predict one token)rV   )r#   rM   r*   r/   rY   r)   rQ   r'   r'   r(   r3      s
   	
zErnieMTP.forwardc                 C   s   | j || j|S rN   )rV   rT   rR   )r#   r/   rQ   r'   r'   r(   rT      s   zErnieMTP.compute_logitsweightsc                 C   sT  g d}t |  }t }|D ]\}}| jjr|drqd|v r"qd|v r-| | j|}|D ]A\}}}	||vr9q/d|vr>q/d|v rG||vrGq/|||}|dsW|dr\||vr\q/t|| rbq/|| }
|
j	}||
||	  n1|ds{|dr||vrqt|| rqd|vrd	|vrd
|vrq|| }
t
|
dt}||
| || q|S )N))qkv_projq_projq)r[   k_projk)r[   v_projv)gate_up_proj	gate_projr   )rb   up_projr   zlm_head.weightzrotary_emb.inv_freqmtpzmlp.experts.z.bias_biasmtp_rK   rR   weight_loader)dictnamed_parameterssetr$   rW   endswith_rewrite_spec_layer_namereplacer   rh   getattrr	   add)r#   rZ   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrh   r'   r'   r(   load_weights   s\   


zErnieMTP.load_weightsr$   rt   c              	   C   sb   g d}|j }|D ]}||v r$|d| dd| d| d}|  S q	|dd| d}|S )zT
        Rewrite the weight name to match the format of the original model.
        )rK   r   r   r    zmodel.z.0.zmodel.layers..zmodel.mtp_block.0.z.mtp_block.)rC   rn   )r#   r$   rt   spec_layer_weight_names	layer_idxrw   r'   r'   r(   rm     s   
z!ErnieMTP._rewrite_spec_layer_name)NNr   r4   )r5   r6   r7   r   r8   r   r1   r9   rP   r
   r:   r3   rT   r   tuplerk   rz   r   rm   r;   r'   r'   r%   r(   rU      s>    

$FrU   )__doc__collections.abcr   r1   torch.nnr   transformersr   vllm.configr   $vllm.model_executor.layers.layernormr   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr	   vllm.sequencer
   llamar   utilsr   r   Moduler   r<   rU   r'   r'   r'   r(   <module>   s    *9