o
    
۾i G                     @   sN  d dl Z d dlmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZmZ d dlmZmZ d dlm Z  d dl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ee*Z+G dd dej,Z-G dd dej,Z.G dd dej,Z/eG dd dej,e%Z0dS )    N)CallableIterable)PretrainedConfig)rocm_aiter_ops)support_torch_compile)
VllmConfig)init_logger)SharedFusedMoE)RMSNorm)LogitsProcessor)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)current_platform)IntermediateTensors   )DeepseekV2DecoderLayerDeepseekV2MixtureOfExpertsDeepseekV2MoE#get_spec_layer_idx_from_weight_name)maybe_prefixc                	       sL   e Zd Z	ddedededB ddf fddZdejdejfd	d
Z	  Z
S )
SharedHeadNconfigprefixquant_configreturnc                    s<   t    t|j|jd| _t|j|j|t|dd| _	d S )Nepshead)r   r   )
super__init__r
   hidden_sizerms_norm_epsnormr   
vocab_sizer   r    )selfr   r   r   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/deepseek_mtp.pyr"   )   s   
zSharedHead.__init__hidden_statesc                 C   
   |  |S N)r%   )r'   r,   r*   r*   r+   forward8      
zSharedHead.forwardr.   )__name__
__module____qualname__r   strr   r"   torchTensorr/   __classcell__r*   r*   r(   r+   r   (   s    r   c                       s`   e Zd Zdededdf fddZ		ddejd	ejd
ejdejdB dedejfddZ	  Z
S ) DeepSeekMultiTokenPredictorLayervllm_configr   r   Nc                    s   t    |jjj}|| _|j}t|j|j	d| _
t|j|j	d| _tj|jd |jdd| _tj| _t|d| _| jrM|j}tj|jj|tj| jd}nd }t|||d| _t||| j|d| _d S )	Nr      F)bias
index_topk)dtypedevice)r   r   r   )r   topk_indices_buffer)r!   r"   speculative_configdraft_model_config	hf_configr   r   r
   r#   r$   enormhnormnnLineareh_projr   device_typer>   hasattris_v32r<   r5   emptyscheduler_configmax_num_batched_tokensint32r   shared_headr   	mtp_block)r'   r9   r   r   r   topk_tokensr?   r(   r*   r+   r"   =   s6   

z)DeepSeekMultiTokenPredictorLayer.__init__r   	input_ids	positionsprevious_hidden_statesinputs_embedsspec_step_indexc                 C   sp   |d usJ t |ddkd|}| |}| |}| t j||gdd}| j||d d\}}|| }|S )Nr   )dim)rS   r,   residual)r5   where	unsqueezerC   rD   rG   catrP   )r'   rR   rS   rT   rU   rV   r,   rY   r*   r*   r+   r/   `   s   


z(DeepSeekMultiTokenPredictorLayer.forwardNr   )r1   r2   r3   r   r4   r"   r5   r6   intr/   r7   r*   r*   r(   r+   r8   <   s"    (r8   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdejdejdejdB de	dejfddZ
	ddejde	dejfddZ  ZS )DeepSeekMultiTokenPredictor r   r9   r   c                   sz   t    jj}|j| _|j| _tj	
 fddt| j| j| j D | _t|j|jt dd| _t|j| _d S )Nc                    s&   i | ]}t |t  d | qS )z.layers.)r4   r8   ).0idxr   r9   r*   r+   
<dictcomp>   s    z8DeepSeekMultiTokenPredictor.__init__.<locals>.<dictcomp>embed_tokensra   )r!   r"   model_configrB   num_hidden_layersmtp_start_layer_idxnum_nextn_predict_layersnum_mtp_layersr5   rE   
ModuleDictrangelayersr   r&   r#   r   rf   r   logits_processor)r'   r9   r   r   r(   rd   r+   r"   z   s$   

z$DeepSeekMultiTokenPredictor.__init__rR   r   c                 C   r-   r.   )rf   r'   rR   r*   r*   r+   embed_input_ids   r0   z+DeepSeekMultiTokenPredictor.embed_input_idsNr   rS   rT   rU   spec_step_idxc                 C   s<   |d u r	|  |}|| j }| jt| j|  |||||S r.   )rf   rk   rn   r4   ri   )r'   rR   rS   rT   rU   rr   current_step_idxr*   r*   r+   r/      s   

z#DeepSeekMultiTokenPredictor.forwardr,   c                 C   s8   || j  }| jt| j|  }| |jj||}|S r.   )rk   rn   r4   ri   ro   rO   r    )r'   r,   rr   rs   	mtp_layerlogitsr*   r*   r+   compute_logits   s   
z*DeepSeekMultiTokenPredictor.compute_logitsr]   r   )r1   r2   r3   r   r4   r"   r5   r6   rq   r^   r/   rv   r7   r*   r*   r(   r+   r_   y   s4    
r_   c                       s   e Zd Zdddedef fddZdd Zd	ejd
ejfddZ				dd	ejdB dejdejde
dB dejdB ded
ejfddZ	d dejded
ejdB fddZdeeeejf  d
ee fddZdeded
efddZ  ZS )!DeepSeekMTPr`   ra   r9   r   c                   s4   t    |jj| _t|t|dd| _|   d S )Nmodel)r9   r   )	r!   r"   rg   rB   r   r_   r   ry   set_moe_parameters)r'   r9   r   r(   r*   r+   r"      s   


zDeepSeekMTP.__init__c                 C   s   g | _ | jj| _| jj| _g | _g | _d }| jj	
 D ]+}t|ts$J |j}t|ts.J t|jtrF|j}| j|j | j|jj q| | d S r.   )expert_weightsr   rj   num_moe_layersn_groupnum_expert_groups
moe_layersmoe_mlp_layersry   rn   values
isinstancer8   rP   r   mlpr   appendexpertsextract_moe_parameters)r'   example_moelayerr*   r*   r+   rz      s    

zDeepSeekMTP.set_moe_parametersrR   r   c                 C   s   | j |S r.   )ry   rq   rp   r*   r*   r+   rq      s   zDeepSeekMTP.embed_input_idsNr   rS   r,   intermediate_tensorsrU   rr   c                 C   s   |  |||||}|S r.   )ry   )r'   rR   rS   r,   r   rU   rr   r*   r*   r+   r/      s   	
zDeepSeekMTP.forwardc                 C   s   | j ||S r.   )ry   rv   )r'   r,   rr   r*   r*   r+   rv      s   zDeepSeekMTP.compute_logitsweightsc                 C   s  t  }g d}tj| ddd| jj|r| jjnd d}t|  }t	 }|D ]_\}}d|v r1q't
| j|}	|	d u r<q'|oAd|v }
| |	|}|D ]?\}}}||vrTqJd	|v r]||vr]qJ|
r`qJ|||}|d
kro||vroqJ|}|dr{||vr{qJ|| }|j}||||  nd}|
rt| jddpd}d|v r|jdkrdnd}|j| }|| dksJ d| d| || }t|D ]}|}|}|
rt|| |d | }|jdkr|| }n|dkr||d d f }n|d d |f }|dd	| jj|  }d}|D ]A}|\}}}}||vrqd}|||}|| }ttdtf |j}||||||dd}|rE|
s>|}n||  n9q|rKq|drW||vrWqt||}|d u rbq|	| jjkrod|vroq|| }t|dt}||| q|
s|| q'|S )N))gate_up_proj	gate_projr   )r   up_projr   )fused_qkv_a_projq_a_projr   )r   kv_a_proj_with_mqar   r   	down_projr   r   )ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namenum_expertszrotary_emb.inv_freqzmlp.shared_expertszmlp.experts.r   z.biasr   n_shared_expertszdown_proj.weightzShared expert weight dim z not divisible by num_chunks FT.)shard_id	expert_idreturn_successz.layersweight_loader)r   $is_fusion_moe_shared_experts_enabledr	   make_expert_params_mappingr   n_routed_expertsr   dictnamed_parameterssetr   _rewrite_spec_layer_namereplaceendswithr   getattrndimshaperm   slicetypingcastr   booladdr   ry   ri   r   )r'   r   $rocm_aiter_moe_shared_expert_enabledstacked_params_mappingexpert_params_mappingparams_dictloaded_paramsnameloaded_weight
spec_layer"is_fusion_moe_shared_experts_layer
param_nameweight_namer   name_mappedparamr   
num_chunks	split_dimtotal
chunk_sizej
chunk_nameweight_to_loadchunk_sliceis_expert_weightmappingr   successr*   r*   r+   load_weights   s   












zDeepSeekMTP.load_weightsr   r   c                 C   s|   g d}dg}d}d}|D ]}||v rd}||v rd} nq|s0| d| dd| d}|S |r<| d| dd}|S )	z
        Rewrite the weight name to match the format of the original model.
        Add .mtp_block for modules in transformer layer block for spec layer
        and rename shared layer weights to be top level.
        )rf   rC   rD   rG   rO   rf   FTzmodel.layers..z.mtp_block.zmodel.)r   )r'   r   r   spec_layer_weight_namesshared_weight_namesspec_layer_weightshared_weightr   r*   r*   r+   r     s&   z$DeepSeekMTP._rewrite_spec_layer_name)NNr   rw   )r1   r2   r3   r   r4   r"   rz   r5   r6   rq   r   r^   r/   rv   r   tupler   r   r   r7   r*   r*   r(   r+   rx      sB    	

$ :rx   )1r   collections.abcr   r   r5   torch.nnrE   transformersr   vllm._aiter_opsr   vllm.compilation.decoratorsr   vllm.configr   vllm.loggerr   $vllm.model_executor.layers.fused_moer	   $vllm.model_executor.layers.layernormr
   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.platformsr   vllm.sequencer   deepseek_v2r   r   r   r   utilsr   r1   loggerModuler   r8   r_   rx   r*   r*   r*   r+   <module>   s2   ==