o
    
۾i<+                     @   s   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ G dd dejZG dd dejZG dd dejZ dS )zInference-only MiMo-MTP model.    )IterableN)PretrainedConfig)CacheConfigModelConfig
VllmConfig)RMSNorm)LogitsProcessor)QuantizationConfig)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)Qwen2DecoderLayer)IntermediateTensors   )maybe_prefixc                       sn   e Zd Z		ddededededB dedB ddf fdd	Z	
dde	j
de	j
de	j
dede	j
f
ddZ  ZS )MiMoMultiTokenPredictorLayerNconfigprefixmodel_configcache_configquant_configreturnc                    sp   t    t|j|jd| _t|j|jd| _tj|jd |jdd| _	t
||||d| _t|j|jd| _d S )N)eps   F)bias)r   r   r   r   )super__init__r   hidden_sizerms_norm_epstoken_layernormhidden_layernormnnLinear
input_projr   	mtp_blockfinal_layernorm)selfr   r   r   r   r   	__class__ W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/mimo_mtp.pyr   -   s   
z%MiMoMultiTokenPredictorLayer.__init__r   inputs_embeds	positionsprevious_hidden_statesspec_step_indexc                 C   sj   |d usJ d||dk< |  |}| |}| tj||gdd}| j||d d\}}|| }| |S )Nr   )dim)r,   hidden_statesresidual)r   r    r#   torchcatr$   r%   )r&   r+   r,   r-   r.   r1   r2   r)   r)   r*   forwardD   s   



z$MiMoMultiTokenPredictorLayer.forward)NNr   )__name__
__module____qualname__r   strr   r   r	   r   r3   Tensorintr5   __classcell__r)   r)   r'   r*   r   ,   s8    r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z		ddejdejdejdejdB de	dejfddZ
	ddejdede	dejfddZ  ZS )MiMoMultiTokenPredictor r   vllm_configr   c                   sr   t    jj  j| _ j| _t j	 j
| _tj fddt| j| j| j D | _t j	| _d S )Nc              
      s4   i | ]}t |t  d | jjjdqS )z.layers.)r   r   r   )r:   r   r   r   r   ).0idxr   r   rA   r)   r*   
<dictcomp>j   s    z4MiMoMultiTokenPredictor.__init__.<locals>.<dictcomp>)r   r   r   	hf_confignum_hidden_layersmtp_start_layer_idxnum_nextn_predict_layersnum_mtp_layersr   
vocab_sizer   embed_tokensr3   r!   
ModuleDictrange
mtp_layersr   logits_processorr&   rA   r   r'   rD   r*   r   ]   s"   

z MiMoMultiTokenPredictor.__init__	input_idsr   c                 C   s
   |  |S N)rL   r&   rR   r)   r)   r*   embed_input_ids{   s   
z'MiMoMultiTokenPredictor.embed_input_idsNr   r,   r-   r+   spec_step_idxc                 C   s0   |d u r	|  |}| jt| j|  ||||S rS   )rL   rO   r:   rH   )r&   rR   r,   r-   r+   rV   r)   r)   r*   r5   ~   s   
zMiMoMultiTokenPredictor.forwardr1   lm_headc                 C   s$   | j t| j|   | ||}|S rS   )rO   r:   rH   rP   )r&   r1   rW   rV   logitsr)   r)   r*   compute_logits   s   z&MiMoMultiTokenPredictor.compute_logits)Nr   r6   )r7   r8   r9   r   r:   r   r3   r;   rU   r<   r5   r
   rY   r=   r)   r)   r'   r*   r>   \   s8    
r>   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z			ddejdB dejdejde	dB dejdB de
dejfddZ	d dejde
dejdB fddZdeeeejf  dee fddZdedefddZde
dedefddZ  ZS )!MiMoMTPr?   r@   rA   r   c                   sJ   t    |jj| _t|t|dd| _t| jj	| jj
t|dd| _d S )Nmodel)rA   r   rW   r@   )r   r   r   rF   r   r>   r   r[   r
   rK   r   rW   rQ   r'   r)   r*   r      s   


zMiMoMTP.__init__rR   r   c                 C   s   | j |S rS   )r[   rU   rT   r)   r)   r*   rU      s   zMiMoMTP.embed_input_idsNr   r,   r1   intermediate_tensorsr+   rV   c                 C   s&   |dksJ d|  |||||}|S )Nr   z+mimo_mtp only support predict one token now)r[   )r&   rR   r,   r1   r\   r+   rV   r)   r)   r*   r5      s
   	
zMiMoMTP.forwardc                 C   s   | j || j|S rS   )r[   rY   rW   )r&   r1   rV   r)   r)   r*   rY      s   zMiMoMTP.compute_logitsweightsc                 C   s
  g d}t |  }t }|D ]s\}}d|v rq| |}|D ]7\}}}	||vr)qd|vr/ nNd|v r8||vr8q|||}|drH||vrHq|| }
|
j}||
||	  n&|dra||vraqd|vrnd|vrnd|vrnq|| }
t|
dt}||
| |	| q|S )	N))qkv_projq_projq)r^   k_projk)r^   v_projv)gate_up_proj	gate_projr   )re   up_projr   zrotary_emb.inv_freqrO   zmlp.experts.z.biasrL   rW   weight_loader)
dictnamed_parametersset map_model_name_to_mtp_param_namereplaceendswithrh   getattrr   add)r&   r]   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamrh   r)   r)   r*   load_weights   s>   

zMiMoMTP.load_weightsrt   c           	      C   s   dd l }d}|||}|r+t|d}|| jj }|| |d | d}g d}|D ]
}||v r;|  S q1d}|||}|rR|| | d }|S )	Nr   z(model\.mtp_layers\.)(\d+)(\.)r   r   .)r   r    r#   r%   z(model\.mtp_layers\.\d+\.)z
mtp_block.)regexmatchr<   groupr   rG   rm   )	r&   rt   repatternr}   original_numnew_numname_without_prefixsub_namer)   r)   r*   rl      s"    z(MiMoMTP.map_model_name_to_mtp_param_name
spec_layerc                 C   sJ   g d}d}|D ]
}||v rd} nq|s#| d| dd| d}|S )z
        Rewrite the weight name to match the format of the original model.
        Add .mtp_block for modules in transformer layer block for spec layer
        )rL   enormhnormeh_projshared_headFTzmodel.layers.r{   z.mtp_block.)rm   )r&   r   rt   spec_layer_weight_namesspec_layer_weightrw   r)   r)   r*   _rewrite_spec_layer_name  s   z MiMoMTP._rewrite_spec_layer_name)NNr   r6   )r7   r8   r9   r   r:   r   r3   r;   rU   r   r<   r5   rY   r   tuplerk   rz   rl   r   r=   r)   r)   r'   r*   rZ      s@    

$5rZ   )!__doc__collections.abcr   r3   torch.nnr!   transformersr   vllm.configr   r   r   $vllm.model_executor.layers.layernormr   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr	   3vllm.model_executor.layers.vocab_parallel_embeddingr
   r   -vllm.model_executor.model_loader.weight_utilsr    vllm.model_executor.models.qwen2r   vllm.sequencer   utilsr   Moduler   r>   rZ   r)   r)   r)   r*   <module>   s"   0>