o
    -iH                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5 ee6Z7G dd dej8Z9G dd dej8Z:G dd dej8Z;eG dd  d ej8Z<G d!d" d"ej8e/e.Z=dS )#z?Inference-only OLMoE model compatible with HuggingFace weights.    )Iterable)partial)isliceN)nn)	Attention)support_torch_compile)
VllmConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)split_tensor_along_last_dim)init_logger)FusedMoE)RMSNorm)QKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sp   e Zd ZdZ				ddededededejdB d	edB d
edB def fddZ	dej
dej
fddZ  ZS )OlmoeMoEa  A tensor-parallel MoE implementation for Olmoe that shards each expert
    across all ranks.

    Each expert's weights are sharded across all ranks and a fused MoE
    kernel is used for the forward pass, and finally we reduce the outputs
    across ranks.
    N num_expertstop_khidden_sizeintermediate_sizeparams_dtypequant_configtp_sizeprefixc	           	         sP   t    || _t||dd | dd| _t||||dd||| dd	| _d S )NFz.gatebiasr*   r,   Tz.experts)	r%   r&   r'   r(   reduce_resultsrenormalizer*   r+   r,   )super__init__r'   r   gater   experts)	selfr%   r&   r'   r(   r)   r*   r+   r,   	__class__ ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/olmoe.pyr2   I   s(   
zOlmoeMoE.__init__hidden_statesreturnc                 C   sB   |j }|j d }|d|}| |\}}| j||d}||S )N)r:   router_logits)shapeviewr3   r4   )r5   r:   
orig_shape
hidden_dimr=   _final_hidden_statesr8   r8   r9   forwardl   s   

zOlmoeMoE.forward)NNNr$   )__name__
__module____qualname____doc__inttorchdtyper   strr2   TensorrD   __classcell__r8   r8   r6   r9   r#   @   s0    	#r#   c                       sr   e Zd Zdddededdf fddZd	ejd
ejdeejejf fddZ	dejdejdejfddZ
  ZS )OlmoeAttentionr$   r,   vllm_configr,   r;   Nc          
   	      s  t    |jj}|j}|j}|j| _t|dd}|j}|j	}t
 }	|| _| j|	 dks.J | j|	 | _|| _| j|	krF| j|	 dksEJ n	|	| j dksOJ td| j|	 | _| j| j | _| j| j | _| j| j | _| jd | _|| _t| j| j| j| jd|| dd| _|	| _t | _t| j| j d	d
| _t| j| j d	d
| _t| j| j | jd|| dd| _t| j||j dd| _!t"| j| j| j| j||| dd| _#d S )Nmax_position_embeddingsi   r   r   g      Fz	.qkv_projr-   h㈵>epsz.o_projT)max_positionrope_parametersis_neox_stylez.attn)num_kv_headscache_configr*   r,   )$r1   r2   model_config	hf_configrZ   r*   r'   getattrnum_attention_headsnum_key_value_headsr   total_num_heads	num_headstotal_num_kv_headsmaxrY   head_dimq_sizekv_sizescalingrR   r   qkv_projr+   r
   tp_rankr   q_normk_normr   o_projr   rW   
rotary_embr   attn)
r5   rQ   r,   configrZ   r*   rR   ra   rY   r+   r6   r8   r9   r2   z   sr   

	
zOlmoeAttention.__init__qkc                 C   sr   | j dkrt| }t| }| |}| |}| j dkr5tt| j d}||| j }||| j }||fS )Nr   )num_partitions)r+   r   
contiguousrj   rk   r   r   ri   )r5   rp   rq   splitterr8   r8   r9   _apply_qk_norm   s   



zOlmoeAttention._apply_qk_norm	positionsr:   c           
      C   sp   |  |\}}|j| j| j| jgdd\}}}| ||\}}| |||\}}| |||}| |\}	}|	S )Nr<   )dim)rh   splitre   rf   ru   rm   rn   rl   )
r5   rv   r:   qkvrB   rp   rq   vattn_outputoutputr8   r8   r9   rD      s    zOlmoeAttention.forward)rE   rF   rG   r   rL   r2   rJ   rM   tupleru   rD   rN   r8   r8   r6   r9   rO   y   s      F
rO   c                       sV   e Zd Zdddededdf fddZd	ejd
ejdejdB dejfddZ  Z	S )OlmoeDecoderLayerr$   rP   rQ   r,   r;   Nc                   s|   t    |jj}|j}|j| _t|| dd| _t|j	|j
|j|j|| dd| _t|jdd| _t|jdd| _d S )Nz
.self_attnrQ   r,   z.mlp)r%   r&   r'   r(   r*   r,   rS   rT   )r1   r2   r[   r\   r*   r'   rO   	self_attnr#   r%   num_experts_per_tokr(   mlpr   input_layernormpost_attention_layernorm)r5   rQ   r,   ro   r*   r6   r8   r9   r2      s$   
zOlmoeDecoderLayer.__init__rv   r:   residualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)rv   r:   )r   r   r   r   )r5   rv   r:   r   r8   r8   r9   rD      s   
zOlmoeDecoderLayer.forward)
rE   rF   rG   r   rL   r2   rJ   rM   rD   rN   r8   r8   r6   r9   r~      s     r~   c                       s   e Zd Zdeddededeej f fddZ	de
jd	e
jfd
dZ	dde
jde
jdedB de
jdB d	e
jeB f
ddZd	eeeeeef  fddZdeeee
jf  d	ee fddZ  ZS )
OlmoeModelr$   r,   
layer_typerQ   r,   r   c                   s   t    jj}|j| _|| _t|j|j| _t	|j
 fdd| dd\| _| _| _t|jdd| _tddg|j| _d S )	Nc                    s    | dS )Nr   r8   rP   r   rQ   r8   r9   <lambda>!  s    z%OlmoeModel.__init__.<locals>.<lambda>z.layersrP   rS   rT   r:   r   )r1   r2   r[   r\   
vocab_sizero   r   r'   embed_tokensr!   num_hidden_layersstart_layer	end_layerlayersr   normr    make_empty_intermediate_tensors)r5   rQ   r,   r   ro   r6   r   r9   r2     s"   


zOlmoeModel.__init__	input_idsr;   c                 C   s
   |  |S N)r   r5   r   r8   r8   r9   embed_input_ids*     
zOlmoeModel.embed_input_idsNrv   intermediate_tensorsinputs_embedsc           	      C   s   t  jr|d ur|}n| |}d }n|d usJ |d }|d }t| j| j| jD ]
}||||\}}q*t  js@t||dS |d urN| 	||\}}|S | 	|}|S )Nr:   r   )r:   r   )
r	   is_first_rankr   r   r   r   r   is_last_rankr   r   )	r5   r   rv   r   r   r:   r   layerrB   r8   r8   r9   rD   -  s.   


zOlmoeModel.forwardc                 C   s   t j| ddd| jjdS )N	gate_proj	down_projup_proj)ckpt_gate_proj_nameckpt_down_proj_nameckpt_up_proj_namer%   )r   make_expert_params_mappingro   r%   r5   r8   r8   r9   get_expert_mappingQ  s   zOlmoeModel.get_expert_mappingweightsc              	   C   s~  g d}t |  }t }|  }|D ]\}}|D ]8\}}	}
|	|vr#qd|v r(q||	|}|dr8||vr8qt|| r>q||vrCq|| }|j}||||
  ne|D ]*}|\}}	}}
|	|vraqT||	|}t|| rmqT|| }|j}|||||
|d  n8|dr||vrqt|| rq|dr|dd}||vrt	d|| q|}|| }t
|d	t}||| || q|S )
N))rh   q_projrp   )rh   k_projrq   )rh   v_projrz   zmlp.expertsz.bias)shard_id	expert_idkv_scalez	.kv_scalez.attn.kv_scalez{Found kv scale in the checkpoint (e.g. %s), but not found the expected name in the model (e.g. %s). kv-scale is not loaded.weight_loader)dictnamed_parameterssetr   replaceendswithr   r   loggerwarning_oncer]   r   add)r5   r   stacked_params_mappingparams_dictloaded_paramsexpert_params_mappingnameloaded_weight
param_nameweight_namer   paramr   mappingr   remapped_kv_scale_namer8   r8   r9   load_weights\  sz   




zOlmoeModel.load_weightsr   )rE   rF   rG   r~   r   rL   typer   Moduler2   rJ   rM   r   r   rD   listr}   rI   r   r   r   r   rN   r8   r8   r6   r9   r     s2    
$,r   c                       s   e Zd Zdg diZdeddededeej	 f fdd	Z
d
ejdejfddZ		dd
ejdejdedB dejdB dejeB f
ddZdejdejfddZdeeeejf  dee fddZdeeeeeef  fddZ  ZS )OlmoeForCausalLMrh   )r   r   r   r$   r   rQ   r,   r   c                   sp   t    |jj}|j}|| _|| _t|t|d|d| _t	|j
|j|t|dd| _t|j
| _| jj| _d S )Nmodel)rQ   r,   r   lm_head)r*   r,   )r1   r2   r[   r\   r*   ro   r   r"   r   r   r   r'   r   r   logits_processorr   )r5   rQ   r,   r   ro   r*   r6   r8   r9   r2     s&   
zOlmoeForCausalLM.__init__r   r;   c                 C   s   | j |S r   )r   r   r   r8   r8   r9   r     s   z OlmoeForCausalLM.embed_input_idsNrv   r   r   c                 C   s   |  ||||}|S r   )r   )r5   r   rv   r   r   r:   r8   r8   r9   rD     s   zOlmoeForCausalLM.forwardr:   c                 C   s   |  | j|}|S r   )r   r   )r5   r:   logitsr8   r8   r9   compute_logits  s   zOlmoeForCausalLM.compute_logitsr   c                 C   s   t | }||S r   )r   r   )r5   r   loaderr8   r8   r9   r     s   
zOlmoeForCausalLM.load_weightsc                 C   s
   | j  S r   )r   r   r   r8   r8   r9   r     r   z#OlmoeForCausalLM.get_expert_mapping)NN)rE   rF   rG   packed_modules_mappingr~   r   rL   r   r   r   r2   rJ   rM   r   r   rD   r   r   r}   r   r   r   rI   r   rN   r8   r8   r6   r9   r     s:    
$&r   )>rH   collections.abcr   	functoolsr   	itertoolsr   rJ   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   vllm.distributedr	   r
   r   r   vllm.distributed.utilsr   vllm.loggerr   $vllm.model_executor.layers.fused_moer   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   utilsr   r   r    r!   r"   rE   r   r   r#   rO   r~   r   r   r8   r8   r8   r9   <module>   s>   9c0 (