o
    -iU                     @   s  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z< ee=Z>G dd dej?Z@G dd dej?ZAG dd  d ej?ZBG d!d" d"ej?ZCe
G d#d$ d$ej?ZDG d%d& d&ej?e5e6ZEdS )'z&Inference-only Snowflake Arctic model.    )Iterable)isliceN)nn)	Attention)support_torch_compile)CacheConfig
VllmConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size tensor_model_parallel_all_reduce)init_logger)
SiluAndMul)fused_experts
fused_topk)RMSNorm)MergedColumnParallelLinearQKVParallelLinearReplicatedLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)set_weight_attrs)current_platform)IntermediateTensors)ArcticConfig   )
SupportsPPSupportsQuant)extract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                       sN   e Zd Z					ddededed	edB d
edef fddZdd Z	  Z
S )	ArcticMLPFNT config	expert_idis_residual_mlpquant_configreduce_resultsprefixc                    s   t    |j| _|| _|s|jn| j| _t| j| jgd d|| dd| _t| j| jd||| dd| _	|j
dkrEtd|j
 d	t | _d S )
N   Fz.w13biasr.   r0   z.w2r3   r/   r.   r0   siluzUnsupported activation: z!. Only silu is supported for now.)super__init__hidden_sizer,   intermediate_sizeffn_dimr   w13r   w2
hidden_act
ValueErrorr   act_fn)selfr+   r,   r-   r.   r/   r0   	__class__ ^/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/arctic.pyr7   8   s2   
	

zArcticMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r;   r?   r<   )r@   hidden_statesgate_up_rC   rC   rD   forward_   s   
zArcticMLP.forward)r)   FNTr*   )__name__
__module____qualname__r   intboolr   strr7   rI   __classcell__rC   rC   rA   rD   r(   7   s(    'r(   c                       s   e Zd ZdZ					ddededB dejdB dedB d	e	d
e
f fddZdejdejde
defddZdejdejfddZdejfddZ  ZS )	ArcticMoEz<
    Model-parallel implementation of Arctic MoE Layer.
    NTr*   r+   tp_sizeparams_dtyper.   r/   r0   c              	      s6  t    t|}|pt | _|j| _|j| _|| _|j	| _
|j| j | _|d |j dk| _|| _|d u r:t }|| _| jsNt|||| dd| _d S t| j| jd| j|| dd| _ttj| jd| j | jtj| jd	| _ttj| j| j| jtj| jd	| _t| jd
| ji t| jd
| ji d S )Nr    r   z.mlpr.   r/   r0   Fz.gate)r3   rS   r.   r0   r1   )devicedtypeweight_loader)r6   r7   r#   r   rR   r8   num_local_expertsnum_expertslayer_idnum_experts_per_toktop_kr9   moe_layer_frequencyis_moe_layerr/   torchget_default_dtyperS   r(   mlpr   gater   	Parameteremptyr   device_typewsw2sr   rW   )r@   r+   rR   rS   r.   r/   r0   rZ   rA   rC   rD   r7   k   sp   
			zArcticMoE.__init__paramloaded_weightweight_namer,   c           	      C   s   t  }|j}| j}t|| |d | }|dr*||d d f ||d|d d f< |drB||d d f |||d| d d f< |drZ|d d |f ||d d d d f< d S d S )Nr    z	w1.weightr   z	w3.weightr1   z	w2.weight)r
   datar9   sliceendswith)	r@   rh   ri   rj   r,   tp_rank
param_data
shard_sizeshardrC   rC   rD   rW      s   
"


&zArcticMoE.weight_loaderrF   returnc                 C   s   |j \}}|d| j}| |\}}| jdk}t||| j|d\}}}	t|| j| j||dd}
| j	r<| j
dkr<t|
}
|
||S )Nr)   r    )renormalizeT)inplace)shapeviewr8   rb   r\   r   r   rf   rg   r/   rR   r   )r@   rF   
num_tokensr8   router_logitsrH   do_normalizetopk_weightstopk_idstoken_expert_indicesfinal_hidden_statesrC   rC   rD   local_moe_fused   s$   


zArcticMoE.local_moe_fusedc                 C   s"   | j r
| |}|S | |}|S rE   )r^   r~   ra   )r@   rF   r}   rC   rC   rD   rI      s
   

zArcticMoE.forward)NNNTr*   )rJ   rK   rL   __doc__r   rM   r_   rV   r   rN   rO   r7   r   rc   TensorrW   r~   rI   rP   rC   rC   rA   rD   rQ   f   s>    I
rQ   c                	       sZ   e Zd Z			ddededB dedB def fddZd	ej	d
ej	dej	fddZ
  ZS )ArcticAttentionNr*   r+   cache_configr.   r0   c              	      sZ  t    || _|j| _t }|j| _| j| dksJ | j| | _|j| _	| j	|kr5| j	| dks4J n	|| j	 dks>J t
d| j	| | _| j| j | _| j| j | _| j| j | _|j| _| jd | _t| j| j| j| j	d|| dd| _t| j| j | jdd|| dd	| _t| j| j|jdd
| _t| j| j| j| j||| dd| _d S )Nr   r    g      Fz	.qkv_projr2   Tz.o_projr4   )max_positionrope_parametersis_neox_stylez.attn)num_kv_headsr   r.   r0   )r6   r7   r+   r8   r   num_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr   head_dimq_sizekv_sizemax_position_embeddingsscalingr   qkv_projr   o_projr   r   
rotary_embr   attn)r@   r+   r   r.   r0   rR   rA   rC   rD   r7      sb   

	
	zArcticAttention.__init__	positionsrF   rr   c           
      C   s`   |  |\}}|j| j| j| jgdd\}}}| |||\}}| |||}| |\}	}|	S )Nr)   )dim)r   splitr   r   r   r   r   )
r@   r   rF   qkvrH   qkvattn_outputoutputrC   rC   rD   rI   %  s    zArcticAttention.forwardNNr*   rJ   rK   rL   r   r   r   rO   r7   r_   r   rI   rP   rC   rC   rA   rD   r      s(    ?r   c                       s^   e Zd Z			ddededB dedB deddf
 fdd	Zd
ej	dej	dej	fddZ
  ZS )ArcticDecoderLayerNr*   r+   r   r.   r0   rr   c                    s   t    |j| _t|}|d |j dk}|jo|| _t|||| dd| _t||| j | dd| _	t
|j|jd| _t
|j|jd| _| jrbt
|j|jd| _t|dd	| d
d| _d S d S )Nr    r   z
.self_attnr.   r0   z.block_sparse_moerT   epsTFz.residual_mlp)r-   r/   r0   )r6   r7   r8   r#   r]   use_residualr   	self_attnrQ   block_sparse_moer   rms_norm_epsinput_layernormpost_attention_layernormresidual_layernormr(   residual_mlp)r@   r+   r   r.   r0   	layer_idxr^   rA   rC   rD   r7   3  s@   
zArcticDecoderLayer.__init__r   rF   c                 C   s   |}|  |}| j||d}|| }|}| jr;| |}| |}|}| |}| |}|| }t|}|| }|S | |}| |}|| }|S )N)r   rF   )r   r   r   r   r   r   r   r   )r@   r   rF   residual_inputresidual_attnr   rC   rC   rD   rI   \  s,   






zArcticDecoderLayer.forwardr   r   rC   rC   rA   rD   r   2  s,    )r   c                       sx   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdejde	dB dejdB deje	B f
ddZ
  ZS )ArcticModelr*   r0   vllm_configr0   c                   s   t    |jj|j |jj| _t| jj| jd| _	t
j fdd| dd\| _| _| _j| _tjjd| _tdgj| _d S )N)org_num_embeddingsc                    s   t  | dS )Nr   )r   r   r   r+   r.   rC   rD   <lambda>  s    z&ArcticModel.__init__.<locals>.<lambda>z.layersr   r   rF   )r6   r7   model_config	hf_configr   r.   
vocab_sizer   r8   embed_tokensr&   num_hidden_layersstart_layer	end_layerlayers_attn_implementationr   r   normr%   make_empty_intermediate_tensors)r@   r   r0   rA   r   rD   r7   |  s$   

zArcticModel.__init__	input_idsrr   c                 C   s
   |  |S rE   )r   r@   r   rC   rC   rD   embed_input_ids  s   
zArcticModel.embed_input_idsNr   intermediate_tensorsinputs_embedsc                 C   sz   t  jr|d ur|}n| |}n
|d usJ |d }t| j| j| jD ]}|||}q$t  js6td|iS | 	|}|S )NrF   )
r	   is_first_rankr   r   r   r   r   is_last_rankr   r   )r@   r   r   r   r   rF   layerrC   rC   rD   rI     s   
zArcticModel.forwardrE   )rJ   rK   rL   r   rO   r7   r_   r   r   r   rI   rP   rC   rC   rA   rD   r   z  s    r   c                       s   e Zd Zdg diZdddedef fddZd	ejd
ejfddZ			dd	ejdejde
dB dejdB d
eje
B f
ddZdejd
ejdB fddZdeeeejf  d
ee fddZ  ZS )ArcticForCausalLMr   )q_projk_projv_projr*   r   r   r0   c                   s   t    |jj}|j}|| _t|t|dd| _|j	| _	t
| j	|j|t|dd| _| jjr6| jjj| j_|j| _|j| _t|j	| _| jj| _d S )Nmodel)r   r0   lm_headr   )r6   r7   r   r   r.   r+   r   r'   r   r   r   r8   r   tie_word_embeddingsr   weightrX   rY   r[   r   logits_processorr   )r@   r   r0   r+   r.   rA   rC   rD   r7     s*   

zArcticForCausalLM.__init__r   rr   c                 C   s   | j |S rE   )r   r   r   rC   rC   rD   r     s   z!ArcticForCausalLM.embed_input_idsNr   r   r   c                 C   s   |  ||||}|S rE   )r   )r@   r   r   r   r   rF   rC   rC   rD   rI     s   zArcticForCausalLM.forwardrF   c                 C   s   |  | j|}|S rE   )r   r   )r@   rF   logitsrC   rC   rD   compute_logits  s   z ArcticForCausalLM.compute_logitsweightsc                 C   s  g d}g }g }| j j}t|D ]v}|d| dd| ddf |d| dd| ddf |d dkrY|d| d	d| d
df |d| d	d| ddf qt| j jD ]&}|dd| d|f |dd| d|f |dd| d|f q_qt|  }t }	t	d |D ]\}
}|D ].\}}}||
vrq|

||}
|
dr|
|vrqt|
| rq||
 }|j}||||  ns|D ]$\}}}||
vrq|

||}
t|
| rq||
 }|j}||||  nL|D ]'\}}}||
vrq|

||}
t|
| rq||
 }|j}|||||d  n"|
dr)|
|vr)qt|
| r0q||
 }t|dt}||| |	|
 q|	S )N))r   r   r   )r   r   r   )r   r   r   zlayers.z.residual_mlp.w13.weightz.residual_mlp.w1.weightr   z.residual_mlp.w3.weightr    r1   z .block_sparse_moe.mlp.w13.weightz.block_sparse_moe.mlp.w1.weightz.block_sparse_moe.mlp.w3.weightrf   zexperts.z
.w1.weightrg   z
.w2.weightz
.w3.weightzIt will take ~10 minutes loading from the 16-bit weights. Alternatively, use the prequantized 8-bit weights of arctic and set load-format to `sharded_state` will accelerate loading.z.bias)r,   rW   )r+   r   rangeappendrX   dictnamed_parameterssetloggerinforeplacerm   r$   rW   getattrr   add)r@   r   stacked_params_mappingmlp_params_mappingexpert_params_mapping
num_layersr   r,   params_dictloaded_paramsnameri   
param_namerj   shard_idrh   rW   rC   rC   rD   load_weights  s   







	


zArcticForCausalLM.load_weights)NN)rJ   rK   rL   packed_modules_mappingr   rO   r7   r_   r   r   r   rI   r   r   tupler   r   rP   rC   rC   rA   rD   r     s.    

,r   )Fr   collections.abcr   	itertoolsr   r_   r   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.fused_moer   r   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.model_executor.utilsr   vllm.platformsr   vllm.sequencer   &vllm.transformers_utils.configs.arcticr   
interfacesr!   r"   utilsr#   r$   r%   r&   r'   rJ   r   Moduler(   rQ   r   r   r   r   rC   rC   rC   rD   <module>   sB   /MH3