o
    پiJ                     @   s  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 e8e9Z:G dd dej;Z<G dd dej;Z=G dd dej;Z>G dd dej;Z?G dd dej;Z@e@gZAdS )zRInference-only Arcee Foundational Model (AFM) compatible with HuggingFace weights.    N)AnyDictIterableListOptionalTupleUnion)nn)LlamaConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)
get_act_fn)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loaderkv_cache_scales_loadermaybe_remap_kv_scale_name)get_global_server_args)
add_prefixmake_layersc                       sT   e Zd ZdZ			ddedededee d	ed
eddf fddZ	dddZ
  ZS )ArceeMLPz
    MLP block for the Arcee model, using a ReLU-squared activation function.
    This differs from the Llama SwiGLU activation.
    N Thidden_sizeintermediate_size
hidden_actquant_configprefixreduce_resultsreturnc                    sf   t    t||d|td|d| _t||d|td||d| _|dkr,td| dtd| _	d S )	NFup_projbiasr+   r,   	down_proj)r1   r+   r,   r-   relu2zUnsupported activation: z.. Arcee model in SGLang only supports 'relu2'.)
super__init__r   r$   r/   r   r2   
ValueErrorr   act_fn)selfr(   r)   r*   r+   r,   r-   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/arcee.pyr5   ?   s*   
	
zArceeMLP.__init__c                 C   s*   |  |\}}| |}| |\}}|S N)r/   r7   r2   )r8   xforward_batch_r;   r;   r<   forwarda   s   
zArceeMLP.forward)Nr'   Tr=   )__name__
__module____qualname____doc__intstrr   r   boolr5   rA   __classcell__r;   r;   r9   r<   r&   9   s*    
"r&   c                       s   e Zd Z								dded	ed
ededededeeee	f  de
dedee dede
ddf fddZdejdejdedejfddZ  ZS )ArceeAttentionr   '  NT    r'   Fconfigr(   	num_headsnum_kv_headslayer_id
rope_thetarope_scalingrope_is_neox_stylemax_position_embeddingsr+   r,   r1   r.   c              
      s  t    || _t }|| _| j| dksJ | j| | _|| _| j|kr/| j| dks.J n	|| j dks8J td| j| | _t	|dd | _
| j
d u rT| j| j | _
t	|dd| _t| j| j
 | _| j| j
 | _| j| j
 | _| j
d | _|| _|	| _t|| j
| j| j||
td|d| _t| j| j
 |||
td|d| _t| j
| j|	|||d	| _t| j| j
| j| j||
td
|d| _d S )Nr      head_dimpartial_rotary_factorg      qkv_projr0   o_proj)
rotary_dimmax_positionbaserR   is_neox_styleattn)rO   rP   r+   r,   )r4   r5   r(   r   total_num_headsrN   total_num_kv_headsmaxrO   getattrrV   rW   rF   rZ   q_sizekv_sizescalingrQ   rT   r   r$   rX   r   rY   r   
rotary_embr   r^   )r8   rM   r(   rN   rO   rP   rQ   rR   rS   rT   r+   r,   r1   tp_sizer9   r;   r<   r5   i   sl   


	
zArceeAttention.__init__	positionshidden_statesr?   c                 C   sb   |  |\}}|j| j| j| jgdd\}}}| |||\}}| ||||}	| |	\}
}|
S )N)dim)rX   splitrc   rd   rf   r^   rY   )r8   rh   ri   r?   qkvr@   qkvattn_outputoutputr;   r;   r<   rA      s    zArceeAttention.forward)r   rK   NTrL   Nr'   F)rB   rC   rD   r
   rF   floatr   r   rG   r   rH   r   r5   torchTensorr   rA   rI   r;   r;   r9   r<   rJ   h   sZ    	
IrJ   c                       sr   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 dee	j
e	j
f f
ddZ  ZS )ArceeDecoderLayerr   Nr'   rM   rP   r+   r,   r.   c           
         s   t    |j| _t|dd}t|dd }|d ur$t|dd r$|j|d< t|dd}t|dd}t|d	d
p;t|dd
}	t|| j|j|j||||||td||	d| _	t
| j|j|j|td|d| _t|j|jd| _t|j|jd| _d S )NrQ   rK   rR    original_max_position_embeddingsrS   TrT   rL   attention_biasFr1   	self_attn)rM   r(   rN   rO   rP   rQ   rR   rS   rT   r+   r,   r1   mlp)r(   r)   r*   r+   r,   eps)r4   r5   r(   rb   rw   rJ   num_attention_headsnum_key_value_headsr$   ry   r&   r)   r*   rz   r   rms_norm_epsinput_layernormpost_attention_layernorm)
r8   rM   rP   r+   r,   rQ   rR   rS   rT   rx   r9   r;   r<   r5      sN   

zArceeDecoderLayer.__init__rh   ri   r?   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)rh   ri   r?   )r   ry   r   rz   )r8   rh   ri   r?   r   r;   r;   r<   rA      s   
zArceeDecoderLayer.forward)r   Nr'   )rB   rC   rD   r
   rF   r   r   rG   r5   rt   ru   r   r   rA   rI   r;   r;   r9   r<   rv      s4    0rv   c                       s   e Zd Z		ddedee deddf fddZ		dd	ej	d
ej	de
dej	dee deej	eej	eej	 f ef fddZdeddfddZ  ZS )
ArceeModelNr'   rM   r+   r,   r.   c                    s   t     | _ j| _ j| _t | _| jjr't	 j j
td|d| _nt | _t j fdd| jj| jjdd\| _| _| _| jjrRt j
 jd| _ntdd	| _g | _d S )
Nembed_tokensr+   r,   c                    s   t  | |dS )N)rM   r+   rP   r,   )rv   )idxr,   rM   r+   r;   r<   <lambda>"  s    z%ArceeModel.__init__.<locals>.<lambda>zmodel.layers)pp_rankpp_sizer,   r{   T)return_tuple)r4   r5   rM   pad_token_idpadding_idx
vocab_sizer   pp_groupis_first_rankr   r(   r$   r   r   r%   num_hidden_layersrank_in_group
world_sizelayersstart_layer	end_layeris_last_rankr   r   normlayers_to_capturer8   rM   r+   r,   r9   r   r<   r5     s0   



zArceeModel.__init__	input_idsrh   r?   input_embedspp_proxy_tensorsc                 C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }g }t| j| jD ]}	|	| jv r8|||  | j|	 }
|
||||\}}q*| j j	sRt
||dS | ||\}}t|dkrb|S ||fS )Nri   r   )ri   r   r   )r   r   r   ranger   r   r   appendr   r   r   r   len)r8   r   rh   r?   r   r   ri   r   aux_hidden_statesilayerr@   r;   r;   r<   rA   0  s:   


zArceeModel.forwardquantization_param_pathc                 C   sv   t  }t }t|||| jj| jjjD ]%\}}t| j| t	j
s&| j| j}t|jdr5||j_||j_qtdd S )Nk_scalez8Self attention has no KV cache scaling factor attribute!)r   r   r!   rM   r   r:   
model_type
isinstancer   r	   Identityry   hasattrr^   r   v_scaleRuntimeError)r8   r   rg   tp_rank	layer_idxscaling_factorlayer_self_attnr;   r;   r<   load_kv_cache_scales^  s$   
zArceeModel.load_kv_cache_scalesNr'   )NN)rB   rC   rD   r
   r   r   rG   r5   rt   ru   r   r   r   r   r   rA   r   rI   r;   r;   r9   r<   r   
  s8    *
.r   c                       s
  e Zd Zg dZddgZddddZ			d'd
edee de	ddf fddZ
			d'd
edee de	fddZe 			d(dejdejdedejdedee defddZedd Zedd Zdejfdd Zd!eee	ejf  fd"d#Zd$e	ddfd%d&Z  ZS ))ArceeForCausalLM).down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj..o_proj.r   r   )	.qkv_projr   )r   rU   )r      ).q_proj.k_proj.v_projNr'   rM   r+   r,   r.   c                    s   t    t | _|| _|| _| ||td|| _t	|j
|j|td|t jd| _t|| _ttjdd| _g d| _d| _d S )Nmodellm_head)r+   r,   use_attn_tp_groupT)pooling_type	normalize))r   r   rn   )r   r   ro   )r   r   rp   F)r4   r5   r   r   rM   r+   _init_modelr$   r   r   r   r(   r#   enable_dp_lm_headr   r   logits_processorr   r   LASTpoolerstacked_params_mappingcapture_aux_hidden_statesr   r9   r;   r<   r5     s    



zArceeForCausalLM.__init__c                 C   s   t |||dS )Nr   )r   r   r;   r;   r<   r     s   zArceeForCausalLM._init_modelFr   rh   r?   r   get_embeddingr   c           	      C   sV   | j |||||d}d }| jr|\}}| jjr)|s#| ||| j||S | ||S |S )N)r   )r   r   r   r   r   r   r   )	r8   r   rh   r?   r   r   r   ri   r   r;   r;   r<   rA     s*   
zArceeForCausalLM.forwardc                 C      | j jS r=   )r   r   r8   r;   r;   r<   r        zArceeForCausalLM.start_layerc                 C   r   r=   )r   r   r   r;   r;   r<   r     r   zArceeForCausalLM.end_layerc                 C   r   r=   )r   r   r   r;   r;   r<   get_input_embeddings  s   z%ArceeForCausalLM.get_input_embeddingsweightsc                 C   s&  t |  }|D ]\}}t|}|d ur't| jdr'|| jjk s&|| jjkr'qd|v s/d|v r0qd|v s8d|v r9qd|v rGt||}|d u rGqd}| jD ]$\}}}	||vrVqL|	||}||vraqL|| }
|
j
}||
||	 d} |s||v r|| }
t|
d	t}||
| qtd
| d qd S )Nr   zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedscaleFTweight_loaderz
Parameter z not found in model.)dictnamed_parametersr   r   r   r   r   r"   r   replacer   rb   r    loggerwarning)r8   r   params_dictnameloaded_weightrP   
is_stacked
param_nameweight_nameshard_idparamr   r;   r;   r<   load_weights  sN   

zArceeForCausalLM.load_weightsr   c                 C   s   | j | d S r=   )r   r   )r8   r   r;   r;   r<   r     s   z%ArceeForCausalLM.load_kv_cache_scalesr   )NFN)rB   rC   rD   #default_bitsandbytes_target_modulescolumn_parallel_weights_modules#bitsandbytes_stacked_params_mappingr
   r   r   rG   r5   r   rt   no_gradru   r   rH   r   r   rA   propertyr   r   r	   	Embeddingr   r   r   r   r   rI   r;   r;   r9   r<   r   t  sj    
!
#

2r   )BrE   loggingtypingr   r   r   r   r   r   r   rt   r	   transformersr
   sglang.srt.distributedr   r   r   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   r   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr    r!   r"   sglang.srt.server_argsr#   sglang.srt.utilsr$   r%   	getLoggerrB   r   Moduler&   rJ   rv   r   r   
EntryClassr;   r;   r;   r<   <module>   s:   $
/XJj 
!