o
    پiQe                     @   s  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 e8e9Z:G dd dej;Z<G dd dej;Z=G dd dej;Z>G dd dej;Z?G dd dej;Z@e@gZAdS )zAInference-only Apertus model compatible with HuggingFace weights.    N)AnyDictIterableListOptionalTupleUnion)nn)ApertusConfig)get_pp_groupget_tensor_model_parallel_rank$get_tensor_model_parallel_world_size)XIELU)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessorLogitsProcessorOutput)PoolerPoolingType)QuantizationConfig)RadixAttention)get_rope)PPMissingLayerget_layer_id)ParallelLMHeadVocabParallelEmbedding)ForwardBatchPPProxyTensors)default_weight_loaderkv_cache_scales_loadermaybe_remap_kv_scale_name)get_global_server_args)
add_prefixmake_layersc                       s`   e Zd Z				ddedededee d	ed
ededdf fddZ		ddefddZ	  Z
S )
ApertusMLPNF Thidden_sizeintermediate_size
hidden_actquant_configbiasprefixreduce_resultsreturnc                    sd   t    t||||td|d| _t||||td||d| _|dkr,td| dt | _	d S )Nup_projr,   r+   r-   	down_proj)r,   r+   r-   r.   xieluzUnsupported activation: z". Only xIELU is supported for now.)
super__init__r   r$   r0   r   r2   
ValueErrorr   act_fn)selfr(   r)   r*   r+   r,   r-   r.   	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/apertus.pyr5   >   s*   


zApertusMLP.__init__use_reduce_scatterc                 C   s.   |  |\}}| |}| j||d\}}|S )N)skip_all_reduce)r0   r7   r2   )r8   xforward_batchr=   _r;   r;   r<   forward_   s   

zApertusMLP.forward)NFr'   T)NF)__name__
__module____qualname__intstrr   r   boolr5   rB   __classcell__r;   r;   r9   r<   r&   =   s6    	$r&   c                       s   e Zd Z									dded	ed
ededededeeee	f  de
dedee dede
de
ddf fddZdejdejdedejfddZ  ZS )ApertusAttentionr   '  NT    r'   Fconfigr(   	num_headsnum_kv_headslayer_id
rope_thetarope_scalingrope_is_neox_stylemax_position_embeddingsr+   r-   r,   bias_o_projr/   c              
      s  t    || _|| _t }|| _| j| dksJ | j| | _|| _| j|kr2| j| dks1J n	|| j dks;J td| j| | _	t
|d| j| j | _t
|dd}t|| j | _| j| j | _| j	| j | _| jd | _|| _|	| _t|| j| j| j||
td|d| _t| j| j |||
td|d| _t| j| j|	|||d	| _t| j| j| j| j	||
td
|d| _t| j|jd| _t| j|jd| _d S )Nr      head_dimpartial_rotary_factorg      qkv_projr1   o_proj)
rotary_dimmax_positionbaserR   is_neox_styleattn)rO   rP   r+   r-   eps) r4   r5   rP   r(   r   total_num_headsrN   total_num_kv_headsmaxrO   getattrrW   rF   r[   q_sizekv_sizescalingrQ   rT   r   r$   rY   r   rZ   r   
rotary_embr   r_   r   rms_norm_epsq_normk_norm)r8   rM   r(   rN   rO   rP   rQ   rR   rS   rT   r+   r-   r,   rU   tp_sizerX   r9   r;   r<   r5   p   sr   

	
	zApertusAttention.__init__	positionshidden_statesr@   c                 C   s   |  |\}}|j| j| j| jgdd\}}}| | d| j|}| 	| d| j|}| 
|||\}}| ||||}	| |	\}
}|
S )N)dim)rY   splitrf   rg   rk   
contiguousviewrW   view_asrl   ri   r_   rZ   )r8   rn   ro   r@   qkvrA   qkvattn_outputoutputr;   r;   r<   rB      s    zApertusAttention.forward)	r   rK   NTrL   Nr'   FF)rC   rD   rE   r
   rF   floatr   r   rG   r   rH   r   r5   torchTensorr   rB   rI   r;   r;   r9   r<   rJ   o   s`    	
RrJ   c                       sr   e Zd Z			ddededee deddf
 fd	d
Zde	j
de	j
dedee	j
 dee	j
e	j
f f
ddZ  ZS )ApertusDecoderLayerr   Nr'   rM   rP   r+   r-   r/   c                    s  t    |j| _t|dd}t|dd }|d ur$t|dd r$|j|d< t|dd}t|dd}t|d	d
p;t|dd
}	|	}
t|drF|j}	t|| j|j|j	||||||t
d||	|
d| _t| j|j|j|t|dd
t
d|d| _t|j|jd| _t|j|jd| _d S )NrQ   rK   rR    original_max_position_embeddingsrS   TrT   rL   attention_biasFr,   qkv_bias	self_attn)rM   r(   rN   rO   rP   rQ   rR   rS   rT   r+   r-   r,   rU   mlp_biasmlp)r(   r)   r*   r+   r,   r-   r`   )r4   r5   r(   re   r   hasattrr   rJ   num_attention_headsnum_key_value_headsr$   r   r&   r)   r*   r   r   rj   attention_layernormfeedforward_layernorm)r8   rM   rP   r+   r-   rQ   rR   rS   rT   r   rU   r9   r;   r<   r5      sX   



zApertusDecoderLayer.__init__rn   ro   r@   residualc                 C   sZ   |d u r|}|  |}n|  ||\}}| j|||d}| ||\}}| |}||fS )N)rn   ro   r@   )r   r   r   r   )r8   rn   ro   r@   r   r;   r;   r<   rB     s   
zApertusDecoderLayer.forward)r   Nr'   )rC   rD   rE   r
   rF   r   r   rG   r5   r}   r~   r   r   rB   rI   r;   r;   r9   r<   r      s4    8r   c                       s   e Zd Z		ddedee deddf fddZ		dd	ej	d
ej	de
dej	dee deej	eej	eej	 f ef fddZdeddfddZ  ZS )ApertusModelNr'   rM   r+   r-   r/   c                    s   t    | _ | _ j| _ j| _ j| _t | _	| j	j
r.t j jtd|d| _nt | _t j fdd| j	j| j	jdd\| _| _| _| j	jrYt j jd| _ntdd	| _g | _d S )
Nembed_tokensr+   r-   c                    s   t  | |dS )N)rM   r+   rP   r-   )r   )idxr-   rM   r+   r;   r<   <lambda>>  s    z'ApertusModel.__init__.<locals>.<lambda>zmodel.layers)pp_rankpp_sizer-   r`   T)return_tuple)r4   r5   r+   rM   pad_token_idpadding_idx
vocab_sizeorg_vocab_sizer   pp_groupis_first_rankr   r(   r$   r   r   r%   num_hidden_layersrank_in_group
world_sizelayersstart_layer	end_layeris_last_rankr   rj   normlayers_to_capturer8   rM   r+   r-   r9   r   r<   r5   %  s4   



zApertusModel.__init__	input_idsrn   r@   input_embedspp_proxy_tensorsc                 C   s   | j jr|d u r| |}n|}d }n|d usJ |d }|d }d }g }	t| j| jD ]}
|
| jv r:|	||  | j|
 }|||||\}}q,| j j	sTt
||dS | ||\}}t|	dkrd|S ||	fS )Nro   r   )ro   r   r   )r   r   r   ranger   r   r   appendr   r   r   r   len)r8   r   rn   r@   r   r   ro   r   deferred_normaux_hidden_statesilayerrA   r;   r;   r<   rB   L  s<   


zApertusModel.forwardquantization_param_pathc                 C   sv   t  }t }t|||| jj| jjjD ]%\}}t| j| t	j
s&| j| j}t|jdr5||j_||j_qtdd S )Nk_scalez8Self attention has no KV cache scaling factor attribute!)r   r   r!   rM   r   r:   
model_type
isinstancer   r	   Identityr   r   r_   r   v_scaleRuntimeError)r8   r   rm   tp_rank	layer_idxscaling_factorlayer_self_attnr;   r;   r<   load_kv_cache_scales  s$   
z!ApertusModel.load_kv_cache_scalesNr'   )NN)rC   rD   rE   r
   r   r   rG   r5   r}   r~   r   r   r   r   r   rB   r   rI   r;   r;   r9   r<   r   $  s8    ,
3r   c                       s  e Zd ZdddZdgZg dZddgZdd	d
dZ		d=dede	e
 deddf fddZ		d=dede	e
 defddZe 			d>dejdejdedejdede	e defddZe 	d?dejdejdedeeef dejde	e fd d!Zed"d# Zed$d% Zdejfd&d'Zd(d) Zd*d+ Z d,e!eeejf  fd-d.Z"d/d0 Z#d1d2 Z$d3d4 Z%d5d6 Z&d7eddfd8d9Z'd?d:e	e(e  fd;d<Z)  Z*S )@ApertusForCausalLMinput_embeddingsoutput_embeddings)r   lm_headr   ).down_proj.z	.up_proj.z.q_proj.z.k_proj.z.v_proj..o_proj.r   r   )	.qkv_projr   )r   rV   )r      ).q_proj.k_proj.v_projNr'   rM   r+   r-   r/   c                    s   t    t | _|| _|| _| ||td|| _| jj	r$| jj
| _nt|j|j|td|t jd| _t|| _ttjdd| _g d| _d| _d S )Nmodelr   )r+   r-   use_attn_tp_groupT)pooling_type	normalize)r   r   rw   )r   r   rx   )r   r   ry   F)r4   r5   r   r   rM   r+   _init_modelr$   r   tie_word_embeddingsr   r   r   r   r(   r#   enable_dp_lm_headr   logits_processorr   r   LASTpoolerstacked_params_mappingcapture_aux_hidden_statesr   r9   r;   r<   r5     s$   



zApertusForCausalLM.__init__c                 C   s   t |||dS )Nr   )r   r   r;   r;   r<   r     s   zApertusForCausalLM._init_modelFr   rn   r@   r   get_embeddingr   c           	      C   sV   | j |||||d}d }| jr|\}}| jjr)|s#| ||| j||S | ||S |S )N)r   )r   r   r   r   r   r   r   )	r8   r   rn   r@   r   r   r   ro   r   r;   r;   r<   rB     s*   
zApertusForCausalLM.forwardsplit_intervalc                 C   s   |\}}|dkr|d u r| j ||_n||_t||D ]}| j j| }	|	||j||j\|_|_q|| j jjkrS| j |j|j\}
}|
|_| 	||j| j
|}|S d }|S )Nr   )r   r   ro   r   r   r   rM   r   r   r   r   )r8   r   rn   r@   r   r   startendr   r   ro   rA   resultr;   r;   r<   forward_split_prefill  s0   	z(ApertusForCausalLM.forward_split_prefillc                 C      | j jS N)r   r   r8   r;   r;   r<   r   #     zApertusForCausalLM.start_layerc                 C   r   r   )r   r   r   r;   r;   r<   r   '  r   zApertusForCausalLM.end_layerc                 C   r   r   )r   r   r   r;   r;   r<   get_input_embeddings+  s   z'ApertusForCausalLM.get_input_embeddingsc                 C   sT   | j D ]\}}}}||v r|||d td  |f  S q|d td  dfS )Nz.weightrV   )r   replacer   )r8   name
param_nameweight_nameshard_id	num_shardr;   r;   r<    get_module_name_from_weight_name.  s   z3ApertusForCausalLM.get_module_name_from_weight_namec                 C   s   t |  }t|S r   )dictnamed_parametersr   )r8   params_dictr;   r;   r<   get_num_params7  s   z!ApertusForCausalLM.get_num_paramsweightsc                 C   s  g d}t |  }|  D ]\}}|ds|dr |||< q|D ]\}}t|}|d urBt| jdrB|| jjk sA|| jjkrBq#d|v sJd|v rKq#d|v sSd|v rTq#|	d	r^||vr^q#| j
jrgd
|v rgq#d|v rut||}|d u ruq#|D ]-\}}	}
|	|vrqw||	|}|dr||vrqw||vrqw|| }|j}||||
  n3|dr||vrq#|dr||vrq#|| v r|| }t|dt}||| q#td| d q#d S )Nr   z.betaz.epsr   zrotary_emb.inv_freq	projectorzrotary_emb.cos_cachedzrotary_emb.sin_cachedzmodel.vision_towerzlm_head.weightscalez.biasz	.kv_scaleweight_loaderz
Parameter z not found in params_dict)r   r   named_buffersendswithr   r   r   r   r   
startswithrM   r   r"   r   r   keysre   r    loggerwarning)r8   r   r   r   r   bufferloaded_weightrP   r   r   r   paramr   r;   r;   r<   load_weights;  sf   

zApertusForCausalLM.load_weightsc                 C   s   | j jj| jjfS r   )r   r   weightr   r   r;   r;   r<   get_embed_and_head  s   z%ApertusForCausalLM.get_embed_and_headc                 C   s8   | j j`| j`|| j j_|| j_tj  tj  d S r   )r   r   r   r   r}   cudaempty_cachesynchronize)r8   embedheadr;   r;   r<   set_embed_and_head  s   

z%ApertusForCausalLM.set_embed_and_headc                 C   s
   | j jjS r   )r   r   r   r   r;   r;   r<   	get_embed  s   
zApertusForCausalLM.get_embedc                 C   sJ   t | jdr| jj| jjkrd S | jj`|| jj_tj	  tj
  d S )Ntarget_hidden_size)r   rM   r  r(   r   r   r   r}   r   r   r   )r8   r   r;   r;   r<   	set_embed  s   


zApertusForCausalLM.set_embedr   c                 C   s   | j | d S r   )r   r   )r8   r   r;   r;   r<   r     s   z'ApertusForCausalLM.load_kv_cache_scales	layer_idsc                 C   sX   | j jsd S |d u rd| _| jj}d|d |d g| j_d S d| _dd |D | j_d S )NTr      c                 S   s   g | ]}|d  qS )rV   r;   ).0valr;   r;   r<   
<listcomp>  s    zCApertusForCausalLM.set_eagle3_layers_to_capture.<locals>.<listcomp>)r   r   r   rM   r   r   r   )r8   r  
num_layersr;   r;   r<   set_eagle3_layers_to_capture  s   z/ApertusForCausalLM.set_eagle3_layers_to_capturer   )NFNr   )+rC   rD   rE   embedding_modulesembedding_padding_modules#default_bitsandbytes_target_modulescolumn_parallel_weights_modules#bitsandbytes_stacked_params_mappingr
   r   r   rG   r5   r   r}   no_gradr~   r   rH   r   r   rB   r   rF   r   propertyr   r   r	   	Embeddingr   r   r   r   r   r   r   r   r  r   r   r	  rI   r;   r;   r9   r<   r     s    	
#
#
(

	F r   )B__doc__loggingtypingr   r   r   r   r   r   r   r}   r	   transformersr
   sglang.srt.distributedr   r   r   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   r   sglang.srt.layers.poolerr   r   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   sglang.srt.layers.utilsr   r   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   r   $sglang.srt.model_loader.weight_utilsr    r!   r"   sglang.srt.server_argsr#   sglang.srt.utilsr$   r%   	getLoggerrC   r   Moduler&   rJ   r   r   r   
EntryClassr;   r;   r;   r<   <module>   s<   $
2cRq  
