o
    ˜à·ió>  ã                   @   s<  d dl mZ d dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' G dd„ dej(ƒZ)G dd„ dej(ƒZ*eG dd„ dej(ƒƒZ+G dd„ dej(e e!ƒZ,dS )é    )ÚIterable)Úislice)ÚAnyN)Únn)ÚLlamaConfig)Úsupport_torch_compile)Úget_pp_group)ÚReLUSquaredActivation)ÚRMSNorm)ÚColumnParallelLinearÚRowParallelLinear)ÚLogitsProcessor)ÚParallelLMHeadÚVocabParallelEmbedding)Údefault_weight_loaderÚmaybe_remap_kv_scale_name)ÚIntermediateTensorsé   )ÚSupportsLoRAÚ
SupportsPP)ÚAutoWeightsLoaderÚPPMissingLayerÚis_pp_missing_parameterÚ'make_empty_intermediate_tensors_factoryÚmake_layersc                       sf   e Zd ZdZ				ddededed	edB d
edededdf‡ fdd„Zde	j
de	j
fdd„Z‡  ZS )ÚArceeMLPzQFeed-forward layer for Arcee using ReLU^2 activation
    (no gating as in LLaMA).NFÚ TÚhidden_sizeÚintermediate_sizeÚ
hidden_actÚquant_configÚbiasÚprefixÚreduce_resultsÚreturnc                    sd   t ƒ  ¡  t|||||› dd| _t||||||› dd| _|dkr,td|› dƒ‚tƒ | _d S )Nz.up_proj)Ú
input_sizeÚoutput_sizer!   r    r"   z
.down_proj)r%   r&   r!   r    r#   r"   Úrelu2zUnsupported activation: z$. Only 'relu2' is supported for AFM.)	ÚsuperÚ__init__r   Úup_projr   Ú	down_projÚ
ValueErrorr	   Úact_fn)Úselfr   r   r   r    r!   r"   r#   ©Ú	__class__© úV/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/arcee.pyr)   1   s*   

ûú
ÿzArceeMLP.__init__Úxc                 C   s*   |   |¡\}}|  |¡}|  |¡\}}|S ©N)r*   r-   r+   )r.   r3   Ú_r1   r1   r2   ÚforwardV   s   
zArceeMLP.forward)NFr   T)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚintÚstrr   Úboolr)   ÚtorchÚTensorr6   Ú__classcell__r1   r1   r/   r2   r   -   s0    øþýüûúùø	÷%r   c                       sv   e Zd ZdZ			ddededB dedB deddf
‡ fd	d
„Zdej	dej	dej	dB de
ej	ej	f fdd„Z‡  ZS )ÚArceeDecoderLayerzLTransformer decoder block for Arcee, with self-attention and
    ReLU^2 MLP.Nr   ÚconfigÚcache_configr    r"   r$   c           	         sä   t ƒ  ¡  |j| _t|ddƒ}t|ddƒpt|ddƒ}|}t|dƒr%|j}ddlm} ||| j|jt|d	|jƒ||||||› d
t|ddƒd| _	t
| j|j|j|t|ddƒ|› dd| _t|j|jd| _t|j|jd| _d S )NÚmax_position_embeddingsi    Úattention_biasFr!   Úqkv_biasr   )ÚLlamaAttentionÚnum_key_value_headsz
.self_attnÚ	attn_typeÚdecoder)rB   r   Ú	num_headsÚnum_kv_headsrD   r    r!   Úbias_o_projrC   r"   rI   Úmlp_biasz.mlp)r   r   r   r    r!   r"   ©Úeps)r(   r)   r   ÚgetattrÚhasattrrF   Ú vllm.model_executor.models.llamarG   Únum_attention_headsÚ	self_attnr   r   r   Úmlpr
   Úrms_norm_epsÚinput_layernormÚpost_attention_layernorm)	r.   rB   rC   r    r"   rD   rE   rM   rG   r/   r1   r2   r)   a   sN   
ÿ
ÿÿó
ú	ÿzArceeDecoderLayer.__init__Ú	positionsÚhidden_statesÚresidualc                 C   sX   |d u r|}|   |¡}n|   ||¡\}}| j||d}|  ||¡\}}|  |¡}||fS )N)rZ   r[   )rX   rU   rY   rV   )r.   rZ   r[   r\   r1   r1   r2   r6   ˜   s   
zArceeDecoderLayer.forward)NNr   )r7   r8   r9   r:   r   r   r<   r)   r>   r?   Útupler6   r@   r1   r1   r/   r2   rA   ]   s2    ûþýüûú7þýüûrA   c                       sÄ   e Zd ZdZdedœdedeej ddf‡ fdd	„Z	d
e
jde
jfdd„Z	dd
e
jdB de
jdedB de
jdB de
jeB ee
jee
j f B f
dd„Zdeeee
jf  dee fdd„Z‡  ZS )Ú
ArceeModelzeThe transformer model backbone for Arcee (embedding layer + stacked
    decoder blocks + final norm).r   )r"   Ú
layer_typer"   r_   r$   Nc                   sØ   t ƒ  ¡  |jj‰|j‰ |j‰ˆ| _ˆ| _ˆj| _tƒ j	s$ˆj
r/tƒ jr/t| jˆjˆd| _ntƒ | _tˆj‡ ‡‡‡fdd„|› dd\| _| _| _tƒ jrYtˆjˆjd| _ntƒ | _tƒ | _tddgˆjƒ| _d S )	N)r    c                    s   ˆˆˆ ˆ| dS )N)rB   rC   r    r"   r1   ©r"   ©rC   rB   r_   r    r1   r2   Ú<lambda>Ï   s    üz%ArceeModel.__init__.<locals>.<lambda>z.layersr`   rO   r[   r\   )r(   r)   Úmodel_configÚ	hf_configrC   r    rB   Ú
vocab_sizer   Úis_first_rankÚtie_word_embeddingsÚis_last_rankr   r   Úembed_tokensr   r   Únum_hidden_layersÚstart_layerÚ	end_layerÚlayersr
   rW   Únormr]   Úaux_hidden_state_layersr   Úmake_empty_intermediate_tensors)r.   Úvllm_configr"   r_   r/   ra   r2   r)   ±   s<   
ÿÿ
ýø

ÿzArceeModel.__init__Ú	input_idsc                 C   s
   |   |¡S r4   )ri   ©r.   rr   r1   r1   r2   Úembed_input_idsç   s   
zArceeModel.embed_input_idsrZ   Úintermediate_tensorsÚinputs_embedsc                 C   sÎ   t ƒ jr|d ur
|n|  |¡}d }n|d usJ dƒ‚|d }|d }g }tt| j| j| jƒƒD ]\}}	|| jv r?| 	|| ¡ |	|||ƒ\}}q/t ƒ j
sSt||dœƒS |  ||¡\}}
t|ƒdkre||fS |S )NzAIntermediateTensors must be provided for non-first pipeline ranksr[   r\   )r[   r\   r   )r   rf   rt   Ú	enumerater   rm   rk   rl   ro   Úappendrh   r   rn   Úlen)r.   rr   rZ   ru   rv   r[   r\   Úaux_hidden_statesÚidxÚlayerr5   r1   r1   r2   r6   ê   s8   ÿý
ÿÿ
ÿÿzArceeModel.forwardÚweightsc                 C   s˜  g d¢}t |  ¡ ƒ}tƒ }|D ]º\}}d|v rqd|v s d|v r!q| jdurO| j |¡ }rO|| }t|dtƒ}	| ¡ dkr@|n|d }|	||ƒ | |¡ qd|v sWd	|v rct	||ƒ}
|
du raq|
}d
}|D ]:\}}}||vrqqg| 
||¡}| d¡r„||vr„d} nt|| ƒrd} n|| }|j}	|	|||ƒ | |¡ d} |r¥q| d¡r¯||vr¯qt|| ƒrµq|| }t|dtƒ}	|	||ƒ | |¡ q|S )z:Load weights, mapping q/k/v projections to fused qkv_proj.))ú	.qkv_projz.q_projÚq)r~   z.k_projÚk)r~   z.v_projÚvzrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedNÚweight_loaderr   ÚscaleÚ
zero_pointFz.biasT)ÚdictÚnamed_parametersÚsetr    Úget_cache_scalerQ   r   ÚdimÚaddr   ÚreplaceÚendswithr   r‚   )r.   r}   Ústacked_params_mappingÚparams_dictÚloaded_paramsÚnameÚloaded_weightÚ
scale_nameÚparamr‚   Úremapped_nameÚmappedÚ
param_nameÚweight_nameÚshard_idr1   r1   r2   Úload_weights  sf   
ÿÿ






zArceeModel.load_weightsr4   )r7   r8   r9   r:   rA   r<   Útyper   ÚModuler)   r>   r?   rt   r   r]   Úlistr6   r   r‡   r™   r@   r1   r1   r/   r2   r^   ¬   s2    ûüûú6ûþýüû
ú,+r^   c                       sÌ   e Zd ZdZdg d¢iZddœdeddf‡ fd	d
„Z		ddejdB dejde	dB dejdB deje	B f
dd„Z
dejdejdB fdd„Zdejdejfdd„Zdeeeejf  dee fdd„Z‡  ZS )ÚArceeForCausalLMzKArcee Model for causal language modeling, integrated with vLLM
    runtime.Úqkv_proj)Úq_projÚk_projÚv_projr   r`   r"   r$   Nc                   s¨   t ƒ  ¡  |jj}|| _t||› dd| _tƒ jrIt	|j
|j|jt|ddƒ|› dd| _|jr:| j | jj¡| _t|ddƒ}t|j
|d	| _ntƒ | _| jj| _d S )
Nz.model)rq   r"   Úlm_head_biasFz.lm_head)r    r!   r"   Úlogit_scaleg      ð?)rƒ   )r(   r)   rc   rd   rB   r^   Úmodelr   rh   r   re   r   r    rQ   Úlm_headrg   Útie_weightsri   r   Úlogits_processorr   rp   )r.   rq   r"   rB   r£   r/   r1   r2   r)   i  s*   

û
ÿÿzArceeForCausalLM.__init__rr   rZ   ru   rv   c                 C   s   | j ||||d}|S )N)rr   rZ   ru   rv   )r¤   )r.   rr   rZ   ru   rv   Úmodel_outputr1   r1   r2   r6   ‹  s   üzArceeForCausalLM.forwardr[   c                 C   s   |   | j|¡}|S r4   )r§   r¥   )r.   r[   Úlogitsr1   r1   r2   Úcompute_logitsš  s   zArceeForCausalLM.compute_logitsc                 C   s   | j  |¡S r4   )r¤   rt   rs   r1   r1   r2   rt   Ÿ  s   z ArceeForCausalLM.embed_input_idsr}   c                 C   s(   t | | jjr	dgnddgd}| |¡S )z[Load weights into the model (delegates to inner model and handles
        tied embeddings).zlm_head.NÚ	gate_proj)Úskip_prefixesÚskip_substrs)r   rB   rg   r™   )r.   r}   Úloaderr1   r1   r2   r™   ¢  s   ý
zArceeForCausalLM.load_weights)NN)r7   r8   r9   r:   Úpacked_modules_mappingr<   r)   r>   r?   r   r6   rª   rt   r   r]   r‡   r™   r@   r1   r1   r/   r2   r   _  s*    ÿ&ûþýüû
ú,r   )-Úcollections.abcr   Ú	itertoolsr   Útypingr   r>   r   Útransformersr   Úvllm.compilation.decoratorsr   Úvllm.distributedr   Ú%vllm.model_executor.layers.activationr	   Ú$vllm.model_executor.layers.layernormr
   Ú!vllm.model_executor.layers.linearr   r   Ú+vllm.model_executor.layers.logits_processorr   Ú3vllm.model_executor.layers.vocab_parallel_embeddingr   r   Ú-vllm.model_executor.model_loader.weight_utilsr   r   Úvllm.sequencer   Ú
interfacesr   r   Úutilsr   r   r   r   r   r›   r   rA   r^   r   r1   r1   r1   r2   Ú<module>   s.   
	0O 3