o
    i>                     @   s<  d dl mZ d dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ d dlmZ d dlmZmZ d dlmZmZ d dlmZ ddlm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' G dd dej(Z)G dd dej(Z*eG dd dej(Z+G dd dej(e e!Z,dS )    )Iterable)islice)AnyN)nn)LlamaConfig)support_torch_compile)get_pp_group)ReLUSquaredActivation)RMSNorm)ColumnParallelLinearRowParallelLinear)LogitsProcessor)ParallelLMHeadVocabParallelEmbedding)default_weight_loadermaybe_remap_kv_scale_name)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderPPMissingLayeris_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersc                       sf   e Zd ZdZ				ddededed	edB d
edededdf fddZde	j
de	j
fddZ  ZS )ArceeMLPzQFeed-forward layer for Arcee using ReLU^2 activation
    (no gating as in LLaMA).NF Thidden_sizeintermediate_size
hidden_actquant_configbiasprefixreduce_resultsreturnc                    sd   t    t||||| dd| _t|||||| dd| _|dkr,td| dt | _d S )Nz.up_proj)
input_sizeoutput_sizer!   r    r"   z
.down_proj)r%   r&   r!   r    r#   r"   relu2zUnsupported activation: z$. Only 'relu2' is supported for AFM.)	super__init__r   up_projr   	down_proj
ValueErrorr	   act_fn)selfr   r   r   r    r!   r"   r#   	__class__ V/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/arcee.pyr)   1   s*   


zArceeMLP.__init__xc                 C   s*   |  |\}}| |}| |\}}|S N)r*   r-   r+   )r.   r3   _r1   r1   r2   forwardV   s   
zArceeMLP.forward)NFr   T)__name__
__module____qualname____doc__intstrr   boolr)   torchTensorr6   __classcell__r1   r1   r/   r2   r   -   s0    	%r   c                       sv   e Zd ZdZ			ddededB dedB deddf
 fd	d
Zdej	dej	dej	dB de
ej	ej	f fddZ  ZS )ArceeDecoderLayerzLTransformer decoder block for Arcee, with self-attention and
    ReLU^2 MLP.Nr   configcache_configr    r"   r$   c           	         s   t    |j| _t|dd}t|ddpt|dd}|}t|dr%|j}ddlm} ||| j|jt|d	|j|||||| d
t|ddd| _	t
| j|j|j|t|dd| dd| _t|j|jd| _t|j|jd| _d S )Nmax_position_embeddingsi    attention_biasFr!   qkv_biasr   )LlamaAttentionnum_key_value_headsz
.self_attn	attn_typedecoder)rB   r   	num_headsnum_kv_headsrD   r    r!   bias_o_projrC   r"   rI   mlp_biasz.mlp)r   r   r   r    r!   r"   eps)r(   r)   r   getattrhasattrrF    vllm.model_executor.models.llamarG   num_attention_heads	self_attnr   r   r   mlpr
   rms_norm_epsinput_layernormpost_attention_layernorm)	r.   rB   rC   r    r"   rD   rE   rM   rG   r/   r1   r2   r)   a   sN   


	zArceeDecoderLayer.__init__	positionshidden_statesresidualc                 C   sX   |d u r|}|  |}n|  ||\}}| j||d}| ||\}}| |}||fS )N)rZ   r[   )rX   rU   rY   rV   )r.   rZ   r[   r\   r1   r1   r2   r6      s   
zArceeDecoderLayer.forward)NNr   )r7   r8   r9   r:   r   r   r<   r)   r>   r?   tupler6   r@   r1   r1   r/   r2   rA   ]   s2    7rA   c                       s   e Zd ZdZdeddedeej ddf fdd	Z	d
e
jde
jfddZ	dd
e
jdB de
jdedB de
jdB de
jeB ee
jee
j f B f
ddZdeeee
jf  dee fddZ  ZS )
ArceeModelzeThe transformer model backbone for Arcee (embedding layer + stacked
    decoder blocks + final norm).r   )r"   
layer_typer"   r_   r$   Nc                   s   t    |jj|j |j| _| _j| _t j	s$j
r/t jr/t| jjd| _nt | _tj fdd| dd\| _| _| _t jrYtjjd| _nt | _t | _tddgj| _d S )	N)r    c                    s    | dS )N)rB   rC   r    r"   r1   r"   rC   rB   r_   r    r1   r2   <lambda>   s    z%ArceeModel.__init__.<locals>.<lambda>z.layersr`   rO   r[   r\   )r(   r)   model_config	hf_configrC   r    rB   
vocab_sizer   is_first_ranktie_word_embeddingsis_last_rankr   r   embed_tokensr   r   num_hidden_layersstart_layer	end_layerlayersr
   rW   normr]   aux_hidden_state_layersr   make_empty_intermediate_tensors)r.   vllm_configr"   r_   r/   ra   r2   r)      s<   



zArceeModel.__init__	input_idsc                 C   s
   |  |S r4   )ri   r.   rr   r1   r1   r2   embed_input_ids   s   
zArceeModel.embed_input_idsrZ   intermediate_tensorsinputs_embedsc                 C   s   t  jr|d ur
|n| |}d }n|d usJ d|d }|d }g }tt| j| j| jD ]\}}	|| jv r?|	||  |	|||\}}q/t  j
sSt||dS | ||\}}
t|dkre||fS |S )NzAIntermediateTensors must be provided for non-first pipeline ranksr[   r\   )r[   r\   r   )r   rf   rt   	enumerater   rm   rk   rl   ro   appendrh   r   rn   len)r.   rr   rZ   ru   rv   r[   r\   aux_hidden_statesidxlayerr5   r1   r1   r2   r6      s8   

zArceeModel.forwardweightsc                 C   s  g d}t |  }t }|D ]\}}d|v rqd|v s d|v r!q| jdurO| j| }rO|| }t|dt}	| dkr@|n|d }|	|| || qd|v sWd	|v rct	||}
|
du raq|
}d
}|D ]:\}}}||vrqqg|
||}|dr||vrd} nt|| rd} n|| }|j}	|	||| || d} |rq|dr||vrqt|| rq|| }t|dt}	|	|| || q|S )z:Load weights, mapping q/k/v projections to fused qkv_proj.))	.qkv_projz.q_projq)r~   z.k_projk)r~   z.v_projvzrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedNweight_loaderr   scale
zero_pointFz.biasT)dictnamed_parameterssetr    get_cache_scalerQ   r   dimaddr   replaceendswithr   r   )r.   r}   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
scale_nameparamr   remapped_namemapped
param_nameweight_nameshard_idr1   r1   r2   load_weights  sf   







zArceeModel.load_weightsr4   )r7   r8   r9   r:   rA   r<   typer   Moduler)   r>   r?   rt   r   r]   listr6   r   r   r   r@   r1   r1   r/   r2   r^      s2    6
,+r^   c                       s   e Zd ZdZdg diZdddeddf fd	d
Z		ddejdB dejde	dB dejdB deje	B f
ddZ
dejdejdB fddZdejdejfddZdeeeejf  dee fddZ  ZS )ArceeForCausalLMzKArcee Model for causal language modeling, integrated with vLLM
    runtime.qkv_proj)q_projk_projv_projr   r`   r"   r$   Nc                   s   t    |jj}|| _t|| dd| _t jrIt	|j
|j|jt|dd| dd| _|jr:| j| jj| _t|dd}t|j
|d	| _nt | _| jj| _d S )
Nz.model)rq   r"   lm_head_biasFz.lm_head)r    r!   r"   logit_scaleg      ?)r   )r(   r)   rc   rd   rB   r^   modelr   rh   r   re   r   r    rQ   lm_headrg   tie_weightsri   r   logits_processorr   rp   )r.   rq   r"   rB   r   r/   r1   r2   r)   i  s*   


zArceeForCausalLM.__init__rr   rZ   ru   rv   c                 C   s   | j ||||d}|S )N)rr   rZ   ru   rv   )r   )r.   rr   rZ   ru   rv   model_outputr1   r1   r2   r6     s   zArceeForCausalLM.forwardr[   c                 C   s   |  | j|}|S r4   )r   r   )r.   r[   logitsr1   r1   r2   compute_logits  s   zArceeForCausalLM.compute_logitsc                 C   s   | j |S r4   )r   rt   rs   r1   r1   r2   rt     s   z ArceeForCausalLM.embed_input_idsr}   c                 C   s(   t | | jjr	dgnddgd}||S )z[Load weights into the model (delegates to inner model and handles
        tied embeddings).zlm_head.N	gate_proj)skip_prefixesskip_substrs)r   rB   rg   r   )r.   r}   loaderr1   r1   r2   r     s   
zArceeForCausalLM.load_weights)NN)r7   r8   r9   r:   packed_modules_mappingr<   r)   r>   r?   r   r6   r   rt   r   r]   r   r   r@   r1   r1   r/   r2   r   _  s*    &
,r   )-collections.abcr   	itertoolsr   typingr   r>   r   transformersr   vllm.compilation.decoratorsr   vllm.distributedr   %vllm.model_executor.layers.activationr	   $vllm.model_executor.layers.layernormr
   !vllm.model_executor.layers.linearr   r   +vllm.model_executor.layers.logits_processorr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   r   vllm.sequencer   
interfacesr   r   utilsr   r   r   r   r   r   r   rA   r^   r   r1   r1   r1   r2   <module>   s.   
	0O 3