o
    
۾i@                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z6 ddl7m8Z8 G dd dej9Z:G dd dej9Z;G dd dej9Z<eG d d! d!ej9Z=G d"d# d#ej9e-e,Z>dS )$z?Inference-only OLMo2 model compatible with HuggingFace weights.    )Iterable)partial)isliceN)nn)Olmo2Config)support_torch_compile)
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size) tensor_model_parallel_all_gather)get_tensor_model_parallel_rank)split_tensor_along_last_dim)
SiluAndMul)	Attention)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)SupportsLoRA
SupportsPP)AutoWeightsLoaderextract_layer_indexis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefix)IntermediateTensors)Olmo3Configc                       sr   e Zd ZdZdddedef fddZdejd	ejd
e	ejejf fddZ
dejdejd
ejfddZ  ZS )Olmo2Attentionz
    This is the attention block where the output is computed as
    `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
     prefixvllm_configr&   c          	   
      s2  t    |jj| _t| jttfsJ | jj}t	 | _
| jj| _|| j dks*J | j| j
 dks4J | j| j
 | _| jjpA| j| _| j| j
krT| j| j
 dksSJ n
| j
| j dks^J td| j| j
 | _|| j | _| j| j | _| j| j | _| jj| _t|| j| j| jd|j| dd| _t | _t| j| j | jjd| _t| jj| jjd| _| jd | _t |}d }t!| jdd  }d ur|| d	kr| jj"}t#| j| j| j| j|j$|j|| d
d| _%|d u r| jj&}n| jj&d }d|d}t'| j| j|d| _(t)| j| j |d|j| dd| _*d S )Nr      Fz	.qkv_projbiasquant_configr&   epsg      layer_typessliding_attentionz.attn)num_kv_headscache_configr+   per_layer_sliding_windowr&   
rope_thetadefault)	rope_typer3   )max_positionrope_parametersz.o_proj)+super__init__model_config	hf_configconfig
isinstancer   r"   hidden_sizer
   tp_sizenum_attention_headstotal_num_heads	num_headsnum_key_value_headstotal_num_kv_headsmaxr0   head_dimq_sizekv_sizemax_position_embeddingsr   r+   qkv_projr   tp_rankr   rms_norm_epsk_normq_normscalingr   getattrsliding_windowr   r1   attnr7   r   
rotary_embr   o_proj)	selfr'   r&   r>   	layer_idxrQ   r.   r7   r3   	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/olmo2.pyr9   L   s   








zOlmo2Attention.__init__qkreturnc                 C   sr   | j dkrt| }t| }| |}| |}| j dkr5tt| j d}||| j }||| j }||fS )Nr(   )num_partitions)r?   r   
contiguousrN   rM   r   r   rK   )rU   r[   r\   splitterrY   rY   rZ   _apply_qk_norm   s   



zOlmo2Attention._apply_qk_norm	positionshidden_statesc           
      C   sp   |  |\}}|j| j| j| jgdd\}}}| ||\}}| |||\}}| |||}| |\}	}|	S )N)dim)rJ   splitrG   rH   ra   rS   rR   rT   )
rU   rb   rc   qkv_r[   r\   vattn_outputoutputrY   rY   rZ   forward   s    zOlmo2Attention.forward)__name__
__module____qualname____doc__r   strr9   torchTensortuplera   rl   __classcell__rY   rY   rW   rZ   r#   E   s"    V
r#   c                       sF   e Zd ZdZdddedef fddZdejd	ejfd
dZ	  Z
S )Olmo2MLPz
    This is the MLP block where the output is computed as
    `MLP(x)` in `LN(MLP(x + LN(Attention(x))))`
    (plus another skip connection).
    r$   r%   r'   r&   c                   sz   t    |jj}t|ttfsJ |j}|j}t	||gd d|j
| dd| _t | _t||d|j
| dd| _d S )N   Fz.gate_up_projr)   z
.down_proj)r8   r9   r:   r;   r=   r   r"   r>   intermediate_sizer   r+   gate_up_projr   act_fnr   	down_proj)rU   r'   r&   r<   r>   rx   rW   rY   rZ   r9      s(   
	zOlmo2MLP.__init__xr]   c                 C   s*   |  |\}}| |}| |\}}|S N)ry   rz   r{   )rU   r|   gate_uprh   rY   rY   rZ   rl      s   
zOlmo2MLP.forwardrm   rn   ro   rp   r   rq   r9   rr   rs   rl   ru   rY   rY   rW   rZ   rv      s    rv   c                       sL   e Zd ZdZdddedef fddZdejd	ejd
ejfddZ	  Z
S )Olmo2DecoderLayerz
    This is a typical transformer block where the output is
    computed as `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
    r$   r%   r'   r&   c                   st   t    |jj}t|ttfsJ t|| dd| _t	|| dd| _
t|j|jd| _t|j|jd| _d S )Nz
.self_attnr'   r&   z.mlpr,   )r8   r9   r:   r;   r=   r   r"   r#   	self_attnrv   mlpr   r>   rL   post_attention_layernormpost_feedforward_layernormrU   r'   r&   r<   rW   rY   rZ   r9      s   

zOlmo2DecoderLayer.__init__rb   rc   r]   c                 C   sF   |}|  ||}| |}|| }|}| |}| |}|| }|S r}   )r   r   r   r   )rU   rb   rc   residualrY   rY   rZ   rl     s   


zOlmo2DecoderLayer.forwardr   rY   rY   rW   rZ   r      s    r   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )
Olmo2Modelr$   r%   r'   r&   c                   s   t     jj| _t| jttfsJ t| jj	| jj
| dd| _t| jj fdd| dd\| _| _| _t| jj
| jjd| _tdg| jj
| _d S )Nz.embed_tokensr%   c                    s   t  | dS )Nr   )r   r%   r'   rY   rZ   <lambda>(  s    z%Olmo2Model.__init__.<locals>.<lambda>z.layersr,   rc   )r8   r9   r:   r;   r<   r=   r   r"   r   
vocab_sizer>   embed_tokensr   num_hidden_layersstart_layer	end_layerlayersr   rL   normr   make_empty_intermediate_tensors)rU   r'   r&   rW   r   rZ   r9     s(   




zOlmo2Model.__init__	input_idsr]   c                 C   s
   |  |S r}   )r   rU   r   rY   rY   rZ   embed_input_ids3  s   
zOlmo2Model.embed_input_idsNrb   intermediate_tensorsinputs_embedsc                 C   s   t  jr|dur|}n| |}n|dusJ |d }t|tjs#J t| j| j| j	D ]}|||}q,t  j
s>td|iS | |}|S )zN
        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
        Nrc   )r	   is_first_rankr   r=   rr   rs   r   r   r   r   is_last_rankr!   r   )rU   r   rb   r   r   rc   layerrY   rY   rZ   rl   6  s   

zOlmo2Model.forwardweightsc                 C   s   g d}t | jdd}t }|D ]S\}}t|| rq|D ](\}}}	||vr'q|||}|dr7||vr7q|| }
|
j}||
||	  n|drP||vrPq|| }
t|
dt}||
| |	| q|S )N))rJ   q_projr[   )rJ   k_projr\   )rJ   v_projri   )ry   	gate_projr   )ry   up_projr(   F)remove_duplicatez.biasweight_loader)
dictnamed_parameterssetr   replaceendswithr   rP   r   add)rU   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   rY   rY   rZ   load_weightsZ  s.   	

zOlmo2Model.load_weightsr}   )rm   rn   ro   r   rq   r9   rr   rs   r   r!   rl   r   rt   r   r   ru   rY   rY   rW   rZ   r     s     
,$r   c                       s   e Zd ZdZg dddgdZddded	ef fd
dZdej	dej	fddZ
		ddej	dB dej	dedB dej	dB dej	eB f
ddZdej	dej	dB fddZdeeeej	f  fddZ  ZS )Olmo2ForCausalLMz/
    Extremely barebones HF model wrapper.
    )r   r   r   r   r   )rJ   ry   r$   r%   r'   r&   c                   s   t    |jj}t|ttfsJ || _t|t	|dd| _
|jr(| j
j| _nt|j|j|jt	|dd| _t|j| _| j
j| _d S )Nmodelr   lm_head)r+   r&   )r8   r9   r:   r;   r=   r   r"   r<   r   r    r   tie_word_embeddingsr   r   r   r   r>   r+   r   logits_processorr   r   rW   rY   rZ   r9     s$   

zOlmo2ForCausalLM.__init__r   r]   c                 C   s   | j |S r}   )r   r   r   rY   rY   rZ   r     s   z Olmo2ForCausalLM.embed_input_idsNrb   r   r   c                 C   s   | j ||||d}|S )N)r   rb   r   r   )r   )rU   r   rb   r   r   rc   rY   rY   rZ   rl     s   zOlmo2ForCausalLM.forwardrc   c                 C   s   |  | j|}|S r}   )r   r   )rU   rc   logitsrY   rY   rZ   compute_logits  s   zOlmo2ForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.weight)skip_prefixes)r   r<   r   r   )rU   r   loaderrY   rY   rZ   r     s
   
zOlmo2ForCausalLM.load_weights)NN)rm   rn   ro   rp   packed_modules_mappingr   rq   r9   rr   rs   r   r!   rl   r   r   rt   r   ru   rY   rY   rW   rZ   r     s8    

$r   )?rp   collections.abcr   	functoolsr   	itertoolsr   rr   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   vllm.distributedr	   r
   !vllm.distributed.communication_opr   vllm.distributed.parallel_stater   vllm.distributed.utilsr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   $vllm.model_executor.layers.layernormr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   %vllm.model_executor.models.interfacesr   r    vllm.model_executor.models.utilsr   r   r   r   r   r    vllm.sequencer!   vllm.transformers_utils.configsr"   Moduler#   rv   r   r   r   rY   rY   rY   rZ   <module>   s>    y-/d