o
    i8                     @   sr  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. G dd dej/Z0G dd dej/Z1G dd dej/Z2e
G dd dej/Z3G dd dej/e(e'Z4dS ) z>Inference-only OLMo model compatible with HuggingFace weights.    )Iterable)isliceN)nn)
OlmoConfig)support_torch_compile)CacheConfig
VllmConfig)get_pp_group$get_tensor_model_parallel_world_size)
SiluAndMul)	Attention)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)get_rope)ParallelLMHeadVocabParallelEmbedding)default_weight_loader)IntermediateTensors   )SupportsLoRA
SupportsPP)AutoWeightsLoaderis_pp_missing_parameter'make_empty_intermediate_tensors_factorymake_layersmaybe_prefixc                	       s^   e Zd ZdZ			ddededB dedB def fdd	Zd
e	j
de	j
de	j
fddZ  ZS )OlmoAttentionz
    This is the attention block where the output is computed as
    `Attention(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
    N configcache_configquant_configprefixc                    s  t    || _|j| _t }|j| _| j| j dksJ | j| dks&J | j| | _| j| j | _|j	| _	|j
| _
t| j| j| j|j|| dd| _t| j| j	|jd| _| jd | _t| j| j| j||| dd| _t| j| j|j|| dd| _d S )	Nr   z	.qkv_projbiasr#   r$   )max_positionrope_parametersg      z.attn)scaler"   r#   r$   z.o_proj)super__init__r!   hidden_sizer
   num_attention_headstotal_num_heads	num_headshead_dimmax_position_embeddingsclip_qkvr   attention_biasqkv_projr   r(   
rotary_embscalingr   attnr   o_proj)selfr!   r"   r#   r$    tensor_model_parallel_world_size	__class__ U/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/olmo.pyr+   G   sP   


zOlmoAttention.__init__	positionshidden_statesreturnc           
      C   sr   |  |\}}| jd ur|j| j | jd |jddd\}}}| |||\}}| |||}| |\}	}|	S )N)minmax   )chunksdim)r4   r2   clamp_chunkr5   r7   r8   )
r9   r?   r@   qkv_qkvattn_outputoutputr=   r=   r>   forward   s   
zOlmoAttention.forwardNNr    )__name__
__module____qualname____doc__r   r   r   strr+   torchTensorrQ   __classcell__r=   r=   r;   r>   r   @   s*    	8r   c                       sN   e Zd ZdZ		ddededB def fddZd	ej	d
ej	fddZ
  ZS )OlmoMLPz
    This is the MLP block where the output is computed as
    `MLP(LN(x))` in `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
    Nr    r!   r#   r$   c                    sn   t    || _|j| _|j| _t| j| jgd d|| dd| _t | _t	| j| jd|| dd| _
d S )N   Fz.gate_up_projr%   z
.down_proj)r*   r+   r!   r,   intermediate_sizer   gate_up_projr   act_fnr   	down_proj)r9   r!   r#   r$   r;   r=   r>   r+      s&   

	zOlmoMLP.__init__xrA   c                 C   s*   |  |\}}| |}| |\}}|S N)r^   r_   r`   )r9   ra   gate_uprK   r=   r=   r>   rQ      s   
zOlmoMLP.forward)Nr    )rS   rT   rU   rV   r   r   rW   r+   rX   rY   rQ   rZ   r=   r=   r;   r>   r[      s     	 r[   c                
       sv   e Zd ZdZ			ddededB dedB def fdd	Zd
e	j
de	j
dee	j
ee	j
e	j
f dB f fddZ  ZS )OlmoDecoderLayerz
    This is a typical transformer block where the output is
    computed as `MLP(LN(x + Attention(LN(x))))`
    (plus another skip connection).
    Nr    r!   r"   r#   r$   c                    sd   t    t|||| dd| _t||| dd| _tj|jddd| _	tj|jddd| _
d S )Nz
.self_attnr$   z.mlpFelementwise_affiner&   )r*   r+   r   	self_attnr[   mlpr   	LayerNormr,   input_layernormpost_attention_layernorm)r9   r!   r"   r#   r$   r;   r=   r>   r+      s   
zOlmoDecoderLayer.__init__r?   r@   rA   c                 C   sF   |}|  |}| ||}|| }|}| |}| |}|| }|S rb   )rk   rh   rl   ri   )r9   r?   r@   residualr=   r=   r>   rQ      s   


zOlmoDecoderLayer.forwardrR   )rS   rT   rU   rV   r   r   r   rW   r+   rX   rY   tuplerQ   rZ   r=   r=   r;   r>   rd      s*    	rd   c                       s   e Zd Zdddedef fddZdejdejfd	d
Z	ddejdB dejde	dB dejdB deje	B f
ddZ
deeeejf  dee fddZ  ZS )	OlmoModelr    re   vllm_configr$   c                   s   t    |jj|j |j| _tjj	| _
tj fdd| dd\| _| _| _tjj	ddd| _tdgj	| _d S )Nc                    s   t  | dS )Nre   )rd   re   r"   r!   r#   r=   r>   <lambda>  s    z$OlmoModel.__init__.<locals>.<lambda>z.layersre   Frf   r@   )r*   r+   model_config	hf_configr"   r#   r!   r   
vocab_sizer,   embed_tokensr   num_hidden_layersstart_layer	end_layerlayersr   rj   normr   make_empty_intermediate_tensors)r9   rp   r$   r;   rq   r>   r+      s&   

zOlmoModel.__init__	input_idsrA   c                 C   s
   |  |S rb   )rv   r9   r}   r=   r=   r>   embed_input_ids  s   
zOlmoModel.embed_input_idsNr?   intermediate_tensorsinputs_embedsc                 C   sz   t  jr|dur|}n| |}n
|dusJ |d }t| j| j| jD ]}|||}q$t  js6td|iS | 	|}|S )zN
        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
        Nr@   )
r	   is_first_rankr   r   rz   rx   ry   is_last_rankr   r{   )r9   r}   r?   r   r   r@   layerr=   r=   r>   rQ     s   

zOlmoModel.forwardweightsc                 C   s   g d}t | jdd}t }|D ]Y\}}|D ].\}}}	||vr!q|||}|dr1||vr1qt|| r7q|| }
|
j}||
||	  n|drP||vrPqt|| rVq|| }
t|
dt}||
| |	| q|S )N))r4   q_projrL   )r4   k_projrM   )r4   v_projrN   )r^   	gate_projr   )r^   up_projr   F)remove_duplicatez.biasweight_loader)
dictnamed_parameterssetreplaceendswithr   r   getattrr   add)r9   r   stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr   r=   r=   r>   load_weights/  s2   


zOlmoModel.load_weightsrb   )rS   rT   rU   r   rW   r+   rX   rY   r   r   rQ   r   rn   r   r   rZ   r=   r=   r;   r>   ro      s     
,ro   c                       s   e Zd ZdZg dddgdZddded	ef fd
dZdej	dej	fddZ
		ddej	dB dej	dedB dej	dB dej	eB f
ddZdej	dej	dB fddZdeeeej	f  dee fddZ  ZS )OlmoForCausalLMz/
    Extremely barebones HF model wrapper.
    )r   r   r   r   r   )r4   r^   r    re   rp   r$   c                   sz   t    |jj}|j}|| _t|t|dd| _|j	r"| jj
| _nt|j|j|t|dd| _t|j| _| jj| _d S )Nmodel)rp   r$   lm_head)r#   r$   )r*   r+   rs   rt   r#   r!   ro   r   r   tie_word_embeddingsrv   r   r   ru   r,   r   logits_processorr|   )r9   rp   r$   r!   r#   r;   r=   r>   r+   f  s$   

zOlmoForCausalLM.__init__r}   rA   c                 C   s   | j |S rb   )r   r   r~   r=   r=   r>   r   |  s   zOlmoForCausalLM.embed_input_idsNr?   r   r   c                 C   s   | j ||||d}|S )N)r}   r?   r   r   )r   )r9   r}   r?   r   r   r@   r=   r=   r>   rQ     s   zOlmoForCausalLM.forwardr@   c                 C   s   |  | j|}|S rb   )r   r   )r9   r@   logitsr=   r=   r>   compute_logits  s   zOlmoForCausalLM.compute_logitsr   c                 C   s$   t | | jjr	dgnd d}||S )Nzlm_head.weight)skip_prefixes)r   r!   r   r   )r9   r   loaderr=   r=   r>   r     s
   
zOlmoForCausalLM.load_weights)NN)rS   rT   rU   rV   packed_modules_mappingr   rW   r+   rX   rY   r   r   rQ   r   r   rn   r   r   rZ   r=   r=   r;   r>   r   U  s8    

,r   )5rV   collections.abcr   	itertoolsr   rX   r   transformersr   vllm.compilation.decoratorsr   vllm.configr   r   vllm.distributedr	   r
   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   +vllm.model_executor.layers.rotary_embeddingr   3vllm.model_executor.layers.vocab_parallel_embeddingr   r   -vllm.model_executor.model_loader.weight_utilsr   vllm.sequencer   
interfacesr   r   utilsr   r   r   r   r   Moduler   r[   rd   ro   r   r=   r=   r=   r>   <module>   s4   	N12c