o
    پi1                     @   s.  d Z ddlmZmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddl m!Z!m"Z" G dd dej#Z$G dd dej#Z%G dd dej#Z&G dd dej#Z'G dd dej#Z(e(Z)dS )z>Inference-only OLMo model compatible with HuggingFace weights.    )IterableOptionalTupleN)nn)
OlmoConfig)$get_tensor_model_parallel_world_size)
SiluAndMul)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)ForwardBatch)default_weight_loader)
add_prefixmake_layersc                	       s^   e Zd ZdZ			ddededee def fd	d
Z	de
jde
jdede
jfddZ  ZS )OlmoAttentionz
    This is the attention block where the output is computed as
    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
    (plus another skip connection).
    r   N configlayer_idquant_configprefixc              
      s  t    || _|j| _t }|j| _| j| j dksJ | j| dks&J | j| | _| j| j | _|j	| _	|j
| _
|j| _t| j| j| j|jtd|d| _t| j| j| j	| j
d| _| jd | _t| j| j| j| j||td|d| _t| j| j|jtd|d| _d S )	Nr   qkv_proj)biasr   )
rotary_dimmax_positionbaseg      attn)num_kv_headsr   r   r   o_proj)super__init__r   hidden_sizer   num_attention_headstotal_num_heads	num_headshead_dimmax_position_embeddings
rope_thetaclip_qkvr
   attention_biasr   r   r   
rotary_embscalingr   r!   r   r#   )selfr   r   r   r    tensor_model_parallel_world_size	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/olmo.pyr%   4   sR   
	zOlmoAttention.__init__	positionshidden_statesforward_batchreturnc                 C   st   |  |\}}| jd ur|j| j | jd |jddd\}}}| |||\}}| ||||}	| |	\}
}|
S )N)minmax   )chunksdim)r   r-   clamp_chunkr/   r!   r#   )r1   r7   r8   r9   qkv_qkvattn_outputoutputr5   r5   r6   forwardm   s   
zOlmoAttention.forwardr   Nr   )__name__
__module____qualname____doc__r   intr   r   strr%   torchTensorr   rJ   __classcell__r5   r5   r3   r6   r   -   s.    	9r   c                       sN   e Zd ZdZ		ddedee def fddZd	e	j
d
e	j
fddZ  ZS )OlmoMLPz
    This is the MLP block where the output is computed as
    ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
    (plus another skip connection).
    Nr   r   r   r   c                    sn   t    || _|j| _|j| _t| j| jgd d|td|d| _t | _	t
| j| jd|td|d| _d S )N   Fgate_up_proj)r   r   r   	down_proj)r$   r%   r   r&   intermediate_sizer	   r   rW   r   act_fnr   rX   r1   r   r   r   r3   r5   r6   r%      s&   

	zOlmoMLP.__init__xr:   c                 C   s*   |  |\}}| |}| |\}}|S N)rW   rZ   rX   )r1   r\   gate_uprD   r5   r5   r6   rJ      s   
zOlmoMLP.forwardNr   )rL   rM   rN   rO   r   r   r   rQ   r%   rR   rS   rJ   rT   r5   r5   r3   r6   rU   }   s     	 rU   c                       sv   e Zd ZdZ			ddededee def fd	d
Z	de
jde
jdedee
jeee
je
jf  f fddZ  ZS )OlmoDecoderLayerz
    This is a typical transformer block where the output is
    computed as ``MLP(LN(x + Attention(LN(x))))``
    (plus another skip connection).
    r   Nr   r   r   r   r   c                    sd   t    t|||td|d| _t||td|d| _tj|j	ddd| _
tj|j	ddd| _d S )N	self_attnr   mlpFelementwise_affiner   )r$   r%   r   r   ra   rU   rc   r   	LayerNormr&   input_layernormpost_attention_layernorm)r1   r   r   r   r   r3   r5   r6   r%      s$   
zOlmoDecoderLayer.__init__r7   r8   r9   r:   c                 C   sH   |}|  |}| |||}|| }|}| |}| |}|| }|S r]   )rg   ra   rh   rc   )r1   r7   r8   r9   residualr5   r5   r6   rJ      s   


zOlmoDecoderLayer.forwardrK   )rL   rM   rN   rO   r   rP   r   r   rQ   r%   rR   rS   r   r   rJ   rT   r5   r5   r3   r6   r`      s.    	r`   c                       s^   e Zd Z		ddedee def fddZ	ddej	d	ej	d
e
dej	dej	f
ddZ  ZS )	OlmoModelNr   r   r   r   c                    sd   t     | _t j jtd|d| _t j	 fddtd|d| _
tj jddd| _d S )Nembed_tokensrb   c                    s   t |  |dS )N)r   r   r   r   )r`   )idxr   r   r   r5   r6   <lambda>   s    z$OlmoModel.__init__.<locals>.<lambda>layersFrd   )r$   r%   r   r   
vocab_sizer&   r   rk   r   num_hidden_layersro   r   rf   normr[   r3   rm   r6   r%      s   

zOlmoModel.__init__	input_idsr7   r9   input_embedsr:   c                 C   sF   |du r
|  |}n|}t| jD ]
\}}||||}q| |}|S )zN
        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
        N)rk   	enumeratero   rr   )r1   rs   r7   r9   rt   r8   r   decoder_layerr5   r5   r6   rJ     s   
zOlmoModel.forwardr_   r]   )rL   rM   rN   r   r   r   rQ   r%   rR   rS   r   rJ   rT   r5   r5   r3   r6   rj      s,    !rj   c                       s   e Zd ZdZ		ddedee def fddZe	
 	dd	e	jd
e	jdede	jde	jf
ddZdeeee	jf  fddZ  ZS )OlmoForCausalLMz/
    Extremely barebones HF model wrapper.
    Nr   r   r   r   c                    sn   t    || _t||td|d| _|jr| jj| _n|j	| _
t| j
|j|j	|td|d| _t|| _d S )Nmodelrb   lm_head)org_num_embeddingsr   r   )r$   r%   r   rj   r   rx   tie_word_embeddingsrk   ry   rp   unpadded_vocab_sizer   r&   r   logits_processorr[   r3   r5   r6   r%   ,  s   
zOlmoForCausalLM.__init__rs   r7   r9   rt   r:   c                 C   s$   | j ||||d}| ||| j|S )N)rs   r7   r9   rt   )rx   r}   ry   )r1   rs   r7   r9   rt   r8   r5   r5   r6   rJ   B  s   
zOlmoForCausalLM.forwardweightsc                 C   s   g d}t | jdd}|D ]_\}}d|v rqd|v sd|v r q| jjr)d|v r)q|D ](\}}}||vr5q+|||}|drE||vrEq+|| }	|	j}
|
|	||  n|dr^||vr^q|| }	t|	d	t}
|
|	| qd S )
N))r   q_projrE   )r   k_projrF   )r   v_projrG   )rW   	gate_projr   )rW   up_proj   F)remove_duplicatezrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedzlm_head.weightz.biasweight_loader)	dictnamed_parametersr   r{   replaceendswithr   getattrr   )r1   r~   stacked_params_mappingparams_dictnameloaded_weight
param_nameweight_nameshard_idparamr   r5   r5   r6   load_weightsT  s4   
zOlmoForCausalLM.load_weightsr_   r]   )rL   rM   rN   rO   r   r   r   rQ   r%   rR   no_gradrS   r   rJ   r   r   r   rT   r5   r5   r3   r6   rw   '  s2    $rw   )*rO   typingr   r   r   rR   r   transformersr   sglang.srt.distributedr   sglang.srt.layers.activationr   sglang.srt.layers.linearr	   r
   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   r   Moduler   rU   r`   rj   rw   
EntryClassr5   r5   r5   r6   <module>   s,   P1:?T