o
    پiA                     @   sp  d Z ddlmZ ddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z, e+ Z-dd Z.G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3G dd dej/Z4e4Z5dS ) z?Inference-only OLMo2 model compatible with HuggingFace weights.    )partial)IterableOptionalTupleN)nn)PretrainedConfig)get_tensor_model_parallel_rank$get_tensor_model_parallel_world_sizesplit_tensor_along_last_dim tensor_model_parallel_all_gather)
SiluAndMul)RMSNorm)MergedColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)RadixAttention)get_rope)ParallelLMHeadVocabParallelEmbedding)get_is_capture_mode)ForwardBatch)default_weight_loader)
add_prefixis_cudamake_layersc                 C   s   t | dr
| jd S d S )Nsliding_window   )hasattrr   )config r!   K/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/models/olmo2.py!get_attention_sliding_window_size9   s   r#   c                       s   e Zd ZdZ				ddededee ded	ee	j
j f
 fd
dZde	jde	jdee	je	jf fddZde	jde	jdede	jfddZ  ZS )Olmo2Attentionz
    This is the attention block where the output is computed as
    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
    (plus another skip connection).
    r   N r    layer_idquant_configprefix
alt_streamc                    s  t    || _|j| _t | _|j| _| j| j dksJ | j| j dks(J | j| j | _| jj	| _
| j
| jkrE| j
| j dksDJ n
| j| j
 dksOJ td| j
| j | _| j| j | _| j| j | _| j| j | _|j| _|j| _t| j| j| j| j
|j|td|d| _t | _|| _t| j
| j | jjd| _t| jj| jjd| _d }t| jdd  }d ur|| dkrt| j}|d u r| jjndd	i| _t | j| j| j| j| jd
| _!| jd | _"t#| j| j| j"| j|||td|d| _$t%| j| j | j|j|td|d| _&d S )Nr   r   qkv_proj)total_num_kv_headsbiasr'   r(   epslayer_typessliding_attention	rope_typedefault)
rotary_dimmax_positionbaserope_scalingg      attn)num_kv_headsr&   sliding_window_sizer'   r(   o_projr,   r'   r(   )'super__init__r    hidden_sizer	   tp_sizenum_attention_headstotal_num_heads	num_headsnum_key_value_headsr+   maxr8   head_dimq_sizekv_sizemax_position_embeddings
rope_thetar   attention_biasr   r*   r   tp_rankr)   r   rms_norm_epsk_normq_normgetattrr#   r6   r   
rotary_embscalingr   r7   r   r:   )selfr    r&   r'   r(   r)   r   r/   	__class__r!   r"   r=   D   s   

	


zOlmo2Attention.__init__qkreturnc           	      C   s,  | j dkrt| }t| }| jd urlt rltj }| j| |j	}|j	}|
d|d }| |}tj| j |
d|d }| |}W d    n1 sVw   Y  || j ||}||}n| j|}| j|}| j dkrtt| j d}||| j }||| j }||fS )Nr   )num_partitions)r?   r   
contiguousr)   r   torchcudacurrent_streamwait_streamshapereshaperN   streamrM   viewforward_nativer   r
   rK   )	rR   rU   rV   r]   q_shapek_shape	q_by_last	k_by_lastsplitterr!   r!   r"   _apply_qk_norm   s0   




zOlmo2Attention._apply_qk_norm	positionshidden_statesforward_batchc                 C   sr   |  |\}}|j| j| j| jgdd\}}}| ||\}}| |||\}}| ||||}	| |	\}
}|
S )NrX   )dim)r*   splitrF   rG   ri   rP   r7   r:   )rR   rj   rk   rl   qkv_rU   rV   vattn_outputoutputr!   r!   r"   forward   s    zOlmo2Attention.forwardr   Nr%   N)__name__
__module____qualname____doc__r   intr   r   strr[   r\   Streamr=   Tensorr   ri   r   rt   __classcell__r!   r!   rS   r"   r$   =   sB    	
`
#r$   c                       sN   e Zd ZdZ		ddedee def fddZd	e	j
d
e	j
fddZ  ZS )Olmo2MLPz
    This is the MLP block where the output is computed as
    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
    (plus another skip connection).
    Nr%   r    r'   r(   c                    sn   t    || _|j| _|j| _t| j| jgd d|td|d| _t | _	t
| j| jd|td|d| _d S )N   Fgate_up_projr;   	down_proj)r<   r=   r    r>   intermediate_sizer   r   r   r   act_fnr   r   )rR   r    r'   r(   rS   r!   r"   r=      s&   

	zOlmo2MLP.__init__xrW   c                 C   s*   |  |\}}| |}| |\}}|S N)r   r   r   )rR   r   gate_uprp   r!   r!   r"   rt      s   
zOlmo2MLP.forward)Nr%   )rv   rw   rx   ry   r   r   r   r{   r=   r[   r}   rt   r~   r!   r!   rS   r"   r      s     	 r   c                       sl   e Zd ZdZ				ddededee ded	ee	j
j f
 fd
dZde	jde	jdede	jfddZ  ZS )Olmo2DecoderLayerz
    This is a typical transformer block where the output is
    computed as ``MLP(LN(x + Attention(LN(x))))``
    (plus another skip connection).
    r   Nr%   r    r&   r'   r(   r)   c                    sn   t    || _|| _t|||td||d| _t||td|d| _t	|j
|jd| _t	|j
|jd| _d S )N	self_attnr(   r)   mlpr(   r-   )r<   r=   r&   r)   r$   r   r   r   r   r   r>   rL   post_attention_layernormpost_feedforward_layernorm)rR   r    r&   r'   r(   r)   rS   r!   r"   r=     s"   
	zOlmo2DecoderLayer.__init__rj   rk   rl   rW   c                 C   sH   |}|  |||}| |}|| }|}| |}| |}|| }|S r   )r   r   r   r   )rR   rj   rk   rl   residualr!   r!   r"   rt   .  s   


zOlmo2DecoderLayer.forwardru   )rv   rw   rx   ry   r   rz   r   r   r{   r[   r\   r|   r=   r}   r   rt   r~   r!   r!   rS   r"   r     s4    	
 r   c                       sp   e Zd Z			ddedee dedeejj	 f fddZ
	dd	ejd
ejdedeej dejf
ddZ  ZS )
Olmo2ModelNr%   r    r'   r(   r)   c                    s   t     _|d u rtrtj }|_t j	 j
td|d_t j fddtd|d_t j
 jd_d S )Nembed_tokensr   c                    s   t  | |jdS )N)r    r&   r'   r(   r)   )r   r)   )idxr(   r    r'   rR   r!   r"   <lambda>X  s    z%Olmo2Model.__init__.<locals>.<lambda>layersr-   )r<   r=   r    _is_cudar[   r\   r|   r)   r   
vocab_sizer>   r   r   r   num_hidden_layersr   r   rL   normrR   r    r'   r(   r)   rS   r   r"   r=   D  s    

zOlmo2Model.__init__	input_idsrj   rl   input_embedsrW   c                 C   sF   |du r
|  |}n|}t| jD ]
\}}||||}q| |}|S )zN
        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
        N)r   	enumerater   r   )rR   r   rj   rl   r   rk   r&   decoder_layerr!   r!   r"   rt   c  s   
zOlmo2Model.forwardNr%   Nr   )rv   rw   rx   r   r   r   r{   r[   r\   r|   r=   r}   r   rt   r~   r!   r!   rS   r"   r   B  s2    
$r   c                       s   e Zd ZdZ			ddedee dedeej	j
 f fdd	Zd
d Ze 	ddejdejdedejdejf
ddZdeeeejf  fddZ  ZS )Olmo2ForCausalLMz/
    Extremely barebones HF model wrapper.
    Nr%   r    r'   r(   r)   c                    sp   t    || _t||td||d| _|jr| jj| _n|j	| _
t| j
|j|j	|td|d| _t|| _d S )Nmodelr   lm_head)org_num_embeddingsr'   r(   )r<   r=   r    r   r   r   tie_word_embeddingsr   r   r   unpadded_vocab_sizer   r>   r   logits_processorr   rS   r!   r"   r=     s&   
zOlmo2ForCausalLM.__init__c                 C   s
   t | jS r   )r#   r    )rR   r!   r!   r"   r#     s   
z2Olmo2ForCausalLM.get_attention_sliding_window_sizer   rj   rl   r   rW   c                 C   s$   | j ||||d}| ||| j|S )N)r   rj   rl   r   )r   r   r   )rR   r   rj   rl   r   rk   r!   r!   r"   rt     s   
zOlmo2ForCausalLM.forwardweightsc                 C   s   g d}t | jdd}|D ]_\}}d|v rqd|v sd|v r q| jjr)d|v r)q|D ](\}}}||vr5q+|||}|drE||vrEq+|| }	|	j}
|
|	||  n|dr^||vr^q|| }	t|	d	t}
|
|	| qd S )
N))r*   q_projrU   )r*   k_projrV   )r*   v_projrq   )r   	gate_projr   )r   up_projr   F)remove_duplicatezrotary_emb.inv_freqzrotary_emb.cos_cachedzrotary_emb.sin_cachedzlm_head.weightz.biasweight_loader)	dictnamed_parametersr    r   replaceendswithr   rO   r   )rR   r   stacked_params_mappingparams_dictnameloaded_weight
param_nameweight_nameshard_idparamr   r!   r!   r"   load_weights  s4   
zOlmo2ForCausalLM.load_weightsr   r   )rv   rw   rx   ry   r   r   r   r{   r[   r\   r|   r=   r#   no_gradr}   r   rt   r   r   r   r~   r!   r!   rS   r"   r     s:    
$r   )6ry   	functoolsr   typingr   r   r   r[   r   transformersr   sglang.srt.distributedr   r	   r
   r   sglang.srt.layers.activationr   sglang.srt.layers.layernormr   sglang.srt.layers.linearr   r   r   "sglang.srt.layers.logits_processorr   *sglang.srt.layers.quantization.base_configr   !sglang.srt.layers.radix_attentionr   "sglang.srt.layers.rotary_embeddingr   *sglang.srt.layers.vocab_parallel_embeddingr   r   +sglang.srt.model_executor.cuda_graph_runnerr   ,sglang.srt.model_executor.forward_batch_infor   $sglang.srt.model_loader.weight_utilsr   sglang.srt.utilsr   r   r   r   r#   Moduler$   r   r   r   r   
EntryClassr!   r!   r!   r"   <module>   s8    1;B`