o
    eiX                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, edG dd dej-Z.dej/de0dej/fddZ1	d;dej-dej/d ej/d!ej/d"ej/dB d#e2d$e2d%e"e fd&d'Z3d<d(d)Z4d*d+ Z5ee4G d,d- d-ej-Z6G d.d/ d/ej-Z7G d0d1 d1eZ8G d2d3 d3ej-Z9e$G d4d5 d5e Z:e$G d6d7 d7e:Z;e$G d8d9 d9e:eZ<g d:Z=dS )=    )Callable)OptionalN)TransformersKwargs   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernelized_func)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )Olmo3ConfigRMSNormc                       s@   e Zd Zddeddf fddZdejfddZd	d
 Z  Z	S )Olmo3RMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )z;
        Olmo3RMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizer    	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/olmo3/modeling_olmo3.pyr#   /   s   

zOlmo3RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j| |S )N   T)keepdim)	dtypetor&   float32powmeanrsqrtr)   r(   )r*   hidden_statesinput_dtypevariancer.   r.   r/   forward7   s
   zOlmo3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler(   shaper)   )r*   r.   r.   r/   
extra_repr>   s   zOlmo3RMSNorm.extra_repr)r   )
__name__
__module____qualname__floatr#   r&   Tensorr<   r?   __classcell__r.   r.   r,   r/   r   -   s    r   r9   n_repr!   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r>   expandreshape)r9   rF   batchnum_key_value_headsslenhead_dimr.   r.   r/   	repeat_kvB   s
   0rM           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr0   r   r1   )dimr3   )ptrainingr   )rM   num_key_value_groupsr&   matmul	transposer$   
functionalsoftmaxr5   r4   r3   rU   rY   
contiguous)rO   rP   rQ   rR   rS   rT   rU   rV   
key_statesvalue_statesattn_weightsattn_outputr.   r.   r/   eager_attention_forwardN   s   
rd   c           	      C   s^   | j |j }}||}||}| | t| |  }|| t||  }||||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r3   	unsqueezerotate_halfr4   )	qkcossinunsqueeze_dimq_typek_typeq_embedk_embedr.   r.   r/   apply_rotary_pos_embg   s   

rp   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr1   r0   rW   )r>   r&   cat)xx1x2r.   r.   r/   rf      s   rf   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	ejdB d
e
dB dejdB dee de	ejejdB f fddZ  ZS )Olmo3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    s2  t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _t|j| j |j| _t|j| j |j| _|jd usJ |j| | _| jdkr|j| _d S d | _d S )NrL   g      Tbiassliding_attention)r"   r#   rw   rx   getattrr+   num_attention_headsrL   rJ   rZ   rT   attention_dropout	is_causalr$   Linearattention_biasq_projk_projv_projo_projr   rms_norm_epsq_normk_normlayer_typesattention_typesliding_windowr*   rw   rx   r,   r.   r/   r#      s2   
 zOlmo3Attention.__init__Nr9   position_embeddingsrS   past_key_valuescache_positionrV   r!   c                 K   s4  |j d d }g |d| jR }| | |}	| | |}
| |}|	|dd}	|
|dd}
||dd}|\}}t	|	|
||\}	}
|d urc|||d}|
|
|| j|\}
}t| jjt}|| |	|
||f| jswdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr1   r   r0   )rj   ri   r   rN   )rU   rT   r   )r>   rL   r   r   r   r   r   viewr\   rp   updaterx   r   get_interfacerw   _attn_implementationrd   rY   r~   rT   r   rH   r_   r   )r*   r9   r   rS   r   r   rV   input_shapehidden_shapequery_statesr`   ra   ri   rj   cache_kwargsattention_interfacerc   rb   r.   r.   r/   r<      s@   	
	

zOlmo3Attention.forward)NN)r@   rA   rB   __doc__r   intr#   r&   rD   r=   r   
LongTensorr   r   r<   rE   r.   r.   r,   r/   rv      s(    !rv   c                       s$   e Zd Z fddZdd Z  ZS )Olmo3MLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFry   )r"   r#   rw   r+   intermediate_sizer$   r   	gate_projup_proj	down_projr   
hidden_actact_fnr*   rw   r,   r.   r/   r#      s   
zOlmo3MLP.__init__c                 C   s$   |  | | || | }|S N)r   r   r   r   )r*   rs   r   r.   r.   r/   r<      s    zOlmo3MLP.forward)r@   rA   rB   r#   r<   rE   r.   r.   r,   r/   r      s    
r   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfddZ  ZS )Olmo3DecoderLayerrw   rx   c                    sR   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
d S )N)rw   rx   r    )r"   r#   r+   rv   	self_attnr   mlpr   r   post_attention_layernormpost_feedforward_layernormr   r,   r.   r/   r#      s   

zOlmo3DecoderLayer.__init__NFr9   rS   position_idsr   	use_cacher   r   rV   r!   c              
   K   s^   |}	| j d|||||||d|\}}
| |}|	| }|}	| |}| |}|	| }|S )N)r9   rS   r   r   r   r   r   r.   )r   r   r   r   )r*   r9   rS   r   r   r   r   r   rV   residual_r.   r.   r/   r<      s&   




zOlmo3DecoderLayer.forward)NNNFNN)r@   rA   rB   r   r   r#   r&   rD   r   r   boolr=   r   r   r<   rE   r.   r.   r,   r/   r      s6    	
r   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )Olmo3RotaryEmbeddinginv_freqNrw   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)r"   r#   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrw   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r*   rw   devicerope_init_fnr   r,   r.   r/   r#     s   


zOlmo3RotaryEmbedding.__init__r   ztorch.deviceseq_lenr!   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetarL   Ng      ?r   r0   r3   )r   r3   )	r   r|   r+   r}   r&   arangeint64r4   rC   )rw   r   r   baserW   attention_factorr   r.   r.   r/   r   '  s   
&z4Olmo3RotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r1   r   mpscpuF)device_typeenabledr0   rq   r   )r   rC   rG   r>   r4   r   
isinstancetypestrr   r\   r&   rr   ri   r   rj   r3   )
r*   rs   r   inv_freq_expandedposition_ids_expandedr   freqsembri   rj   r.   r.   r/   r<   E  s   0&zOlmo3RotaryEmbedding.forwardr   )NNN)r@   rA   rB   r&   rD   __annotations__r   r#   staticmethodr   r   r=   rC   r   no_gradr   r<   rE   r.   r.   r,   r/   r     s&   
 

r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )Olmo3PreTrainedModelrw   modelTr   r   )r9   
attentionsN)r@   rA   rB   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   rv   _can_record_outputsr.   r.   r.   r/   r   U  s   
 
r   c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
ej	dB dedB dee defddZ  ZS )
Olmo3Modelrw   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r.   )r   ).0rx   rw   r.   r/   
<listcomp>q  s    z'Olmo3Model.__init__.<locals>.<listcomp>r   r   F)r"   r#   pad_token_idpadding_idx
vocab_sizer$   	Embeddingr+   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   normr   
rotary_embgradient_checkpointing	post_initr   r,   r   r/   r#   j  s   zOlmo3Model.__init__N	input_idsrS   r   r   inputs_embedsr   r   rV   r!   c              	   K   s,  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r<|d ur-| nd}	tj|jd |jd|	 }|d u rE|	d}t
| }
tse| j|||||d}td
i |td
i |d}
|}| ||}| jd | jj D ]}||f|
|jj ||||d|}qv| |}t||d	S )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   )rw   r   rS   r   r   r   )full_attentionr{   )rS   r   r   r   r   )last_hidden_stater   r.   )
ValueErrorr   r   rw   get_seq_lengthr&   r   r>   r   re   r   dictr   r   r   r   r   r   r   r   r   )r*   r   rS   r   r   r   r   r   rV   past_seen_tokenscausal_mask_mappingmask_kwargsr9   r   decoder_layerr.   r.   r/   r<   z  sT   





zOlmo3Model.forward)NNNNNNN)r@   rA   rB   r   r#   r   r   r   r&   r   rD   r   FloatTensorr   r   r   r   r<   rE   r.   r.   r,   r/   r   h  s>    	
r   c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																	
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B de	j
d	B dee	jB dee defddZ  ZS )Olmo3ForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr9   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r   )
r"   r#   r   r   r   r$   r   r+   r  r   r   r,   r.   r/   r#     s
   
zOlmo3ForCausalLM.__init__Nr   r   rS   r   r   r   labelsr   r   logits_to_keeprV   r!   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }d}|durB| jd||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Olmo3ForCausalLM

        >>> model = Olmo3ForCausalLM.from_pretrained("meta-olmo3/Olmo3-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-olmo3/Olmo3-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r   rS   r   r   r   r   r   N)r  r  r   )lossr  r   r9   r   r.   )r   r   r   r   slicer  loss_functionrw   r   r   r   r9   r   )r*   r   rS   r   r   r   r  r   r   r  rV   outputsr9   slice_indicesr  r  r.   r.   r/   r<     s0    zOlmo3ForCausalLM.forward)	NNNNNNNNr   )r@   rA   rB   _tied_weights_keys_tp_plan_pp_planr#   r   r   r&   r   rD   r   r   r   r   r   r   r   r<   rE   r.   r.   r,   r/   r     sN    		
r   )r   r   r   )rN   )r   )>collections.abcr   typingr   r&   torch.nnr$   transformers.utils.genericr   activationsr   cache_utilsr   r   
generationr	   integrationsr
   r   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.genericr   r   utils.output_capturingr   configuration_olmo3r   Moduler   rD   r   rM   rC   rd   rp   rf   rv   r   r   r   r   r   r   __all__r.   r.   r.   r/   <module>   sl   

P+AWK