o
    eiV                     @   s  d dl mZ d dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+ edG dd dej,Z-G dd dej,Z.dej/de0dej/fddZ1	d;d ej,d!ej/d"ej/d#ej/d$ej/dB d%e2d&e2d'e!e fd(d)Z3d<d*d+Z4d,d- Z5ee4G d.d/ d/ej,Z6G d0d1 d1ej,Z7G d2d3 d3eZ8e#G d4d5 d5eZ9e#G d6d7 d7e9Z:e#G d8d9 d9e9eZ;g d:Z<dS )=    )Callable)OptionalN)TransformersKwargs   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )Olmo2ConfigRMSNormc                       s@   e Zd Zddeddf fddZdejfddZd	d
 Z  Z	S )Olmo2RMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )z;
        Olmo2RMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizer   	__class__ f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/olmo2/modeling_olmo2.pyr"   4   s   

zOlmo2RMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j| |S )N   T)keepdim)	dtypetor%   float32powmeanrsqrtr(   r'   )r)   hidden_statesinput_dtypevariancer-   r-   r.   forward<   s
   zOlmo2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler'   shaper(   )r)   r-   r-   r.   
extra_reprC   s   zOlmo2RMSNorm.extra_repr)r   )
__name__
__module____qualname__floatr"   r%   Tensorr;   r>   __classcell__r-   r-   r+   r.   r   2   s    r   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )Olmo2RotaryEmbeddinginv_freqNconfigc                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrF   F)
persistentoriginal_inv_freq)r!   r"   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrG   rope_parametersrH   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r)   rG   devicerope_init_fnrF   r+   r-   r.   r"   J   s   


zOlmo2RotaryEmbedding.__init__rT   ztorch.deviceseq_lenr    ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r/   )r2   )rT   r2   )	rO   getattrr*   num_attention_headsr%   arangeint64r3   rB   )rG   rT   rV   basedimattention_factorrF   r-   r-   r.   rP   Z   s   
&z4Olmo2RotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd. | |  
dd}tj||fdd	}| | j }| | j }	W d    ||	fS 1 sow   Y  ||	fS )
Nr   r0   r   mpscpuF)device_typeenabledr/   r^   )rF   rB   expandr=   r3   rT   
isinstancetypestrr   	transposer%   catcosrQ   sin)
r)   xposition_idsinv_freq_expandedposition_ids_expandedrb   freqsembrk   rl   r-   r-   r.   r;   x   s   0&
zOlmo2RotaryEmbedding.forwardN)NNN)r?   r@   rA   r%   rC   __annotations__r   r"   staticmethodr   intr<   rB   rP   no_gradr   r;   rD   r-   r-   r+   r.   rE   G   s&   
 

rE   r8   n_repr    c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r=   re   reshape)r8   rx   batchnum_key_value_headsslenrX   r-   r-   r.   	repeat_kv   s
   0r}           modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d ur |
| }
tjj|
dtjd	|j
}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr/   r   r0   )r^   r2   )ptrainingr   )r}   num_key_value_groupsr%   matmulri   r#   
functionalsoftmaxr4   r3   r2   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputr-   r-   r.   eager_attention_forward   s   
r   c           	      C   s^   | j |j }}||}||}| | t| |  }|| t||  }||||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r2   	unsqueezerotate_halfr3   )	qkrk   rl   unsqueeze_dimq_typek_typeq_embedk_embedr-   r-   r.   apply_rotary_pos_emb   s   

r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr0   r/   rd   )r=   r%   rj   )rm   x1x2r-   r-   r.   r      s   r   c                       s   e Zd ZdZddededB f fddZ		ddejde	ejejf d	ejdB d
e
dB dejdB dee de	ejejdB f fddZ  ZS )Olmo2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperNrG   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _t|j| j |j| _t|j| j |j| _d S )NrX   g      Tbias)r!   r"   rG   r   rY   r*   rZ   rX   r{   r   r   attention_dropout	is_causalr#   Linearattention_biasq_projk_projv_projo_projr   rms_norm_epsq_normk_normr)   rG   r   r+   r-   r.   r"      s,   
zOlmo2Attention.__init__r8   position_embeddingsr   past_key_valuescache_positionr   r    c                 K   s0  |j d d }g |d| jR }| | |}	| | |}
| |}|	|dd}	|
|dd}
||dd}|\}}t	|	|
||\}	}
|d urc|||d}|
|
|| j|\}
}t| jjt}|| |	|
||f| jswdn| j| jd|\}}|jg |dR   }| |}||fS )Nr0   r   r/   )rl   rk   r   r~   )r   r   )r=   rX   r   r   r   r   r   viewri   r   updater   r   get_interfacerG   _attn_implementationr   r   r   r   ry   r   r   )r)   r8   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rk   rl   cache_kwargsattention_interfacer   r   r-   r-   r.   r;      s>   	


zOlmo2Attention.forwardrs   )NN)r?   r@   rA   __doc__r   rv   r"   r%   rC   r<   r   
LongTensorr   r   r;   rD   r-   r-   r+   r.   r      s(    r   c                       s$   e Zd Z fddZdd Z  ZS )Olmo2MLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFr   )r!   r"   rG   r*   intermediate_sizer#   r   	gate_projup_proj	down_projr   
hidden_actact_fnr)   rG   r+   r-   r.   r"     s   
zOlmo2MLP.__init__c                 C   s$   |  | | || | }|S rs   )r   r   r   r   )r)   rm   r   r-   r-   r.   r;   %  s    zOlmo2MLP.forward)r?   r@   rA   r"   r;   rD   r-   r-   r+   r.   r     s    
r   c                       s   e Zd Zdedef fddZ						ddejdejdB d	ejdB d
e	dB de
dB dejdB deejejf dB dee dejfddZ  ZS )Olmo2DecoderLayerrG   r   c                    sR   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
d S )N)rG   r   r   )r!   r"   r*   r   	self_attnr   mlpr   r   post_attention_layernormpost_feedforward_layernormr   r+   r-   r.   r"   +  s   

zOlmo2DecoderLayer.__init__NFr8   r   rn   r   	use_cacher   r   r   r    c              
   K   s^   |}	| j d|||||||d|\}}
| |}|	| }|}	| |}| |}|	| }|S )N)r8   r   rn   r   r   r   r   r-   )r   r   r   r   )r)   r8   r   rn   r   r   r   r   r   residual_r-   r-   r.   r;   4  s&   




zOlmo2DecoderLayer.forward)NNNFNN)r?   r@   rA   r   rv   r"   r%   rC   r   r   boolr<   r   r   r;   rD   r-   r-   r+   r.   r   *  s6    	
r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )Olmo2PreTrainedModelrG   modelTr   r   )r8   
attentionsN)r?   r@   rA   r   rt   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr-   r-   r-   r.   r   U  s   
 
r   c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
ej	dB dedB dee defddZ  ZS )
Olmo2ModelrG   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r-   )r   ).0r   rG   r-   r.   
<listcomp>q  s    z'Olmo2Model.__init__.<locals>.<listcomp>r   r   F)r!   r"   pad_token_idpadding_idx
vocab_sizer#   	Embeddingr*   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   normrE   
rotary_embgradient_checkpointing	post_initr   r+   r   r.   r"   j  s   zOlmo2Model.__init__N	input_idsr   rn   r   inputs_embedsr   r   r   r    c              
   K   s   |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r<|d ur-| nd}	tj|jd |jd|	 }|d u rE|	d}t
| j|||||d}
|}| j||d}| jd | jj D ]}||f|
|||||d|}qb| |}t||d	S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )rT   )rG   r   r   r   r   rn   )rn   )r   r   rn   r   r   r   )last_hidden_stater   )
ValueErrorr   r   rG   get_seq_lengthr%   r[   r=   rT   r   r   r   r   r   r   r   )r)   r   r   rn   r   r   r   r   r   past_seen_tokenscausal_maskr8   r   decoder_layerr-   r-   r.   r;   z  sP   

	
zOlmo2Model.forward)NNNNNNN)r?   r@   rA   r   r"   r   r   r   r%   r   rC   r   FloatTensorr   r   r   r   r;   rD   r-   r-   r+   r.   r   h  s>    	
r   c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																	
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B de	j
d	B dee	jB dee defddZ  ZS )Olmo2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr8   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r   )
r!   r"   r   r   r   r#   r   r*   r   r   r   r+   r-   r.   r"     s
   
zOlmo2ForCausalLM.__init__Nr   r   r   rn   r   r   labelsr   r   logits_to_keepr   r    c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }d}|durB| jd||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Olmo2ForCausalLM

        >>> model = Olmo2ForCausalLM.from_pretrained("meta-olmo2/Olmo2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-olmo2/Olmo2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r   r   rn   r   r   r   r   N)r   r   r   )lossr   r   r8   r   r-   )r   r   rf   rv   slicer   loss_functionrG   r   r   r   r8   r   )r)   r   r   rn   r   r   r   r   r   r   r   outputsr8   slice_indicesr   r   r-   r-   r.   r;     s0    zOlmo2ForCausalLM.forward)	NNNNNNNNr   )r?   r@   rA   _tied_weights_keys_tp_plan_pp_planr"   r   r   r%   r   rC   r   r   r   rv   r   r   r   r;   rD   r-   r-   r+   r.   r     sN    		
r   )r   r   r   )r~   )r   )=collections.abcr   typingr   r%   torch.nnr#   transformers.utils.genericr   activationsr   cache_utilsr   r   
generationr	   integrationsr
   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.genericr   r   utils.output_capturingr   configuration_olmo2r   Moduler   rE   rC   rv   r}   rB   r   r   r   r   r   r   r   r   r   __all__r-   r-   r-   r.   <module>   sl   @

L+PK