o
    wi                     @   sP  d dl Z d dlmZ d dlZd dlmZ d dlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZmZmZ d	d
lmZ eeZG dd dejZG dd deZG dd deZdd Zd$ddZG dd deZ G dd deZ!G dd deZ"G dd de"eZ#G dd de
Z$G dd  d eZ%G d!d" d"eZ&g d#Z'dS )%    N)Optional   )logging   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttention)LlamaDecoderLayerLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding   )HeliumConfigc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	HeliumRMSNormư>c                    s&   t    tt|| _|| _d S N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ f/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/helium/modular_helium.pyr   "   s   

zHeliumRMSNorm.__init__c                 C   sR   |j }|tj}|djddd}|t|| j  }| jtj| |S )Nr   T)keepdim)	dtypetor   float32powmeanrsqrtr   r   )r   hidden_statesinput_dtypevariancer!   r!   r"   forward'   s
   zHeliumRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   shaper   )r   r!   r!   r"   
extra_repr.   s   zHeliumRMSNorm.extra_repr)r   )__name__
__module____qualname__r   r.   r1   __classcell__r!   r!   r   r"   r   !   s    r   c                   @      e Zd ZdS )HeliumRotaryEmbeddingNr2   r3   r4   r!   r!   r!   r"   r7   2       r7   c                   @   r6   )	HeliumMLPNr8   r!   r!   r!   r"   r:   6   r9   r:   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   Nr   r   r#   dim)r   stackflatten)xx1x2r!   r!   r"   rotate_half:   s   rC   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr#   r   r;   )	unsqueezer0   repeat_interleaverC   )qkcossinposition_idsunsqueeze_dimq_embedk_embedr!   r!   r"   apply_rotary_pos_embA   s   

$$rN   c                       ,   e Zd Zddedee f fddZ  ZS )HeliumAttentionNconfig	layer_idxc                    s:   t  || tj|j|jdd| _dt| j | _	d S )NF)biasr   )
r   r   r   Linearr   o_projmathsqrthead_dimscalingr   rQ   rR   r   r!   r"   r   c   s   zHeliumAttention.__init__r   r2   r3   r4   r   r   intr   r5   r!   r!   r   r"   rP   b       $rP   c                       rO   )HeliumDecoderLayerNrQ   rR   c                    s<   t    t|| _t|j|jd| _t|j|jd| _d S )Nr   )	r   r   r:   mlpr   r   rms_norm_epsinput_layernormpost_attention_layernormrZ   r   r!   r"   r   j   s   

zHeliumDecoderLayer.__init__r   r[   r!   r!   r   r"   r^   i   r]   r^   c                   @   r6   )HeliumPreTrainedModelNr8   r!   r!   r!   r"   rd   r   r9   rd   c                       "   e Zd Zdef fddZ  ZS )HeliumModelrQ   c                    sZ   t    t fddt jD | _t j j	d| _
t | _d| _|   d S )Nc                    s   g | ]}t  |qS r!   )r^   ).0rR   rQ   r!   r"   
<listcomp>z   s    z(HeliumModel.__init__.<locals>.<listcomp>r_   F)r   r   r   
ModuleListrangenum_hidden_layerslayersr   r   ra   normr7   
rotary_embgradient_checkpointing	post_initr   rQ   r   rh   r"   r   w   s   
zHeliumModel.__init__r2   r3   r4   r   r   r5   r!   r!   r   r"   rf   v       rf   c                       re   )HeliumForCausalLMrQ   c                    "   t  | t|| _|   d S r   r   r   rf   modelrq   rr   r   r!   r"   r         
zHeliumForCausalLM.__init__rs   r!   r!   r   r"   ru      rt   ru   c                       re   )HeliumForSequenceClassificationrQ   c                    rv   r   rw   rr   r   r!   r"   r      ry   z(HeliumForSequenceClassification.__init__rs   r!   r!   r   r"   rz      rt   rz   c                       re   )HeliumForTokenClassificationrQ   c                    rv   r   rw   rr   r   r!   r"   r      ry   z%HeliumForTokenClassification.__init__rs   r!   r!   r   r"   r{      rt   r{   )rd   rf   ru   rz   r{   )Nr   )(rV   typingr   r   torch.nnr   torch.utils.checkpointutilsr   gemma.modeling_gemmar   r   r   granite.modeling_graniter	   llama.modeling_llamar
   r   r   r   r   configuration_heliumr   
get_loggerr2   loggerModuler   r7   r:   rC   rN   rP   r^   rd   rf   ru   rz   r{   __all__r!   r!   r!   r"   <module>   s0   

!	