o
    i_                     @   sH  d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZmZ d	d
lmZ eeZG dd dejZG dd deZG dd deZdd Zd$ddZG dd deZG dd deZ G dd deZ!G dd de!eZ"G dd de	Z#G dd  d e
Z$G d!d" d"eZ%g d#Z&dS )%    N)Optional   )logging   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttention)LlamaDecoderLayerLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding   )HeliumConfigc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	HeliumRMSNormư>c                    s&   t    tt|| _|| _d S N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/helium/modular_helium.pyr   !   s   

zHeliumRMSNorm.__init__c                 C   sR   |j }|tj}|djddd}|t|| j  }| jtj| |S )Nr   T)keepdim)	dtypetor   float32powmeanrsqrtr   r   )r   hidden_statesinput_dtypevariancer!   r!   r"   forward&   s
   zHeliumRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   shaper   )r   r!   r!   r"   
extra_repr-   s   zHeliumRMSNorm.extra_repr)r   )__name__
__module____qualname__r   r.   r1   __classcell__r!   r!   r   r"   r       s    r   c                   @      e Zd ZdS )HeliumRotaryEmbeddingNr2   r3   r4   r!   r!   r!   r"   r7   1       r7   c                   @   r6   )	HeliumMLPNr8   r!   r!   r!   r"   r:   5   r9   r:   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   Nr   r   r#   dim)r   stackflatten)xx1x2r!   r!   r"   rotate_half9   s   rC   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr#   r   r;   )	unsqueezer0   repeat_interleaverC   )qkcossinposition_idsunsqueeze_dimq_embedk_embedr!   r!   r"   apply_rotary_pos_emb@   s   

$$rN   c                       ,   e Zd Zddedee f fddZ  ZS )HeliumAttentionNconfig	layer_idxc                    s:   t  || tj|j|jdd| _dt| j | _	d S )NF)biasr   )
r   r   r   Linearr   o_projmathsqrthead_dimscalingr   rQ   rR   r   r!   r"   r   b   s   zHeliumAttention.__init__r   r2   r3   r4   r   r   intr   r5   r!   r!   r   r"   rP   a       $rP   c                       rO   )HeliumDecoderLayerNrQ   rR   c                    s@   t  || t|| _t|j|jd| _t|j|jd| _d S )Nr   )	r   r   r:   mlpr   r   rms_norm_epsinput_layernormpost_attention_layernormrZ   r   r!   r"   r   i   s   
zHeliumDecoderLayer.__init__r   r[   r!   r!   r   r"   r^   h   r]   r^   c                   @   r6   )HeliumPreTrainedModelNr8   r!   r!   r!   r"   rd   q   r9   rd   c                       s"   e Zd Zdef fddZ  ZS )HeliumModelrQ   c                    sZ   t    t fddt jD | _t j j	d| _
t | _d| _|   d S )Nc                    s   g | ]}t  |qS r!   )r^   ).0rR   rQ   r!   r"   
<listcomp>y   s    z(HeliumModel.__init__.<locals>.<listcomp>r_   F)r   r   r   
ModuleListrangenum_hidden_layerslayersr   r   ra   normr7   
rotary_embgradient_checkpointing	post_init)r   rQ   r   rg   r"   r   v   s   
zHeliumModel.__init__)r2   r3   r4   r   r   r5   r!   r!   r   r"   re   u   s    re   c                   @   r6   )HeliumForCausalLMNr8   r!   r!   r!   r"   rq      r9   rq   c                   @   r6   )HeliumForSequenceClassificationNr8   r!   r!   r!   r"   rr      r9   rr   c                   @   r6   )HeliumForTokenClassificationNr8   r!   r!   r!   r"   rs      r9   rs   )rd   re   rq   rr   rs   )Nr   )'rV   typingr   r   torch.nnr   utilsr   gemma.modeling_gemmar   r   r   granite.modeling_graniter	   llama.modeling_llamar
   r   r   r   r   configuration_heliumr   
get_loggerr2   loggerModuler   r7   r:   rC   rN   rP   r^   rd   re   rq   rr   rs   __all__r!   r!   r!   r"   <module>   s.   

!	