o
    ei                     @   s<  d dl Z d dlZd dlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZmZmZmZ dd	lmZ eeZG d
d dejZG dd deZG dd deZdd Zd#ddZG dd deZG dd deZG dd deZG dd deeZ G dd deZ!G dd deZ"G d d! d!e	Z#g d"Z$dS )$    N   )logging   )GemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassification)GraniteAttention)LlamaDecoderLayerLlamaMLP
LlamaModelLlamaPreTrainedModelLlamaRotaryEmbedding   )HeliumConfigc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	HeliumRMSNormư>c                    s&   t    tt|| _|| _d S N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ g/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/helium/modular_helium.pyr      s   

zHeliumRMSNorm.__init__c                 C   sR   |j }|tj}|djddd}|t|| j  }| jtj| |S )Nr   T)keepdim)	dtypetor   float32powmeanrsqrtr   r   )r   hidden_statesinput_dtypevariancer    r    r!   forward$   s
   zHeliumRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler   shaper   )r   r    r    r!   
extra_repr+   s   zHeliumRMSNorm.extra_repr)r   )__name__
__module____qualname__r   r-   r0   __classcell__r    r    r   r!   r      s    r   c                   @      e Zd ZdS )HeliumRotaryEmbeddingNr1   r2   r3   r    r    r    r!   r6   /       r6   c                   @   r5   )	HeliumMLPNr7   r    r    r    r!   r9   3   r8   r9   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   Nr   r   r"   dim)r   stackflatten)xx1x2r    r    r!   rotate_half7   s   rB   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr"   r   r:   )	unsqueezer/   repeat_interleaverB   )qkcossinunsqueeze_dimq_embedk_embedr    r    r!   apply_rotary_pos_emb>   s   

$$rL   c                       ,   e Zd ZddededB f fddZ  ZS )HeliumAttentionNconfig	layer_idxc                    s:   t  || tj|j|jdd| _dt| j | _	d S )NF)biasr   )
r   r   r   Linearr   o_projmathsqrthead_dimscalingr   rO   rP   r   r    r!   r   ^   s   zHeliumAttention.__init__r   r1   r2   r3   r   intr   r4   r    r    r   r!   rN   ]       $rN   c                       rM   )HeliumDecoderLayerNrO   rP   c                    s@   t  || t|| _t|j|jd| _t|j|jd| _d S )Nr   )	r   r   r9   mlpr   r   rms_norm_epsinput_layernormpost_attention_layernormrX   r   r    r!   r   e   s   
zHeliumDecoderLayer.__init__r   rY   r    r    r   r!   r\   d   r[   r\   c                   @   r5   )HeliumPreTrainedModelNr7   r    r    r    r!   rb   m   r8   rb   c                       s"   e Zd Zdef fddZ  ZS )HeliumModelrO   c                    sP   t    t fddt jD | _t j j	d| _
d| _|   d S )Nc                    s   g | ]}t  |qS r    )r\   ).0rP   rO   r    r!   
<listcomp>u   s    z(HeliumModel.__init__.<locals>.<listcomp>r]   F)r   r   r   
ModuleListrangenum_hidden_layerslayersr   r   r_   normgradient_checkpointing	post_init)r   rO   r   re   r!   r   r   s   zHeliumModel.__init__)r1   r2   r3   r   r   r4   r    r    r   r!   rc   q   s    rc   c                   @   r5   )HeliumForCausalLMNr7   r    r    r    r!   rn   ~   r8   rn   c                   @   r5   )HeliumForSequenceClassificationNr7   r    r    r    r!   ro      r8   ro   c                   @   r5   )HeliumForTokenClassificationNr7   r    r    r    r!   rp      r8   rp   )rb   rc   rn   ro   rp   )r   )%rT   r   torch.nnr   utilsr   gemma.modeling_gemmar   r   r   granite.modeling_graniter   llama.modeling_llamar	   r
   r   r   r   configuration_heliumr   
get_loggerr1   loggerModuler   r6   r9   rB   rL   rN   r\   rb   rc   rn   ro   rp   __all__r    r    r    r!   <module>   s,   

	