o
    ¾e¦i  ã                   @   s<  d dl Z d dlZd dlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZmZmZmZ dd	lmZ e e¡ZG d
d„ dejƒZG dd„ deƒZG dd„ deƒZdd„ Zd#dd„ZG dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deeƒZ G dd„ deƒZ!G dd„ deƒZ"G d d!„ d!e	ƒZ#g d"¢Z$dS )$é    Né   )Úloggingé   )ÚGemmaForCausalLMÚGemmaForSequenceClassificationÚGemmaForTokenClassification)ÚGraniteAttention)ÚLlamaDecoderLayerÚLlamaMLPÚ
LlamaModelÚLlamaPreTrainedModelÚLlamaRotaryEmbeddingé   )ÚHeliumConfigc                       s.   e Zd Zd‡ fdd„	Zdd„ Zdd„ Z‡  ZS )	ÚHeliumRMSNormçíµ ÷Æ°>c                    s&   t ƒ  ¡  t t |¡¡| _|| _d S ©N)ÚsuperÚ__init__ÚnnÚ	ParameterÚtorchÚonesÚweightÚvariance_epsilon)ÚselfÚhidden_sizeÚeps©Ú	__class__© úg/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/helium/modular_helium.pyr      s   

zHeliumRMSNorm.__init__c                 C   sR   |j }| tj¡}| d¡jddd}|t || j ¡ }| j tj¡|  |¡S )Nr   éÿÿÿÿT)Úkeepdim)	ÚdtypeÚtor   Úfloat32ÚpowÚmeanÚrsqrtr   r   )r   Úhidden_statesÚinput_dtypeÚvariancer    r    r!   Úforward$   s
   zHeliumRMSNorm.forwardc                 C   s   t | jjƒ› d| j› S )Nz, eps=)Útupler   Úshaper   )r   r    r    r!   Ú
extra_repr+   s   zHeliumRMSNorm.extra_repr)r   )Ú__name__Ú
__module__Ú__qualname__r   r-   r0   Ú__classcell__r    r    r   r!   r      s    r   c                   @   ó   e Zd ZdS )ÚHeliumRotaryEmbeddingN©r1   r2   r3   r    r    r    r!   r6   /   ó    r6   c                   @   r5   )Ú	HeliumMLPNr7   r    r    r    r!   r9   3   r8   r9   c                 C   s>   | dddd…f }| dddd…f }t j| |fdd d¡S )	z*Rotates half the hidden dims of the input..r   Nr   r   r"   ©Údiméþÿÿÿ)r   ÚstackÚflatten)ÚxÚx1Úx2r    r    r!   Úrotate_half7   s   rB   c                 C   sŒ   |  |¡}|  |¡}|dd|jd d …f jddd}|dd|jd d …f jddd}| | t| ƒ|  }|| t|ƒ|  }||fS )a…  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr"   r   r:   )Ú	unsqueezer/   Úrepeat_interleaverB   )ÚqÚkÚcosÚsinÚunsqueeze_dimÚq_embedÚk_embedr    r    r!   Úapply_rotary_pos_emb>   s   

$$rL   c                       ó,   e Zd ZddededB f‡ fdd„Z‡  ZS )ÚHeliumAttentionNÚconfigÚ	layer_idxc                    s:   t ƒ  ||¡ tj|j|jdd| _dt | j¡ | _	d S )NF)Úbiasr   )
r   r   r   ÚLinearr   Úo_projÚmathÚsqrtÚhead_dimÚscaling©r   rO   rP   r   r    r!   r   ^   s   zHeliumAttention.__init__r   ©r1   r2   r3   r   Úintr   r4   r    r    r   r!   rN   ]   ó    $rN   c                       rM   )ÚHeliumDecoderLayerNrO   rP   c                    s@   t ƒ  ||¡ t|ƒ| _t|j|jd| _t|j|jd| _d S )N©r   )	r   r   r9   Úmlpr   r   Úrms_norm_epsÚinput_layernormÚpost_attention_layernormrX   r   r    r!   r   e   s   
zHeliumDecoderLayer.__init__r   rY   r    r    r   r!   r\   d   r[   r\   c                   @   r5   )ÚHeliumPreTrainedModelNr7   r    r    r    r!   rb   m   r8   rb   c                       s"   e Zd Zdef‡ fdd„Z‡  ZS )ÚHeliumModelrO   c                    sP   t ƒ  ˆ ¡ t ‡ fdd„tˆ jƒD ƒ¡| _tˆ jˆ j	d| _
d| _|  ¡  d S )Nc                    s   g | ]}t ˆ |ƒ‘qS r    )r\   )Ú.0rP   ©rO   r    r!   Ú
<listcomp>u   s    z(HeliumModel.__init__.<locals>.<listcomp>r]   F)r   r   r   Ú
ModuleListÚrangeÚnum_hidden_layersÚlayersr   r   r_   ÚnormÚgradient_checkpointingÚ	post_init)r   rO   r   re   r!   r   r   s   ÿzHeliumModel.__init__)r1   r2   r3   r   r   r4   r    r    r   r!   rc   q   s    rc   c                   @   r5   )ÚHeliumForCausalLMNr7   r    r    r    r!   rn   ~   r8   rn   c                   @   r5   )ÚHeliumForSequenceClassificationNr7   r    r    r    r!   ro   ‚   r8   ro   c                   @   r5   )ÚHeliumForTokenClassificationNr7   r    r    r    r!   rp   †   r8   rp   )rb   rc   rn   ro   rp   )r   )%rT   r   Útorch.nnr   Úutilsr   Úgemma.modeling_gemmar   r   r   Úgranite.modeling_graniter   Úllama.modeling_llamar	   r
   r   r   r   Úconfiguration_heliumr   Ú
get_loggerr1   ÚloggerÚModuler   r6   r9   rB   rL   rN   r\   rb   rc   rn   ro   rp   Ú__all__r    r    r    r!   Ú<module>   s,   

	