o
    ei]                     @   sF  d dl mZ d dlmZ d dlZd dlmZ ddlmZ	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 G dd dej1Z2G dd dej1Z3dd Z4edd<ddZ5dej6d e7d!ej6fd"d#Z8	$		d=d%ej1d&ej6d'ej6d(ej6d)ej6dB d*e9d+e9dB d,e9dB d!e:ej6ej6f fd-d.Z;ee5G d/d0 d0ej1Z<G d1d2 d2eZ=G d3d4 d4ej1Z>e(G d5d6 d6e#Z?e(G d7d8 d8e?Z@e(G d9d: d:e?eZAg d;ZBdS )>    )Callable)OptionalN   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )VaultGemmaConfigc                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )VaultGemmaRMSNormư>dimepsc                    s&   t    || _tt|| _d S N)super__init__r"   nn	Parametertorchzerosweight)selfr!   r"   	__class__ p/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vaultgemma/modeling_vaultgemma.pyr%   /   s   
zVaultGemmaRMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)keepdim)r(   rsqrtpowmeanr"   )r+   xr.   r.   r/   _norm4   s   $zVaultGemmaRMSNorm._normc                 C   s*   |  | }|d| j   }||S )N      ?)r7   floatr*   type_as)r+   r6   outputr.   r.   r/   forward7   s   
zVaultGemmaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler*   shaper"   )r+   r.   r.   r/   
extra_repr>   s   zVaultGemmaRMSNorm.extra_repr)r    )
__name__
__module____qualname__intr9   r%   r7   r<   r?   __classcell__r.   r.   r,   r/   r   .   s
    r   c                       s$   e Zd Z fddZdd Z  ZS )VaultGemmaMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFbias)r$   r%   confighidden_sizeintermediate_sizer&   Linear	gate_projup_proj	down_projr   hidden_activationact_fnr+   rI   r,   r.   r/   r%   C   s   
zVaultGemmaMLP.__init__c                 C   s$   |  | | || | }|S r#   )rO   rQ   rM   rN   )r+   r6   rO   r.   r.   r/   r<   M   s    zVaultGemmaMLP.forward)r@   rA   rB   r%   r<   rD   r.   r.   r,   r/   rE   B   s    
rE   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr1   r0   r!   )r>   r(   cat)r6   x1x2r.   r.   r/   rotate_halfR   s   rW   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerW   )qkcossinunsqueeze_dimq_embedk_embedr.   r.   r/   apply_rotary_pos_embY   s
   

ra   hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r>   expandreshape)rb   rc   batchnum_key_value_headsslenhead_dimr.   r.   r/   	repeat_kvs   s
   0rk           modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d ur:|| }tjj	|dtj
d|j}tjj||| jd}t||
}|dd }||fS )N      r0   r   r1   )r!   dtype)ptrainingr   )rj   rk   num_key_value_groupsr(   matmul	transposetanhr&   
functionalsoftmaxfloat32torv   rr   rx   
contiguous)rm   rn   ro   rp   rq   rr   rs   rt   kwargs
key_statesvalue_statesattn_weightsattn_outputr.   r.   r/   eager_attention_forward   s    

r   c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )VaultGemmaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrI   	layer_idxc                    s  t    t|dr|j| nd | _|| _|| _t|d|j|j	 | _
|j	|j | _|jd | _| jj| _d| _tj|j|j	| j
 |jd| _tj|j|j| j
 |jd| _tj|j|j| j
 |jd| _tj|j	| j
 |j|jd| _| jj| _| jdkr|j| _d S d | _d S )Nlayer_typesrj   ru   TrG   sliding_attention)r$   r%   hasattrr   
layer_typerI   r   getattrrJ   num_attention_headsrj   rh   ry   query_pre_attn_scalarrs   attention_dropout	is_causalr&   rL   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowr+   rI   r   r,   r.   r/   r%      s.   


 zVaultGemmaAttention.__init__Nrb   position_embeddingsrq   past_key_valuescache_positionr   rd   c                 K   s   |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jrl| jnd| j| j| jd|\}}|jg |dR   }| |}||fS )Nr1   r   r0   )r]   r\   r   rl   )rr   rs   r   rt   )r>   rj   r   viewr{   r   r   ra   updater   r   get_interfacerI   _attn_implementationr   rx   r   rs   r   r   rf   r   r   )r+   rb   r   rq   r   r   r   input_shapehidden_shapequery_statesr   r   r\   r]   cache_kwargsattention_interfacer   r   r.   r.   r/   r<      s<   	


zVaultGemmaAttention.forwardNNNN)r@   rA   rB   __doc__r   rC   r%   r(   Tensorr=   r   
LongTensorr   r   r<   rD   r.   r.   r,   r/   r      s,    r   c                       s   e Zd Zdedef fddZ				ddejdeejejf dejdB d	ej	dB d
e
dB dej	dB deejeejejf dB f fddZ  ZS )VaultGemmaDecoderLayerrI   r   c                    sd   t    |j| _|| _|j| | _t||d| _t|| _	t
|j|jd| _t
|j|jd| _d S )N)rI   r   r"   )r$   r%   rJ   rI   r   attention_typer   	self_attnrE   mlpr   rms_norm_epsinput_layernormpre_feedforward_layernormr   r,   r.   r/   r%      s   

zVaultGemmaDecoderLayer.__init__Nrb   r   rq   position_idsr   r   rd   c           
   	   K   s\   |}|  |}| jd||||||d|\}}	|| }|}| |}| |}|| }|S )N)rb   r   rq   r   r   r   r.   )r   r   r   r   )
r+   rb   r   rq   r   r   r   r   residual_r.   r.   r/   r<      s$   


	

zVaultGemmaDecoderLayer.forwardr   )r@   rA   rB   r   rC   r%   r(   r   r=   r   r   FloatTensorr<   rD   r.   r.   r,   r/   r      s*    	r   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )VaultGemmaRotaryEmbeddinginv_freqNrI   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)r$   r%   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrI   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r+   rI   devicerope_init_fnr   r,   r.   r/   r%     s   


z"VaultGemmaRotaryEmbedding.__init__r   ztorch.deviceseq_lenrd   ztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetarj   Nr8   r   r0   rv   )r   rv   )	r   r   rJ   r   r(   arangeint64r   r9   )rI   r   r   baser!   attention_factorr   r.   r.   r/   r   ,  s   
&z9VaultGemmaRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r1   r   mpscpuF)device_typeenabledr0   rS   r   )r   r9   re   r>   r   r   
isinstancetypestrr   r{   r(   rT   r\   r   r]   rv   )
r+   r6   r   inv_freq_expandedposition_ids_expandedr   freqsembr\   r]   r.   r.   r/   r<   J  s   0&z!VaultGemmaRotaryEmbedding.forwardr#   )NNN)r@   rA   rB   r(   r   __annotations__r   r%   staticmethodr   rC   r=   r9   r   no_gradr   r<   rD   r.   r.   r,   r/   r     s&   
 

r   c                       s`   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZe  fddZ  ZS )	VaultGemmaPreTrainedModelrI   modelTr   r   )rb   
attentionsc                    s,   t  | d|jjv rt|j d S d S )NRMSNorm)r$   _init_weightsr-   r@   initzeros_r*   )r+   rm   r,   r.   r/   r   l  s   z'VaultGemmaPreTrainedModel._init_weights)r@   rA   rB   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr(   r   r   rD   r.   r.   r,   r/   r   Z  s    
 r   c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )VaultGemmaModelrI   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t | _d| _|   d S )Nc                    s   g | ]}t  |qS r.   )r   ).0r   rI   r.   r/   
<listcomp>}  s    z,VaultGemmaModel.__init__.<locals>.<listcomp>r   F)r$   r%   pad_token_idpadding_idx
vocab_sizer&   	EmbeddingrJ   embed_tokens
ModuleListrangenum_hidden_layerslayersr   r   normr   
rotary_embgradient_checkpointing	post_initrR   r,   r   r/   r%   v  s   
zVaultGemmaModel.__init__N	input_idsrq   r   r   inputs_embeds	use_cacher   r   rd   c              	   K   sL  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| }
tsf| j|||||d}tdi |tdi |d}
|}| ||}tj| jjd |jd	}|| }| jd | jj D ]}||f|
|j ||||d
|}q| |}t||dS )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   )rI   r   rq   r   r   r   )full_attentionr   g      ?r   )rq   r   r   r   r   )last_hidden_stater   r.   )
ValueErrorr   r   rI   get_seq_lengthr(   r   r>   r   rY   r   dictr   r   r   tensorrJ   rv   r   r   r   r   r   )r+   r   rq   r   r   r   r   r   r   past_seen_tokenscausal_mask_mappingmask_kwargsrb   r   
normalizerdecoder_layerr.   r.   r/   r<     sZ   




zVaultGemmaModel.forward)NNNNNNN)r@   rA   rB   r   r%   r   r   r   r(   r   r   r   r   boolr   r   r   r<   rD   r.   r.   r,   r/   r   t  s>    	
r   c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																	
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B de	j
d	B dee	jB dee defddZ  ZS )VaultGemmaForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrb   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rF   )
r$   r%   r   r   r   r&   rL   rJ   r  r   rR   r,   r.   r/   r%     s
   
zVaultGemmaForCausalLM.__init__Nr   r   rq   r   r   r   labelsr   r   logits_to_keepr   rd   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }| jjdurE|| jj }t	|}|| jj }d}|durW| j
||| jfi |
}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, VaultGemmaForCausalLM

        >>> model = VaultGemmaForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)r   rq   r   r   r   r   r   N)lossr  r   rb   r   r.   )r   r  r   rC   slicer  rI   final_logit_softcappingr(   r|   loss_functionr   r   r   rb   r   )r+   r   rq   r   r   r   r  r   r   r  r   outputsrb   slice_indicesr  r  r.   r.   r/   r<     s8   !
zVaultGemmaForCausalLM.forward)	NNNNNNNNr   )r@   rA   rB   _tied_weights_keys_tp_plan_pp_planr%   r   r   r(   r   r   r   r   r  rC   r   r   r   r<   rD   r.   r.   r,   r/   r    sN    		
r  )r  r   r   )r   )rl   NN)Ccollections.abcr   typingr   r(   torch.nnr&    r   r   activationsr   cache_utilsr   r   
generationr	   integrationsr
   r   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   configuration_vaultgemmar   Moduler   rE   rW   ra   r   rC   rk   r9   r=   r   r   r   r   r   r   r  __all__r.   r.   r.   r/   <module>   sv   

"K,A_P