o
    ei_                     @   sr  d dl mZ d dlmZ d dlZd dlmZ ddlmZ	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 G dd dej3Z4G dd dej3Z5G dd dej3Z6dd Z7edd@dd Z8d!ej9d"e:d#ej9fd$d%Z;	&		dAd'ej3d(ej9d)ej9d*ej9d+ej9dB d,e<d-e<dB d.e<dB d#e=ej9ej9f fd/d0Z>ee8G d1d2 d2ej3Z?G d3d4 d4eZ@e*G d5d6 d6e%ZAe*G d7d8 d8eAZBe*G d9d: d:eAeZCG d;d< d<eeAZDG d=d> d>eeAZEg d?ZFdS )B    )Callable)OptionalN   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGenericForTokenClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)capture_outputs   )Gemma2Configc                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )Gemma2RMSNormư>dimepsc                    s&   t    || _tt|| _d S N)super__init__r$   nn	Parametertorchzerosweight)selfr#   r$   	__class__ h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/gemma2/modeling_gemma2.pyr'   2   s   
zGemma2RMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)keepdim)r*   rsqrtpowmeanr$   )r-   xr0   r0   r1   _norm7   s   $zGemma2RMSNorm._normc                 C   s*   |  | }|d| j   }||S )N      ?)r9   floatr,   type_as)r-   r8   outputr0   r0   r1   forward:   s   
zGemma2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler,   shaper$   )r-   r0   r0   r1   
extra_reprA   s   zGemma2RMSNorm.extra_repr)r"   )
__name__
__module____qualname__intr;   r'   r9   r>   rA   __classcell__r0   r0   r.   r1   r!   1   s
    r!   c                       s$   e Zd Z fddZdd Z  ZS )	Gemma2MLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFbias)r&   r'   confighidden_sizeintermediate_sizer(   Linear	gate_projup_proj	down_projr   hidden_activationact_fnr-   rK   r.   r0   r1   r'   F   s   
zGemma2MLP.__init__c                 C   s$   |  | | || | }|S r%   )rQ   rS   rO   rP   )r-   r8   rQ   r0   r0   r1   r>   P   s    zGemma2MLP.forward)rB   rC   rD   r'   r>   rF   r0   r0   r.   r1   rG   E   s    
rG   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )Gemma2RotaryEmbeddinginv_freqNrK   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultrV   F)
persistentoriginal_inv_freq)r&   r'   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrK   rope_parametersrW   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r-   rK   devicerope_init_fnrV   r.   r0   r1   r'   X   s   


zGemma2RotaryEmbedding.__init__rc   ztorch.deviceseq_lenreturnztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNr:   r   r2   dtype)rc   rj   )	r^   getattrrL   num_attention_headsr*   arangeint64tor;   )rK   rc   re   baser#   attention_factorrV   r0   r0   r1   r_   h   s   
&z5Gemma2RotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r3   r   mpscpuF)device_typeenabledr2   r#   ri   )rV   r;   expandr@   ro   rc   
isinstancetypestrr   	transposer*   catcosr`   sinrj   )
r-   r8   position_idsinv_freq_expandedposition_ids_expandedrt   freqsembr}   r~   r0   r0   r1   r>      s   0&zGemma2RotaryEmbedding.forwardr%   )NNN)rB   rC   rD   r*   Tensor__annotations__r    r'   staticmethodr   rE   r?   r;   r_   no_gradr   r>   rF   r0   r0   r.   r1   rU   U   s&   
 

rU   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr3   r2   rv   )r@   r*   r|   )r8   x1x2r0   r0   r1   rotate_half   s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr}   r~   unsqueeze_dimq_embedk_embedr0   r0   r1   apply_rotary_pos_emb   s
   

r   hidden_statesn_reprf   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r@   rw   reshape)r   r   batchnum_key_value_headsslenrh   r0   r0   r1   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d ur:|| }tjj	|dtj
d|j}tjj||| jd}t||
}|dd }||fS )N      r2   r   r3   )r#   rj   )ptrainingr   )rh   r   num_key_value_groupsr*   matmulr{   tanhr(   
functionalsoftmaxfloat32ro   rj   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputr0   r0   r1   eager_attention_forward   s    

r   c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )Gemma2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrK   	layer_idxc                    s  t    t|dr|j| nd | _|| _|| _t|d|j|j	 | _
|j	|j | _|jd | _| jj| _t|dd | _tj|j|j	| j
 |jd| _tj|j|j| j
 |jd| _tj|j|j| j
 |jd| _tj|j	| j
 |j|jd| _| jj| _| jdkr|j| _d S d | _d S )Nlayer_typesrh   r   use_bidirectional_attentionFrI   sliding_attention)r&   r'   hasattrr   
layer_typerK   r   rk   rL   rl   rh   r   r   query_pre_attn_scalarr   attention_dropout	is_causalr(   rN   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowr-   rK   r   r.   r0   r1   r'      s.   


 zGemma2Attention.__init__Nr   position_embeddingsr   past_key_valuescache_positionr   rf   c                 K   s   |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jrl| jnd| j| j| jd|\}}|jg |dR   }| |}||fS )Nr3   r   r2   )r~   r}   r   r   )r   r   r   r   )r@   rh   r   viewr{   r   r   r   updater   r   get_interfacerK   _attn_implementationr   r   r   r   r   r   r   r   r   )r-   r   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r}   r~   cache_kwargsattention_interfacer   r   r0   r0   r1   r>     s<   	


zGemma2Attention.forward)NNNN)rB   rC   rD   __doc__r    rE   r'   r*   r   r?   r   
LongTensorr   r   r>   rF   r0   r0   r.   r1   r      s,    r   c                       s   e Zd Zdedef fddZ					ddejdeejejf dB dejdB d	ej	dB d
e
dB dej	dB deejeejejf dB f fddZ  ZS )Gemma2DecoderLayerrK   r   c                    s   t    |j| _|| _|j| | _t||d| _t|| _	t
|j|jd| _t
|j|jd| _t
|j|jd| _t
|j|jd| _d S )N)rK   r   r$   )r&   r'   rL   rK   r   attention_typer   	self_attnrG   mlpr!   rms_norm_epsinput_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   r.   r0   r1   r'   2  s   

zGemma2DecoderLayer.__init__Nr   r   r   r   r   r   rf   c           
   	   K   sp   |}|  |}| jd||||||d|\}}	| |}|| }|}| |}| |}| |}|| }|S )N)r   r   r   r   r   r   r0   )r   r   r   r   r   r   )
r-   r   r   r   r   r   r   r   residual_r0   r0   r1   r>   ?  s(   



	


zGemma2DecoderLayer.forward)NNNNN)rB   rC   rD   r    rE   r'   r*   r   r?   r   r   FloatTensorr>   rF   r0   r0   r.   r1   r   1  s,    	r   c                       s`   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZe  fddZ  ZS )	Gemma2PreTrainedModelrK   modelTr   r   )r   
attentionsc                    s,   t  | d|jjv rt|j d S d S )NRMSNorm)r&   _init_weightsr/   rB   initzeros_r,   )r-   r   r.   r0   r1   r   u  s   z#Gemma2PreTrainedModel._init_weights)rB   rC   rD   r    r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r   _can_record_outputsr*   r   r   rF   r0   r0   r.   r1   r   c  s    
 r   c                       s   e Zd Zdef fddZeee							ddej	dB dej
dB dej	dB dedB d	ejdB d
edB dej	dB dee defddZ  ZS )Gemma2ModelrK   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t | _d| _|   d S )Nc                    s   g | ]}t  |qS r0   )r   ).0r   rK   r0   r1   
<listcomp>  s    z(Gemma2Model.__init__.<locals>.<listcomp>r   F)r&   r'   pad_token_idpadding_idx
vocab_sizer(   	EmbeddingrL   embed_tokens
ModuleListrangenum_hidden_layerslayersr!   r   normrU   
rotary_embgradient_checkpointing	post_initrT   r.   r   r1   r'     s   
zGemma2Model.__init__N	input_idsr   r   r   inputs_embeds	use_cacher   r   rf   c              	   K   sL  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| }
tsf| j|||||d}tdi |tdi |d}
|}| ||}tj| jjd |jd	}|| }| jd | jj D ]}||f|
|j ||||d
|}q| |}t||dS )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )rc   )rK   r  r   r   r   r   )full_attentionr   g      ?ri   )r   r   r   r   r   )last_hidden_stater   r0   )
ValueErrorr   r   rK   get_seq_lengthr*   rm   r@   rc   r   rx   dictr   r   r   tensorrL   rj   r   r   r   r   r   )r-   r  r   r   r   r  r  r   r   past_seen_tokenscausal_mask_mappingmask_kwargsr   r   
normalizerdecoder_layerr0   r0   r1   r>     sZ   




zGemma2Model.forward)NNNNNNN)rB   rC   rD   r    r'   r   r   r   r*   r   r   r   r   boolr   r   r   r>   rF   r0   r0   r.   r1   r   }  s>    	
r   c                       s   e Zd ZddiZddiZddgdgfiZ fddZee																	
dde	j
d	B de	jd	B de	j
d	B ded	B de	jd	B de	j
d	B ded	B de	j
d	B dee	jB dee defddZ  ZS )Gemma2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rH   )
r&   r'   r   r   r   r(   rN   rL   r  r  rT   r.   r0   r1   r'     s
   
zGemma2ForCausalLM.__init__Nr   r  r   r   r   r  labelsr  r   logits_to_keepr   rf   c
              
   K   s   | j d|||||||d|
}|j}t|	trt|	 dn|	}| |dd|ddf }| jjdurE|| jj }t	|}|| jj }d}|durW| j
||| jfi |
}t|||j|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM

        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)r  r   r   r   r  r  r   N)lossr  r   r   r   r0   )r   r  rx   rE   slicer  rK   final_logit_softcappingr*   r   loss_functionr   r   r   r   r   )r-   r  r   r   r   r  r  r  r   r  r   outputsr   slice_indicesr  r  r0   r0   r1   r>     s8   !
zGemma2ForCausalLM.forward)	NNNNNNNNr   )rB   rC   rD   _tied_weights_keys_tp_plan_pp_planr'   r   r   r*   r   r   r   r   r  rE   r   r   r   r>   rF   r0   r0   r.   r1   r    sN    		
r  c                   @      e Zd ZdS )Gemma2ForSequenceClassificationNrB   rC   rD   r0   r0   r0   r1   r!  .      r!  c                   @   r   )Gemma2ForTokenClassificationNr"  r0   r0   r0   r1   r$  2  r#  r$  )r  r   r   r!  r$  )r   )r   NN)Gcollections.abcr   typingr   r*   torch.nnr(    r   r   activationsr   cache_utilsr   r   
generationr	   integrationsr
   r   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   r   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   configuration_gemma2r    Moduler!   rG   rU   r   r   r   rE   r   r;   r?   r   r   r   r   r   r  r!  r$  __all__r0   r0   r0   r1   <module>   sz   A

"K2_P