o
    eiD                     @   s  d dl mZ d dlmZ d dlZd dlmZ ddlmZ	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 dZ9e(:e;Z<G dd de/Z=G dd deZ>G dd de4Z?G dd de2Z@G dd de5ZAG d d! d!e1ZBG d"d# d#e1ZCd$ejDdB d%efd&d'ZEd(eFd%efd)d*ZGG d+d, d,eZHG d-d. d.eZIG d/d0 d0ejJZKG d1d2 d2ejJZLe&G d3d4 d4e3ZMd5ejNdB d6ejDd7eFdB d%ejDfd8d9ZOG d:d; d;eMZPG d<d= d=eMZQe&G d>d? d?eMZRe&G d@dA dAeMZSG dBdC dCeMeZTe&G dDdE dEeMZUe&G dFdG dGeMZVg dHZWdS )I    )Callable)AnyN   )initialization)CacheDynamicCacheEncoderDecoderCache)PreTrainedConfig)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)RopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forwardz google/t5gemma-2b-2b-prefixlm-itc                4       s.  e Zd ZdZ										
																d2dedB dedB dedB dedB dedB dedB dedB dedB dedB d edB d!edB d"edB d#edB d$edB d%edB d&edB d'ee	eef B dB d(edB d)edB d*edB d+edB d,e
e dB d-edB d.edB d/edB f2 fd0d1Z  ZS )3T5GemmaModuleConfiga  
    This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the T5GemmaModule-7B.
    e.g. [google/t5_gemma_module-7b](https://huggingface.co/google/t5_gemma_module-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the T5GemmaModule model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5GemmaModuleModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            in T5GemmaModule, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
            scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
            scaling factor when applying tanh softcapping on the attention scores.
        is_decoder (`bool`, *optional*, defaults to `False`):
            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
            decoder-only or encoder-only architectures.

    ```python
    >>> from transformers import T5GemmaModuleModel, T5GemmaModuleConfig
    >>> # Initializing a T5GemmaModule t5_gemma_module-7b style configuration
    >>> configuration = T5GemmaModuleConfig()
    >>> # Initializing a model from the t5_gemma_module-7b style configuration
    >>> model = T5GemmaModuleModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```   	   $              gelu_pytorch_tanh    {Gz?ư>Tr      r   NF                 >@      I@
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_idtie_word_embeddingsrope_parametersattention_biasattention_dropoutquery_pre_attn_scalarsliding_windowlayer_typesfinal_logit_softcappingattn_logit_softcapping
is_decoderc                    s   || _ t jdi d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d|d|d|d|| | `d S )Nr9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP    )rQ   super__init__use_bidirectional_attention)selfr9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   kwargs	__class__rR   i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/t5gemma/modular_t5gemma.pyrT      sh   	
zT5GemmaModuleConfig.__init__)r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   Tr   r4   r   TNFr5   r/   r6   Nr7   r8   F)__name__
__module____qualname____doc__intstrfloatboolr   dictlistrT   __classcell__rR   rR   rX   rZ   r(   A   s    T	

r(   c                       s   e Zd ZdZdZdgZeedZ								dd	eee	e	f B dB d
eee	e	f B dB de
dB dedB dedB dedB de
dB dedB f fddZ  ZS )T5GemmaConfiga  
    This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
    e.g. [google/t5gemma-2b-2b-prefixlm-it](https://huggingface.co/google/t5gemma-2b-2b-prefixlm-it)
    ```python
    >>> from transformers import T5GemmaConfig, T5GemmaModel
    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
    >>> model = T5GemmaModel(t5gemma_config)
    ```
    Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PreTrainedConfig] for more information.
    Args:
        encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether tie input and output embeddings.
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the T5Gemma model (the same as Gemma 2).
        kwargs (additional keyword arguments, optional, *optional*):
            Will be passed to the PreTrainedConfig base class.
    t5gemmapast_key_values)encoderdecoderNTr5   r)   ri   rj   is_encoder_decoderdropout_rateclassifier_dropout_raterK   rH   r9   c	                    sJ  t |trtdi |}n|d u rt }nt |ts#J t| dt |tr0tdi |}n|d u r7|}nt |tsEJ t| dtdi | }tdi | }d|_||_||_|| _d|_d|_	||_||_|j
|_|| _dD ]}
|
|	vrt||
|	|
< qxt jdi |	 || _|	d|j| _|| _|| _|| _d S )Nz is not supported.FT)rG   rE   rF   rB   rR   )
isinstancerc   r(   typeto_dictrQ   rl   rK   ri   rD   r:   cross_attention_hidden_sizerj   getattrrS   rT   rk   getrB   rm   rH   r9   )rV   ri   rj   rk   rl   rm   rK   rH   r9   rW   special_token_keyrX   rR   rZ   rT      s@   


zT5GemmaConfig.__init__)NNTr5   r5   r5   Tr)   )r[   r\   r]   r^   
model_typekeys_to_ignore_at_inferencer(   sub_configsrc   r   rb   ra   r_   rT   re   rR   rR   rX   rZ   rf      s<    !
	rf   c                   @      e Zd ZdS )T5GemmaRMSNormNr[   r\   r]   rR   rR   rR   rZ   ry   /      ry   c                       s$   e Zd Z fddZdd Z  ZS )
T5GemmaMLPc                    s   t  | t|j| _d S N)rS   rT   nnDropoutrl   dropoutrV   configrX   rR   rZ   rT   4  s   zT5GemmaMLP.__init__c                 C   s2   |  | || | }| |}| |}|S r}   )act_fn	gate_projup_projr   	down_proj)rV   xhidden_statesr   rR   rR   rZ   forward8  s   

zT5GemmaMLP.forward)r[   r\   r]   rT   r   re   rR   rR   rX   rZ   r|   3  s    r|   c                   @   rx   )T5GemmaRotaryEmbeddingNrz   rR   rR   rR   rZ   r   ?  r{   r   c                       s&   e Zd Zdedef fddZ  ZS )T5GemmaSelfAttentionr   	layer_idxc                    s   t  || |j| _d S r}   )rS   rT   rQ   	is_causalrV   r   r   rX   rR   rZ   rT   D  s   zT5GemmaSelfAttention.__init__)r[   r\   r]   r(   r_   rT   re   rR   rR   rX   rZ   r   C  s    r   c                       s~   e Zd Zdedef fddZ	ddejdejdB dejdB d	edB d
e	e
 deejejdB eej dB f fddZ  ZS )T5GemmaCrossAttentionr   r   c                    sn   t  || | `| `d| _|jd u rtdtj|j|j	| j
 |jd| _tj|j|j	| j
 |jd| _d S )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)rS   rT   rM   
layer_typer   rq   
ValueErrorr~   Linearr>   r?   rJ   k_projv_projr   rX   rR   rZ   rT   K  s   
zT5GemmaCrossAttention.__init__Nr   attention_maskencoder_hidden_statesrh   rW   returnc                 K   sz  |d u rt d|jd d }g |d| jR }| ||dd}|d ur3|j| j}	|j	}
|d u s9|	sw|jd d }g |d| jR }| 
||dd}| ||dd}|d urv|
||| j\}}d|j| j< n|
j| j j}|
j| j j}t| jjt}|| ||||f| jr| jnd| jd | jd|\}}|jg |dR   }| |}||fS )Nz5Encoder hidden state is required for cross attention.r4   r   Tr5   )r   scalingrM   softcap)r   shaper?   q_projview	transpose
is_updatedrs   r   cross_attention_cacher   r   updatelayerskeysvaluesr   get_interfacer   _attn_implementationr'   trainingrK   r   rP   reshape
contiguouso_proj)rV   r   r   r   rh   rW   input_shapehidden_shapequery_statesr   curr_past_key_valuesencoder_input_shapeencoder_hidden_shape
key_statesvalue_statesattention_interfaceattn_outputattn_weightsrR   rR   rZ   r   [  sN   


zT5GemmaCrossAttention.forwardr}   )r[   r\   r]   r(   r_   rT   torchTensorr   r   r   tupler   re   rR   rR   rX   rZ   r   J  s     r   r   r   c              
      &   dt dt dt dt dtf
 fdd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxr   c                    s,    d u rt jdt jdS  | |f t jS )NrR   dtype)r   onesrb   tor   r   r   r   r   rR   rZ   
inner_mask  s   z/bidirectional_mask_function.<locals>.inner_maskr_   rb   )r   r   rR   r   rZ   bidirectional_mask_function  s   "r   rM   c              
      r   )zH
    This creates bidirectional attention mask with sliding window.
    r   r   r   r   r   c                    s   |  |k ||  k @ S r}   rR   r   rM   rR   rZ   r     s   z>sliding_window_bidirectional_mask_function.<locals>.inner_maskr   )rM   r   rR   r   rZ   *sliding_window_bidirectional_mask_function  s   "r   c                       sr   e Zd ZdZdef fddZ			ddejdeejejf dB dejdB d	ej	dB d
eej
f f
ddZ  ZS )T5GemmaEncoderLayerzEncoder sub-layer.r   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S N)r   r   eps)rS   rT   r:   r   r   rN   attention_typer   	self_attnry   rC   pre_self_attn_layernormpost_self_attn_layernormr|   mlppre_feedforward_layernormpost_feedforward_layernormr~   r   rl   r   r   rX   rR   rZ   rT     s   

zT5GemmaEncoderLayer.__init__Nr   position_embeddingsr   position_idsr   c                 K   sz   |}|  |}| jd||||d d|\}}| |}|| | }|}| |}| |}| |}|| | }|S )N)r   r   r   r   rh   rR   )r   r   r   r   r   r   r   )rV   r   r   r   r   rW   residual_rR   rR   rZ   r     s&   





zT5GemmaEncoderLayer.forward)NNN)r[   r\   r]   r^   r_   rT   r   r   r   
LongTensorFloatTensorr   re   rR   rR   rX   rZ   r     s"    
r   c                       s   e Zd ZdZdef fddZ								ddejdeejejf dB d	ejdB d
ej	dB de
dB dedB dej	dB dejdB dejdB dejfddZ  ZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.r   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _t||d| _t	|j|j
d| _t	|j|j
d| _d S r   )rS   rT   r:   r   r   rN   r   r   r   ry   rC   r   r   r|   r   r   r   r~   r   rl   r   r   
cross_attnpre_cross_attn_layernormpost_cross_attn_layernormr   rX   rR   rZ   rT     s$   

zT5GemmaDecoderLayer.__init__NFr   r   r   r   rh   rD   cache_positionr   encoder_attention_maskr   c
              
   K   s   |}|  |}| jd|||||d ur|jnd ||d|
\}}| |}|| | }|}| |}| jd|||	||d|
\}}| |}|| | }|}| |}| 	|}| 
|}|| | }|S )N)r   r   r   r   rh   rD   r   )r   r   r   rh   rD   rR   )r   r   self_attention_cacher   r   r   r   r   r   r   r   )rV   r   r   r   r   rh   rD   r   r   r   rW   r   r   rR   rR   rZ   r     sD   









zT5GemmaDecoderLayer.forward)NNNNFNNN)r[   r\   r]   r^   r_   rT   r   r   r   r   r   rb   r   r   re   rR   rR   rX   rZ   r     s@    	
r   c                       F   e Zd ZdZddededef fddZdejd	ejfd
dZ	  Z
S )T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r5   r:   
num_labelsrm   c                    s*   t    tj|d| _t||| _d S )N)p)rS   rT   r~   r   r   r   out_proj)rV   r:   r   rm   rX   rR   rZ   rT   +  s   
z"T5GemmaClassificationHead.__init__r   r   c                 C   s   |  |}| |}|S r}   )r   r   )rV   r   rR   rR   rZ   r   0  s   

z!T5GemmaClassificationHead.forward)r5   )r[   r\   r]   r^   r_   ra   rT   r   r   r   re   rR   rR   rX   rZ   r   (  s    r   c                       r   )T5GemmaLMHeadz.Head for language modeling (generation) tasks.Fr:   r9   r   c                    s    t    tj|||d| _d S )Nr   )rS   rT   r~   r   r   )rV   r:   r9   r   rX   rR   rZ   rT   9  s   
zT5GemmaLMHead.__init__r   r   c                 C   s   |  |}|S r}   )r   )rV   r   logitsrR   rR   rZ   r   =  s   
zT5GemmaLMHead.forward)F)r[   r\   r]   r^   r_   rb   rT   r   r   r   re   rR   rR   rX   rZ   r   6  s    r   c                   @   sl   e Zd ZU eed< dZdZddgZee	e
ddde	e
dd	de	edd	dgd
Ze dd Zdd ZdS )T5GemmaPreTrainedModelr   modelTr   r   r4   r   )index
layer_namer   )r   
attentionsc                 C   s   t | | | jj}t|tr=|jjjd d }t	j
|jjd|| d t|jdr9|jjd ur;t	|jj d S d S d S t|tr_| jjs]|jjjd d }t	j
|jjd|| d d S d S d|jjv rmt	|j d S d S )Nr   g      r5   )meanstdr   RMSNorm)r   _init_weightsr   rB   rn   r   r   weightr   initnormal_hasattrr   zeros_r   rH   rY   r[   )rV   moduler   scalerR   rR   rZ   r   Q  s    

z$T5GemmaPreTrainedModel._init_weightsc                 C   s|   | j jj}| j jj}|du rtd||j}|dddf  |dddf< ||d< |du r4td||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r4   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r   rj   rG   rE   r   	new_zerosr   clonemasked_fill_)rV   	input_idsdecoder_start_token_idrE   shifted_input_idsrR   rR   rZ   _shift_rightc  s   

 z#T5GemmaPreTrainedModel._shift_rightN)r[   r\   r]   rf   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r   r   r   _can_record_outputsr   no_gradr   r   rR   rR   rR   rZ   r   B  s   
 	
r   	token_idsr   rE   c                 C   sX   | dur|du rt d| |k|jtj}|S tj|jd |jd f|jtjd}|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r4   devicer   )r   r   r	  r   longr   r   )r  r   rE   r   rR   rR   rZ   make_default_2d_attention_mask~  s   r  c                       sz   e Zd ZeedZ fddZee				dde	j
dB de	jdB de	j
dB de	jdB d	ee d
eeB fddZ  ZS )T5GemmaEncoder)r   r   c                       t     j| _ j| _t j j| j| _t	 j j
d| _d| _t fddt jD | _t j| _t d| _|   d S )Nr   Fc                       g | ]}t  |qS rR   )r   .0r   r   rR   rZ   
<listcomp>      z+T5GemmaEncoder.__init__.<locals>.<listcomp>r  rS   rT   rE   padding_idxr9   r~   	Embeddingr:   embed_tokensry   rC   normgradient_checkpointing
ModuleListranger<   r   r   rl   r   r   
rotary_emb	post_initr   rX   r  rZ   rT        zT5GemmaEncoder.__init__Nr   r   r   inputs_embedsrW   r   c                 K   sf  |d u |d uA rt d|dd  |d u r| |}tjd|jd |jd}|d u r0|d}|d u r<t||| j	j
}t| }tsm| j	|||d |d}tdi |dt|itdi |t| j	jt|dd	}|}	tj| j	jd
 |	jd}
|	|
 }	| |	}	| |	|}| jd | j	j D ]}||	|||j |fi |}	q| |	}	| |	}	t|	dS )N:You must specify exactly one of input_ids or inputs_embedsrh   r   r4   r	  r   r  r   r   rh   r   or_mask_function)r#  and_mask_functionfull_attentionsliding_attention      ?r   )last_hidden_staterR   )r   popr  r   aranger   r	  	unsqueezer  r   rE   rn   rc   r%   r   r&   r   rM   tensorr:   r   r   r  r   r<   r   r  r   )rV   r   r   r   r  rW   r   self_attn_mask_mappingmask_kwargsr   
normalizerr   layer_modulerR   rR   rZ   r     sb   


	



zT5GemmaEncoder.forwardNNNN)r[   r\   r]   r   r   r  rT   r   r   r   r   r   r   r   r   r   r   r   re   rR   rR   rX   rZ   r    s0    r  c                       s   e Zd ZeeddeeddedZ fddZe	e
									ddejdB dejdB d	ejdB d
edB dejdB dedB dejdB dejdB dejdB dee deeB fddZ  ZS )T5GemmaDecoderr4   )r   )r   cross_attentionsr   c                    r  )Nr   Fc                    r  rR   )r   r  r  rR   rZ   r    r  z+T5GemmaDecoder.__init__.<locals>.<listcomp>r  r  r   rX   r  rZ   rT     r  zT5GemmaDecoder.__init__Nr   r   r   rh   r  rD   r   r   r   rW   r   c
                 K   s  |d u |d uA rt d|d u rt d|d u r| |}| js0|r0|d u r0tt| jdt }|d u rL|d ur<| nd}tj|||j	d  |j
d}|d u rU|d}|d u re|d u ret||| jj}t| }ts| j||||d urx|jnd |d}tdi |tdi |d}t|	 }ts| j||	|d d d}d	tdi |d
t|	ii}|}tj| jjd |jd}|| }| |}| ||}| jd | jj D ]}|||||j ||||||d	 f	i |
}q| |}| |}t||dS )Nr   z0`encoder_hidden_states` must be given in decoderr  r   r4   r!  r"  r%  r&  r#  r(  r   )r)  rh   rR   )r   r  r   r   r   r   get_seq_lengthr   r+  r   r	  r,  r  rE   rn   rc   r   r%   r&   r   r-  r:   r   r   r  r   r<   r   r  r   )rV   r   r   r   rh   r  rD   r   r   r   rW   past_seen_tokensr.  r/  cross_attn_mask_mappingr   r0  r   r1  rR   rR   rZ   r     s   

		



zT5GemmaDecoder.forward)	NNNNNNNNN)r[   r\   r]   r   r   r   r   r  rT   r   r   r   r   r   r   r   rb   r   r   r   r   r   re   rR   rR   rX   rZ   r3    sP    

	
r3  c                       s   e Zd Zdef fddZdd Zdd Zee												dd	e	j
dB d
e	jdB de	j
dB de	j
dB de	jdB de	j
dB dedB dedB de	jdB de	jdB dedB de	j
dB dee defddZ  ZS )T5GemmaModelr   c                    s>   t  | |jstdt|j| _t|j| _|   d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	rS   rT   rk   r   r  ri   r3  rj   r  r   rX   rR   rZ   rT   i  s   zT5GemmaModel.__init__c                 C   
   | j  S r}   ri   get_input_embeddingsrV   rR   rR   rZ   r;  t     
z!T5GemmaModel.get_input_embeddingsc                 C      | j |S r}   ri   set_input_embeddingsrV   new_embeddingsrR   rR   rZ   r@  w     z!T5GemmaModel.set_input_embeddingsNr   r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsrh   r  decoder_inputs_embedsrD   r   rW   r   c                 K   s   |du r| j d||||	d|}|j}| jd||||
|||||d	|}t|j|j|ddr4|jn|jf|j|j|j|j|jdS )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        Nr   r   r   r  )	r   r   r   r  rh   r   r   rD   r   output_hidden_statesF)r)  rh   decoder_hidden_statesdecoder_attentionsr4  encoder_last_hidden_stater   encoder_attentionsrR   )	ri   r)  rj   r   rh   rs   r   r   r4  )rV   r   r   r   rD  rE  rF  rG  rh   r  rH  rD   r   rW   r   decoder_outputsrR   rR   rZ   r   z  sF   

zT5GemmaModel.forward)NNNNNNNNNNNN)r[   r\   r]   rf   rT   r;  r@  r   r   r   r   r   
BoolTensorr   r   r   rb   r   r   r   r   re   rR   rR   rX   rZ   r8  g  s^    	
r8  c                       s   e Zd Zdef fddZdd Zdd Zee				dd	e	j
dB d
e	jdB de	j
dB de	jdB dee defddZ  ZS )T5GemmaEncoderModelr   c                    s2   t  | |jrtdt|j| _|   d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)rS   rT   rk   r   r  ri   r  r   rX   rR   rZ   rT     s
   zT5GemmaEncoderModel.__init__c                 C   r9  r}   r:  r<  rR   rR   rZ   r;    r=  z(T5GemmaEncoderModel.get_input_embeddingsc                 C   r>  r}   r?  rA  rR   rR   rZ   r@    rC  z(T5GemmaEncoderModel.set_input_embeddingsNr   r   r   r  rW   r   c                 K   s   | j d||||d|}|S )NrI  rR   )ri   )rV   r   r   r   r  rW   rG  rR   rR   rZ   r     s   
zT5GemmaEncoderModel.forwardr2  )r[   r\   r]   rf   rT   r;  r@  r   r   r   r   r   r   r   r   r   r   re   rR   rR   rX   rZ   rQ    s.    	rQ  c                $       s.  e Zd ZddiZddiZddgdgfiZdef fdd	Zd
d Zdd Z	e
e														d$dejdB dejdB dejdB dejdB dejdB dejdB dedB dedB dejdB dejdB dejdB dedB dejdB deejB dee deej eB f d d!Zdejfd"d#Z  ZS )%T5GemmaForConditionalGenerationzlm_head.out_proj.weightz!model.decoder.embed_tokens.weightzlm_head.out_projcolwise_gather_outputr   r   r   c                    sJ   d|_ t | t|| _|jj| _t|jj| j| _	d| _
|   d S )NTForMaskedLM)rk   rS   rT   r8  r   rj   r9   r   r:   lm_head	loss_typer  r   rX   rR   rZ   rT     s   

z(T5GemmaForConditionalGeneration.__init__c                 C   s   || j _d S r}   rU  r   rA  rR   rR   rZ   set_output_embeddings  rC  z5T5GemmaForConditionalGeneration.set_output_embeddingsc                 C   s   | j jS r}   rW  r<  rR   rR   rZ   get_output_embeddings  s   z5T5GemmaForConditionalGeneration.get_output_embeddingsNr   r   r   r   rD  rE  rF  rG  rh   r  rH  labelsrD   r   logits_to_keeprW   r   c                 K   s  |dur|du r|
du r|  |}| jd|||||||||	|
||d|}|j}t|tr4t| dn|}| |dd|ddf }|  j}|j	dur]||j	 }t
|}||j	 }d}|duro| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r   r   r   rD  rE  rF  rG  rh   r  rH  rD   r   )	lossr   rh   rK  rL  r4  rM  r   rN  rR   )r   r   r)  rn   r_   slicerU  get_decoderr   rO   r   tanhloss_functionr9   r   rh   rK  rL  r4  rM  r   rN  )rV   r   r   r   rD  rE  rF  rG  rh   r  rH  rZ  rD   r   r[  rW   rO  r   slice_indicesr   decoder_configr\  rR   rR   rZ   r     sP   





z'T5GemmaForConditionalGeneration.forwardc                 C   s
   |  |S r}   )r   )rV   rZ  rR   rR   rZ   %prepare_decoder_input_ids_from_labels?  r=  zET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNr   )r[   r\   r]   _tied_weights_keys_tp_plan_pp_planrf   rT   rX  rY  r   r   r   r   r   rP  r   r   rb   r_   r   r   r   r   r   r   rc  re   rR   rR   rX   rZ   rR    sr    	
KrR  c                          e Zd ZddededB f fddZdd Zdd	 Zee											dd
e
jdB de
jdB de
jdB de
jdB de
jdB de
jdB dedB de
jdB de
jdB de
jdB dee defddZ  ZS ) T5GemmaForSequenceClassificationNr   rk   c                    |   |dur||_ t | |j| _|j rt|| _nt|| _|jj}|j r*|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        Nrm   皙?rk   rS   rT   r   r8  r   rQ  ri   r:   rj   rr   r   scorer  rV   r   rk   r:   classifier_dropoutrX   rR   rZ   rT   E  s   
z)T5GemmaForSequenceClassification.__init__c                 C   r9  r}   r   r;  r<  rR   rR   rZ   r;  \  r=  z5T5GemmaForSequenceClassification.get_input_embeddingsc                 C      | j | d S r}   r   r@  rV   valuerR   rR   rZ   r@  _     z5T5GemmaForSequenceClassification.set_input_embeddingsr   r   r   rD  rE  rF  rG  r  rH  rZ  rW   r   c                 K   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrP| j|f||||||||	dd	|}|j}|j	}|j
}n| j|f|||d|}|j}|j}|j}| |}|duru|jd }n|jd }| j jdu r|d	krtd
| j jdu rd}nE|dur|| j jk|jtj}tj|jd |jtjd}|| d}| j jr|d	7 }tj||jd d	 d}nd}t| jj d |tj||jd|f }d}|
dur| j||
|| j d}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   rD  rE  rF  rG  r  rH  rD   r   r   r  r   r4   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r!  )r   rZ  pooled_logitsr   r\  r   r   r   )r   rk   NotImplementedErrorrY   r[   r   r   r   r)  rK  rL  r   r   rl  r   rE   r   r	  r   int32r+  argmaxclamploggerwarning_oncer`  r   )rV   r   r   r   rD  rE  rF  rG  r  rH  rZ  rW   outputsr)  r   r   r   
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr|  r\  rR   rR   rZ   r   b  s   


z(T5GemmaForSequenceClassification.forwardr}   
NNNNNNNNNN)r[   r\   r]   rf   rb   rT   r;  r@  r   r   r   r   r   r   r   r   r   r   r   re   rR   rR   rX   rZ   rh  C  sR    	
rh  c                       rg  )T5GemmaForTokenClassificationNr   rk   c                    ri  )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        Nrm   rj  rk  rm  rX   rR   rZ   rT     s   
z&T5GemmaForTokenClassification.__init__c                 C   r9  r}   ro  r<  rR   rR   rZ   r;    r=  z2T5GemmaForTokenClassification.get_input_embeddingsc                 C   rp  r}   rq  rr  rR   rR   rZ   r@    rt  z2T5GemmaForTokenClassification.set_input_embeddingsr   r   r   rD  rE  rF  rG  r  rH  rZ  rW   r   c                 K   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrP| j|f||||||||	dd	|}|j}|j	}|j
}n| j|f|||d|}|j}|j}|j}| |}d}|
dury| ||
| j }t||||dS )	ru  Nrv  rw  rx  Fry  rz  r}  )r   rk   r~  rY   r[   r   r   r   r)  rK  rL  r   r   rl  r`  r   )rV   r   r   r   rD  rE  rF  rG  r  rH  rZ  rW   r  r)  r   r   r   r\  rR   rR   rZ   r     sf   

z%T5GemmaForTokenClassification.forwardr}   r  )r[   r\   r]   rf   rb   rT   r;  r@  r   r   r   r   r   r   r   r   r   r   r   re   rR   rR   rX   rZ   r    sR    	
r  )rf   r(   rR  r8  rQ  r   rh  r  )Xcollections.abcr   typingr   r   torch.nnr~    r   r   cache_utilsr   r   r   configuration_utilsr	   
generationr
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r    r!   r"   r#   r$   r%   r&   r'   _CHECKPOINT_FOR_DOC
get_loggerr[   r  r(   rf   ry   r|   r   r   r   r   r   r_   r   r   r   Moduler   r   r   r   r  r  r3  r8  rQ  rR  rh  r  __all__rR   rR   rR   rZ   <module>   sv    (
 ^G4K;
^zO$g r