o
    wiV                     @   s  d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. dZ/e" r	 e#0e1Z2G dd de%Z3G dd deZ4G dd de*Z5G dd de(Z6G dd de+Z7G dd de'Z8G dd de'Z9deej: d efd!d"Z;d#e<d efd$d%Z=G d&d' d'eZ>G d(d) d)e>Z?G d*d+ d+ej@ZAG d,d- d-ej@ZBe G d.d/ d/e)ZCd0eejD d1ej:d2ee< d ej:fd3d4ZEG d5d6 d6eCZFG d7d8 d8eFZGe G d9d: d:eCZHe G d;d< d<eCZIG d=d> d>eCeZJe G d?d@ d@eCZKe G dAdB dBeCZLg dCZMdS )D    )AnyCallableOptionalUnionN   )CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forwardzgoogle/t5gemma-placeholderc                       s    e Zd ZdZ fddZ  ZS )T5GemmaModuleConfigz=Module config (encoder or decoder): the same as Gemma2Config.c                    s   t  jdi | d S )N super__init__)selfsuper_kwargs	__class__r%   h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/t5gemma/modular_t5gemma.pyr(   E   s   zT5GemmaModuleConfig.__init__)__name__
__module____qualname____doc__r(   __classcell__r%   r%   r+   r-   r$   B   s    r$   c                       sP  e Zd ZdZdZdgZi dddddddd	d
ddddd	dddddddd	dddddddd	dddddd	iZdgdgfddgdgfdgdgfdgdgfddgdgfdgdgfdZ							d0d ee	e
eeef f  d!ee	e
eeef f  d"ed#ed$ed%ed&ef fd'd(Z fd)d*Zd1d2d.d/Z  ZS )3T5GemmaConfiga6  
    This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
    e.g. [google/t5gemma-placeholder](https://huggingface.co/google/t5gemma-placeholder)
    ```python
    >>> from transformers import T5GemmaConfig, T5GemmaModel
    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-placeholder")
    >>> model = T5GemmaModel(t5gemma_config)
    ```
    Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PretrainedConfig] for more information.
    Args:
        encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether tie input and output embeddings.
        kwargs (additional keyword arguments, optional, *optional*):
            Will be passed to the PretrainedConfig base class.
    t5gemmapast_key_valuesz!encoder.layers.*.self_attn.q_projcolwisez!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.o_projrowwisezencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_projz!decoder.layers.*.self_attn.q_projz!decoder.layers.*.self_attn.k_projz!decoder.layers.*.self_attn.v_projz!decoder.layers.*.self_attn.o_projz"decoder.layers.*.cross_attn.q_projz"decoder.layers.*.cross_attn.k_projz"decoder.layers.*.cross_attn.v_projz"decoder.layers.*.cross_attn.o_projzdecoder.layers.*.mlp.gate_projzdecoder.layers.*.mlp.up_projzdecoder.layers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)zencoder.embed_tokenszencoder.layerszencoder.normzdecoder.embed_tokenszdecoder.layerszdecoder.normNT        encoderdecoderis_encoder_decoderdropout_rateclassifier_dropout_rateattention_dropouttie_word_embeddingsc           
         s`  t |trtdi |}n|d u rt }nt |ts#J t| dt |tr0tdi |}n|d u r7|}nt |tsEJ t| dtdi | }tdi | }d|_||_||_|| _d|_d|_	||_||_|j
|_|| _dD ]}	|	|vrt||	||	< qxt jdi | || _|d|j	| _	|d|j| _|| _|| _|| _|| _d S )Nz is not supported.FT)bos_token_idpad_token_ideos_token_id	use_cacheinitializer_ranger%   )
isinstancedictr$   typeto_dict
is_decoderr@   rB   r=   rG   hidden_sizecross_attention_hidden_sizer>   getattrr'   r(   r?   getrH   rA   rC   )
r)   r=   r>   r?   r@   rA   rB   rC   kwargsspecial_token_keyr+   r%   r-   r(      sD   


zT5GemmaConfig.__init__c                    s>   g d}||v rt | j|| t | j|| t || d S )N)output_hidden_statesoutput_attentions_attn_implementationr@   rB   )setattrr=   r>   r'   __setattr__)r)   keyvalueshared_attr_with_submodulesr+   r%   r-   rX      s
   zT5GemmaConfig.__setattr__Freturnr
   c                 C   s   ~| S Nr%   )r)   r>   r%   r%   r-   get_text_config   s   zT5GemmaConfig.get_text_config)NNTr<   r<   r<   TF)r\   r
   )r.   r/   r0   r1   
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   r   r$   rJ   r   boolfloatr(   rX   r^   r2   r%   r%   r+   r-   r3   I   s    




=r3   c                   @   s   e Zd ZdS )T5GemmaRMSNormN)r.   r/   r0   r%   r%   r%   r-   rf      s    rf   c                       s$   e Zd Z fddZdd Z  ZS )
T5GemmaMLPc                    s   t  | t|j| _d S r]   )r'   r(   nnDropoutr@   dropoutr)   configr+   r%   r-   r(      s   zT5GemmaMLP.__init__c                 C   s2   |  | || | }| |}| |}|S r]   )act_fn	gate_projup_projrj   	down_proj)r)   xr:   rp   r%   r%   r-   forward   s   

zT5GemmaMLP.forward)r.   r/   r0   r(   rr   r2   r%   r%   r+   r-   rg      s    rg   c                       s   e Zd Zd fdd	Z  ZS )T5GemmaRotaryEmbeddingNc                    s   t  || d S r]   r&   )r)   rl   devicer+   r%   r-   r(      s   zT5GemmaRotaryEmbedding.__init__r]   )r.   r/   r0   r(   r2   r%   r%   r+   r-   rs      s    rs   c                       s&   e Zd Zdedef fddZ  ZS )T5GemmaSelfAttentionrl   	layer_idxc                    s   t  || |j| _d S r]   )r'   r(   rM   	is_causalr)   rl   rv   r+   r%   r-   r(      s   zT5GemmaSelfAttention.__init__)r.   r/   r0   r$   intr(   r2   r%   r%   r+   r-   ru      s    ru   c                       s~   e Zd Zdedef fddZ	ddejdeej deej d	ee	 d
e
e deejeej eeej  f fddZ  ZS )T5GemmaCrossAttentionrl   rv   c                    sj   t  || | `d| _|jd u rtdtj|j|j| j	 |j
d| _tj|j|j| j	 |j
d| _d S )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)r'   r(   sliding_windowrw   rO   
ValueErrorrh   Linearnum_key_value_headshead_dimattention_biask_projv_projrx   r+   r%   r-   r(      s   
zT5GemmaCrossAttention.__init__Nr:   r;   encoder_hidden_statespast_key_valuerR   r\   c                 K   s  |d u rt d|jd d }g |d| jR }| ||dd}|d ur3|j| j}	|j	}
|d u s9|	sw|jd d }g |d| jR }| 
||dd}| ||dd}|d urv|
||| j\}}d|j| j< n|
j| j }|
j| j }t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|| ||||f| jr| jnd| jd | jd|\}}|jg |dR   }| |}||fS )Nz5Encoder hidden state is required for cross attention.   r   TeagersdparU   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r<   )rj   scalingr}   softcap)r~   shaper   q_projview	transpose
is_updatedrQ   rv   cross_attention_cacher   r   update	key_cachevalue_cacher#   rl   rV   loggerwarning_oncer   trainingrB   r   attn_logit_softcappingreshape
contiguouso_proj)r)   r:   r;   r   r   rR   input_shapehidden_shapequery_statesr   curr_past_key_valueencoder_input_shapeencoder_hidden_shape
key_statesvalue_statesattention_interfaceattn_outputattn_weightsr%   r%   r-   rr     sZ   


zT5GemmaCrossAttention.forwardr]   )r.   r/   r0   r$   ry   r(   torchTensorr   r   r   r   tuplerr   r2   r%   r%   r+   r-   rz      s     rz   r;   r\   c              
      &   dt dt dt dt dtf
 fdd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxr\   c                    s,    d u rt jdt jdS  | |f t jS )Nr%   dtype)r   onesrd   tor   r   r   r   r;   r%   r-   
inner_maskZ  s   z/bidirectional_mask_function.<locals>.inner_maskry   rd   )r;   r   r%   r   r-   bidirectional_mask_functionU  s   "r   r}   c              
      r   )zH
    This creates bidirectional attention mask with sliding window.
    r   r   r   r   r\   c                    s   |  |k ||  k @ S r]   r%   r   r}   r%   r-   r   i  s   z>sliding_window_bidirectional_mask_function.<locals>.inner_maskr   )r}   r   r%   r   r-   *sliding_window_bidirectional_mask_functiond  s   "r   c                       s   e Zd ZdZdef fddZ			ddejdeejejf d	e	ej d
e	ej
 de	e deeje	eejejf  f fddZ  ZS )T5GemmaEncoderLayerzEncoder sub-layer.rv   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S N)rl   rv   eps)r'   r(   rN   rl   rv   layer_typesattention_typeru   	self_attnrf   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrg   mlppre_feedforward_layernormpost_feedforward_layernormrh   ri   r@   rj   rx   r+   r%   r-   r(   r  s   

zT5GemmaEncoderLayer.__init__NFr:   position_embeddingsr;   position_idsrU   r\   c           
   
   K   s   |}|  |}| jd|||||dd d|\}}| |}|| | }|}| |}| |}| |}|| | }|f}	|rG|	|f7 }	|	S )NF)r:   r   r;   r   rU   rG   r   r%   )r   r   r   rj   r   r   r   )
r)   r:   r   r;   r   rU   rR   residualself_attn_weightsoutputsr%   r%   r-   rr     s0   
	





zT5GemmaEncoderLayer.forward)NNF)r.   r/   r0   r1   ry   r(   r   r   r   r   
LongTensorrd   FloatTensorrr   r2   r%   r%   r+   r-   r   o  s.    r   c                       s   e Zd ZdZdef fddZ								ddejdeejejf d	e	ej d
e	ej
 de	e de	e de	e de	ej
 de	ej de	ej deeje	eejejf  e	eejejf  f fddZ  ZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.rv   c                    sD   t  || t||d| _t|j|jd| _t|j|jd| _d S r   )	r'   r(   rz   
cross_attnrf   rN   r   pre_cross_attn_layernormpost_cross_attn_layernormrx   r+   r%   r-   r(     s   zT5GemmaDecoderLayer.__init__NFr:   r   r;   r   r   rU   rG   cache_positionr   encoder_attention_maskr\   c                 K   s   |}|  |}| jd|||||d ur|jnd |||d|\}}| |}|| | }|}| |}| jd||	|
|||d|\}}| |}|| | }|}| |}| 	|}| 
|}|| | }|f}|rt|||f7 }|S )N)r:   r   r;   r   r   rU   rG   r   )r:   r   r;   r   rU   rG   r%   )r   r   self_attention_cacher   rj   r   r   r   r   r   r   )r)   r:   r   r;   r   r   rU   rG   r   r   r   rR   r   r   cross_attn_weightsr   r%   r%   r-   rr     sN   
	




	


zT5GemmaDecoderLayer.forward)NNNFFNNN)r.   r/   r0   r1   ry   r(   r   r   r   r   r   r	   rd   r   rr   r2   r%   r%   r+   r-   r     sN    	
r   c                       F   e Zd ZdZddededef fddZdejd	ejfd
dZ	  Z
S )T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r<   rN   
num_labelsrA   c                    s*   t    tj|d| _t||| _d S )N)p)r'   r(   rh   ri   rj   r   out_proj)r)   rN   r   rA   r+   r%   r-   r(     s   
z"T5GemmaClassificationHead.__init__r:   r\   c                 C   s   |  |}| |}|S r]   )rj   r   )r)   r:   r%   r%   r-   rr     s   

z!T5GemmaClassificationHead.forward)r<   )r.   r/   r0   r1   ry   re   r(   r   r   rr   r2   r%   r%   r+   r-   r      s    r   c                       r   )T5GemmaLMHeadz.Head for language modeling (generation) tasks.FrN   
vocab_sizer|   c                    s    t    tj|||d| _d S )Nr{   )r'   r(   rh   r   r   )r)   rN   r   r|   r+   r%   r-   r(     s   
zT5GemmaLMHead.__init__r:   r\   c                 C   s   |  |}|S r]   )r   )r)   r:   logitsr%   r%   r-   rr     s   
zT5GemmaLMHead.forwardr_   )r.   r/   r0   r1   ry   rd   r(   r   r   rr   r2   r%   r%   r+   r-   r     s    r   c                   @   s.   e Zd ZeZdZdZdgZdd Zdd Z	dS )	T5GemmaPreTrainedModelmodelTT5GemmaBlockc                 C   sP  | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S t|tr|jjjd d }|jjjjd|| d t|jdr|jjd ur|jjj	  d S d S d S t|tr| j js|jjjd d }|jjjjd|| d d S d S d S )Nr<   )meanstdg      ?r   g      r|   )rl   rH   rI   rh   r   weightdatanormal_r|   zero_	Embeddingpadding_idxrf   fill_r   r   r   hasattrr   rC   )r)   moduler   scaler%   r%   r-   _init_weights!  s2   




z$T5GemmaPreTrainedModel._init_weightsc                 C   s|   | j jj}| j jj}|du rtd||j}|dddf  |dddf< ||d< |du r4td||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rl   r>   rD   rE   r~   	new_zerosr   clonemasked_fill_)r)   r8   decoder_start_token_idrE   shifted_input_idsr%   r%   r-   _shift_right8  s   

 z#T5GemmaPreTrainedModel._shift_rightN)
r.   r/   r0   r3   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r   r%   r%   r%   r-   r     s    r   	token_idsr:   rE   c                 C   sX   | dur|du rt d| |k|jtj}|S tj|jd |jd f|jtjd}|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r   rt   r   )r~   r   rt   r   longr   r   )r   r:   rE   r;   r%   r%   r-   make_default_2d_attention_maskS  s   r   c                       s   e Zd Z fddZdd Zdd Ze						ddeej	 d	eej
 d
eej	 deej dee dee dee defddZ  ZS )T5GemmaEncoderc                    s   t     j| _ j| _t j j| j| _t	 j j
d| _t d| _d| _t fddt jD | _t j| _|   d S )Nr   rl   Fc                       g | ]}t  |qS r%   )r   .0rv   r   r%   r-   
<listcomp>p      z+T5GemmaEncoder.__init__.<locals>.<listcomp>)r'   r(   rE   r   r   rh   r   rN   embed_tokensrf   r   normrs   
rotary_embgradient_checkpointing
ModuleListrangenum_hidden_layerslayersri   r@   rj   	post_initrk   r+   r   r-   r(   e  s   zT5GemmaEncoder.__init__c                 C      | j S r]   r  r)   r%   r%   r-   get_input_embeddingsw     z#T5GemmaEncoder.get_input_embeddingsc                 C   s
   || _ d S r]   r  r)   rZ   r%   r%   r-   set_input_embeddingsz     
z#T5GemmaEncoder.set_input_embeddingsNr8   r;   r   r9   rU   rT   flash_attn_kwargsr\   c                 K   s  |d ur|n| j j}|d ur|n| j j}|d u |d uA r td|d u r)| |}tjd|jd |jd}|d u r>|	d}|d u rJt
||| j j}t| }	ts{| j |||d |d}
tdi |
dt|itdi |
t| j jt|dd}	|}| ||}tj| j jd	 |jd
}|| }|rdnd }|rdnd }| |}| jd | j j D ]%}|r||f7 }||||	|j ||fi |}|d }|r||d f7 }q| |}| |}|r||f7 }t|||dS )N:You must specify exactly one of input_ids or inputs_embedsr   r   rt   rl   input_embedsr;   r   r5   r   or_mask_function)r  and_mask_functionfull_attentionsliding_attention      ?r   r%   )last_hidden_stater:   
attentions)rl   rU   rT   r~   r  r   aranger   rt   	unsqueezer   rE   rI   rJ   r!   r   r"   r   r}   r  tensorrN   r   rj   r  r
  r   r  r   )r)   r8   r;   r   r9   rU   rT   r  r   self_attn_mask_mappingmask_kwargsr:   r   
normalizerall_hidden_statesall_self_attnslayer_modulelayer_outputsr%   r%   r-   rr   }  s   





	


zT5GemmaEncoder.forwardNNNNNN)r.   r/   r0   r(   r  r  r   r   r   r   r   r   rd   r   r   r   rr   r2   r%   r%   r+   r-   r   d  s8    	r   c                       s   e Zd Z fddZe											ddeej deej deej dee	 deej
 d	ee d
ee dee deej deej deej dee defddZ  ZS )T5GemmaDecoderc                    s8   t    t fddt jD | _|   d S )Nc                    r   r%   )r   r   r   r%   r-   r    r  z+T5GemmaDecoder.__init__.<locals>.<listcomp>)r'   r(   rh   r  r	  r
  r  r  rk   r+   r   r-   r(     s
   zT5GemmaDecoder.__init__Nr8   r;   r   r5   r9   rG   rU   rT   r   r   r   r  r\   c                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|
d u rAtd|d u rJ| 	|}| js[|r[|d u r[t
t t d}|	d u rw|d urg| nd}tj|||jd  |jd}	|d u r|	d}|d u r|d u rt||| j j}t| }ts| j |||	|d ur|jnd |d	}tdi |tdi |d
}t| }ts| j |
||	d d d	}dtdi |dt|ii}|}| ||}tj| j jd |jd}|| }|rdnd }|rdnd }|rdnd }| |}| jd | j j  D ]6}|r||f7 }|||||j! |||||	|
|d f
i |}|d }|rD||d f7 }||d f7 }q| "|}| |}|rX||f7 }t#|||||dS )Nr  zX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz0`encoder_hidden_states` must be given in decoder)r   r   r   r   r  r  r  r  r  r  r   r%   r   )r   r5   r:   r!  cross_attentions)$rl   rU   rT   rG   r~   r  r   r   r   r  r	   r   get_seq_lengthr   r"  r   rt   r#  r   rE   rI   rJ   r   r!   r"   r   r  r$  rN   r   rj   r  r
  r   r  r   )r)   r8   r;   r   r5   r9   rG   rU   rT   r   r   r   r  past_seen_tokensr%  r&  cross_attn_mask_mappingr:   r   r'  r(  r)  all_cross_attnsr*  r+  r%   r%   r-   rr     s   


	




zT5GemmaDecoder.forward)NNNNNNNNNNN)r.   r/   r0   r(   r   r   r   r   r   r	   r   rd   r   r   r   rr   r2   r%   r%   r+   r-   r-    sR    
	
r-  c                #       s   e Zd Zdef fddZdd Zdd Zdd	 Zd
d Ze	e
														ddeej deej deej deej deej deej dee dee deej deej dee dee dee deej dee def ddZ  ZS ) T5GemmaModelrl   c                    s>   t  | |jstdt|j| _t|j| _|   d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r'   r(   r?   r~   r   r=   r-  r>   r  rk   r+   r%   r-   r(     s   zT5GemmaModel.__init__c                 C   r  r]   r=   r  r%   r%   r-   get_encoder  r  zT5GemmaModel.get_encoderc                 C   r  r]   )r>   r  r%   r%   r-   get_decoder  r  zT5GemmaModel.get_decoderc                 C   
   | j  S r]   r=   r  r  r%   r%   r-   r    r  z!T5GemmaModel.get_input_embeddingsc                 C      | j |S r]   r=   r  r)   new_embeddingsr%   r%   r-   r       z!T5GemmaModel.set_input_embeddingsNr8   r;   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr5   r9   decoder_inputs_embedsrG   rU   rT   r   r  r\   c                 K   s   |dur|n| j j}|du r| jd||||	||d|}|j}| jd||||
|||||||d|}t|j|j|j|j|j	|j|j|jdS )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)

        **flash_attn_kwargs: flash attention related parameters.
        Nr8   r;   r   r9   rU   rT   )r8   r;   r   r9   r5   r   r   rG   rU   rT   r   )r   r5   decoder_hidden_statesdecoder_attentionsr.  encoder_last_hidden_stater   encoder_attentionsr%   )
rl   rG   r=   r   r>   r   r5   r:   r!  r.  )r)   r8   r;   r   r>  r?  r@  rA  r5   r9   rB  rG   rU   rT   r   r  r   decoder_outputsr%   r%   r-   rr     sL   
zT5GemmaModel.forward)NNNNNNNNNNNNNN)r.   r/   r0   r3   r(   r5  r6  r  r  r   r   r   r   r   r   
BoolTensorr   r	   r   rd   r   r   r   rr   r2   r%   r%   r+   r-   r3    sn    	r3  c                       s   e Zd Zdef fddZdd Zdd Zee						dd	e	e
j d
e	e
j de	e
j de	e
j de	e de	e dee defddZ  ZS )T5GemmaEncoderModelrl   c                    s2   t  | |jrtdt|j| _|   d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r'   r(   r?   r~   r   r=   r  rk   r+   r%   r-   r(     s
   zT5GemmaEncoderModel.__init__c                 C   r7  r]   r8  r  r%   r%   r-   r    r  z(T5GemmaEncoderModel.get_input_embeddingsc                 C   r9  r]   r:  r;  r%   r%   r-   r    r=  z(T5GemmaEncoderModel.set_input_embeddingsNr8   r;   r   r9   rU   rT   r  r\   c           	   	   K   s"   | j d||||||d|}|S )zJ
        **flash_attn_kwargs: flash attention related parameters.
        rC  Nr%   r4  )	r)   r8   r;   r   r9   rU   rT   r  rA  r%   r%   r-   rr      s   	zT5GemmaEncoderModel.forwardr,  )r.   r/   r0   r3   r(   r  r  r   r   r   r   r   r   r   rd   r   r   r   rr   r2   r%   r%   r+   r-   rJ    s:    		rJ  c                '       sZ  e Zd ZddgZddiZddgdgfiZdef fdd	Zd
d Zdd Z	dd Z
dd Zdd Zee																d+deej deej deej deej deej deej dee dee deej deej d eej d!ee d"ee d#ee d$eej d%eeejf d&eeej ef f"d'd(Zd ejfd)d*Z  ZS ),T5GemmaForConditionalGenerationz!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projcolwise_repr:   r   rl   c                    sJ   d|_ t | t|| _|jj| _t|jj| j| _	d| _
|   d S )NTForMaskedLMLoss)r?   r'   r(   r3  r   r>   r   r   rN   lm_head	loss_typer  rk   r+   r%   r-   r(   !  s   

z(T5GemmaForConditionalGeneration.__init__c                 C   s   || j _d S r]   rN  r   r;  r%   r%   r-   set_output_embeddings,  r=  z5T5GemmaForConditionalGeneration.set_output_embeddingsc                 C      | j jS r]   rP  r  r%   r%   r-   get_output_embeddings/     z5T5GemmaForConditionalGeneration.get_output_embeddingsc                 C   s(   | j jr| | jj|    d S d S r]   )rl   rC   _tie_or_clone_weightsrN  r   r6  r  r  r%   r%   r-   _tie_weights2  s   z,T5GemmaForConditionalGeneration._tie_weightsc                 C   rR  r]   )r   r=   r  r%   r%   r-   r5  7  rT  z+T5GemmaForConditionalGeneration.get_encoderc                 C   rR  r]   )r   r>   r  r%   r%   r-   r6  :  rT  z+T5GemmaForConditionalGeneration.get_decoderNr   r8   r;   r   r>  r?  r@  rA  r5   r9   rB  labelsrG   rU   rT   r   logits_to_keepr\   c                 K   s2  | j r| jjdkrtd| jj d |dur%|du r%|
du r%| |}| jd|||||||||	|
||||d|}|j}t|t	rJt
| dn|}| |dd|ddf }|  j}|jdurs||j }t|}||j }d}|dur| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)

        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        r   ziIt is strongly recommended to train T5Gemma models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)r8   r;   r   r>  r?  r@  rA  r5   r9   rB  rG   rU   rT   r   )	lossr   r5   rD  rE  r.  rF  r   rG  r%   )r   rl   rV   r   r   r   r   r   rI   ry   slicerN  r6  final_logit_softcappingr   tanhloss_functionr   r   r5   rD  rE  r.  rF  r   rG  )r)   r8   r;   r   r>  r?  r@  rA  r5   r9   rB  rW  rG   rU   rT   r   rX  loss_kwargsrH  r:   slice_indicesr   decoder_configrY  r%   r%   r-   rr   =  s`   #





z'T5GemmaForConditionalGeneration.forwardc                 C   s
   |  |S r]   )r   )r)   rW  r%   r%   r-   %prepare_decoder_input_ids_from_labels  r  zET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNNNr   )r.   r/   r0   _tied_weights_keys_tp_plan_pp_planr3   r(   rQ  rS  rV  r5  r6  r   r   r   r   r   r   rI  r   r	   rd   r   ry   r   r   r   rr   ra  r2   r%   r%   r+   r-   rK    s    	XrK  c                          e Zd Zddedee f fddZdd Zdd	 Ze	e
												dd
eej deej deej deej deej deej dee deej deej deej dee dee defddZ  ZS ) T5GemmaForSequenceClassificationNrl   r?   c                    |   |dur||_ t | |j| _|j rt|| _nt|| _|jj}|j r*|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        NrA   皙?r?   r'   r(   r   r3  r   rJ  r=   rN   r>   rP   r   scorer  r)   rl   r?   rN   classifier_dropoutr+   r%   r-   r(     s   
z)T5GemmaForSequenceClassification.__init__c                 C   r7  r]   r   r  r  r%   r%   r-   r    r  z5T5GemmaForSequenceClassification.get_input_embeddingsc                 C      | j | d S r]   r   r  r  r%   r%   r-   r       z5T5GemmaForSequenceClassification.set_input_embeddingsr8   r;   r   r>  r?  r@  rA  r9   rB  rW  rU   rT   r\   c                 C   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrN| j|||||||||	d||d}|j}|j	}|j
}n| j||||||d}|j}|j}|j}| |}|durq|jd }n|jd }| j jdu r|d	krtd
| j jdu rd}nE|dur|| j jk|jtj}tj|jd |jtjd}|| d}| j jr|d	7 }tj||jd d	 d}nd}t| jj d |tj||jd|f }d}|
dur| j||
|| j d}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)

        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.Fr;   r   r>  r?  r@  rA  r9   rB  rG   rU   rT   r;   r   r9   rU   rT   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )r   rW  pooled_logitsrl   rY  r   r:   r!  )rl   r?   NotImplementedErrorr,   r.   r~   r   r   r   rD  rE  r:   r!  rj  r   rE   r   rt   r   int32r"  argmaxclampr   r   r]  r   )r)   r8   r;   r   r>  r?  r@  rA  r9   rB  rW  rU   rT   r   r   r:   r!  r   
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesrx  rY  r%   r%   r-   rr     s   


z(T5GemmaForSequenceClassification.forwardr]   NNNNNNNNNNNN)r.   r/   r0   r3   r   rd   r(   r  r  r   r   r   r   r   r   r   r   rr   r2   r%   r%   r+   r-   rf    sZ    	rf  c                       re  )T5GemmaForTokenClassificationNrl   r?   c                    rg  )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        NrA   rh  ri  rk  r+   r%   r-   r(   1  s   
z&T5GemmaForTokenClassification.__init__c                 C   r7  r]   rm  r  r%   r%   r-   r  I  r  z2T5GemmaForTokenClassification.get_input_embeddingsc                 C   rn  r]   ro  r  r%   r%   r-   r  L  rp  z2T5GemmaForTokenClassification.set_input_embeddingsr8   r;   r   r>  r?  r@  rA  r9   rB  rW  rU   rT   r\   c                 C   s   | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrN| j|||||||||	d||d}|j}|j	}|j
}n| j||||||d}|j}|j}|j}| |}d}|
duru| ||
| j }t||||dS )	rq  Nrr  rs  rt  Fru  rv  ry  )rl   r?   rz  r,   r.   r~   r   r   r   rD  rE  r:   r!  rj  r]  r   )r)   r8   r;   r   r>  r?  r@  rA  r9   rB  rW  rU   rT   r   r   r:   r!  r   rY  r%   r%   r-   rr   O  sb   

z%T5GemmaForTokenClassification.forwardr]   r  )r.   r/   r0   r3   r   rd   r(   r  r  r   r   r   r   r   r   r   r   rr   r2   r%   r%   r+   r-   r  /  sZ    	r  )r3   r$   rK  r3  rJ  r   rf  r  )Ntypingr   r   r   r   r   torch.nnrh   cache_utilsr   r   r	   configuration_utilsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r   r   r   r   r    r!   r"   r#   _CHECKPOINT_FOR_DOC
get_loggerr.   r   r$   r3   rf   rg   rs   ru   rz   r   r   ry   r   r   r   Moduler   r   r   r   r   r   r-  r3  rJ  rK  rf  r  __all__r%   r%   r%   r-   <module>   st    (
 \FK8
  %b, y