o
    i0                     @   s>  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 e)2e3Z4G dd dej5Z6G dd dej5Z7G dd dej5Z8dd Z9dUddZ:dej;de<dej;fd d!Z=	"		dVd#ej5d$ej;d%ej;d&ej;d'eej; d(e>d)ee> d*ee> de?ej;ej;f fd+d,Z@G d-d. d.ej5ZAG d/d0 d0ej5ZBG d1d2 d2eZCG d3d4 d4eCZDG d5d6 d6ej5ZEG d7d8 d8ej5ZFG d9d: d:ej5ZGe'G d;d< d<e"ZHd'eej; defd=d>ZId?e<defd@dAZJdBeejK dej;dCee< dej;fdDdEZLG dFdG dGeHZMG dHdI dIeMZNe'G dJdK dKeHZOe'G dLdM dMeHZPG dNdO dOeHeZQe'G dPdQ dQeHZRe'G dRdS dSeHZSg dTZTdS )W    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )T5GemmaConfigT5GemmaModuleConfigc                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )T5GemmaRMSNormư>dimepsc                    s&   t    || _tt|| _d S N)super__init__r'   nn	Parametertorchzerosweight)selfr&   r'   	__class__ i/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/t5gemma/modeling_t5gemma.pyr*   6   s   
zT5GemmaRMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)keepdim)r-   rsqrtpowmeanr'   )r0   xr3   r3   r4   _norm;   s   $zT5GemmaRMSNorm._normc                 C   s*   |  | }|d| j   }||S )Ng      ?)r<   floatr/   type_as)r0   r;   outputr3   r3   r4   forward>   s   
zT5GemmaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler/   shaper'   r0   r3   r3   r4   
extra_reprE      zT5GemmaRMSNorm.extra_repr)r%   )
__name__
__module____qualname__intr=   r*   r<   r@   rD   __classcell__r3   r3   r1   r4   r$   5   s
    r$   c                       s$   e Zd Z fddZdd Z  ZS )
T5GemmaMLPc                    s   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _t|j| _d S )NFbias)r)   r*   confighidden_sizeintermediate_sizer+   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr0   rN   r1   r3   r4   r*   J   s   
zT5GemmaMLP.__init__c                 C   s2   |  | || | }| |}| |}|S r(   )rV   rR   rS   rY   rT   )r0   r;   hidden_statesrT   r3   r3   r4   r@   U   s   

zT5GemmaMLP.forward)rF   rG   rH   r*   r@   rJ   r3   r3   r1   r4   rK   I   s    rK   c                       s>   e Zd ZU ejed< d fdd	Ze edd Z	  Z
S )T5GemmaRotaryEmbeddinginv_freqNc                    s   t    t|drt|jtr|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr]   F)
persistent)r)   r*   hasattr
isinstancer^   dictgetr_   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrN   r   rope_init_fnattention_scalingregister_bufferr]   original_inv_freq)r0   rN   devicer]   r1   r3   r4   r*   _   s   
zT5GemmaRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r6   r!   mpscpuF)device_typeenabledr5   r&   dtype)r]   r=   expandrB   torn   rd   r`   strr-   autocast	transposecatcosrk   sinru   )
r0   r;   position_idsinv_freq_expandedposition_ids_expandedrq   freqsembr|   r}   r3   r3   r4   r@   p   s   0&zT5GemmaRotaryEmbedding.forwardr(   )rF   rG   rH   r-   Tensor__annotations__r*   no_gradr   r@   rJ   r3   r3   r1   r4   r\   \   s   
 
r\   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr6   r5   rs   )rB   r-   r{   )r;   x1x2r3   r3   r4   rotate_half   s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr|   r}   r~   unsqueeze_dimq_embedk_embedr3   r3   r4   apply_rotary_pos_emb   s
   

r   r[   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rB   rv   reshape)r[   r   batchnum_key_value_headsslenhead_dimr3   r3   r4   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskrY   scalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d urM|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r5   r   r6   )r&   ru   )ptrainingr!   )r   r   num_key_value_groupsr-   matmulrz   tanhrB   r+   
functionalsoftmaxfloat32rw   ru   rY   r   
contiguous)r   r   r   r   r   rY   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr3   r3   r4   eager_attention_forward   s"   

&r   c                          e Zd ZdZdedef fddZedddd		
	
ddej	de
ej	ej	f deej	 dee deej dee de
ej	eej	 ee
ej	  f fddZ  ZS )T5GemmaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperrN   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _|j| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|j| dkrz|j| _d S d | _d S )Nr   r   rL   sliding_attention)r)   r*   rN   r   getattrrO   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout
is_decoder	is_causalr+   rQ   attention_biasq_projk_projv_projo_projattn_logit_softcappinglayer_typessliding_windowr0   rN   r   r1   r3   r4   r*      s,   


$zT5GemmaSelfAttention.__init__past_key_valuepast_key_values4.58new_nameversionNr[   position_embeddingsr   cache_positionr   r   c                 K   ,  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jrr| jnd| j| j| jd|\}}|jg |dR   }| |}||fS Nr6   r!   r5   )r}   r|   r   eagerr   rY   r   r   r   rB   r   r   viewrz   r   r   r   updater   r   rN   _attn_implementationr   r   r   r   r   r   r   r   r   r0   r[   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r|   r}   cache_kwargsattention_interfacer   r   r3   r3   r4   r@      <   



zT5GemmaSelfAttention.forwardNN)rF   rG   rH   __doc__r#   rI   r*   r   r-   r   rA   r   r   
LongTensorr   r   r@   rJ   r3   r3   r1   r4   r      s*    r   c                       s   e Zd ZdZdedef fddZedddd		
ddej	de
ej	 de
ej	 de
e dee deej	e
ej	 e
eej	  f fddZ  ZS )T5GemmaCrossAttentionr   rN   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|jd u rutdd S )Nr   r   FrL   zBCross-attention needs cross_attention_hidden_size to be specified.)r)   r*   rN   r   r   rO   r   r   r   r   r   r   r   r   r+   rQ   r   r   cross_attention_hidden_sizer   r   r   r   
ValueErrorr   r1   r3   r4   r*      s0   



zT5GemmaCrossAttention.__init__r   r   r   r   Nr[   r   encoder_hidden_statesr   r   c                 K   s  |d u rt d|jd d }g |d| jR }| ||dd}|d ur3|j| j}	|j	}
|d u s9|	sw|jd d }g |d| jR }| 
||dd}| ||dd}|d urv|
||| j\}}d|j| j< n|
j| j j}|
j| j j}t}| jjdkrt| jj }|| ||||f| jr| jnd| jd | jd|\}}|jg |dR   }| |}||fS )	Nz5Encoder hidden state is required for cross attention.r6   r!   r5   Tr   r   r   )r   rB   r   r   r   rz   
is_updatedrf   r   cross_attention_cacher   r   r   layerskeysvaluesr   rN   r   r   r   r   r   r   r   r   r   )r0   r[   r   r   r   r   r   r   r   r   curr_past_key_valueencoder_input_shapeencoder_hidden_shaper   r   r   r   r   r3   r3   r4   r@   <  sN   	


zT5GemmaCrossAttention.forwardr(   )rF   rG   rH   r   r#   rI   r*   r   r-   r   r   r   r   r   rA   r@   rJ   r3   r3   r1   r4   r     s$    r   c                       sl   e Zd ZdZdef fddZ		ddejdeejejf de	ej d	e	ej
 d
eejf f
ddZ  ZS )T5GemmaEncoderLayerzEncoder sub-layer.r   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S N)rN   r   r'   )r)   r*   rO   rN   r   r   attention_typer   	self_attnr$   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrK   mlppre_feedforward_layernormpost_feedforward_layernormr+   rW   rX   rY   r   r1   r3   r4   r*   v  s   

zT5GemmaEncoderLayer.__init__Nr[   r   r   r~   r   c                 K   sz   |}|  |}| jd||||d d|\}}| |}|| | }|}| |}| |}| |}|| | }|S )N)r[   r   r   r~   r   r3   )r   r   r   rY   r   r   r   )r0   r[   r   r   r~   r   residual_r3   r3   r4   r@     s&   





zT5GemmaEncoderLayer.forwardr   )rF   rG   rH   r   rI   r*   r-   r   rA   r   r   FloatTensorr@   rJ   r3   r3   r1   r4   r   s  s     
r   c                       s   e Zd ZdZdef fddZedddd							
						ddejde	ejejf de
ej de
ej de
e de
e de
ej de
ej de
ej dejfddZ  ZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.r   c                    sD   t  || t||d| _t|j|jd| _t|j|jd| _d S r   )	r)   r*   r   
cross_attnr$   rO   r   pre_cross_attn_layernormpost_cross_attn_layernormr   r1   r3   r4   r*     s   zT5GemmaDecoderLayer.__init__r   r   r   r   NFr[   r   r   r~   	use_cacher   r   encoder_attention_maskr   c
              
   K   s   |}|  |}| jd|||||d ur|jnd ||d|
\}}| |}|| | }|}| |}| jd|||	||d|
\}}| |}|| | }|}| |}| 	|}| 
|}|| | }|S )N)r[   r   r   r~   r   r   r   )r[   r   r   r   r   r3   )r   r   self_attention_cacher   rY   r   r   r   r   r   r   )r0   r[   r   r   r~   r   r   r   r   r   r   r   r   r3   r3   r4   r@     sD   









zT5GemmaDecoderLayer.forward)NNNFNNN)rF   rG   rH   r   rI   r*   r   r-   r   rA   r   r   r	   boolr   r@   rJ   r3   r3   r1   r4   r     s@    	
r   c                       F   e Zd ZdZddededef fddZdejd	ejfd
dZ	  Z
S )T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r   rO   
num_labelsclassifier_dropout_ratec                    s*   t    tj|d| _t||| _d S )N)r   )r)   r*   r+   rW   rY   rQ   out_proj)r0   rO   r  r  r1   r3   r4   r*     s   
z"T5GemmaClassificationHead.__init__r[   r   c                 C   s   |  |}| |}|S r(   )rY   r  )r0   r[   r3   r3   r4   r@     s   

z!T5GemmaClassificationHead.forward)r   )rF   rG   rH   r   rI   r=   r*   r-   r   r@   rJ   r3   r3   r1   r4   r    s    r  c                       r  )T5GemmaLMHeadz.Head for language modeling (generation) tasks.FrO   
vocab_sizerM   c                    s    t    tj|||d| _d S )NrL   )r)   r*   r+   rQ   r  )r0   rO   r  rM   r1   r3   r4   r*     s   
zT5GemmaLMHead.__init__r[   r   c                 C   s   |  |}|S r(   )r  )r0   r[   logitsr3   r3   r4   r@     s   
zT5GemmaLMHead.forward)F)rF   rG   rH   r   rI   r  r*   r-   r   r@   rJ   r3   r3   r1   r4   r    s    r  c                       r   )T5GemmaAttentionr   rN   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|j| dkry|j| _d S d | _d S )Nr   r   TrL   r   )r)   r*   rN   r   r   rO   r   r   r   r   r   r   r   r   r+   rQ   r   r   r   r   r   r   r   r   r   r1   r3   r4   r*     s,   


$zT5GemmaAttention.__init__r   r   r   r   Nr[   r   r   r   r   r   c                 K   r   r   r   r   r3   r3   r4   r@     r   zT5GemmaAttention.forwardr   )rF   rG   rH   r   r"   rI   r*   r   r-   r   rA   r   r   r   r   r   r@   rJ   r3   r3   r1   r4   r
    s*    r
  c                       sb   e Zd ZU eed< dZdZddgZdgZdZ	dZ
dZdZdZeedZ fdd	Zd
d Z  ZS )T5GemmaPreTrainedModelrN   modelTr   r   r   )r[   
attentionsc                    s   t  | | jj}t|tr=|jjjd d }|jjj	j
d|| d t|jdr9|jjd ur;|jjj	  d S d S d S t|tr_| jjs]|jjjd d }|jjj	j
d|| d d S d S d|jjv rm|jj	  d S d S )Nr   r   r   )r:   stdrM   RMSNorm)r)   _init_weightsrN   initializer_rangerd   r  r  r/   rB   datanormal_rc   rM   zero_r  tie_word_embeddingsr2   rF   )r0   r   r  scaler1   r3   r4   r  Y  s    

z$T5GemmaPreTrainedModel._init_weightsc                 C   s|   | j jj}| j jj}|du rtd||j}|dddf  |dddf< ||d< |du r4td||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r6   r!   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rN   decoderbos_token_idpad_token_idr   	new_zerosrB   clonemasked_fill_)r0   	input_idsdecoder_start_token_idr  shifted_input_idsr3   r3   r4   _shift_rightj  s   

 z#T5GemmaPreTrainedModel._shift_right)rF   rG   rH   r"   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r
  _can_record_outputsr  r   rJ   r3   r3   r1   r4   r  G  s    
 r  c              
      &   dt dt dt dt dtf
 fdd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxr   c                    s,    d u rt jdt jdS  | |f t jS )Nr3   rt   )r-   onesr  rw   r,  r-  r.  r/  r   r3   r4   
inner_mask  s   z/bidirectional_mask_function.<locals>.inner_maskrI   r  )r   r3  r3   r2  r4   bidirectional_mask_function  s   "r5  r   c              
      r+  )zH
    This creates bidirectional attention mask with sliding window.
    r,  r-  r.  r/  r   c                    s   |  |k ||  k @ S r(   r3   r1  r   r3   r4   r3    rE   z>sliding_window_bidirectional_mask_function.<locals>.inner_maskr4  )r   r3  r3   r6  r4   *sliding_window_bidirectional_mask_function  s   "r7  	token_idsr  c                 C   sX   | dur|du rt d| |k|jtj}|S tj|jd |jd f|jtjd}|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r!   rn   ru   )r   rw   rn   r-   longr0  rB   )r8  r[   r  r   r3   r3   r4   make_default_2d_attention_mask  s   r;  c                       sr   e Zd ZeedZ fddZe				ddee	j
 dee	j dee	j
 dee	j d	ee d
efddZ  ZS )T5GemmaEncoder)r  r[   c                    s   t     j| _ j| _t j j| j| _t	 j j
d| _t d| _d| _t fddt jD | _t j| _|   d S )Nr   rN   Fc                       g | ]}t  |qS r3   )r   .0r   r=  r3   r4   
<listcomp>      z+T5GemmaEncoder.__init__.<locals>.<listcomp>)r)   r*   r  padding_idxr  r+   	EmbeddingrO   embed_tokensr$   r   normr\   
rotary_embgradient_checkpointing
ModuleListrangenum_hidden_layersr   rW   rX   rY   	post_initrZ   r1   r=  r4   r*     s   zT5GemmaEncoder.__init__Nr  r   r~   inputs_embedsr   r   c                 K   sf  |d u |d uA rt d|dd  |d u r| |}tjd|jd |jd}|d u r0|d}|d u r<t||| j	j
}t| }tsm| j	|||d |d}tdi |dt|itdi |t| j	jt|dd	}|}	| |	|}
tj| j	jd
 |	jd}|	| }	| |	}	| jd | j	j D ]}||	|
||j |fi |}	q| |	}	| |	}	t|	dS )N:You must specify exactly one of input_ids or inputs_embedsr   r   r!   rn   rN   input_embedsr   r   r   r~   or_mask_function)rR  and_mask_functionfull_attentionr         ?rt   )last_hidden_stater3   )r   poprE  r-   arangerB   rn   r   r;  rN   r  rd   re   r   r5  r   r7  r   rG  tensorrO   ru   rY   r   rK  r   rF  r   )r0   r  r   r~   rM  r   r   self_attn_mask_mappingmask_kwargsr[   r   
normalizerlayer_moduler3   r3   r4   r@     sb   	

	



zT5GemmaEncoder.forwardNNNN)rF   rG   rH   r   r   r*  r*   r    r   r-   r   r   r   r   r   r   r@   rJ   r3   r3   r1   r4   r<    s.    r<  c                       s   e Zd ZeeddeeddedZ fddZe										dde
ej de
ej d	e
ej d
e
e de
ej de
e de
ej de
ej de
ej dee defddZ  ZS )T5GemmaDecoderr!   )index)r  cross_attentionsr[   c                    s8   t    t fddt jD | _|   d S )Nc                    r>  r3   )r   r?  r=  r3   r4   rA    rB  z+T5GemmaDecoder.__init__.<locals>.<listcomp>)r)   r*   r+   rI  rJ  rK  r   rL  rZ   r1   r=  r4   r*     s
   zT5GemmaDecoder.__init__Nr  r   r~   r   rM  r   r   r   r   r   r   c
                 K   s  |d u |d uA rt d|d u rt d|d u r| |}| js3|r3|d u r3tt| jdt| jd}|d u rO|d ur?| nd}tj|||j	d  |j
d}|d u rX|d}|d u rh|d u rht||| jj}t| }ts| j||||d ur{|jnd |d}tdi |tdi |d}t|	 }ts| j||	|d d d}d	tdi |d
t|	ii}|}| ||}tj| jjd |jd}|| }| |}| jd | jj D ]}|||||j ||||||d	 f	i |
}q| |}| |}t||dS )NrN  z0`encoder_hidden_states` must be given in decoderr=  r   r!   rO  rP  rT  rU  rR  rV  rt   )rW  r   r3   )r   rE  r   r	   r   rN   get_seq_lengthr-   rY  rB   rn   r   r;  r  rd   re   r   r   r   r5  rG  rZ  rO   ru   rY   r   rK  r   rF  r   )r0   r  r   r~   r   rM  r   r   r   r   r   past_seen_tokensr[  r\  cross_attn_mask_mappingr[   r   r]  r^  r3   r3   r4   r@     s   

		



zT5GemmaDecoder.forward)	NNNNNNNNN)rF   rG   rH   r   r   r   r   r*  r*   r    r   r-   r   r   r	   r   r  r   r   r   r@   rJ   r3   r3   r1   r4   r`    sN    

	
r`  c                       s   e Zd Zdef fddZdd Zdd Zdd	 Zee		
	
	
	
	
	
	
	
	
	
	
	
dde
ej de
ej de
ej de
ej de
ej de
ej de
e de
e de
ej de
ej de
e de
ej dee defddZ  ZS )T5GemmaModelrN   c                    s>   t  | |jstdt|j| _t|j| _|   d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r)   r*   is_encoder_decoderr   r<  encoderr`  r  rL  rZ   r1   r3   r4   r*   z  s   zT5GemmaModel.__init__c                 C   s   | j S r(   rh  rC   r3   r3   r4   get_encoder  s   zT5GemmaModel.get_encoderc                 C   
   | j  S r(   rh  get_input_embeddingsrC   r3   r3   r4   rm       
z!T5GemmaModel.get_input_embeddingsc                 C      | j |S r(   rh  set_input_embeddingsr0   new_embeddingsr3   r3   r4   rq       z!T5GemmaModel.set_input_embeddingsNr  r   r~   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   rM  decoder_inputs_embedsr   r   r   r   c                 K   s   |du r| j d||||	d|}|j}| jd||||
|||||d	|}t|j|j|ddr4|jn|jf|j|j|j|j|jdS )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        Nr  r   r~   rM  )	r  r   r~   rM  r   r   r   r   r   output_hidden_statesF)rW  r   decoder_hidden_statesdecoder_attentionsrb  encoder_last_hidden_stater   encoder_attentionsr3   )	rh  rW  r  r   r   rf   r[   r  rb  )r0   r  r   r~   ru  rv  rw  rx  r   rM  ry  r   r   r   r   decoder_outputsr3   r3   r4   r@     sF   

zT5GemmaModel.forward)NNNNNNNNNNNN)rF   rG   rH   r"   r*   rj  rm  rq  r   r   r   r-   r   r   
BoolTensorr   r	   r   r  r   r   r   r@   rJ   r3   r3   r1   r4   rf  x  s`    	
rf  c                       s   e Zd Zdef fddZdd Zdd Zee				dd	e	e
j d
e	e
j de	e
j de	e
j dee defddZ  ZS )T5GemmaEncoderModelrN   c                    s2   t  | |jrtdt|j| _|   d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r)   r*   rg  r   r<  rh  rL  rZ   r1   r3   r4   r*     s
   zT5GemmaEncoderModel.__init__c                 C   rk  r(   rl  rC   r3   r3   r4   rm    rn  z(T5GemmaEncoderModel.get_input_embeddingsc                 C   ro  r(   rp  rr  r3   r3   r4   rq    rt  z(T5GemmaEncoderModel.set_input_embeddingsNr  r   r~   rM  r   r   c                 K   s   | j d||||d|}|S )Nrz  r3   ri  )r0   r  r   r~   rM  r   rx  r3   r3   r4   r@     s   
zT5GemmaEncoderModel.forwardr_  )rF   rG   rH   r"   r*   rm  rq  r   r   r   r-   r   r   r   r   r   r   r@   rJ   r3   r3   r1   r4   r    s.    	r  c                %       sN  e Zd ZddgZddiZddgdgfiZdef fdd	Zd
d Zdd Z	dd Z
dd Zdd Zee														d*deej deej deej deej deej deej dee dee deej deej d eej d!ee d"eej d#eeejf d$ee d%eeej ef f d&d'Zd ejfd(d)Z  Z S )+T5GemmaForConditionalGenerationz!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projcolwise_repr[   r	  rN   c                    sJ   d|_ t | t|| _|jj| _t|jj| j| _	d| _
|   d S )NTForMaskedLM)rg  r)   r*   rf  r  r  r  r  rO   lm_head	loss_typerL  rZ   r1   r3   r4   r*     s   

z(T5GemmaForConditionalGeneration.__init__c                 C   s   || j _d S r(   r  r  rr  r3   r3   r4   set_output_embeddings   rt  z5T5GemmaForConditionalGeneration.set_output_embeddingsc                 C      | j jS r(   r  rC   r3   r3   r4   get_output_embeddings     z5T5GemmaForConditionalGeneration.get_output_embeddingsc                 C   s(   | j jr| | jj|    d S d S r(   )rN   r  _tie_or_clone_weightsr  r  get_decoderrm  rC   r3   r3   r4   _tie_weights  s   z,T5GemmaForConditionalGeneration._tie_weightsc                 C   r  r(   )r  rh  rC   r3   r3   r4   rj    r  z+T5GemmaForConditionalGeneration.get_encoderc                 C   r  r(   )r  r  rC   r3   r3   r4   r    r  z+T5GemmaForConditionalGeneration.get_decoderNr   r  r   r~   ru  rv  rw  rx  r   rM  ry  labelsr   r   logits_to_keepr   r   c                 K   s  |dur|du r|
du r|  |}| jd|||||||||	|
||d|}|j}t|tr4t| dn|}| |dd|ddf }|  j}|j	dur]||j	 }t
|}||j	 }d}|duro| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r  r   r~   ru  rv  rw  rx  r   rM  ry  r   r   )	lossr	  r   r|  r}  rb  r~  r   r  r3   )r   r  rW  rd   rI   slicer  r  rN   final_logit_softcappingr-   r   loss_functionr  r   r   r|  r}  rb  r~  r   r  )r0   r  r   r~   ru  rv  rw  rx  r   rM  ry  r  r   r   r  r   r  r[   slice_indicesr	  decoder_configr  r3   r3   r4   r@     sP   





z'T5GemmaForConditionalGeneration.forwardc                 C   s
   |  |S r(   )r   )r0   r  r3   r3   r4   %prepare_decoder_input_ids_from_labels^  rn  zET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNr   )!rF   rG   rH   _tied_weights_keys_tp_plan_pp_planr"   r*   r  r  r  rj  r  r   r   r   r-   r   r   r  r   r	   r  r   rI   r   r   r   rA   r   r@   r  rJ   r3   r3   r1   r4   r    sx    	
Kr  c                          e Zd Zddedee f fddZdd Zdd	 Ze	e
										dd
eej deej deej deej deej deej dee deej deej deej dee defddZ  ZS ) T5GemmaForSequenceClassificationNrN   rg  c                    |   |dur||_ t | |j| _|j rt|| _nt|| _|jj}|j r*|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        Nr  皙?rg  r)   r*   r  rf  r  r  rh  rO   r  r   r  scorerL  r0   rN   rg  rO   classifier_dropoutr1   r3   r4   r*   d  s   
z)T5GemmaForSequenceClassification.__init__c                 C   rk  r(   r  rm  rC   r3   r3   r4   rm  {  rn  z5T5GemmaForSequenceClassification.get_input_embeddingsc                 C      | j | d S r(   r  rq  r0   r   r3   r3   r4   rq  ~     z5T5GemmaForSequenceClassification.set_input_embeddingsr  r   r~   ru  rv  rw  rx  rM  ry  r  r   r   c                 K   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrP| j|f||||||||	dd	|}|j}|j	}|j
}n| j|f|||d|}|j}|j}|j}| |}|duru|jd }n|jd }| j jdu r|d	krtd
| j jdu rd}nE|dur|| j jk|jtj}tj|jd |jtjd}|| d}| j jr|d	7 }tj||jd d	 d}nd}t| jj d |tj||jd|f }d}|
dur| j||
|| j d}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r~   ru  rv  rw  rx  rM  ry  r   r   r~   rM  r   r!   z=Cannot handle batch sizes > 1 if no padding token is defined.r6   r9  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rO  )r	  r  pooled_logitsrN   r  r	  r[   r  )rN   rg  NotImplementedErrorr2   rF   r   r   r  rW  r|  r}  r[   r  r  rB   r  rw   rn   r-   int32rY  argmaxclamploggerwarning_oncer  r   )r0   r  r   r~   ru  rv  rw  rx  rM  ry  r  r   outputsrW  r[   r  r	  
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr  r  r3   r3   r4   r@     s   


z(T5GemmaForSequenceClassification.forwardr(   
NNNNNNNNNN)rF   rG   rH   r"   r   r  r*   rm  rq  r   r   r-   r   r   r   r   r   r   r   r@   rJ   r3   r3   r1   r4   r  b  sR    	
r  c                       r  )T5GemmaForTokenClassificationNrN   rg  c                    r  )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        Nr  r  r  r  r1   r3   r4   r*     s   
z&T5GemmaForTokenClassification.__init__c                 C   rk  r(   r  rC   r3   r3   r4   rm  	  rn  z2T5GemmaForTokenClassification.get_input_embeddingsc                 C   r  r(   r  r  r3   r3   r4   rq    r  z2T5GemmaForTokenClassification.set_input_embeddingsr  r   r~   ru  rv  rw  rx  rM  ry  r  r   r   c                 K   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrP| j|f||||||||	dd	|}|j}|j	}|j
}n| j|f|||d|}|j}|j}|j}| |}d}|
dury| ||
| j }t||||dS )	r  Nr  r  r  Fr  r  r  )rN   rg  r  r2   rF   r   r   r  rW  r|  r}  r[   r  r  r  r   )r0   r  r   r~   ru  rv  rw  rx  rM  ry  r  r   r  rW  r[   r  r	  r  r3   r3   r4   r@     sf   

z%T5GemmaForTokenClassification.forwardr(   r  )rF   rG   rH   r"   r   r  r*   rm  rq  r   r   r-   r   r   r   r   r   r   r   r@   rJ   r3   r3   r1   r4   r    sR    	
r  )r  rf  r  r  r  r  )Nr!   )r   NN)Utypingr   r   r   r-   torch.nnr+   activationsr   cache_utilsr   r   r	   
generationr
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r    configuration_t5gemmar"   r#   
get_loggerrF   r  Moduler$   rK   r\   r   r   r   rI   r   r=   rA   r   r   r   r   r   r  r  r
  r  r5  r7  r   r;  r<  r`  rf  r  r  r  r  __all__r3   r3   r3   r4   <module>   s    
$


#LV4;K=
]mR$r r