o
    wi                     @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ e(,e-Z.G dd dej/Z0G dd dej/Z1G dd dej/Z2dd Z3dQddZ4dej5de6dej5fddZ7	 		dRd!ej/d"ej5d#ej5d$ej5d%eej5 d&e8d'ee8 d(ee8 de9ej5ej5f fd)d*Z:G d+d, d,ej/Z;G d-d. d.ej/Z<G d/d0 d0eZ=G d1d2 d2e=Z>G d3d4 d4ej/Z?G d5d6 d6ej/Z@e&G d7d8 d8e"ZAd%eej5 defd9d:ZBd;e6defd<d=ZCd>eejD dej5d?ee6 dej5fd@dAZEG dBdC dCeAZFG dDdE dEeFZGe&G dFdG dGeAZHe&G dHdI dIeAZIG dJdK dKeAeZJe&G dLdM dMeAZKe&G dNdO dOeAZLg dPZMdS )S    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )T5GemmaConfigT5GemmaModuleConfigc                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )T5GemmaRMSNormư>dimepsc                    s&   t    || _tt|| _d S N)super__init__r#   nn	Parametertorchzerosweight)selfr"   r#   	__class__ i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/t5gemma/modeling_t5gemma.pyr&   4   s   
zT5GemmaRMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)keepdim)r)   rsqrtpowmeanr#   )r,   xr/   r/   r0   _norm9   s   $zT5GemmaRMSNorm._normc                 C   s*   |  | }|d| j   }||S )N      ?)r8   floatr+   type_as)r,   r7   outputr/   r/   r0   forward<   s   
zT5GemmaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler+   shaper#   r,   r/   r/   r0   
extra_reprC      zT5GemmaRMSNorm.extra_repr)r!   )
__name__
__module____qualname__intr:   r&   r8   r=   rA   __classcell__r/   r/   r-   r0   r    3   s
    r    c                       s$   e Zd Z fddZdd Z  ZS )
T5GemmaMLPc                    s   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _t|j| _d S )NFbias)r%   r&   confighidden_sizeintermediate_sizer'   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr,   rK   r-   r/   r0   r&   H   s   
zT5GemmaMLP.__init__c                 C   s2   |  | || | }| |}| |}|S r$   )rS   rO   rP   rV   rQ   )r,   r7   hidden_statesrQ   r/   r/   r0   r=   S   s   

zT5GemmaMLP.forward)rC   rD   rE   r&   r=   rG   r/   r/   r-   r0   rH   G   s    rH   c                       s2   e Zd Zd fdd	Ze edd Z  ZS )T5GemmaRotaryEmbeddingNc                    s   t    t|dr|jd ur|jd|jd| _nd| _|j| _|j| _|| _	t
| j | _| | j	|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r%   r&   hasattrrZ   getr[   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrK   r   rope_init_fnattention_scalingregister_bufferr^   original_inv_freq)r,   rK   devicer^   r-   r/   r0   r&   [   s   
zT5GemmaRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	j
|dd+ | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 smw   Y  |j|jd
|	j|jd
fS )Nr   r2   r   mpscpuF)device_typeenabledr1   r"   dtype)r^   r:   expandr?   tori   
isinstancer\   strr)   autocast	transposecatcosrf   sinrp   )
r,   r7   position_idsinv_freq_expandedposition_ids_expandedrl   freqsembrx   ry   r/   r/   r0   r=   l   s   0&zT5GemmaRotaryEmbedding.forwardr$   )	rC   rD   rE   r&   r)   no_gradr   r=   rG   r/   r/   r-   r0   rY   Z   s
    rY   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr2   r1   rn   )r?   r)   rw   )r7   x1x2r/   r/   r0   rotate_half|   s   r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkrx   ry   rz   unsqueeze_dimq_embedk_embedr/   r/   r0   apply_rotary_pos_emb   s
   

r   rX   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r?   rq   reshape)rX   r   batchnum_key_value_headsslenhead_dimr/   r/   r0   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskrV   scalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d urM|d d d d d d d |	jd f }|| }tj	j
|dtjd|j}tj	j||| jd}t||
}|dd }||fS )	N      r1   r   r2   )r"   rp   )ptrainingr   )r   r   num_key_value_groupsr)   matmulrv   tanhr?   r'   
functionalsoftmaxfloat32rr   rp   rV   r   
contiguous)r   r   r   r   r   rV   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr/   r/   r0   eager_attention_forward   s"   

&r   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	e
ej d
e
e de
ej dee de	eje
ej e
e	ej  f fddZ  ZS )T5GemmaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperrK   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _|j| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|j| dkrz|j| _d S d | _d S )Nr   r   rI   sliding_attention)r%   r&   rK   r   getattrrL   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout
is_decoder	is_causalr'   rN   attention_biasq_projk_projv_projo_projattn_logit_softcappinglayer_typessliding_windowr,   rK   r   r-   r/   r0   r&      s,   


$zT5GemmaSelfAttention.__init__NrX   position_embeddingsr   past_key_valuecache_positionr   r   c                 K   s,  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jrr| jnd| j| j| jd|\}}|jg |dR   }| |}||fS )Nr2   r   r1   )ry   rx   r   eagerr   rV   r   r   r   )r?   r   r   viewrv   r   r   r   updater   r   rK   _attn_implementationr   r   r   r   r   r   r   r   r   )r,   rX   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rx   ry   cache_kwargsattention_interfacer   r   r/   r/   r0   r=      s<   	


zT5GemmaSelfAttention.forward)NN)rC   rD   rE   __doc__r   rF   r&   r)   Tensorr>   r   r   
LongTensorr   r   r=   rG   r/   r/   r-   r0   r      s(    r   c                       s   e Zd ZdZdedef fddZ	ddejde	ej d	e	ej d
e	e
 dee deeje	ej e	eej  f fddZ  ZS )T5GemmaCrossAttentionr   rK   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|jd u rutdd S )Nr   r   FrI   zBCross-attention needs cross_attention_hidden_size to be specified.)r%   r&   rK   r   r   rL   r   r   r   r   r   r   r   r   r'   rN   r   r   cross_attention_hidden_sizer   r   r   r   
ValueErrorr   r-   r/   r0   r&     s0   



zT5GemmaCrossAttention.__init__NrX   r   encoder_hidden_statesr   r   r   c                 K   s  |d u rt d|jd d }g |d| jR }| ||dd}|d ur3|j| j}	|j	}
|d u s9|	sw|jd d }g |d| jR }| 
||dd}| ||dd}|d urv|
||| j\}}d|j| j< n|
j| j }|
j| j }t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|| ||||f| jr| jnd| jd | jd|\}}|jg |dR   }| |}||fS )Nz5Encoder hidden state is required for cross attention.r2   r   r1   Tr   sdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   r   )r   r?   r   r   r   rv   
is_updatedra   r   cross_attention_cacher   r   r   	key_cachevalue_cacher   rK   r   loggerwarning_oncer   r   r   r   r   r   r   r   )r,   rX   r   r   r   r   r   r   r   r   curr_past_key_valueencoder_input_shapeencoder_hidden_shaper   r   r   r   r   r/   r/   r0   r=   9  sZ   


zT5GemmaCrossAttention.forwardr$   )rC   rD   rE   r   r   rF   r&   r)   r   r   r   r   r   r>   r=   rG   r/   r/   r-   r0   r     s"    #r   c                       s   e Zd ZdZdef fddZ			ddejdeejejf d	e	ej d
e	ej
 de	e deeje	eejejf  f fddZ  ZS )T5GemmaEncoderLayerzEncoder sub-layer.r   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S N)rK   r   r#   )r%   r&   rL   rK   r   r   attention_typer   	self_attnr    rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrH   mlppre_feedforward_layernormpost_feedforward_layernormr'   rT   rU   rV   r   r-   r/   r0   r&     s   

zT5GemmaEncoderLayer.__init__NFrX   r   r   rz   r   r   c           
   
   K   s   |}|  |}| jd|||||dd d|\}}| |}|| | }|}| |}| |}| |}|| | }|f}	|rG|	|f7 }	|	S )NF)rX   r   r   rz   r   	use_cacher   r/   )r   r   r   rV   r   r   r   )
r,   rX   r   r   rz   r   r   residualself_attn_weightsoutputsr/   r/   r0   r=     s0   
	





zT5GemmaEncoderLayer.forward)NNF)rC   rD   rE   r   rF   r&   r)   r   r>   r   r   boolFloatTensorr=   rG   r/   r/   r-   r0   r     s.    r   c                       s   e Zd ZdZdef fddZ								ddejdeejejf d	e	ej d
e	ej
 de	e de	e de	e de	ej
 de	ej de	ej deeje	eejejf  e	eejejf  f fddZ  ZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.r   c                    sD   t  || t||d| _t|j|jd| _t|j|jd| _d S r   )	r%   r&   r   
cross_attnr    rL   r   pre_cross_attn_layernormpost_cross_attn_layernormr   r-   r/   r0   r&     s   zT5GemmaDecoderLayer.__init__NFrX   r   r   rz   r   r   r   r   r   encoder_attention_maskr   c                 K   s   |}|  |}| jd|||||d ur|jnd |||d|\}}| |}|| | }|}| |}| jd||	|
|||d|\}}| |}|| | }|}| |}| 	|}| 
|}|| | }|f}|rt|||f7 }|S )N)rX   r   r   rz   r   r   r   r   )rX   r   r   r   r   r   r/   )r   r   self_attention_cacher   rV   r   r   r   r   r   r   )r,   rX   r   r   rz   r   r   r   r   r   r   r   r   r   cross_attn_weightsr   r/   r/   r0   r=     sN   
	




	


zT5GemmaDecoderLayer.forward)NNNFFNNN)rC   rD   rE   r   rF   r&   r)   r   r>   r   r   r	   r   r   r=   rG   r/   r/   r-   r0   r     sN    	
r   c                       F   e Zd ZdZddededef fddZdejd	ejfd
dZ	  Z
S )T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r   rL   
num_labelsclassifier_dropout_ratec                    s*   t    tj|d| _t||| _d S )N)r   )r%   r&   r'   rT   rV   rN   out_proj)r,   rL   r   r   r-   r/   r0   r&     s   
z"T5GemmaClassificationHead.__init__rX   r   c                 C   s   |  |}| |}|S r$   )rV   r   )r,   rX   r/   r/   r0   r=     s   

z!T5GemmaClassificationHead.forward)r   )rC   rD   rE   r   rF   r:   r&   r)   r   r=   rG   r/   r/   r-   r0   r     s    r   c                       r   )T5GemmaLMHeadz.Head for language modeling (generation) tasks.FrL   
vocab_sizerJ   c                    s    t    tj|||d| _d S )NrI   )r%   r&   r'   rN   r   )r,   rL   r   rJ   r-   r/   r0   r&   $  s   
zT5GemmaLMHead.__init__rX   r   c                 C   s   |  |}|S r$   )r   )r,   rX   logitsr/   r/   r0   r=   (  s   
zT5GemmaLMHead.forward)F)rC   rD   rE   r   rF   r   r&   r)   r   r=   rG   r/   r/   r-   r0   r   !  s    r   c                   @   sT   e Zd ZeZdZdZdgZdgZdZ	dZ
dZdZdZdZdZdZdd Zdd Zd	S )
T5GemmaPreTrainedModelmodelTT5GemmaBlockpast_key_valuesc                 C   sP  | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rC|jjjd|d |jd urA|jj|j 	  d S d S t|trQ|jjd d S t|tr|jjjd d }|jjjjd|| d t|jdr|jjd ur|jjj	  d S d S d S t|tr| j js|jjjd d }|jjjjd|| d d S d S d S )Nr   )r6   stdr9   r   r   rJ   )rK   initializer_rangers   r'   rN   r+   datanormal_rJ   zero_	Embeddingpadding_idxr    fill_r   r   r?   r`   r   tie_word_embeddings)r,   r   r  scaler/   r/   r0   _init_weights=  s2   




z$T5GemmaPreTrainedModel._init_weightsc                 C   s|   | j jj}| j jj}|du rtd||j}|dddf  |dddf< ||d< |du r4td||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r2   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rK   decoderbos_token_idpad_token_idr   	new_zerosr?   clonemasked_fill_)r,   	input_idsdecoder_start_token_idr  shifted_input_idsr/   r/   r0   _shift_rightT  s   

 z#T5GemmaPreTrainedModel._shift_rightN)rC   rD   rE   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_3_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_attention_backendr  r  r/   r/   r/   r0   r   -  s     r   c              
      &   dt dt dt dt dtf
 fdd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxr   c                    s,    d u rt jdt jdS  | |f t jS )Nr/   ro   )r)   onesr   rr   r%  r&  r'  r(  r   r/   r0   
inner_maskt  s   z/bidirectional_mask_function.<locals>.inner_maskrF   r   )r   r,  r/   r+  r0   bidirectional_mask_functiono  s   "r.  r   c              
      r$  )zH
    This creates bidirectional attention mask with sliding window.
    r%  r&  r'  r(  r   c                    s   |  |k ||  k @ S r$   r/   r*  r   r/   r0   r,    rB   z>sliding_window_bidirectional_mask_function.<locals>.inner_maskr-  )r   r,  r/   r/  r0   *sliding_window_bidirectional_mask_function~  s   "r0  	token_idsr  c                 C   sX   | dur|du rt d| |k|jtj}|S tj|jd |jd f|jtjd}|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r   ri   rp   )r   rr   ri   r)   longr)  r?   )r1  rX   r  r   r/   r/   r0   make_default_2d_attention_mask  s   r4  c                       s   e Zd Z fddZdd Zdd Ze						ddeej	 d	eej
 d
eej	 deej dee dee dee defddZ  ZS )T5GemmaEncoderc                    s   t     j| _ j| _t j j| j| _t	 j j
d| _t d| _d| _t fddt jD | _t j| _|   d S )Nr   rK   Fc                       g | ]}t  |qS r/   )r   .0r   r6  r/   r0   
<listcomp>      z+T5GemmaEncoder.__init__.<locals>.<listcomp>)r%   r&   r  r  r   r'   r  rL   embed_tokensr    r   normrY   
rotary_embgradient_checkpointing
ModuleListrangenum_hidden_layerslayersrT   rU   rV   	post_initrW   r-   r6  r0   r&     s   zT5GemmaEncoder.__init__c                 C      | j S r$   r<  r@   r/   r/   r0   get_input_embeddings     z#T5GemmaEncoder.get_input_embeddingsc                 C   s
   || _ d S r$   rF  r,   r   r/   r/   r0   set_input_embeddings     
z#T5GemmaEncoder.set_input_embeddingsNr  r   rz   inputs_embedsr   output_hidden_statesflash_attn_kwargsr   c                 K   s  |d ur|n| j j}|d ur|n| j j}|d u |d uA r td|d u r)| |}tjd|jd |jd}|d u r>|	d}|d u rJt
||| j j}t| }	ts{| j |||d |d}
tdi |
dt|itdi |
t| j jt|dd}	|}| ||}tj| j jd	 |jd
}|| }|rdnd }|rdnd }| |}| jd | j j D ]%}|r||f7 }||||	|j ||fi |}|d }|r||d f7 }q| |}| |}|r||f7 }t|||dS )N:You must specify exactly one of input_ids or inputs_embedsr   r   ri   rK   input_embedsr   r   r  rz   or_mask_function)rS  and_mask_functionfull_attentionr         ?ro   r/   )last_hidden_staterX   
attentions)rK   r   rM  r   r<  r)   aranger?   ri   r   r4  r  rs   dictr   r.  r   r0  r   r>  tensorrL   rp   rV   rC  rB  r   r=  r   )r,   r  r   rz   rL  r   rM  rN  r   self_attn_mask_mappingmask_kwargsrX   r   
normalizerall_hidden_statesall_self_attnslayer_modulelayer_outputsr/   r/   r0   r=     s   





	


zT5GemmaEncoder.forwardNNNNNN)rC   rD   rE   r&   rG  rJ  r   r   r)   r   r   r   r   r   r   r   r=   rG   r/   r/   r-   r0   r5    s8    	r5  c                       s   e Zd Z fddZe											ddeej deej deej dee	 deej
 d	ee d
ee dee deej deej deej dee defddZ  ZS )T5GemmaDecoderc                    s8   t    t fddt jD | _|   d S )Nc                    r7  r/   )r   r8  r6  r/   r0   r:  #  r;  z+T5GemmaDecoder.__init__.<locals>.<listcomp>)r%   r&   r'   r@  rA  rB  rC  rD  rW   r-   r6  r0   r&     s
   zT5GemmaDecoder.__init__Nr  r   rz   r  rL  r   r   rM  r   r   r   rN  r   c                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}|
d u rAtd|d u rJ| 	|}| js[|r[|d u r[t
t t d}|	d u rw|d urg| nd}tj|||jd  |jd}	|d u r|	d}|d u r|d u rt||| j j}t| }ts| j |||	|d ur|jnd |d	}tdi |tdi |d
}t| }ts| j |
||	d d d	}dtdi |dt|ii}|}| ||}tj| j jd |jd}|| }|rdnd }|rdnd }|rdnd }| |}| jd | j j  D ]6}|r||f7 }|||||j! |||||	|
|d f
i |}|d }|rD||d f7 }||d f7 }q| "|}| |}|rX||f7 }t#|||||dS )NrO  zX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz0`encoder_hidden_states` must be given in decoder)r   r   r   r   rP  rQ  rU  rV  rS  rW  ro   r/   r1   )rX  r  rX   rY  cross_attentions)$rK   r   rM  r   r   r?  r   r   r   r<  r	   r   get_seq_lengthr)   rZ  r?   ri   r   r4  r  rs   r[  r   r   r   r.  r>  r\  rL   rp   rV   rC  rB  r   r=  r   )r,   r  r   rz   r  rL  r   r   rM  r   r   r   rN  past_seen_tokensr]  r^  cross_attn_mask_mappingrX   r   r_  r`  ra  all_cross_attnsrb  rc  r/   r/   r0   r=   )  s   


	




zT5GemmaDecoder.forward)NNNNNNNNNNN)rC   rD   rE   r&   r   r   r)   r   r   r	   r   r   r   r   r   r=   rG   r/   r/   r-   r0   re    sR    
	
re  c                #       s   e Zd Zdef fddZdd Zdd Zdd	 Zd
d Ze	e
														ddeej deej deej deej deej deej dee dee deej deej dee dee dee deej dee def ddZ  ZS ) T5GemmaModelrK   c                    s>   t  | |jstdt|j| _t|j| _|   d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r%   r&   is_encoder_decoderr   r5  encoderre  r  rD  rW   r-   r/   r0   r&     s   zT5GemmaModel.__init__c                 C   rE  r$   rm  r@   r/   r/   r0   get_encoder  rH  zT5GemmaModel.get_encoderc                 C   rE  r$   )r  r@   r/   r/   r0   get_decoder  rH  zT5GemmaModel.get_decoderc                 C   
   | j  S r$   rm  rG  r@   r/   r/   r0   rG    rK  z!T5GemmaModel.get_input_embeddingsc                 C      | j |S r$   rm  rJ  r,   new_embeddingsr/   r/   r0   rJ       z!T5GemmaModel.set_input_embeddingsNr  r   rz   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr  rL  decoder_inputs_embedsr   r   rM  r   rN  r   c                 K   s   |dur|n| j j}|du r| jd||||	||d|}|j}| jd||||
|||||||d|}t|j|j|j|j|j	|j|j|jdS )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)

        **flash_attn_kwargs: flash attention related parameters.
        Nr  r   rz   rL  r   rM  )r  r   rz   rL  r  r   r   r   r   rM  r   )rX  r  decoder_hidden_statesdecoder_attentionsrf  encoder_last_hidden_stater   encoder_attentionsr/   )
rK   r   rm  rX  r  r   r  rX   rY  rf  )r,   r  r   rz   rx  ry  rz  r{  r  rL  r|  r   r   rM  r   rN  r   decoder_outputsr/   r/   r0   r=     sL   
zT5GemmaModel.forward)NNNNNNNNNNNNNN)rC   rD   rE   r   r&   ro  rp  rG  rJ  r   r   r   r)   r   r   
BoolTensorr   r	   r   r   r   r   r   r=   rG   r/   r/   r-   r0   rk    sn    	rk  c                       s   e Zd Zdef fddZdd Zdd Zee						dd	e	e
j d
e	e
j de	e
j de	e
j de	e de	e dee defddZ  ZS )T5GemmaEncoderModelrK   c                    s2   t  | |jrtdt|j| _|   d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r%   r&   rl  r   r5  rm  rD  rW   r-   r/   r0   r&   '  s
   zT5GemmaEncoderModel.__init__c                 C   rq  r$   rr  r@   r/   r/   r0   rG  0  rK  z(T5GemmaEncoderModel.get_input_embeddingsc                 C   rs  r$   rt  ru  r/   r/   r0   rJ  3  rw  z(T5GemmaEncoderModel.set_input_embeddingsNr  r   rz   rL  r   rM  rN  r   c           	   	   K   s"   | j d||||||d|}|S )zJ
        **flash_attn_kwargs: flash attention related parameters.
        r}  Nr/   rn  )	r,   r  r   rz   rL  r   rM  rN  r{  r/   r/   r0   r=   6  s   	zT5GemmaEncoderModel.forwardrd  )rC   rD   rE   r   r&   rG  rJ  r   r   r   r)   r   r   r   r   r   r   r   r=   rG   r/   r/   r-   r0   r  %  s:    		r  c                '       sZ  e Zd ZddgZddiZddgdgfiZdef fdd	Zd
d Zdd Z	dd Z
dd Zdd Zee																d+deej deej deej deej deej deej dee dee deej deej d eej d!ee d"ee d#ee d$eej d%eeejf d&eeej ef f"d'd(Zd ejfd)d*Z  ZS ),T5GemmaForConditionalGenerationz!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projcolwise_reprX   r   rK   c                    sJ   d|_ t | t|| _|jj| _t|jj| j| _	d| _
|   d S )NTForMaskedLMLoss)rl  r%   r&   rk  r   r  r   r   rL   lm_head	loss_typerD  rW   r-   r/   r0   r&   W  s   

z(T5GemmaForConditionalGeneration.__init__c                 C   s   || j _d S r$   r  r   ru  r/   r/   r0   set_output_embeddingsb  rw  z5T5GemmaForConditionalGeneration.set_output_embeddingsc                 C      | j jS r$   r  r@   r/   r/   r0   get_output_embeddingse     z5T5GemmaForConditionalGeneration.get_output_embeddingsc                 C   s(   | j jr| | jj|    d S d S r$   )rK   r
  _tie_or_clone_weightsr  r   rp  rG  r@   r/   r/   r0   _tie_weightsh  s   z,T5GemmaForConditionalGeneration._tie_weightsc                 C   r  r$   )r   rm  r@   r/   r/   r0   ro  m  r  z+T5GemmaForConditionalGeneration.get_encoderc                 C   r  r$   )r   r  r@   r/   r/   r0   rp  p  r  z+T5GemmaForConditionalGeneration.get_decoderNr   r  r   rz   rx  ry  rz  r{  r  rL  r|  labelsr   r   rM  r   logits_to_keepr   c                 K   s2  | j r| jjdkrtd| jj d |dur%|du r%|
du r%| |}| jd|||||||||	|
||||d|}|j}t|t	rJt
| dn|}| |dd|ddf }|  j}|jdurs||j }t|}||j }d}|dur| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)

        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        r   ziIt is strongly recommended to train T5Gemma models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)r  r   rz   rx  ry  rz  r{  r  rL  r|  r   r   rM  r   )	lossr   r  r~  r  rf  r  r   r  r/   )r   rK   r   r   r   r  r   rX  rs   rF   slicer  rp  final_logit_softcappingr)   r   loss_functionr   r   r  r~  r  rf  r  r   r  )r,   r  r   rz   rx  ry  rz  r{  r  rL  r|  r  r   r   rM  r   r  loss_kwargsr  rX   slice_indicesr   decoder_configr  r/   r/   r0   r=   s  s`   #





z'T5GemmaForConditionalGeneration.forwardc                 C   s
   |  |S r$   )r  )r,   r  r/   r/   r0   %prepare_decoder_input_ids_from_labels  rK  zET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNNNr   )rC   rD   rE   _tied_weights_keys_tp_plan_pp_planr   r&   r  r  r  ro  rp  r   r   r   r)   r   r   r  r   r	   r   r   rF   r   r>   r   r=   r  rG   r/   r/   r-   r0   r  R  s    	Xr  c                          e Zd Zddedee f fddZdd Zdd	 Ze	e
												dd
eej deej deej deej deej deej dee deej deej deej dee dee defddZ  ZS ) T5GemmaForSequenceClassificationNrK   rl  c                    |   |dur||_ t | |j| _|j rt|| _nt|| _|jj}|j r*|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        Nr   皙?rl  r%   r&   r   rk  r   r  rm  rL   r  r   r   scorerD  r,   rK   rl  rL   classifier_dropoutr-   r/   r0   r&     s   
z)T5GemmaForSequenceClassification.__init__c                 C   rq  r$   r   rG  r@   r/   r/   r0   rG    rK  z5T5GemmaForSequenceClassification.get_input_embeddingsc                 C      | j | d S r$   r   rJ  rI  r/   r/   r0   rJ       z5T5GemmaForSequenceClassification.set_input_embeddingsr  r   rz   rx  ry  rz  r{  rL  r|  r  r   rM  r   c                 C   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrN| j|||||||||	d||d}|j}|j	}|j
}n| j||||||d}|j}|j}|j}| |}|durq|jd }n|jd }| j jdu r|d	krtd
| j jdu rd}nE|dur|| j jk|jtj}tj|jd |jtjd}|| d}| j jr|d	7 }tj||jd d	 d}nd}t| jj d |tj||jd|f }d}|
dur| j||
|| j d}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)

        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.Fr   rz   rx  ry  rz  r{  rL  r|  r   r   rM  r   rz   rL  r   rM  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r2   r2  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rP  )r   r  pooled_logitsrK   r  r   rX   rY  )rK   rl  NotImplementedErrorr.   rC   r   r  r   rX  r~  r  rX   rY  r  r?   r  rr   ri   r)   int32rZ  argmaxclampr   r   r  r   )r,   r  r   rz   rx  ry  rz  r{  rL  r|  r  r   rM  r   rX  rX   rY  r   
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr  r  r/   r/   r0   r=     s   


z(T5GemmaForSequenceClassification.forwardr$   NNNNNNNNNNNN)rC   rD   rE   r   r   r   r&   rG  rJ  r   r   r)   r   r   r   r   r   r=   rG   r/   r/   r-   r0   r    sZ    	r  c                       r  )T5GemmaForTokenClassificationNrK   rl  c                    r  )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        Nr   r  r  r  r-   r/   r0   r&   g  s   
z&T5GemmaForTokenClassification.__init__c                 C   rq  r$   r  r@   r/   r/   r0   rG    rK  z2T5GemmaForTokenClassification.get_input_embeddingsc                 C   r  r$   r  rI  r/   r/   r0   rJ    r  z2T5GemmaForTokenClassification.set_input_embeddingsr  r   rz   rx  ry  rz  r{  rL  r|  r  r   rM  r   c                 C   s   | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrN| j|||||||||	d||d}|j}|j	}|j
}n| j||||||d}|j}|j}|j}| |}d}|
duru| ||
| j }t||||dS )	r  Nr  r  r  Fr  r  r  )rK   rl  r  r.   rC   r   r  r   rX  r~  r  rX   rY  r  r  r   )r,   r  r   rz   rx  ry  rz  r{  rL  r|  r  r   rM  r   rX  rX   rY  r   r  r/   r/   r0   r=     sb   

z%T5GemmaForTokenClassification.forwardr$   r  )rC   rD   rE   r   r   r   r&   rG  rJ  r   r   r)   r   r   r   r   r   r=   rG   r/   r/   r-   r0   r  e  sZ    	r  )r  rk  r  r   r  r  )Nr   )r   NN)Ntypingr   r   r   r)   torch.nnr'   activationsr   cache_utilsr   r   r	   
generationr
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_t5gemmar   r   
get_loggerrC   r   Moduler    rH   rY   r   r   r   rF   r   r:   r>   r   r   r   r   r   r   r   r   r.  r0  r   r4  r5  re  rk  r  r  r  r  __all__r/   r/   r/   r0   <module>   s    
"


#KjFKA
  %b, y