o
    ei1                     @   sh  d dl mZ d dlmZ d dlZd dlmZ ddlmZ	 ddl
mZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7m8Z8 e/9e:Z;G dd dej<Z=G dd dej<Z>G dd dej<Z?dd Z@eddWdd ZAd!ejBd"eCd#ejBfd$d%ZD	&		dXd'ej<d(ejBd)ejBd*ejBd+ejBdB d,eEd-eEdB d.eEdB d#eFejBejBf fd/d0ZGeeAG d1d2 d2ej<ZHeeAG d3d4 d4ej<ZIG d5d6 d6eZJG d7d8 d8eZKG d9d: d:ej<ZLG d;d< d<ej<ZMe-G d=d> d>e(ZNd+ejBdB d#efd?d@ZOdAeCd#efdBdCZPdDejQdB d!ejBdEeCdB d#ejBfdFdGZRG dHdI dIeNZSG dJdK dKeNZTe-G dLdM dMeNZUe-G dNdO dOeNZVG dPdQ dQeNeZWe-G dRdS dSeNZXe-G dTdU dUeNZYg dVZZdS )Y    )Callable)OptionalN   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )T5GemmaConfigT5GemmaModuleConfigc                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )T5GemmaRMSNormư>dimepsc                    s&   t    || _tt|| _d S N)super__init__r*   nn	Parametertorchzerosweight)selfr)   r*   	__class__ j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/t5gemma/modeling_t5gemma.pyr-   8   s   
zT5GemmaRMSNorm.__init__c                 C   s$   |t |djddd| j  S )N   T)keepdim)r0   rsqrtpowmeanr*   )r3   xr6   r6   r7   _norm=   s   $zT5GemmaRMSNorm._normc                 C   s*   |  | }|d| j   }||S )N      ?)r?   floatr2   type_as)r3   r>   outputr6   r6   r7   forward@   s   
zT5GemmaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler2   shaper*   r3   r6   r6   r7   
extra_reprG      zT5GemmaRMSNorm.extra_repr)r(   )
__name__
__module____qualname__intrA   r-   r?   rD   rH   __classcell__r6   r6   r4   r7   r'   7   s
    r'   c                       s$   e Zd Z fddZdd Z  ZS )
T5GemmaMLPc                    s   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _t|j| _d S )NFbias)r,   r-   confighidden_sizeintermediate_sizer.   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr3   rR   r4   r6   r7   r-   L   s   
zT5GemmaMLP.__init__c                 C   s2   |  | || | }| |}| |}|S r+   )rZ   rV   rW   r]   rX   )r3   r>   hidden_statesrX   r6   r6   r7   rD   W   s   

zT5GemmaMLP.forward)rJ   rK   rL   r-   rD   rN   r6   r6   r4   r7   rO   K   s    rO   c                       s~   e Zd ZU ejed< ddef fddZe			ddedB de	d de
dB d	ed
ef fddZe edd Z  ZS )T5GemmaRotaryEmbeddinginv_freqNrR   c                    s   t    |j| _|j| _|| _| jjd | _| j}| jdkr$t	| j }|| j|\}| _
| jd|dd | jd| dd d S )N	rope_typedefaultra   F)
persistentoriginal_inv_freq)r,   r-   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrR   rope_parametersrb   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r3   rR   devicerope_init_fnra   r4   r6   r7   r-   a   s   


zT5GemmaRotaryEmbedding.__init__rn   ztorch.deviceseq_lenreturnztorch.Tensorc                 C   sZ   | j d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNr@   r   r8   dtypern   ru   )	ri   getattrrS   num_attention_headsr0   arangeint64torA   )rR   rn   rp   baser)   attention_factorra   r6   r6   r7   rj   q   s   
&z6T5GemmaRotaryEmbedding.compute_default_rope_parametersc           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtr6|jjdkr6|jjnd}t	|dd+ | |  
dd}tj||fdd	}| | j }| | j }	W d    n1 slw   Y  |j|jd
|	j|jd
fS )Nr   r9   r$   mpscpuF)device_typeenabledr8   r)   rt   )ra   rA   expandrF   r{   rn   
isinstancetypestrr    	transposer0   catcosrk   sinru   )
r3   r>   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   r6   r6   r7   rD      s   0&zT5GemmaRotaryEmbedding.forwardr+   NNN)rJ   rK   rL   r0   Tensor__annotations__r%   r-   staticmethodr   rM   rE   rA   rj   no_gradr   rD   rN   r6   r6   r4   r7   r`   ^   s&   
 

r`   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr9   r8   r   )rF   r0   r   )r>   x1x2r6   r6   r7   rotate_half   s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embedr6   r6   r7   apply_rotary_pos_emb   s
   

r   r_   n_reprq   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r$   N)rF   r   reshape)r_   r   batchnum_key_value_headsslenrs   r6   r6   r7   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskr]   scalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d ur:|| }tjj	|dtj
d|j}tjj||| jd}t||
}|dd }||fS )N      r8   r   r9   )r)   ru   )ptrainingr$   )rs   r   num_key_value_groupsr0   matmulr   tanhr.   
functionalsoftmaxfloat32r{   ru   r]   r   
contiguous)r   r   r   r   r   r]   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputr6   r6   r7   eager_attention_forward   s    

r   c                       s   e Zd ZdZdedef fddZ				ddejde	ejejf dB d	ejdB d
e
dB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )T5GemmaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperrR   	layer_idxc                    s  t    t|dr|j| nd | _|| _|| _t|d|j|j	 | _
|j	|j | _|jd | _| jj| _|j| _tj|j|j	| j
 |jd| _tj|j|j| j
 |jd| _tj|j|j| j
 |jd| _tj|j	| j
 |j|jd| _| jj| _| jdkr|j| _d S d | _d S )Nlayer_typesrs   r   rP   sliding_attention)r,   r-   hasattrr   
layer_typerR   r   rw   rS   rx   rs   r   r   query_pre_attn_scalarr   attention_dropout
is_decoder	is_causalr.   rU   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowr3   rR   r   r4   r6   r7   r-      s.   


 zT5GemmaSelfAttention.__init__Nr_   position_embeddingsr   past_key_valuescache_positionr   rq   c                 K   s   |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jrl| jnd| j| j| jd|\}}|jg |dR   }| |}||fS )Nr9   r$   r8   )r   r   r   r   r]   r   r   r   )rF   rs   r   viewr   r   r   r   updater   r   get_interfacerR   _attn_implementationr   r   r   r   r   r   r   r   r   )r3   r_   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   r6   r6   r7   rD     s<   	


zT5GemmaSelfAttention.forwardNNNN)rJ   rK   rL   __doc__r&   rM   r-   r0   r   rE   r   
LongTensorr   r   rD   rN   r6   r6   r4   r7   r      s,    r   c                       s   e Zd ZdZdedef fddZ	ddejdejdB d	ejdB d
e	dB de
e deejejdB eej dB f fddZ  ZS )T5GemmaCrossAttentionr   rR   r   c                    s   t    || _|| _t|d|j|j | _|j|j | _	|j
d | _| jj| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jj| _|jd u rutdd S )Nrs   r   FrP   zBCross-attention needs cross_attention_hidden_size to be specified.)r,   r-   rR   r   rw   rS   rx   rs   r   r   r   r   r   r   r.   rU   r   r   cross_attention_hidden_sizer   r   r   r   
ValueErrorr   r4   r6   r7   r-   ?  s0   



zT5GemmaCrossAttention.__init__Nr_   r   encoder_hidden_statesr   r   rq   c                 K   sz  |d u rt d|jd d }g |d| jR }| ||dd}|d ur3|j| j}	|j	}
|d u s9|	sw|jd d }g |d| jR }| 
||dd}| ||dd}|d urv|
||| j\}}d|j| j< n|
j| j j}|
j| j j}t| jjt}|| ||||f| jr| jnd| jd | jd|\}}|jg |dR   }| |}||fS )Nz5Encoder hidden state is required for cross attention.r9   r$   r8   Tr   r   )r   rF   rs   r   r   r   
is_updatedgetr   cross_attention_cacher   r   r   layerskeysvaluesr   r   rR   r   r   r   r   r   r   r   r   r   )r3   r_   r   r   r   r   r   r   r   r   curr_past_key_valuesencoder_input_shapeencoder_hidden_shaper   r   r   r   r   r6   r6   r7   rD   [  sN   


zT5GemmaCrossAttention.forwardr+   )rJ   rK   rL   r   r&   rM   r-   r0   r   r   r   r   rE   rD   rN   r6   r6   r4   r7   r   ;  s"    !r   c                       sr   e Zd ZdZdef fddZ			ddejdeejejf dB dejdB d	ej	dB d
eej
f f
ddZ  ZS )T5GemmaEncoderLayerzEncoder sub-layer.r   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S N)rR   r   r*   )r,   r-   rS   rR   r   r   attention_typer   	self_attnr'   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrO   mlppre_feedforward_layernormpost_feedforward_layernormr.   r[   r\   r]   r   r4   r6   r7   r-     s   

zT5GemmaEncoderLayer.__init__Nr_   r   r   r   rq   c                 K   sz   |}|  |}| jd||||d d|\}}| |}|| | }|}| |}| |}| |}|| | }|S )N)r_   r   r   r   r   r6   )r   r   r   r]   r   r   r   )r3   r_   r   r   r   r   residual_r6   r6   r7   rD     s&   





zT5GemmaEncoderLayer.forwardr   )rJ   rK   rL   r   rM   r-   r0   r   rE   r   FloatTensorrD   rN   r6   r6   r4   r7   r     s"    
r   c                       s   e Zd ZdZdef fddZ								ddejdeejejf dB d	ejdB d
ej	dB de
dB dedB dej	dB dejdB dejdB dejfddZ  ZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.r   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _t||d| _t	|j|j
d| _t	|j|j
d| _d S r   )r,   r-   rS   rR   r   r   r   r   r   r'   r   r   r   rO   r   r   r   r.   r[   r\   r]   r   
cross_attnpre_cross_attn_layernormpost_cross_attn_layernormr   r4   r6   r7   r-     s$   

zT5GemmaDecoderLayer.__init__NFr_   r   r   r   r   	use_cacher   r   encoder_attention_maskrq   c
              
   K   s   |}|  |}| jd|||||d ur|jnd ||d|
\}}| |}|| | }|}| |}| jd|||	||d|
\}}| |}|| | }|}| |}| 	|}| 
|}|| | }|S )N)r_   r   r   r   r   r   r   )r_   r   r   r   r   r6   )r   r   self_attention_cacher   r]   r   r   r   r   r   r   )r3   r_   r   r   r   r   r   r   r   r  r   r   r   r6   r6   r7   rD     sD   









zT5GemmaDecoderLayer.forward)NNNNFNNN)rJ   rK   rL   r   rM   r-   r0   r   rE   r   r	   boolr   rD   rN   r6   r6   r4   r7   r     s@    	
r   c                       F   e Zd ZdZddededef fddZdejd	ejfd
dZ	  Z
S )T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r   rS   
num_labelsclassifier_dropout_ratec                    s*   t    tj|d| _t||| _d S )N)r   )r,   r-   r.   r[   r]   rU   out_proj)r3   rS   r  r  r4   r6   r7   r-     s   
z"T5GemmaClassificationHead.__init__r_   rq   c                 C   s   |  |}| |}|S r+   )r]   r  )r3   r_   r6   r6   r7   rD     s   

z!T5GemmaClassificationHead.forward)r   )rJ   rK   rL   r   rM   rA   r-   r0   r   rD   rN   r6   r6   r4   r7   r    s    r  c                       r  )T5GemmaLMHeadz.Head for language modeling (generation) tasks.FrS   
vocab_sizerQ   c                    s    t    tj|||d| _d S )NrP   )r,   r-   r.   rU   r  )r3   rS   r
  rQ   r4   r6   r7   r-   !  s   
zT5GemmaLMHead.__init__r_   rq   c                 C   s   |  |}|S r+   )r  )r3   r_   logitsr6   r6   r7   rD   %  s   
zT5GemmaLMHead.forward)F)rJ   rK   rL   r   rM   r  r-   r0   r   rD   rN   r6   r6   r4   r7   r	    s    r	  c                       s   e Zd ZU eed< dZdZddgZdgZdZ	dZ
dZdZdZeeeddd	eedd
d	eedd
d	gdZe  fddZdd Z  ZS )T5GemmaPreTrainedModelrR   modelTr   r   r   r$   r   )index
layer_namer   )r_   
attentionsc                    s   t  | | jj}t|tr=|jjjd d }t	j
|jjd|| d t|jdr9|jjd ur;t	|jj d S d S d S t|tr_| jjs]|jjjd d }t	j
|jjd|| d d S d S d|jjv rmt	|j d S d S )Nr   r   r   )r=   stdrQ   RMSNorm)r,   _init_weightsrR   initializer_ranger   r  r  r2   rF   initnormal_r   rQ   zeros_r	  tie_word_embeddingsr5   rJ   )r3   r   r  scaler4   r6   r7   r  @  s    

z$T5GemmaPreTrainedModel._init_weightsc                 C   s|   | j jj}| j jj}|du rtd||j}|dddf  |dddf< ||d< |du r4td||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r9   r$   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rR   decoderbos_token_idpad_token_idr   	new_zerosrF   rm   masked_fill_)r3   	input_idsdecoder_start_token_idr  shifted_input_idsr6   r6   r7   _shift_rightR  s   

 z#T5GemmaPreTrainedModel._shift_right)rJ   rK   rL   r%   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r"   r   r   _can_record_outputsr0   r   r  r"  rN   r6   r6   r4   r7   r  *  s(   
 	r  c              
      &   dt dt dt dt dtf
 fdd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxrq   c                    s,    d u rt jdt jdS  | |f t jS )Nr6   rt   )r0   onesr  r{   r.  r/  r0  r1  r   r6   r7   
inner_maskr  s   z/bidirectional_mask_function.<locals>.inner_maskrM   r  )r   r5  r6   r4  r7   bidirectional_mask_functionm  s   "r7  r   c              
      r-  )zH
    This creates bidirectional attention mask with sliding window.
    r.  r/  r0  r1  rq   c                    s   |  |k ||  k @ S r+   r6   r3  r   r6   r7   r5    rI   z>sliding_window_bidirectional_mask_function.<locals>.inner_maskr6  )r   r5  r6   r8  r7   *sliding_window_bidirectional_mask_functionz  s   "r9  	token_idsr  c                 C   sX   | dur|du rt d| |k|jtj}|S tj|jd |jd f|jtjd}|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r$   rv   )r   r{   rn   r0   longr2  rF   )r:  r_   r  r   r6   r6   r7   make_default_2d_attention_mask  s   r<  c                       sz   e Zd ZeedZ fddZee				dde	j
dB de	jdB de	j
dB de	jdB d	ee d
eeB fddZ  ZS )T5GemmaEncoder)r  r_   c                       t     j| _ j| _t j j| j| _t	 j j
d| _d| _t fddt jD | _t j| _t d| _|   d S )Nr   Fc                       g | ]}t  |qS r6   )r   .0r   rR   r6   r7   
<listcomp>      z+T5GemmaEncoder.__init__.<locals>.<listcomp>rB  r,   r-   r  padding_idxr
  r.   	EmbeddingrS   embed_tokensr'   r   normgradient_checkpointing
ModuleListrangenum_hidden_layersr   r[   r\   r]   r`   
rotary_emb	post_initr^   r4   rB  r7   r-        zT5GemmaEncoder.__init__Nr  r   r   inputs_embedsr   rq   c                 K   sf  |d u |d uA rt d|dd  |d u r| |}tjd|jd |jd}|d u r0|d}|d u r<t||| j	j
}t| }tsm| j	|||d |d}tdi |dt|itdi |t| j	jt|dd	}|}	tj| j	jd
 |	jd}
|	|
 }	| |	}	| |	|}| jd | j	j D ]}||	|||j |fi |}	q| |	}	| |	}	t|	dS )N:You must specify exactly one of input_ids or inputs_embedsr   r   r$   rn   rR   rQ  r   r   r   r   or_mask_function)rU  and_mask_functionfull_attentionr         ?rt   )last_hidden_stater6   )r   poprH  r0   ry   rF   rn   r   r<  rR   r  r   dictr   r7  r   r9  r   tensorrS   ru   r]   rN  r   rM  r   rI  r   )r3   r  r   r   rQ  r   r   self_attn_mask_mappingmask_kwargsr_   
normalizerr   layer_moduler6   r6   r7   rD     sb   


	



zT5GemmaEncoder.forwardr   )rJ   rK   rL   r   r   r,  r-   r!   r#   r0   r   r   r   r   r   rE   r   rD   rN   r6   r6   r4   r7   r=    s0    r=  c                       s   e Zd ZeeddeeddedZ fddZe	e
									ddejdB dejdB d	ejdB d
edB dejdB dedB dejdB dejdB dejdB dee deeB fddZ  ZS )T5GemmaDecoderr$   )r  )r  cross_attentionsr_   c                    r>  )Nr   Fc                    r?  r6   )r   r@  rB  r6   r7   rC    rD  z+T5GemmaDecoder.__init__.<locals>.<listcomp>rB  rE  r^   r4   rB  r7   r-     rP  zT5GemmaDecoder.__init__Nr  r   r   r   rQ  r   r   r   r  r   rq   c
                 K   s  |d u |d uA rt d|d u rt d|d u r| |}| js0|r0|d u r0tt| jdt }|d u rL|d ur<| nd}tj|||j	d  |j
d}|d u rU|d}|d u re|d u ret||| jj}t| }ts| j||||d urx|jnd |d}tdi |tdi |d}t|	 }ts| j||	|d d d}d	tdi |d
t|	ii}|}tj| jjd |jd}|| }| |}| ||}| jd | jj D ]}|||||j ||||||d	 f	i |
}q| |}| |}t||dS )NrR  z0`encoder_hidden_states` must be given in decoderrB  r   r$   rS  rT  rW  rX  rU  rY  rt   )rZ  r   r6   )r   rH  r   r	   r   rR   get_seq_lengthr0   ry   rF   rn   r   r<  r  r   r\  r  r   r   r7  r]  rS   ru   r]   rN  r   rM  r   rI  r   )r3   r  r   r   r   rQ  r   r   r   r  r   past_seen_tokensr^  r_  cross_attn_mask_mappingr_   r`  r   ra  r6   r6   r7   rD     s   

		



zT5GemmaDecoder.forward)	NNNNNNNNN)rJ   rK   rL   r"   r   r   r   r,  r-   r!   r#   r0   r   r   r	   r   r  r   r   rE   r   rD   rN   r6   r6   r4   r7   rb    sP    

	
rb  c                       s   e Zd Zdef fddZdd Zdd Zee												dd	e	j
dB d
e	jdB de	j
dB de	j
dB de	jdB de	j
dB dedB dedB de	jdB de	jdB dedB de	j
dB dee defddZ  ZS )T5GemmaModelrR   c                    s>   t  | |jstdt|j| _t|j| _|   d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r,   r-   is_encoder_decoderr   r=  encoderrb  r  rO  r^   r4   r6   r7   r-   p  s   zT5GemmaModel.__init__c                 C   
   | j  S r+   ri  get_input_embeddingsrG   r6   r6   r7   rl  {     
z!T5GemmaModel.get_input_embeddingsc                 C      | j |S r+   ri  set_input_embeddingsr3   new_embeddingsr6   r6   r7   rp  ~     z!T5GemmaModel.set_input_embeddingsNr  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   rQ  decoder_inputs_embedsr   r   r   rq   c                 K   s   |du r| j d||||	d|}|j}| jd||||
|||||d	|}t|j|j|ddr4|jn|jf|j|j|j|j|jdS )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        Nr  r   r   rQ  )	r  r   r   rQ  r   r   r  r   r   output_hidden_statesF)rZ  r   decoder_hidden_statesdecoder_attentionsrc  encoder_last_hidden_stater   encoder_attentionsr6   )	ri  rZ  r  r   r   r   r_   r  rc  )r3   r  r   r   rt  ru  rv  rw  r   rQ  rx  r   r   r   r   decoder_outputsr6   r6   r7   rD     sF   

zT5GemmaModel.forward)NNNNNNNNNNNN)rJ   rK   rL   r%   r-   rl  rp  r   r   r0   r   r   
BoolTensorr   r	   r   r  r   r   r   rD   rN   r6   r6   r4   r7   rg  n  s^    	
rg  c                       s   e Zd Zdef fddZdd Zdd Zee				dd	e	j
dB d
e	jdB de	j
dB de	jdB dee defddZ  ZS )T5GemmaEncoderModelrR   c                    s2   t  | |jrtdt|j| _|   d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r,   r-   rh  r   r=  ri  rO  r^   r4   r6   r7   r-     s
   zT5GemmaEncoderModel.__init__c                 C   rj  r+   rk  rG   r6   r6   r7   rl    rm  z(T5GemmaEncoderModel.get_input_embeddingsc                 C   rn  r+   ro  rq  r6   r6   r7   rp    rs  z(T5GemmaEncoderModel.set_input_embeddingsNr  r   r   rQ  r   rq   c                 K   s   | j d||||d|}|S )Nry  r6   )ri  )r3   r  r   r   rQ  r   rw  r6   r6   r7   rD     s   
zT5GemmaEncoderModel.forwardr   )rJ   rK   rL   r%   r-   rl  rp  r   r   r0   r   r   r   r   r   r   rD   rN   r6   r6   r4   r7   r    s.    	r  c                $       s.  e Zd ZddiZddiZddgdgfiZdef fdd	Zd
d Zdd Z	e
e														d$dejdB dejdB dejdB dejdB dejdB dejdB dedB dedB dejdB dejdB dejdB dedB dejdB deejB dee deej eB f d d!Zdejfd"d#Z  ZS )%T5GemmaForConditionalGenerationzlm_head.out_proj.weightz!model.decoder.embed_tokens.weightzlm_head.out_projcolwise_gather_outputr_   r  rR   c                    sJ   d|_ t | t|| _|jj| _t|jj| j| _	d| _
|   d S )NTForMaskedLM)rh  r,   r-   rg  r  r  r
  r	  rS   lm_head	loss_typerO  r^   r4   r6   r7   r-     s   

z(T5GemmaForConditionalGeneration.__init__c                 C   s   || j _d S r+   r  r  rq  r6   r6   r7   set_output_embeddings  rs  z5T5GemmaForConditionalGeneration.set_output_embeddingsc                 C   s   | j jS r+   r  rG   r6   r6   r7   get_output_embeddings  s   z5T5GemmaForConditionalGeneration.get_output_embeddingsNr   r  r   r   rt  ru  rv  rw  r   rQ  rx  labelsr   r   logits_to_keepr   rq   c                 K   s  |dur|du r|
du r|  |}| jd|||||||||	|
||d|}|j}t|tr4t| dn|}| |dd|ddf }|  j}|j	dur]||j	 }t
|}||j	 }d}|duro| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r  r   r   rt  ru  rv  rw  r   rQ  rx  r   r   )	lossr  r   r{  r|  rc  r}  r   r~  r6   )r"  r  rZ  r   rM   slicer  get_decoderrR   final_logit_softcappingr0   r   loss_functionr
  r   r   r{  r|  rc  r}  r   r~  )r3   r  r   r   rt  ru  rv  rw  r   rQ  rx  r  r   r   r  r   r  r_   slice_indicesr  decoder_configr  r6   r6   r7   rD     sP   





z'T5GemmaForConditionalGeneration.forwardc                 C   s
   |  |S r+   )r"  )r3   r  r6   r6   r7   %prepare_decoder_input_ids_from_labelsF  rm  zET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNr   )rJ   rK   rL   _tied_weights_keys_tp_plan_pp_planr%   r-   r  r  r   r   r0   r   r   r  r   r	   r  rM   r   r   r   rE   r   rD   r  rN   r6   r6   r4   r7   r    sr    	
Kr  c                          e Zd ZddededB f fddZdd Zdd	 Zee											dd
e
jdB de
jdB de
jdB de
jdB de
jdB de
jdB dedB de
jdB de
jdB de
jdB dee defddZ  ZS ) T5GemmaForSequenceClassificationNrR   rh  c                    |   |dur||_ t | |j| _|j rt|| _nt|| _|jj}|j r*|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        Nr  皙?rh  r,   r-   r  rg  r  r  ri  rS   r  rw   r  scorerO  r3   rR   rh  rS   classifier_dropoutr4   r6   r7   r-   L  s   
z)T5GemmaForSequenceClassification.__init__c                 C   rj  r+   r  rl  rG   r6   r6   r7   rl  c  rm  z5T5GemmaForSequenceClassification.get_input_embeddingsc                 C      | j | d S r+   r  rp  r3   r   r6   r6   r7   rp  f     z5T5GemmaForSequenceClassification.set_input_embeddingsr  r   r   rt  ru  rv  rw  rQ  rx  r  r   rq   c                 K   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrP| j|f||||||||	dd	|}|j}|j	}|j
}n| j|f|||d|}|j}|j}|j}| |}|duru|jd }n|jd }| j jdu r|d	krtd
| j jdu rd}nE|dur|| j jk|jtj}tj|jd |jtjd}|| d}| j jr|d	7 }tj||jd d	 d}nd}t| jj d |tj||jd|f }d}|
dur| j||
|| j d}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   rt  ru  rv  rw  rQ  rx  r   r   r   rQ  r   r$   z=Cannot handle batch sizes > 1 if no padding token is defined.r9   rv   )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rS  )r  r  pooled_logitsrR   r  r  r_   r  )rR   rh  NotImplementedErrorr5   rJ   r   r"  r  rZ  r{  r|  r_   r  r  rF   r  r{   rn   r0   int32ry   argmaxclamploggerwarning_oncer  r   )r3   r  r   r   rt  ru  rv  rw  rQ  rx  r  r   outputsrZ  r_   r  r  
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr  r  r6   r6   r7   rD   i  s   


z(T5GemmaForSequenceClassification.forwardr+   
NNNNNNNNNN)rJ   rK   rL   r%   r  r-   rl  rp  r   r   r0   r   r   r   r   r   r   r   rD   rN   r6   r6   r4   r7   r  J  sR    	
r  c                       r  )T5GemmaForTokenClassificationNrR   rh  c                    r  )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        Nr  r  r  r  r4   r6   r7   r-     s   
z&T5GemmaForTokenClassification.__init__c                 C   rj  r+   r  rG   r6   r6   r7   rl    rm  z2T5GemmaForTokenClassification.get_input_embeddingsc                 C   r  r+   r  r  r6   r6   r7   rp    r  z2T5GemmaForTokenClassification.set_input_embeddingsr  r   r   rt  ru  rv  rw  rQ  rx  r  r   rq   c                 K   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrP| j|f||||||||	dd	|}|j}|j	}|j
}n| j|f|||d|}|j}|j}|j}| |}d}|
dury| ||
| j }t||||dS )	r  Nr  r  r  Fr  r  r  )rR   rh  r  r5   rJ   r   r"  r  rZ  r{  r|  r_   r  r  r  r   )r3   r  r   r   rt  ru  rv  rw  rQ  rx  r  r   r  rZ  r_   r  r  r  r6   r6   r7   rD     sf   

z%T5GemmaForTokenClassification.forwardr+   r  )rJ   rK   rL   r%   r  r-   rl  rp  r   r   r0   r   r   r   r   r   r   r   rD   rN   r6   r6   r4   r7   r    sR    	
r  )r  rg  r  r  r  r  )r$   )r   NN)[collections.abcr   typingr   r0   torch.nnr.    r   r  activationsr   cache_utilsr   r   r	   
generationr
   integrationsr   r   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr    r!   utils.output_capturingr"   r#   configuration_t5gemmar%   r&   
get_loggerrJ   r  Moduler'   rO   r`   r   r   r   rM   r   rA   rE   r   r   r   r   r   r  r	  r  r7  r9  r   r<  r=  rb  rg  r  r  r  r  __all__r6   r6   r6   r7   <module>   s    
A

"LU4KB
^zO$g r