o
    iz                     @   s  d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 dZ5e$6e7Z8G dd de+Z9G dd deZ:G dd de0Z;G dd de.Z<G dd de1Z=G dd de-Z>G dd  d e-Z?d!eej@ d"efd#d$ZAd%eBd"efd&d'ZCG d(d) d)eZDG d*d+ d+eDZEG d,d- d-ejFZGG d.d/ d/ejFZHe"G d0d1 d1e/ZId2eejJ d3ej@d4eeB d"ej@fd5d6ZKG d7d8 d8eIZLG d9d: d:eLZMe"G d;d< d<eIZNe"G d=d> d>eIZOG d?d@ d@eIeZPe"G dAdB dBeIZQe"G dCdD dDeIZRg dEZSdS )F    )AnyCallableOptionalUnionN   )CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forwardz google/t5gemma-2b-2b-prefixlm-itc                   @      e Zd ZdS )T5GemmaModuleConfigN__name__
__module____qualname__ r.   r.   h/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/t5gemma/modular_t5gemma.pyr)   ?       r)   c                       sJ  e Zd ZdZdZdgZi dddddddd	d
ddddd	dddddddd	dddddddd	dddddd	iZdgdgfddgdgfdgdgfdgdgfddgdgfdgdgfdZ								 d-d!ee	e
eeef f  d"ee	e
eeef f  d#ed$ed%ed&ed'ed(ef fd)d*Z fd+d,Z  ZS ).T5GemmaConfiga  
    This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
    e.g. [google/t5gemma-2b-2b-prefixlm-it](https://huggingface.co/google/t5gemma-2b-2b-prefixlm-it)
    ```python
    >>> from transformers import T5GemmaConfig, T5GemmaModel
    >>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
    >>> model = T5GemmaModel(t5gemma_config)
    ```
    Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PretrainedConfig] for more information.
    Args:
        encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether tie input and output embeddings.
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the T5Gemma model (the same as Gemma 2).
        kwargs (additional keyword arguments, optional, *optional*):
            Will be passed to the PretrainedConfig base class.
    t5gemmapast_key_valuesz!encoder.layers.*.self_attn.q_projcolwisez!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.o_projrowwisezencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_projz!decoder.layers.*.self_attn.q_projz!decoder.layers.*.self_attn.k_projz!decoder.layers.*.self_attn.v_projz!decoder.layers.*.self_attn.o_projz"decoder.layers.*.cross_attn.q_projz"decoder.layers.*.cross_attn.k_projz"decoder.layers.*.cross_attn.v_projz"decoder.layers.*.cross_attn.o_projzdecoder.layers.*.mlp.gate_projzdecoder.layers.*.mlp.up_projzdecoder.layers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)zencoder.embed_tokenszencoder.layerszencoder.normzdecoder.embed_tokenszdecoder.layerszdecoder.normNT          encoderdecoderis_encoder_decoderdropout_rateclassifier_dropout_rateattention_dropouttie_word_embeddings
vocab_sizec	                    sf  t |trtdi |}n|d u rt }nt |ts#J t| dt |tr0tdi |}n|d u r7|}nt |tsEJ t| dtdi | }tdi | }d|_||_||_|| _d|_d|_	||_||_|j
|_|| _dD ]}
|
|	vrt||
|	|
< qxt jdi |	 || _|	d|j	| _	|	d|j| _|| _|| _|| _|| _|| _d S )Nz is not supported.FT)bos_token_idpad_token_ideos_token_id	use_cacheinitializer_ranger.   )
isinstancedictr)   typeto_dict
is_decoderr?   rA   r<   rG   hidden_sizecross_attention_hidden_sizer=   getattrsuper__init__r>   getrH   r@   rB   rC   )selfr<   r=   r>   r?   r@   rA   rB   rC   kwargsspecial_token_key	__class__r.   r/   rR      sF   


zT5GemmaConfig.__init__c                    s>   g d}||v rt | j|| t | j|| t || d S )N)output_hidden_statesoutput_attentions_attn_implementationr?   rA   rC   )setattrr<   r=   rQ   __setattr__)rT   keyvalueshared_attr_with_submodulesrW   r.   r/   r]      s
   	zT5GemmaConfig.__setattr__)NNTr:   r:   r:   Tr;   )r+   r,   r-   __doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   r   r)   rJ   r   boolfloatintrR   r]   __classcell__r.   r.   rW   r/   r1   C   s    !




	:r1   c                   @   r(   )T5GemmaRMSNormNr*   r.   r.   r.   r/   rj      r0   rj   c                       s$   e Zd Z fddZdd Z  ZS )
T5GemmaMLPc                    s   t  | t|j| _d S N)rQ   rR   nnDropoutr?   dropoutrT   configrW   r.   r/   rR      s   zT5GemmaMLP.__init__c                 C   s2   |  | || | }| |}| |}|S rl   )act_fn	gate_projup_projro   	down_proj)rT   xr8   ru   r.   r.   r/   forward   s   

zT5GemmaMLP.forward)r+   r,   r-   rR   rw   ri   r.   r.   rW   r/   rk      s    rk   c                       s   e Zd Zd fdd	Z  ZS )T5GemmaRotaryEmbeddingNc                    s   t  || d S rl   )rQ   rR   )rT   rq   devicerW   r.   r/   rR      s   zT5GemmaRotaryEmbedding.__init__rl   )r+   r,   r-   rR   ri   r.   r.   rW   r/   rx      s    rx   c                       s&   e Zd Zdedef fddZ  ZS )T5GemmaSelfAttentionrq   	layer_idxc                    s   t  || |j| _d S rl   )rQ   rR   rM   	is_causalrT   rq   r{   rW   r.   r/   rR      s   zT5GemmaSelfAttention.__init__)r+   r,   r-   r)   rh   rR   ri   r.   r.   rW   r/   rz      s    rz   c                       s   e Zd Zdedef fddZedddd		dd
ejde	ej de	ej de	e
 dee deeje	ej e	eej  f fddZ  ZS )T5GemmaCrossAttentionrq   r{   c                    sj   t  || | `d| _|jd u rtdtj|j|j| j	 |j
d| _tj|j|j| j	 |j
d| _d S )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)rQ   rR   sliding_windowr|   rO   
ValueErrorrm   Linearnum_key_value_headshead_dimattention_biask_projv_projr}   rW   r.   r/   rR      s   
zT5GemmaCrossAttention.__init__past_key_valuer3   4.58new_nameversionNr8   r9   encoder_hidden_statesrU   returnc                 K   s  |d u rt d|jd d }g |d| jR }| ||dd}|d ur3|j| j}	|j	}
|d u s9|	sw|jd d }g |d| jR }| 
||dd}| ||dd}|d urv|
||| j\}}d|j| j< n|
j| j j}|
j| j j}t}| jjdkrt| jj }|| ||||f| jr| jnd| jd | jd|\}}|jg |dR   }| |}||fS )	Nz5Encoder hidden state is required for cross attention.   r   Teagerr:   )ro   scalingr   softcap)r   shaper   q_projview	transpose
is_updatedrS   r{   cross_attention_cacher   r   updatelayerskeysvaluesr'   rq   r[   r   trainingrA   r   attn_logit_softcappingreshape
contiguouso_proj)rT   r8   r9   r   r3   rU   input_shapehidden_shapequery_statesr   curr_past_key_valueencoder_input_shapeencoder_hidden_shape
key_statesvalue_statesattention_interfaceattn_outputattn_weightsr.   r.   r/   rw      sN   	


zT5GemmaCrossAttention.forwardrl   )r+   r,   r-   r)   rh   rR   r   torchTensorr   r   r   r   tuplerw   ri   r.   r.   rW   r/   r~      s"    r~   r9   r   c              
      &   dt dt dt dt dtf
 fdd}|S )z4
    This creates bidirectional attention mask.
    	batch_idxhead_idxq_idxkv_idxr   c                    s,    d u rt jdt jdS  | |f t jS )Nr.   dtype)r   onesrf   tor   r   r   r   r9   r.   r/   
inner_mask:  s   z/bidirectional_mask_function.<locals>.inner_maskrh   rf   )r9   r   r.   r   r/   bidirectional_mask_function5  s   "r   r   c              
      r   )zH
    This creates bidirectional attention mask with sliding window.
    r   r   r   r   r   c                    s   |  |k ||  k @ S rl   r.   r   r   r.   r/   r   G  s   z>sliding_window_bidirectional_mask_function.<locals>.inner_maskr   )r   r   r.   r   r/   *sliding_window_bidirectional_mask_functionB  s   "r   c                       sl   e Zd ZdZdef fddZ		ddejdeejejf de	ej d	e	ej
 d
eejf f
ddZ  ZS )T5GemmaEncoderLayerzEncoder sub-layer.r{   c                    s   t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S N)rq   r{   eps)rQ   rR   rN   rq   r{   layer_typesattention_typerz   	self_attnrj   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrk   mlppre_feedforward_layernormpost_feedforward_layernormrm   rn   r?   ro   r}   rW   r.   r/   rR   P  s   

zT5GemmaEncoderLayer.__init__Nr8   position_embeddingsr9   position_idsr   c                 K   sz   |}|  |}| jd||||d d|\}}| |}|| | }|}| |}| |}| |}|| | }|S )N)r8   r   r9   r   r3   r.   )r   r   r   ro   r   r   r   )rT   r8   r   r9   r   rU   residual_r.   r.   r/   rw   d  s&   





zT5GemmaEncoderLayer.forward)NN)r+   r,   r-   ra   rh   rR   r   r   r   r   
LongTensorFloatTensorrw   ri   r.   r.   rW   r/   r   M  s     
r   c                       s   e Zd ZdZdef fddZedddd							
						ddejde	ejejf de
ej de
ej de
e de
e de
ej de
ej de
ej dejfddZ  ZS )T5GemmaDecoderLayerz2Decoder sub-layer: an extra cross-attention layer.r{   c                    sD   t  || t||d| _t|j|jd| _t|j|jd| _d S r   )	rQ   rR   r~   
cross_attnrj   rN   r   pre_cross_attn_layernormpost_cross_attn_layernormr}   rW   r.   r/   rR     s   zT5GemmaDecoderLayer.__init__r   r3   r   r   NFr8   r   r9   r   rG   cache_positionr   encoder_attention_maskr   c
              
   K   s   |}|  |}| jd|||||d ur|jnd ||d|
\}}| |}|| | }|}| |}| jd|||	||d|
\}}| |}|| | }|}| |}| 	|}| 
|}|| | }|S )N)r8   r   r9   r   r3   rG   r   )r8   r   r9   r3   rG   r.   )r   r   self_attention_cacher   ro   r   r   r   r   r   r   )rT   r8   r   r9   r   r3   rG   r   r   r   rU   r   r   r.   r.   r/   rw     sD   









zT5GemmaDecoderLayer.forward)NNNFNNN)r+   r,   r-   ra   rh   rR   r   r   r   r   r   r   r	   rf   r   rw   ri   r.   r.   rW   r/   r     s@    	
r   c                       F   e Zd ZdZddededef fddZdejd	ejfd
dZ	  Z
S )T5GemmaClassificationHeadz-Head for sentence-level classification tasks.r:   rN   
num_labelsr@   c                    s*   t    tj|d| _t||| _d S )N)p)rQ   rR   rm   rn   ro   r   out_proj)rT   rN   r   r@   rW   r.   r/   rR     s   
z"T5GemmaClassificationHead.__init__r8   r   c                 C   s   |  |}| |}|S rl   )ro   r   )rT   r8   r.   r.   r/   rw     s   

z!T5GemmaClassificationHead.forward)r:   )r+   r,   r-   ra   rh   rg   rR   r   r   rw   ri   r.   r.   rW   r/   r     s    r   c                       r   )T5GemmaLMHeadz.Head for language modeling (generation) tasks.FrN   rC   r   c                    s    t    tj|||d| _d S )Nr   )rQ   rR   rm   r   r   )rT   rN   rC   r   rW   r.   r/   rR     s   
zT5GemmaLMHead.__init__r8   r   c                 C   s   |  |}|S rl   )r   )rT   r8   logitsr.   r.   r/   rw     s   
zT5GemmaLMHead.forward)F)r+   r,   r-   ra   rh   rf   rR   r   r   rw   ri   r.   r.   rW   r/   r     s    r   c                   @   s6   e Zd ZU eed< dZdZddgZdd Zdd	 Z	d
S )T5GemmaPreTrainedModelrq   modelTr   r   c                 C   s   t | | | jj}t|tr=|jjjd d }|jjj	j
d|| d t|jdr9|jjd ur;|jjj	  d S d S d S t|tr_| jjs]|jjjd d }|jjj	j
d|| d d S d S d|jjv rm|jj	  d S d S )Nr   g      r:   )meanstdr   RMSNorm)r   _init_weightsrq   rH   rI   r   r   weightr   datanormal_hasattrr   zero_r   rB   rX   r+   )rT   moduler   scaler.   r.   r/   r     s    

z$T5GemmaPreTrainedModel._init_weightsc                 C   s|   | j jj}| j jj}|du rtd||j}|dddf  |dddf< ||d< |du r4td||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rq   r=   rD   rE   r   	new_zerosr   clonemasked_fill_)rT   r6   decoder_start_token_idrE   shifted_input_idsr.   r.   r/   _shift_right  s   

 z#T5GemmaPreTrainedModel._shift_rightN)
r+   r,   r-   r1   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r   r.   r.   r.   r/   r     s   
 r   	token_idsr8   rE   c                 C   sX   | dur|du rt d| |k|jtj}|S tj|jd |jd f|jtjd}|S )z%Construct the default attention mask.Nz3`pad_token_id` is required for padding information.r   r   ry   r   )r   r   ry   r   longr   r   )r   r8   rE   r9   r.   r.   r/   make_default_2d_attention_mask	  s   r   c                       sr   e Zd ZeedZ fddZe				ddee	j
 dee	j dee	j
 dee	j d	ee d
efddZ  ZS )T5GemmaEncoder)
attentionsr8   c                    s   t     j| _ j| _t j j| j| _t	 j j
d| _t d| _d| _t fddt jD | _t j| _|   d S )Nr   rq   Fc                       g | ]}t  |qS r.   )r   .0r{   r   r.   r/   
<listcomp>+      z+T5GemmaEncoder.__init__.<locals>.<listcomp>)rQ   rR   rE   padding_idxrC   rm   	EmbeddingrN   embed_tokensrj   r   normrx   
rotary_embgradient_checkpointing
ModuleListrangenum_hidden_layersr   rn   r?   ro   	post_initrp   rW   r   r/   rR      s   zT5GemmaEncoder.__init__Nr6   r9   r   r7   rU   r   c                 K   sf  |d u |d uA rt d|dd  |d u r| |}tjd|jd |jd}|d u r0|d}|d u r<t||| j	j
}t| }tsm| j	|||d |d}tdi |dt|itdi |t| j	jt|dd	}|}	| |	|}
tj| j	jd
 |	jd}|	| }	| |	}	| jd | j	j D ]}||	|
||j |fi |}	q| |	}	| |	}	t|	dS )N:You must specify exactly one of input_ids or inputs_embedsr3   r   r   ry   rq   input_embedsr9   r   r3   r   or_mask_function)r  and_mask_functionfull_attentionsliding_attention      ?r   )last_hidden_stater.   )r   popr  r   aranger   ry   	unsqueezer   rq   rE   rI   rJ   r%   r   r&   r   r   r
  tensorrN   r   ro   r   r  r   r	  r   )rT   r6   r9   r   r7   rU   r   self_attn_mask_mappingmask_kwargsr8   r   
normalizerlayer_moduler.   r.   r/   rw   2  sb   	

	



zT5GemmaEncoder.forwardNNNN)r+   r,   r-   rz   r   _can_record_outputsrR   r   r   r   r   r   r   r   r   r   rw   ri   r.   r.   rW   r/   r     s.    r   c                       s   e Zd ZeeddeeddedZ fddZe										dde
ej de
ej d	e
ej d
e
e de
ej de
e de
ej de
ej de
ej dee defddZ  ZS )T5GemmaDecoderr   )index)r   cross_attentionsr8   c                    s8   t    t fddt jD | _|   d S )Nc                    r  r.   )r   r  r   r.   r/   r    r  z+T5GemmaDecoder.__init__.<locals>.<listcomp>)rQ   rR   rm   r  r  r  r   r  rp   rW   r   r/   rR   ~  s
   zT5GemmaDecoder.__init__Nr6   r9   r   r3   r7   rG   r   r   r   rU   r   c
                 K   s  |d u |d uA rt d|d u rt d|d u r| |}| js3|r3|d u r3tt| jdt| jd}|d u rO|d ur?| nd}tj|||j	d  |j
d}|d u rX|d}|d u rh|d u rht||| jj}t| }ts| j||||d ur{|jnd |d}tdi |tdi |d}t|	 }ts| j||	|d d d}d	tdi |d
t|	ii}|}| ||}tj| jjd |jd}|| }| |}| jd | jj D ]}|||||j ||||||d	 f	i |
}q| |}| |}t||dS )Nr  z0`encoder_hidden_states` must be given in decoderr   r   r   r  r  r  r  r  r  r   )r  r3   r.   )r   r  r   r	   r   rq   get_seq_lengthr   r  r   ry   r  r   rE   rI   rJ   r   r%   r&   r   r
  r  rN   r   ro   r   r  r   r	  r   )rT   r6   r9   r   r3   r7   rG   r   r   r   rU   past_seen_tokensr  r   cross_attn_mask_mappingr8   r   r!  r"  r.   r.   r/   rw     s   

		



zT5GemmaDecoder.forward)	NNNNNNNNN)r+   r,   r-   r   rz   r~   r   r$  rR   r   r   r   r   r   r	   r   rf   r   r   r   rw   ri   r.   r.   rW   r/   r%  w  sN    

	
r%  c                       s   e Zd Zdef fddZdd Zdd Zdd	 Zee		
	
	
	
	
	
	
	
	
	
	
	
dde
ej de
ej de
ej de
ej de
ej de
ej de
e de
e de
ej de
ej de
e de
ej dee defddZ  ZS )T5GemmaModelrq   c                    s>   t  | |jstdt|j| _t|j| _|   d S )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	rQ   rR   r>   r   r   r<   r%  r=   r  rp   rW   r.   r/   rR     s   zT5GemmaModel.__init__c                 C   s   | j S rl   r<   rT   r.   r.   r/   get_encoder  s   zT5GemmaModel.get_encoderc                 C   
   | j  S rl   r<   get_input_embeddingsr-  r.   r.   r/   r1       
z!T5GemmaModel.get_input_embeddingsc                 C      | j |S rl   r<   set_input_embeddingsrT   new_embeddingsr.   r.   r/   r5       z!T5GemmaModel.set_input_embeddingsNr6   r9   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr3   r7   decoder_inputs_embedsrG   r   rU   r   c                 K   s   |du r| j d||||	d|}|j}| jd||||
|||||d	|}t|j|j|ddr4|jn|jf|j|j|j|j|jdS )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        Nr6   r9   r   r7   )	r6   r9   r   r7   r3   r   r   rG   r   rY   F)r  r3   decoder_hidden_statesdecoder_attentionsr'  encoder_last_hidden_stater   encoder_attentionsr.   )	r<   r  r=   r   r3   rS   r8   r   r'  )rT   r6   r9   r   r9  r:  r;  r<  r3   r7   r=  rG   r   rU   r   decoder_outputsr.   r.   r/   rw     sF   

zT5GemmaModel.forward)NNNNNNNNNNNN)r+   r,   r-   r1   rR   r.  r1  r5  r   r   r   r   r   r   
BoolTensorr   r	   r   rf   r   r   r   rw   ri   r.   r.   rW   r/   r+    s`    	
r+  c                       s   e Zd Zdef fddZdd Zdd Zee				dd	e	e
j d
e	e
j de	e
j de	e
j dee defddZ  ZS )T5GemmaEncoderModelrq   c                    s2   t  | |jrtdt|j| _|   d S )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)rQ   rR   r>   r   r   r<   r  rp   rW   r.   r/   rR   9  s
   zT5GemmaEncoderModel.__init__c                 C   r/  rl   r0  r-  r.   r.   r/   r1  B  r2  z(T5GemmaEncoderModel.get_input_embeddingsc                 C   r3  rl   r4  r6  r.   r.   r/   r5  E  r8  z(T5GemmaEncoderModel.set_input_embeddingsNr6   r9   r   r7   rU   r   c                 K   s   | j d||||d|}|S )Nr>  r.   r,  )rT   r6   r9   r   r7   rU   r<  r.   r.   r/   rw   H  s   
zT5GemmaEncoderModel.forwardr#  )r+   r,   r-   r1   rR   r1  r5  r   r   r   r   r   r   r   r   r   r   rw   ri   r.   r.   rW   r/   rE  7  s.    	rE  c                %       sN  e Zd ZddgZddiZddgdgfiZdef fdd	Zd
d Zdd Z	dd Z
dd Zdd Zee														d*deej deej deej deej deej deej dee dee deej deej d eej d!ee d"eej d#eeejf d$ee d%eeej ef f d&d'Zd ejfd(d)Z  Z S )+T5GemmaForConditionalGenerationz!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projcolwise_repr8   r   rq   c                    sJ   d|_ t | t|| _|jj| _t|jj| j| _	d| _
|   d S )NTForMaskedLM)r>   rQ   rR   r+  r   r=   rC   r   rN   lm_head	loss_typer  rp   rW   r.   r/   rR   a  s   

z(T5GemmaForConditionalGeneration.__init__c                 C   s   || j _d S rl   rI  r   r6  r.   r.   r/   set_output_embeddingsl  r8  z5T5GemmaForConditionalGeneration.set_output_embeddingsc                 C      | j jS rl   rK  r-  r.   r.   r/   get_output_embeddingso     z5T5GemmaForConditionalGeneration.get_output_embeddingsc                 C   s(   | j jr| | jj|    d S d S rl   )rq   rB   _tie_or_clone_weightsrI  r   get_decoderr1  r-  r.   r.   r/   _tie_weightsr  s   z,T5GemmaForConditionalGeneration._tie_weightsc                 C   rM  rl   )r   r<   r-  r.   r.   r/   r.  w  rO  z+T5GemmaForConditionalGeneration.get_encoderc                 C   rM  rl   )r   r=   r-  r.   r.   r/   rQ  z  rO  z+T5GemmaForConditionalGeneration.get_decoderNr   r6   r9   r   r9  r:  r;  r<  r3   r7   r=  labelsrG   r   logits_to_keeprU   r   c                 K   s  |dur|du r|
du r|  |}| jd|||||||||	|
||d|}|j}t|tr4t| dn|}| |dd|ddf }|  j}|j	dur]||j	 }t
|}||j	 }d}|duro| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r6   r9   r   r9  r:  r;  r<  r3   r7   r=  rG   r   )	lossr   r3   r?  r@  r'  rA  r   rB  r.   )r   r   r  rI   rh   slicerI  rQ  rq   final_logit_softcappingr   tanhloss_functionrC   r   r3   r?  r@  r'  rA  r   rB  )rT   r6   r9   r   r9  r:  r;  r<  r3   r7   r=  rS  rG   r   rT  rU   rC  r8   slice_indicesr   decoder_configrU  r.   r.   r/   rw   }  sP   





z'T5GemmaForConditionalGeneration.forwardc                 C   s
   |  |S rl   )r   )rT   rS  r.   r.   r/   %prepare_decoder_input_ids_from_labels  r2  zET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels)NNNNNNNNNNNNNr   )!r+   r,   r-   _tied_weights_keys_tp_plan_pp_planr1   rR   rL  rN  rR  r.  rQ  r   r   r   r   r   r   rD  r   r	   rf   r   rh   r   r   r   r   r   rw   r\  ri   r.   r.   rW   r/   rF  \  sx    	
KrF  c                          e Zd Zddedee f fddZdd Zdd	 Ze	e
										dd
eej deej deej deej deej deej dee deej deej deej dee defddZ  ZS ) T5GemmaForSequenceClassificationNrq   r>   c                    |   |dur||_ t | |j| _|j rt|| _nt|| _|jj}|j r*|j	j}t
|dd}t|| j|| _|   dS )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
        Nr@   皙?r>   rQ   rR   r   r+  r   rE  r<   rN   r=   rP   r   scorer  rT   rq   r>   rN   classifier_dropoutrW   r.   r/   rR     s   
z)T5GemmaForSequenceClassification.__init__c                 C   r/  rl   r   r1  r-  r.   r.   r/   r1    r2  z5T5GemmaForSequenceClassification.get_input_embeddingsc                 C      | j | d S rl   r   r5  rT   r_   r.   r.   r/   r5       z5T5GemmaForSequenceClassification.set_input_embeddingsr6   r9   r   r9  r:  r;  r<  r7   r=  rS  rU   r   c                 K   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrP| j|f||||||||	dd	|}|j}|j	}|j
}n| j|f|||d|}|j}|j}|j}| |}|duru|jd }n|jd }| j jdu r|d	krtd
| j jdu rd}nE|dur|| j jk|jtj}tj|jd |jtjd}|| d}| j jr|d	7 }tj||jd d	 d}nd}t| jj d |tj||jd|f }d}|
dur| j||
|| j d}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r9   r   r9  r:  r;  r<  r7   r=  rG   r9   r   r7   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )r   rS  pooled_logitsrq   rU  r   r8   r   )rq   r>   NotImplementedErrorrX   r+   r   r   r   r  r?  r@  r8   r   re  r   rE   r   ry   r   int32r  argmaxclamploggerwarning_oncerY  r   )rT   r6   r9   r   r9  r:  r;  r<  r7   r=  rS  rU   outputsr  r8   r   r   
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesrt  rU  r.   r.   r/   rw     s   


z(T5GemmaForSequenceClassification.forwardrl   
NNNNNNNNNN)r+   r,   r-   r1   r   rf   rR   r1  r5  r   r   r   r   r   r   r   r   r   r   rw   ri   r.   r.   rW   r/   ra    sR    	
ra  c                       r`  )T5GemmaForTokenClassificationNrq   r>   c                    rb  )z
        is_encoder_decoder (`Optional`, *optional*):
            Whether use encoder_decoder for token classification. When set to False, only encoder is used.
        Nr@   rc  rd  rf  rW   r.   r/   rR   ]  s   
z&T5GemmaForTokenClassification.__init__c                 C   r/  rl   rh  r-  r.   r.   r/   r1  u  r2  z2T5GemmaForTokenClassification.get_input_embeddingsc                 C   ri  rl   rj  rk  r.   r.   r/   r5  x  rl  z2T5GemmaForTokenClassification.set_input_embeddingsr6   r9   r   r9  r:  r;  r<  r7   r=  rS  rU   r   c                 K   s  | j jr|du r|durtd| jj d| j jr/|du r/|	du r/|du r*td| |}| j jrP| j|f||||||||	dd	|}|j}|j	}|j
}n| j|f|||d|}|j}|j}|j}| |}d}|
dury| ||
| j }t||||dS )	rm  Nrn  ro  rp  Frq  rr  ru  )rq   r>   rv  rX   r+   r   r   r   r  r?  r@  r8   r   re  rY  r   )rT   r6   r9   r   r9  r:  r;  r<  r7   r=  rS  rU   r|  r  r8   r   r   rU  r.   r.   r/   rw   {  sf   

z%T5GemmaForTokenClassification.forwardrl   r  )r+   r,   r-   r1   r   rf   rR   r1  r5  r   r   r   r   r   r   r   r   r   r   rw   ri   r.   r.   rW   r/   r  [  sR    	
r  )r1   r)   rF  r+  rE  r   ra  r  )Ttypingr   r   r   r   r   torch.nnrm   cache_utilsr   r   r	   configuration_utilsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r    r!   r"   r#   r$   r%   r&   r'   _CHECKPOINT_FOR_DOC
get_loggerr+   rz  r)   r1   rj   rk   rx   rz   r~   r   r   rh   r   r   r   Moduler   r   r   r   r   r   r%  r+  rE  rF  ra  r  __all__r.   r.   r.   r/   <module>   sp    (
 G4;2
]mR$r r