o
    ei                    @   s  d dl Z d dlmZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ ddlmZmZmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZB G dd dejCZDG dd dejCZEG dd dejCZFdd ZGed dZd!d"ZHd#ejId$eJd%ejIfd&d'ZK	(		d[d)ejCd*ejId+ejId,ejId-ejIdB d.eLd/eLdB d0eLdB d%eMejIejIf fd1d2ZNeeHG d3d4 d4ejCZOeeHG d5d6 d6ejCZPG d7d8 d8e ZQG d9d: d:e ZRG d;d< d<ejCZSG d=d> d>ejCZTG d?d@ d@ejCZUG dAdB dBejVZWe3G dCdD dDe.ZXd\dFeJd%efdGdHZYG dIdJ dJeXZZG dKdL dLeXZ[d-ejIdB d%efdMdNZ\G dOdP dPeXZ]e3G dQdR dReXZ^G dSdT dTeXeZ_e3G dUdV dVeXZ`e3G dWdX dXeXZag dYZbdS )]    N)Callable)Optional   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCacheStaticCache)GenerationConfigGenerationMixinGenerationMode)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_maskcreate_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPoolingSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel   )T5Gemma2ConfigT5Gemma2DecoderConfigT5Gemma2EncoderConfigT5Gemma2TextConfigc                       s@   e Zd Zddedef fddZdd Zdd	 Zd
d Z  Z	S )T5Gemma2RMSNormư>dimepsc                    s&   t    || _tt|| _d S N)super__init__r3   nn	Parametertorchzerosweight)selfr2   r3   	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/t5gemma2/modeling_t5gemma2.pyr6   8   s   
zT5Gemma2RMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr)   T)keepdim)r9   rsqrtpowmeanr3   )r<   xr?   r?   r@   _norm=   s   $zT5Gemma2RMSNorm._normc                 C   s*   |  | }|d| j   }||S )N      ?)rG   floatr;   type_as)r<   rF   outputr?   r?   r@   forward@   s   
zT5Gemma2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler;   shaper3   r<   r?   r?   r@   
extra_reprG   s   zT5Gemma2RMSNorm.extra_repr)r1   )
__name__
__module____qualname__intrI   r6   rG   rL   rP   __classcell__r?   r?   r=   r@   r0   7   s
    r0   c                       s*   e Zd Zdef fddZdd Z  ZS )T5Gemma2MLPconfigc                    s   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _t|j| _d S )NFbias)r5   r6   rW   hidden_sizeintermediate_sizer7   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr<   rW   r=   r?   r@   r6   L   s   
zT5Gemma2MLP.__init__c                 C   s2   |  | || | }| |}| |}|S r4   )ra   r]   r^   rd   r_   )r<   rF   hidden_statesr_   r?   r?   r@   rL   W   s   

zT5Gemma2MLP.forward)rQ   rR   rS   r/   r6   rL   rU   r?   r?   r=   r@   rV   K   s    rV   c                       s   e Zd ZU ejed< ddef fddZe				ddedB de	d de
dB d	edB d
edef f
ddZe edddZ  ZS )T5Gemma2RotaryEmbeddinginv_freqNrW   c                    s   t    |j| _|j| _|| _tt|j| _i | _	| jD ]P}| jj
| }|d u r+q|d | j	|< | j}| j	| dkrCt| j	|  }|| j||d\}}| j| d|dd | j| d| dd t| | d| qd S )	N	rope_typedefault
layer_type	_inv_freqF
persistent_original_inv_freq_attention_scaling)r5   r6   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrW   listsetlayer_typesri   rope_parameterscompute_default_rope_parametersr   register_bufferclonesetattr)r<   rW   devicerl   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingr=   r?   r@   r6   a   s&   

z T5Gemma2RotaryEmbedding.__init__r}   ztorch.deviceseq_lenrl   returnztorch.Tensorc                 C   s^   | j | d }t| ddp| j| j }d}d|tjd|dtjdj|tjd|   }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNrH   r   r)   dtyper}   r   )	rx   getattrrZ   num_attention_headsr9   arangeint64torI   )rW   r}   r   rl   baser2   attention_factorrh   r?   r?   r@   ry   x   s   &z7T5Gemma2RotaryEmbedding.compute_default_rope_parametersc                 C   s  t | | d}t | | d}|d d d d f  |jd dd|j}|d d d d d f  }t|jjtrE|jjdkrE|jjnd}t	|dd	) | |  
dd
}	tj|	|	fdd}
|
 | }|
 | }W d    n1 syw   Y  |j|jd|j|jdfS )Nrm   rq   r   rA   r+   mpscpuF)device_typeenabledr)   r2   r   )r   rI   expandrN   r   r}   
isinstancetypestrr%   	transposer9   catcossinr   )r<   rF   position_idsrl   rh   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   r?   r?   r@   rL      s   .&zT5Gemma2RotaryEmbedding.forwardr4   NNNN)rQ   rR   rS   r9   Tensor__annotations__r/   r6   staticmethodr   rT   r   rM   rI   ry   no_gradr   rL   rU   r?   r?   r=   r@   rg   ^   s,   
 

#rg   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrA   r)   r   )rN   r9   r   )rF   x1x2r?   r?   r@   rotate_half   s   r   rotary_pos_embc                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embedr?   r?   r@   apply_rotary_pos_emb   s
   

r   rf   n_repr   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r+   N)rN   r   reshape)rf   r   batchnum_key_value_headsslenr   r?   r?   r@   	repeat_kv   s
   0r           modulequerykeyvalueattention_maskrd   scalingsoftcapc                 K   s   |d u r	| j d }t|| j}	t|| j}
t||	dd| }|d ur2|| }t|}|| }|d ur:|| }tjj	|dtj
d|j}tjj||| jd}t||
}|dd }||fS )N      r)   r   rA   )r2   r   )ptrainingr+   )r   r   num_key_value_groupsr9   matmulr   tanhr7   
functionalsoftmaxfloat32r   r   rd   r   
contiguous)r   r   r   r   r   rd   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputr?   r?   r@   eager_attention_forward   s    

r   c                       s   e Zd ZdZdedef fddZ				ddejdejd	ejdB d
e	dB dej
dB dee deejejdB eej dB f fddZ  ZS )T5Gemma2SelfAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrW   	layer_idxc                    <  t    t|dr|j| nd | _|| _|| _t|d|j|j	 | _
|j	|j | _|jd | _| jj| _d| _tj|j|j	| j
 |jd| _tj|j|j| j
 |jd| _tj|j|j| j
 |jd| _tj|j	| j
 |j|jd| _| jj| _| jdkr|jnd | _| jdk| _t|j
|jd| _t|j
|jd| _d S Nrw   r   r   FrX   sliding_attention)r2   r3   r5   r6   hasattrrw   rl   rW   r   r   rZ   r   r   r   r   query_pre_attn_scalarr   attention_dropout	is_causalr7   r\   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_window
is_slidingr0   rms_norm_epsq_normk_normr<   rW   r   r=   r?   r@   r6     4   


zT5Gemma2SelfAttention.__init__Nrf   position_embeddingsr   past_key_valuescache_positionr   r   c                 K   s0  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}| |	}	| |
}
|\}}t	|	|
||\}	}
|d ura|||d}|
|
|| j|\}
}t| jjt}|| |	|
||f| jrv| jnd| j| jd|\}}|jg |dR   }| |}||fS )NrA   r+   r)   r   r   r   r   )rd   r   r   )rN   r   r   viewr   r   r   r   r   r   updater   r   get_interfacerW   _attn_implementationr   r   r   r   r   r   r   r   )r<   rf   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   r?   r?   r@   rL      s>   	

	

zT5Gemma2SelfAttention.forwardr   )rQ   rR   rS   __doc__r/   rT   r6   r9   r   r   
LongTensorr    r!   rM   rL   rU   r?   r?   r=   r@   r      s,    !r   c                       s   e Zd ZdZdedef fddZ		ddejde	ejejf d	ejdB d
ejde
dB dejdB dee de	ejejdB e	ej dB f fddZ  ZS )T5Gemma2MergedAttentionz6Merged self-attention and cross-attention for decoder.rW   r   c                    r   r   r   r   r=   r?   r@   r6   T  r   z T5Gemma2MergedAttention.__init__Nrf   r   merged_attention_maskencoder_hidden_statesr   r   r   r   c                 K   s^  |j d d }g |d| jR }	|j d d }
g |
d| jR }| ||	dd}| ||	dd}| ||	dd}| |}| |}|\}}t	||||\}}|d ur|||d}|j
}|||| j|\}}|j| j}|j}|d u s|s| ||dd}| ||dd}| |}|d ur|||| j\}}d|j| j< n|j| j j}|j| j j}|}|
d }tj||gdd}tj||gdd}t| jjt}|| ||||f| jr| jnd| jd|\}}|jg |dR   }| |}|d ur&|d	d | f }|d	| d f }nd
\}}|||fS )NrA   r+   r)   r   Tr   r   )rd   r   .NN) rN   r   r   r   r   r   r   r   r   r   self_attention_cacher   r   
is_updatedgetcross_attention_cachelayerskeysvaluesr9   r   r   r   rW   r   r   r   r   r   r   r   r   )r<   rf   r   r   r   r   r   r   r   r   cross_input_shapecross_hidden_shaper   r   r   r   r   r   r   r   r   cross_key_statescross_value_statescross_key_sizer   r   r   self_attn_weightscross_attn_weightsr?   r?   r@   rL   r  sr   







zT5Gemma2MergedAttention.forwardr   )rQ   rR   rS   r   r/   rT   r6   r9   r   rM   r	   r   r    r   rL   rU   r?   r?   r=   r@   r   P  s,    '	
r   c                       sr   e Zd ZdZdef fddZ			ddejdeejejf dB dejdB d	ej	dB d
eej
f f
ddZ  ZS )T5Gemma2EncoderLayerzEncoder sub-layer.r   c                       t    |j| _|| _|| _|j| | _t||d| _t	|j|j
d| _t	|j|j
d| _t|| _t	|j|j
d| _t	|j|j
d| _t|j| _d S N)rW   r   r3   )r5   r6   rZ   rW   r   rw   attention_typer   	self_attnr0   r   pre_self_attn_layernormpost_self_attn_layernormrV   mlppre_feedforward_layernormpost_feedforward_layernormr7   rb   rc   rd   r   r=   r?   r@   r6     s   

zT5Gemma2EncoderLayer.__init__Nrf   r   r   r   r   c                 K   sz   |}|  |}| jd||||d d|\}}| |}|| | }|}| |}| |}| |}|| | }|S )N)rf   r   r   r   r   r?   r  r  r  rd   r  r  r  )r<   rf   r   r   r   r   residual_r?   r?   r@   rL     s&   





zT5Gemma2EncoderLayer.forward)NNN)rQ   rR   rS   r   rT   r6   r9   r   rM   r   FloatTensorrL   rU   r?   r?   r=   r@   r	    s"    
r	  c                       s   e Zd ZdZdef fddZ						ddejdeejejf d	ejdB d
ej	dB de
dB dedB dej	dB dejdB dejfddZ  ZS )T5Gemma2DecoderLayerzFDecoder sub-layer: merged attention instead of vanilla self-attention.r   c                    r
  r  )r5   r6   rZ   rW   r   rw   r  r   r  r0   r   r  r  rV   r  r  r  r7   rb   rc   rd   r   r=   r?   r@   r6     s   

zT5Gemma2DecoderLayer.__init__NFrf   r   r   r   r   	use_cacher   r   r   c	                 K   s   |}
|  |}| jd||||||||d|	\}}}| |}|
| | }|}
| |}| |}| |}|
| | }|S )N)rf   r   r   r   r   r  r   r   r?   r  )r<   rf   r   r   r   r   r  r   r   r   r  r  r?   r?   r@   rL     s,   
	



zT5Gemma2DecoderLayer.forward)NNNFNN)rQ   rR   rS   r   rT   r6   r9   r   rM   r   r	   boolr  rL   rU   r?   r?   r=   r@   r    s8    	r  c                       F   e Zd ZdZddededef fddZdejd	ejfd
dZ	  Z
S )T5Gemma2LMHeadz.Head for language modeling (generation) tasks.FrZ   
vocab_sizerY   c                    s    t    tj|||d| _d S )NrX   )r5   r6   r7   r\   out_proj)r<   rZ   r  rY   r=   r?   r@   r6   B  s   
zT5Gemma2LMHead.__init__rf   r   c                 C   s   |  |}|S r4   )r  )r<   rf   logitsr?   r?   r@   rL   F  s   
zT5Gemma2LMHead.forward)F)rQ   rR   rS   r   rT   r  r6   r9   r   rL   rU   r?   r?   r=   r@   r  ?  s    r  c                       r  )T5Gemma2ClassificationHeadz-Head for sentence-level classification tasks.r   rZ   
num_labelsclassifier_dropout_ratec                    s*   t    tj|d| _t||| _d S )N)r   )r5   r6   r7   rb   rd   r\   r  )r<   rZ   r!  r"  r=   r?   r@   r6   N  s   
z#T5Gemma2ClassificationHead.__init__rf   r   c                 C   s   |  |}| |}|S r4   )rd   r  )r<   rf   r?   r?   r@   rL   S  s   

z"T5Gemma2ClassificationHead.forward)r   rQ   rR   rS   r   rT   rI   r6   r9   r   rL   rU   r?   r?   r=   r@   r   K  s    r   c                       s2   e Zd Zdef fddZdejfddZ  ZS )T5Gemma2MultiModalProjectorrW   c                    s   t    tt|jj|jj| _	t
|jj|jjd| _t|jj|jj | _t|jd | _| j| j | _tj| j| jd| _d S )Nr        ?)kernel_sizestride)r5   r6   r7   r8   r9   r:   vision_configrZ   text_configmm_input_projection_weightr0   layer_norm_epsmm_soft_emb_normrT   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider&  	AvgPool2davg_poolre   r=   r?   r@   r6   Z  s   
z$T5Gemma2MultiModalProjector.__init__vision_outputsc           	      C   sv   |j \}}}|dd}|||| j| j}| }| |}|d}|dd}| |}t	|| j
}||S )Nr+   r)   )rN   r   r   r/  r   r3  flattenr,  r9   r   r*  rJ   )	r<   r4  
batch_sizer  rZ   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputsr?   r?   r@   rL   j  s   



z#T5Gemma2MultiModalProjector.forward)	rQ   rR   rS   r.   r6   r9   r   rL   rU   r?   r?   r=   r@   r$  Y  s    r$  c                       sP   e Zd ZdZ		ddededededef
 fd	d
Zdejf fddZ	  Z
S )T5Gemma2TextScaledWordEmbeddingzCT5Gemma2 Embedding: override to add eoi token embedding separately.rH     num_embeddingsembedding_dimpadding_idxembed_scaleeoi_token_indexc                    sJ   t  ||| || _| jdt|dd || _tt	| j
| _d S )Nr@  Frn   )r5   r6   scalar_embed_scalerz   r9   tensorrA  r7   r8   r:   r>  eoi_embedding)r<   r=  r>  r?  r@  rA  r=   r?   r@   r6     s
   z(T5Gemma2TextScaledWordEmbedding.__init__	input_idsc                    s8   t  || j| jj }| j|j||| jk< |S r4   )r5   rL   r@  r   r;   r   rD  rA  )r<   rE  input_embeddingsr=   r?   r@   rL     s   z'T5Gemma2TextScaledWordEmbedding.forward)rH   r<  r#  r?   r?   r=   r@   r;  }  s     r;  c                       s   e Zd ZU eed< dZdZg dZdgZdZ	dZ
dZdZdZeegeeddd	eeddd	eed
dd	gdZdZe  fddZdd Z  ZS )T5Gemma2PreTrainedModelrW   modelT)r	  r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr   Fr+   r  )index
layer_namer)   
cross_attn)rf   
attentions)imagetextc                    sX  t  | t|trt|j d S t|tr(t|j t	|j
|j d S t|tr]|jjjd d }tj|jjd| jj| d t|jdrY|jjd ur[t|jj d S d S d S d|jjv rkt|j d S t|tr|jD ]6}|j}|j| dkrt|j|  }||j|d\}}tt|| d	| tt|| d
| qsd S d S )Nr   r   r   )rE   stdrY   RMSNormrj   rk   rm   rp   )r5   _init_weightsr   r$  initzeros_r*  r;  rD  	constant_r@  rB  r   r  r;   rN   normal_rW   initializer_ranger   rY   r>   rQ   rg   rw   ry   ri   r   copy_r   )r<   r   scalerl   r   r   r  r=   r?   r@   rT    s0   




z%T5Gemma2PreTrainedModel._init_weightsc                 C   s|   | j j}|j}|j}|du rtd||j}|dddf  |dddf< ||d< |du r4td||dk| |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .rA   r+   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	rW   decoderbos_token_idpad_token_id
ValueError	new_zerosrN   r{   masked_fill_)r<   rE  decoder_configdecoder_start_token_idr^  shifted_input_idsr?   r?   r@   %prepare_decoder_input_ids_from_labels  s    z=T5Gemma2PreTrainedModel.prepare_decoder_input_ids_from_labels)rQ   rR   rS   r,   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr	  r  r'   r   r   _can_record_outputsinput_modalitiesr9   r   rT  re  rU   r?   r?   r=   r@   rG    s*   
 rG  Tr   c              
      s(   dt dt dt dt dtf
 fdd}|S )zL
    This creates uni/bidirectional attention mask with sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c           	         s\    rd}}nd d d d }}|| }|dk||k @ }|dk | |k @ }||B S )Nr   r+   r)   r?   )	rq  rr  rs  rt  left_window_sizeright_window_sizedist	left_mask
right_maskr   r   r?   r@   
inner_mask  s   z0sliding_window_mask_function.<locals>.inner_maskrT   r  )r   r   r{  r?   rz  r@   sliding_window_mask_function  s   $r}  c                       s   e Zd ZU eed< eedZ	ddedef fddZ	e
ee					ddejdB d	ejdB d
ejdB dejdB dejdB dee defddZ  ZS )T5Gemma2TextEncoderrW   )rO  rf   r<  rA  c                    s   t     j| _ j| _t j j| j jd |d| _t j j	d| _
d| _t fddt jD | _t j| _t | _|   d S )Nr%  r@  rA  r  Fc                       g | ]}t  |qS r?   )r	  .0r   rW   r?   r@   
<listcomp>      z0T5Gemma2TextEncoder.__init__.<locals>.<listcomp>r5   r6   r^  r?  r  r;  rZ   embed_tokensr0   r   normgradient_checkpointingr7   
ModuleListrangenum_hidden_layersr   rb   rc   rd   rg   
rotary_emb	post_initr<   rW   rA  r=   r  r@   r6     s$   
zT5Gemma2TextEncoder.__init__NrE  r   r   inputs_embedstoken_type_idsr   r   c              	   K   s4  |d u |d uA rt d|dd  |d u r| |}|d u r.tjd|jd |jdd}t| }t	sU| j
||d}tdi |tdi |dt| j
jdd	id
}|}	i }
| j
jD ]}| |	|||
|< q]| |	}	| jd | j
j D ]}||	|
|j ||j |fi |}	qw| |	}	| |	}	t|	dS )N:You must specify exactly one of input_ids or inputs_embedsr   r   r+   r}   )rW   r  r   and_mask_functionF)r   full_attentionr   )last_hidden_stater?   )r_  popr  r9   r   rN   r}   r   r   dictrW   r   r}  r   rw   r  rd   r   r  r  r  r   )r<   rE  r   r   r  r  r   self_attn_mask_mappingmask_kwargsrf   r   rl   layer_moduler?   r?   r@   rL   !  sL   
	


zT5Gemma2TextEncoder.forwardr<  )NNNNN)rQ   rR   rS   r/   r   r   r	  ro  rT   r6   r&   r(   r"   r9   r   r   r  r    r!   r   rL   rU   r?   r?   r=   r@   r~    sD   
 	r~  c                       s   e Zd ZU eed< 	ddedef fddZdd Zdd	 Ze	e
d
ejdee deeB fddZdejdB dejdB dejfddZe
						ddejdB dejdB dejdB dejdB d
ejdB dejdB dee defddZ  ZS )T5Gemma2EncoderrW   r<  rA  c                    sD   t  | tj|j|d| _tj|jd| _	t
|| _|   d S )N)rA  r  )r5   r6   r~  _from_configr)  
text_modelr*   from_configr(  vision_towerr$  multi_modal_projectorr  r  r=   r?   r@   r6   f  s
   
zT5Gemma2Encoder.__init__c                 C   
   | j  S r4   )r  get_input_embeddingsrO   r?   r?   r@   r  t     
z$T5Gemma2Encoder.get_input_embeddingsc                 C      | j |S r4   )r  set_input_embeddingsr<   new_embeddingsr?   r?   r@   r  w     z$T5Gemma2Encoder.set_input_embeddingspixel_valuesr   r   c                 K   s0   | j d|dd|}|j}| |}||_|S )NT)r  return_dictr?   )r  r  r  pooler_output)r<   r  r   r4  r  image_featuresr?   r?   r@   get_image_featuresz  s
   
z"T5Gemma2Encoder.get_image_featuresrE  Nr  r  c                 C   s   | j j}|du r&|du rtd||  tj|tj|jdk}|d}n||k}|	 }|
d||j}|jd |jd  }t||  | kd| d|  |S )	z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nz9Either `input_ids` or `inputs_embeds` has to be provided.)r   r}   rA   r   r+   z6Image features and image tokens do not match: tokens: z, features )rW   image_token_idr_  r  r9   rC  longr}   allsumr   	expand_asr   rN   r$   numel)r<   rE  r  r  r  special_image_maskn_image_tokensn_image_featuresr?   r?   r@   get_image_placeholder_mask  s"   
z*T5Gemma2Encoder.get_image_placeholder_maskr   r   r  c                 K   s   |d u |d uA rt d|d u r| j|}|d ur8| j|ddj}||j|j}| j|||d}	|	|	|}| jd|||d|}
|
S )Nr  T)r  )r  r  )r  r   r   r?   )
r_  r  r  r  r  r   r}   r   r  masked_scatter)r<   rE  r   r   r  r  r  r   r  
image_maskoutputsr?   r?   r@   rL     s&   zT5Gemma2Encoder.forwardr  )NNNNNN)rQ   rR   rS   r.   r   rT   r6   r  r  r#   r"   r9   r   r    r!   rM   r   r  r   r  r  r   rL   rU   r?   r?   r=   r@   r  c  sd   
 
	
r  c              
      s&   dt dt dt dt dtf
 fdd}|S )z4
    This creates bidirectional attention mask.
    rq  rr  rs  rt  r   c                    s,    d u rt jdt jdS  | |f t jS )Nr?   r   )r9   onesr  r   )rq  rr  rs  rt  r   r?   r@   r{    s   z/bidirectional_mask_function.<locals>.inner_maskr|  )r   r{  r?   r  r@   bidirectional_mask_function  s   "r  c                       s   e Zd ZU eed< eeddeeddedZddede	f fdd	Z
eee	
	
	
	
	
	
	
	
	
ddejd
B dejd
B dejd
B ded
B dejd
B ded
B dejd
B dejd
B dejd
B dee defddZ  ZS )T5Gemma2DecoderrW   r+   )rL  r)   )rO  cross_attentionsrf   r<  rA  c                    s   t     j| _ j| _t j j j jd |d| _t j j	d| _
d| _t fddt jD | _t j| _t | _|   d S )Nr%  r  r  Fc                    r  r?   )r  r  r  r?   r@   r    r  z,T5Gemma2Decoder.__init__.<locals>.<listcomp>r  r  r=   r  r@   r6     s$   
zT5Gemma2Decoder.__init__NrE  r   r   r   r  r  r   r   encoder_attention_maskr   r   c
              
   K   s  |d u |d uA rt d|d u rt d|d u r| |}| js0|r0|d u r0tt| jdt }|d u rL|d ur<| nd}tj|||j	d  |j
d}|d u rU|d}t| }ts| j||||d urh|jnd |d}dd	 |d
< tdi |tdi |d}t|	 }ts| j||	|d d d}dtdi |dt|	ii}tj|d |d gddtj|d |d gddd}|}i }| jjD ]}| |||||< q| |}| jd | jj D ]}||||j ||j |||||fi |
}q| |}| |}t||dS )Nr  z0`encoder_hidden_states` must be given in decoderr  r   r+   r  )rW   r  r   r   r   r   c                  W   s   t jdt jdS )NTr   )r9   rC  r  )argsr?   r?   r@   <lambda>%  s    z)T5Gemma2Decoder.forward.<locals>.<lambda>r  r  r  or_mask_functionrA   r   r   )r  r   r?   )r_  r  r   r	   r   rW   get_seq_lengthr9   r   rN   r}   r   r   r  r   r   r   r  r   rw   r  rd   r   r  r  r  r   )r<   rE  r   r   r   r  r  r   r   r  r   past_seen_tokensr  r  cross_attn_mask_mappingmerged_attn_mask_mappingrf   r   rl   r  r?   r?   r@   rL     s   


	

	

zT5Gemma2Decoder.forwardr  )	NNNNNNNNN)rQ   rR   rS   r-   r   r'   r   r  ro  rT   r6   r&   r(   r"   r9   r   r   r	   r  r  r    r!   r   rL   rU   r?   r?   r=   r@   r    sT   
 

	
r  c                !       s  e Zd ZdddZdef fddZdd Zd	d
 Zdd Zdd Z	e
e													d!dejdB dejdB dejdB dejdB dejdB dejdB dejdB dedB dedB dejdB dejdB dedB dejdB dee defdd Z  ZS )"T5Gemma2Modelz&encoder.text_model.embed_tokens.weightz-encoder.text_model.embed_tokens.eoi_embedding)zdecoder.embed_tokens.weightz"decoder.embed_tokens.eoi_embeddingrW   c                    s8   t  | t|j|j| _t|j|j| _|   d S r4   )r5   r6   r  encoderrA  r  r\  r  re   r=   r?   r@   r6   k  s   zT5Gemma2Model.__init__c                 C      | j S r4   )r  rO   r?   r?   r@   get_encodert     zT5Gemma2Model.get_encoderc                 C   r  r4   r\  rO   r?   r?   r@   get_decoderw  r  zT5Gemma2Model.get_decoderc                 C   r  r4   )r  r  rO   r?   r?   r@   r  z  r  z"T5Gemma2Model.get_input_embeddingsc                 C   r  r4   )r  r  r  r?   r?   r@   r  }  r  z"T5Gemma2Model.set_input_embeddingsNrE  r  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   r  decoder_inputs_embedsr  r   r   r   c                 K   sz   |du r| j d||||
|dd|}|j}| jd|||||	||||dd
|}t|j|j|j|j|j|j|j|jdS )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        NT)rE  r   r   r  r  r  )
rE  r   r   r  r   r   r  r  r   r  )r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsr?   )r  r  r\  r   r   rf   rO  r  )r<   rE  r  r   r   r  r  r  r  r   r  r  r  r   r   r   decoder_outputsr?   r?   r@   rL     sH   
zT5Gemma2Model.forward)NNNNNNNNNNNNN)rQ   rR   rS   _tied_weights_keysr,   r6   r  r  r  r  r#   r"   r9   r   r  
BoolTensorr   r	   r   r  r    r!   r   rL   rU   r?   r?   r=   r@   r  d  sn    		
r  c                &       s  e Zd ZddiZddiZddgdgfiZdef fdd	Zd
d Zdd Z	dd Z
dd Zdd Zdd Zeedejdee deeB fddZedd Zee															d6dejdB dejdB d ejdB d!ejdB d"ejdB d#ejdB d$ejdB d%edB d&edB d'ejdB d(ejdB d)ejdB d*edB d+ejdB d,eejB dee deej e B f"d-d.Z!d/e"d0e#d1e$d2ed3edef fd4d5Z%  Z&S )7 T5Gemma2ForConditionalGenerationzlm_head.out_proj.weightz,model.encoder.text_model.embed_tokens.weightzlm_head.out_projcolwise_gather_outputrf   r  rW   c                    sD   t  | t|| _|jj| _t|jj| j| _d| _	| 
  d S )NForMaskedLM)r5   r6   r  rH  r\  r  r  rZ   lm_head	loss_typer  re   r=   r?   r@   r6     s   

z)T5Gemma2ForConditionalGeneration.__init__c                 C   s   || j _d S r4   r  r  r  r?   r?   r@   set_output_embeddings  r  z6T5Gemma2ForConditionalGeneration.set_output_embeddingsc                 C   s   | j jS r4   r  rO   r?   r?   r@   get_output_embeddings  s   z6T5Gemma2ForConditionalGeneration.get_output_embeddingsc                 C   r  r4   rH  r  rO   r?   r?   r@   r    r  z5T5Gemma2ForConditionalGeneration.get_input_embeddingsc                 C      | j | d S r4   rH  r  r<   r   r?   r?   r@   r       z5T5Gemma2ForConditionalGeneration.set_input_embeddingsc                 C   r  r4   )rH  r  rO   r?   r?   r@   r    r  z,T5Gemma2ForConditionalGeneration.get_encoderc                 C   r  r4   )rH  r  rO   r?   r?   r@   r    r  z,T5Gemma2ForConditionalGeneration.get_decoderr  r   r   c                 K   s   |   j|fi |S r4   )r  r  )r<   r  r   r?   r?   r@   r    s   z3T5Gemma2ForConditionalGeneration.get_image_featuresc                 C   s
   |   jS r4   )r  r  rO   r?   r?   r@   r    s   
z-T5Gemma2ForConditionalGeneration.vision_towerNr   rE  r   r   r  r  r  r  r   r  r  labelsr  r   logits_to_keepc                 K   s  |dur|du r|du r|  |}| jd|||||||||	|
|||d|}|j}t|tr5t| dn|}| |dd|ddf }| jj}|j	dur]||j	 }t
|}||j	 }d}|duro| j||| jfi |}t|||j|j|j|j|j|j|jd	S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)rE  r  r   r   r  r  r  r  r   r  r  r  r   )	lossr  r   r  r  r  r  r   r  r?   )re  rH  r  r   rT   slicer  rW   r\  final_logit_softcappingr9   r   loss_functionr  r   r   r  r  r  r  r   r  )r<   rE  r  r   r   r  r  r  r  r   r  r  r  r  r   r  r   r  rf   slice_indicesr  rb  r  r?   r?   r@   rL     sR   "




z(T5Gemma2ForConditionalGeneration.forwardgeneration_configmodel_kwargsgeneration_moder6  max_cache_lengthc                    sJ  t  ||||| |jdu rdS |j}|du rd}nd|jv }t| jjdd}|`|`	||d}	|
d}
|
durqt|
tsEtdt|
jd	krT|
j
d	rTdS t|
j}|tkrh|d
 d	 jd |	d< |di |	|
_nttdi | jjdd|dt |d< t| dr| jdurt| jtstd|d | _dS dS dS )zMOverride cache preparation to support T5Gemma2-specific EncoderDecoder Cache.FN	offloadedTr  )rW   
offloadingr   zaThe `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma2 model.r   r  r+   max_cache_len_cachezLThe internal cache must be of type `EncoderDecoderCache` for T5Gemma2 model.r?   )r5   _prepare_cache_for_generationr  cache_implementationcopydeepcopyrW   get_text_configr   rw   r   r   r	   r_  lenr   r   r   r
   rN   r   r   r  )r<   r  r  r  r6  r  r  offload_cachecross_attn_configcross_attn_cache_kwargsr   cross_attn_clsr=   r?   r@   r  E  sX   
	




z>T5Gemma2ForConditionalGeneration._prepare_cache_for_generation)NNNNNNNNNNNNNNr   )'rQ   rR   rS   r  _tp_plan_pp_planr,   r6   r  r  r  r  r  r  r#   r"   r9   r   r    r!   rM   r   r  propertyr  r   r  r  r   r	   r  rT   r   rL   r   r  r   r  rU   r?   r?   r=   r@   r    s    

	
Qr  c                          e Zd Zdef fddZdd Zdd Zee											dd	e	j
dB d
e	jdB de	jdB de	j
dB de	j
dB de	jdB de	j
dB dedB de	jdB de	jdB de	j
dB dee defddZ  ZS )!T5Gemma2ForSequenceClassificationrW   c                    R   t  | |j| _|jj| _t|| _t|dd}t| j| j|| _	| 
  d S Nr"  g?r5   r6   r!  r\  rZ   r  rH  r   r   scorer  r<   rW   classifier_dropoutr=   r?   r@   r6     s   

z*T5Gemma2ForSequenceClassification.__init__c                 C   r  r4   r  rO   r?   r?   r@   r    r  z6T5Gemma2ForSequenceClassification.get_input_embeddingsc                 C   r  r4   r  r  r?   r?   r@   r    r  z6T5Gemma2ForSequenceClassification.set_input_embeddingsNrE  r  r   r   r  r  r  r  r  r  r  r   r   c                 K   s4  |	dus|
durt d| jj d|du rtd|du r#| |}| j|f||||||||	|
dd
|}|j}|j}|j}| 	|}|j
d }|| jjk|jtj}tj|j
d |jtjd	}|| d}tj||j
d d
 d}|tj||jd|f }d}|dur| j|||| jd}t||||dS )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for .You have to specify input_idsF
r  r   r   r  r  r  r  r  r  r  r   rA   r   r+   )maxr  )r  r  pooled_logitsrW   r  r  rf   rO  )NotImplementedErrorr>   rQ   r_  re  rH  r  r  r  r  rN   rW   r^  r   r}   r9   int32r   argmaxclampr  r   )r<   rE  r  r   r   r  r  r  r  r  r  r  r   r  r  rf   rO  r  r6  non_pad_masktoken_indiceslast_non_pad_tokenr  r  r?   r?   r@   rL     sV   


z)T5Gemma2ForSequenceClassification.forwardNNNNNNNNNNN)rQ   rR   rS   r,   r6   r  r  r#   r"   r9   r   r  r   r   r    r!   r   rL   rU   r?   r?   r=   r@   r    sX    	
r  c                       r  )T5Gemma2ForTokenClassificationrW   c                    r	  r
  r  r  r=   r?   r@   r6     s   

z'T5Gemma2ForTokenClassification.__init__c                 C   r  r4   r  rO   r?   r?   r@   r    r  z3T5Gemma2ForTokenClassification.get_input_embeddingsc                 C   r  r4   r  r  r?   r?   r@   r    r  z3T5Gemma2ForTokenClassification.set_input_embeddingsNrE  r  r   r   r  r  r  r  r  r  r  r   r   c                 K   s   |	dus|
durt d| jj d|du rtd|du r#| |}| j|f||||||||	|
dd
|}|j}|j}|j}| 	|}d}|durS| 
||| j}t||||dS )r  Nr  r  r  Fr  r  )r  r>   rQ   r_  re  rH  r  r  r  r  r  rW   r   )r<   rE  r  r   r   r  r  r  r  r  r  r  r   r  r  rf   rO  r  r  r?   r?   r@   rL     sJ   

z&T5Gemma2ForTokenClassification.forwardr  )rQ   rR   rS   r,   r6   r  r  r#   r"   r9   r   r  r   r   r    r!   r   rL   rU   r?   r?   r=   r@   r    sX    	
r  )r  r  r  rG  r  r  )r+   )r   NN)T)cr  collections.abcr   typingr   r9   torch.nnr7    r   rU  activationsr   cache_utilsr   r   r	   r
   
generationr   r   r   integrationsr   r   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr    utilsr!   r"   r#   r$   utils.genericr%   r&   utils.output_capturingr'   r(   autor*   configuration_t5gemma2r,   r-   r.   r/   Moduler0   rV   rg   r   r   r   rT   r   rI   rM   r   r   r   r	  r  r  r   r$  	Embeddingr;  rG  r}  r~  r  r  r  r  r  r  r  __all__r?   r?   r?   r@   <module>   s   $	Q

"Q}4=$Veh _ NaX