o
    ij                     @   sb  d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZ ee Z!eeddG dd deZ"eeddG dd deZ#G dd dej$Z%eG dd deZ&eddG dd de&Z'eddG d d! d!e&eZ(g d"Z)dS )#zPyTorch PaliGemmamodel.    )	dataclass)OptionalUnionN)nn   )CacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPast)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    )custom_introc                   @   s$   e Zd ZU dZdZeej ed< dS )PaligemmaModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__ r!   r!   m/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   +   s   
 r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	PaliGemmaCausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r$   r   r   r   r    r%   r&   r   r'   tupler(   r   r!   r!   r!   r"   r#   ;   s   
 r#   c                       s*   e Zd Zdef fddZdd Z  ZS )PaliGemmaMultiModalProjectorconfigc                    s(   t    tj|jj|jjdd| _d S )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr+   	__class__r!   r"   r/   Z   s   
z%PaliGemmaMultiModalProjector.__init__c                 C   s   |  |}|S N)r4   )r6   image_featuresr'   r!   r!   r"   forward^   s   
z$PaliGemmaMultiModalProjector.forward)r   r   r   r   r/   r;   __classcell__r!   r!   r7   r"   r*   Y   s    r*   c                   @   sD   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZdZdd Zd	S )
PaliGemmaPreTrainedModelr+    Tr*   r&   Fc                 C   sV   t | jd| j j}t|tjr'|jjj	d|d |j
d ur)|j
j  d S d S d S )Ninitializer_range        )meanstd)getattrr+   get_text_configr?   
isinstancer   r0   weightdatanormal_r-   zero_)r6   modulerB   r!   r!   r"   _init_weightsr   s   
z&PaliGemmaPreTrainedModel._init_weightsN)r   r   r   r   r    base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendrK   r!   r!   r!   r"   r=   d   s   
 r=   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c                #       sR  e Zd ZddiZdZdef fddZdd Zd	d
 Zdd Z	dd Z
					d)dee fddZdejfddZdejdejdejfddZee													d*deej deej deej deej dee deej deej deej d eej d!ee d"ee d#ee d$ee d%ee d&eeef fd'd(Z  ZS )+PaliGemmaModelzlanguage_model.modellanguage_modelFr+   c                    s~   t  | tj|jd| _t|| _|jj	| _	tj|jd}|| _
| jjd ur,| jjnd| _| j jp7| j| _|   d S )N)r+   )r.   r/   r   from_configr1   vision_towerr*   multi_modal_projectortext_config
vocab_sizerV   r+   pad_token_idrD   dtypetext_config_dtype	post_init)r6   r+   rV   r7   r!   r"   r/      s   

zPaliGemmaModel.__init__c                 C   
   | j  S r9   )rV   get_input_embeddingsr6   r!   r!   r"   rb         
z#PaliGemmaModel.get_input_embeddingsc                 C      | j | d S r9   )rV   set_input_embeddingsr6   valuer!   r!   r"   rf         z#PaliGemmaModel.set_input_embeddingsc                 C   s
   || _ d S r9   rV   r6   decoderr!   r!   r"   set_decoder   rd   zPaliGemmaModel.set_decoderc                 C   s   | j S r9   rj   rc   r!   r!   r"   get_decoder   s   zPaliGemmaModel.get_decoderNis_trainingc                 C   sl  | j jjdkr|d urd|v r|S d S |d ur|n| j}t|t}t| jj	}|d u r.|}|j
d d \}	}
|r>| }nt|tjrI|j
d n|d |
 d }|d ur]| dkr]|S tj|
|f|| j|jd}|
dkr|rxtj|dd	}n
d|d d d |
f< |tj||jd
|ddk9 }|d d d d d d f |	ddd}|d ur4| }|j
d }|r|d u rtd|d d d d d d d |f |d d d d d d f |jdkd|d d d d d d d |f< |d d d d d d d |f |d d d d d d f |j }|dk}|d d d d d d d |f |||d d d d d d d |f< |S )Nflash_attention_2r@   r   rW   r   r      
fill_valuer^   devicediagonalrt   z/Token type ids must be provided during training)r+   r[   _attn_implementationtrainingrE   r   r   finfor_   minshapeget_max_cache_shapeTensordimfullrt   triuarangereshapeexpandclone
ValueErrormasked_fillto)r6   attention_masktoken_type_idsr&   cache_positioninput_tensorro   using_static_cache	min_dtypeinputs_lead_dimsequence_lengthtarget_lengthcausal_maskmask_lengthpadding_maskr!   r!   r"   _update_causal_mask   sZ   	


 $

 $ @  z"PaliGemmaModel._update_causal_maskpixel_valuesc                 C   s0   |  |}|j}| |}|| jjjd  }|S )a  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        g      ?)rY   last_hidden_staterZ   r+   r[   r2   )r6   r   image_outputsselected_image_featurer:   r!   r!   r"   get_image_features   s
   


z!PaliGemmaModel.get_image_features	input_idsinputs_embedsr:   c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}|jd |jd  }||  | krPtd| d| |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)r^   rt   rW   r   r   z6Image features and image tokens do not match: tokens: z, features )rb   r   tensorr+   image_token_idlongrt   allsum	unsqueeze	expand_asr   r|   numelr   )r6   r   r   r:   special_image_maskn_image_tokensn_image_featuresr!   r!   r"   get_placeholder_mask   s   z#PaliGemmaModel.get_placeholder_maskr   position_idsr&   r   r   labels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictkwargsreturnc                 K   s  |du |duA rt d|dur|n| jj}|dur|n| jj}|dur&|n| jj}|duo1|	du}|durL| jj| jkrL|| jjk}| }d||< n|}|du rX|  |}|du rt|durd|	 nd}t
j|||jd  |jd}|du r|dd }|dur| |}||j|j}| j|||d}|||}| ||||||}| jd
|||||
||d|d	|}t|j|j|j|j|dur|d	S dd	S )i  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rw   )r   r:   T)	r   r   r&   r   r   r   r   r   r   )r   r&   r'   r(   r   r!   )r   r+   r   r   use_return_dictr   r\   r   rb   get_seq_lengthr   r   r|   rt   r   r   r   r^   r   masked_scatterr   rV   r   r   r&   r'   r(   )r6   r   r   r   r   r&   r   r   r   r   r   r   r   r   r   ro   r   llm_input_idspast_seen_tokensr:   r   outputsr!   r!   r"   r;     sj   /



zPaliGemmaModel.forward)NNNNN)NNNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingaccepts_loss_kwargsr   r/   rb   rf   rm   rn   r   boolr   r   r   r   
LongTensorr   r   r   r~   r   r   r
   r   r)   r   r;   r<   r!   r!   r7   r"   rU   }   s    
E
	

rU   c                %       s  e Zd ZdddddZdgZdef fdd	Zd
d Zdd Zdd Z	dd Z
dd Zedd Zedd Zedd Zee														d7deej deej deej deej d ee d!eej d"eej d#eej d$eej d%ee d&ee d'ee d(ee d)eeejf d*ee d+eeef f d,d-Z								.		d8 fd/d0	Z e!dejd1ed2ed3ej"d"ejd4efd5d6Z#  Z$S )9!PaliGemmaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr+   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr,   )r.   r/   rU   modelr   r0   r[   r2   r\   r   r`   r5   r7   r!   r"   r/     s   
z*PaliGemmaForConditionalGeneration.__init__c                 C   ra   r9   )r   rb   rc   r!   r!   r"   rb     rd   z6PaliGemmaForConditionalGeneration.get_input_embeddingsc                 C   re   r9   )r   rf   rg   r!   r!   r"   rf     ri   z6PaliGemmaForConditionalGeneration.set_input_embeddingsc                 C   re   r9   )r   rm   rk   r!   r!   r"   rm     ri   z-PaliGemmaForConditionalGeneration.set_decoderc                 C   ra   r9   )r   rn   rc   r!   r!   r"   rn     rd   z-PaliGemmaForConditionalGeneration.get_decoderc                 C   s   | j |S r9   )r   r   )r6   r   r!   r!   r"   r     s   z4PaliGemmaForConditionalGeneration.get_image_featuresc                 C      | j jS r9   )r   rV   rc   r!   r!   r"   rV        z0PaliGemmaForConditionalGeneration.language_modelc                 C   r   r9   )r   rY   rc   r!   r!   r"   rY     r   z.PaliGemmaForConditionalGeneration.vision_towerc                 C   r   r9   )r   rZ   rc   r!   r!   r"   rZ     r   z7PaliGemmaForConditionalGeneration.multi_modal_projectorNr   r   r   r   r   r&   r   r   r   r   r   r   r   r   logits_to_keepr   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd||||||||
|	||d|d|}|d }t|trCt| dn|}| |dd|ddf }d}|	durh| j	d||	| j j
jd|}t|||j|j|j|jdS )r   NT)r   r   r   r   r   r&   r   r   r   r   r   r   r   r   )r%   r   r\   )r$   r%   r&   r'   r(   r   r!   )r+   r   r   r   r   rE   intslicer   loss_functionr[   r\   r#   r&   r'   r(   r   )r6   r   r   r   r   r&   r   r   r   r   r   r   r   r   r   r   r   r'   slice_indicesr%   r$   r!   r!   r"   r;     sN   /z)PaliGemmaForConditionalGeneration.forwardTc                    s   t  j|f||||||	|
|d|}|dd ur"|d  d7  < |d dkr,||d< |d uo3|d u}t|to=t|j}|d dkr]|r]|d urL|n|}| j||||||}||d< |S )N)r&   r   r   r   r   r   r   r   r   r   r   r   r   )	r.   prepare_inputs_for_generationgetrE   r   any
is_slidingr   r   )r6   r   r&   r   r   r   r   r   r   r   r   r   r   model_inputsro   is_static_hybrid_cacher   r   r7   r!   r"   r   	  s6   
z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generationr   r   r^   
batch_sizec                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nrq   rr   r   ru   rw   rW   r   )r   r   rz   r{   r   rt   r   r   r   r   r   r|   r   r   )r   r   r   r^   r   r   r   r   r   r   r   r!   r!   r"   5_prepare_4d_causal_attention_mask_with_cache_position8  s,    $
6  zWPaliGemmaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNNNr   )
NNNNNNNTNN)%r   r   r   r   _tied_weights_keysr   r/   rb   rf   rm   rn   r   propertyrV   rY   rZ   r   r   r   r   r   r   r~   r   r   r   r   r   r   r)   r#   r;   r   staticmethodr^   r   r<   r!   r!   r7   r"   r     s    


	

[/r   )r   r=   rU   )*r   dataclassesr   typingr   r   r   r   cache_utilsr   r   
generationr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   autor   configuration_paligemmar   
get_loggerr   loggerr   r#   Moduler*   r=   rU   r   __all__r!   r!   r!   r"   <module>   sN   

 ~ n