o
    eij                     @   s"  d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ e%e&Z'eeddG dd deZ(eeddG dd deZ)G dd dej*Z+dej,dB dej,dB d edB fd!d"Z-e d#d$d%d&			'	d:d(e
d%ej,d)ej,dB d*ej,d+edB d,ej,dB dej,dB d-ej.dB d.e/dB d/e/dB d e0fd0d1Z1eG d2d3 d3eZ2ed4dG d5d6 d6e2Z3ed4dG d7d8 d8e2eZ4g d9Z5dS );zPyTorch PaliGemmamodel.    )Callable)	dataclassN)nn   )Cache)PreTrainedConfig)GenerationMixin)create_masks_for_generate)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)deprecate_kwarg   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    custom_introc                   @   s$   e Zd ZU dZdZejdB ed< dS )PaligemmaModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__ r%   r%   n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   .   s   
 r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dS )	PaliGemmaCausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r    r!   r(   r"   r#   r$   r)   r*   r   r+   tupler,   r   r%   r%   r%   r&   r'   >   s   
 r'   c                       s*   e Zd Zdef fddZdd Z  ZS )PaliGemmaMultiModalProjectorconfigc                    s(   t    tj|jj|jjdd| _d S )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr/   	__class__r%   r&   r3   ]   s   
z%PaliGemmaMultiModalProjector.__init__c                 C   s   |  |}|S N)r8   )r:   image_featuresr+   r%   r%   r&   forwarda   s   
z$PaliGemmaMultiModalProjector.forward)r   r   r    r   r3   r?   __classcell__r%   r%   r;   r&   r.   \   s    r.   token_type_idsimage_group_idsreturnc              
      s4   du rdS dt dt dt dt dtf
 fdd}|S )	z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N	batch_idxhead_idxq_idxkv_idxrC   c                    s   t |jd k |d}t |jd k |d}| |f }t |jd k |d}| |f }t |jd k |d} | |f }t | jd k |d} | |f }	t | jd k |	d}	|dk|dk@ }
||	k}|
|@ S )Nr   r   )r"   whereshape)rD   rE   rF   rG   
safe_q_idxsafe_kv_idxtoken_type_ids_at_q_idxtoken_type_ids_at_kv_idximage_group_ids_at_q_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockrB   rA   r%   r&   
inner_masks   s   z0token_type_ids_mask_function.<locals>.inner_mask)intbool)rA   rB   rT   r%   rS   r&   token_type_ids_mask_functiong   s   	$rW   input_embeds5.6.0inputs_embedsversionnew_nameFr/   attention_maskcache_positionr*   position_idspixel_valuesis_trainingis_first_iterationc
                 K   s*  |r
|du r
t d|  |||||d}|	r|	n|du p$|j p$|du}	|	s-|
ddsI|dur6d| }ntd t|dddddf }|dur|	r|dk|j	}t
jj|d	dd
ddddf }|| @ }tj| ddd }t||t|d}t||j	||d< tdi |S )a"  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Paligemma uses a bidirectional mask on the prompt tokens.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when training)r/   rZ   r^   r_   r*   r`   	use_cacheTr   zIt is a prefill stage but The `token_type_ids` is not provided. We recommend passing `token_type_ids` to the model to prevent bad attention masking.r   )r   r   )valuerH   )dimor_mask_functionr%   )
ValueErrorget_text_configis_initializedgetloggerwarning_oncer"   	ones_liketodevicer   
functionalpadcumsumrU   rI   	full_likerW   r	   )r/   rZ   r^   r_   r*   r`   rA   ra   rb   rc   kwargsmask_kwargsis_imageis_previous_imagenew_image_startrB   r%   r%   r&   create_causal_mask_mapping   s<   
"
rz   c                   @   s@   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdZdZdZdS )	PaliGemmaPreTrainedModelr/   model)imagetextTr.   r*   FN)r   r   r    r   r$   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr%   r%   r%   r&   r{      s   
 r{   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c                "       s>  e Zd ZddiZdZdef fddZdd Zd	d
 Ze	e
dddejdee deeB fddZdejdejdejfddZe	e
													d$dejdB dejdB dejdB dejdB dedB dejdB dejdB dejdB dejdB dedB dedB d edB d!edB dee deeB fd"d#Z  ZS )%PaliGemmaModelzlanguage_model.modellanguage_modelFr/   c                    sd   t  | tj|jd| _t|| _|jj	| _	tj|jd}|| _
| j jp*| j| _|   d S )N)r/   )r2   r3   r   from_configr5   vision_towerr.   multi_modal_projectortext_config
vocab_sizer   r/   ri   dtypetext_config_dtype	post_init)r:   r/   r   r;   r%   r&   r3      s   

zPaliGemmaModel.__init__c                 C   
   | j  S r=   )r   get_input_embeddingsr:   r%   r%   r&   r         
z#PaliGemmaModel.get_input_embeddingsc                 C      | j | d S r=   )r   set_input_embeddingsr:   re   r%   r%   r&   r        z#PaliGemmaModel.set_input_embeddingszWObtains image last hidden states from the vision tower and apply multimodal projection.r   ra   ru   rC   c                 K   sB   | j |fddi|}|j}| |}|| jjjd  }||_|S )Nreturn_dictTg      ?)r   last_hidden_stater   r/   r   r6   pooler_output)r:   ra   ru   image_outputsselected_image_featurer>   r%   r%   r&   get_image_features  s   
z!PaliGemmaModel.get_image_features	input_idsrZ   r>   c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)r   rp   rH   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r"   tensorr/   image_token_idlongrp   allsumrJ   	unsqueeze	expand_asro   r   numel)r:   r   rZ   r>   special_image_maskn_image_tokensn_image_featuresr%   r%   r&   get_placeholder_mask  s   z#PaliGemmaModel.get_placeholder_maskNr^   r`   r*   rA   r_   labelsrd   output_attentionsoutput_hidden_statesr   c                 K   s  |du |duA rt d|dur|n| jj}|dur|n| jj}|dur&|n| jj}|durD| jj| jkrD|| jjk}| }d||< n|}|du rP|  |}|du rl|dur\|	 nd}t
j|||jd  |jd}|du rw|dd }|dur| j|ddj}||j|j}| j|||d}|||}t| }tst| j|||||||| jd		}| jd|||||
||d|d
	|}t|j|j|j|j|dur|dS ddS )  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   )rp   T)r   )rZ   r>   )rb   )	r^   r`   r*   rZ   rd   r   r   r   r_   )r   r*   r+   r,   r   r%   )rh   r/   r   r   use_return_dictr   r   cloner   get_seq_lengthr"   arangerJ   rp   r   r   r   ro   r   r   masked_scatter
isinstancedictrz   trainingr   r   r   r*   r+   r,   )r:   r   ra   r^   r`   r*   rA   r_   rZ   r   rd   r   r   r   ru   r   llm_input_idspast_seen_tokensr>   causal_mask_mappingoutputsr%   r%   r&   r?   .  sz   1


zPaliGemmaModel.forward)NNNNNNNNNNNNN)r   r   r    _checkpoint_conversion_mappingaccepts_loss_kwargsr   r3   r   r   r   r   r"   r#   r   r   r-   r   r   
LongTensorr   Tensorr   rV   r
   r   r?   r@   r%   r%   r;   r&   r      s    
	
r   c                $       s  e Zd ZdddddZddiZdef fd	d
Zdd Zdd Ze	de
jdee fddZee															d/de
jdB de
jdB de
jdB de
jdB dedB de
jdB de
jdB de
jdB de
jdB dedB dedB dedB d edB d!ee
jB dee d"eeB f d#d$Z								%			&d0 fd'd(	Zeed)d*dd+		&d1dede
jde
jdB de
jdedB de
jdB de
jdB d,edB d"efd-d.Z  ZS )2!PaliGemmaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr/   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr0   )r2   r3   r   r|   r   r4   r   r6   r   r   r   r9   r;   r%   r&   r3     s   
z*PaliGemmaForConditionalGeneration.__init__c                 C   r   r=   )r|   r   r   r%   r%   r&   r     r   z6PaliGemmaForConditionalGeneration.get_input_embeddingsc                 C   r   r=   )r|   r   r   r%   r%   r&   r     r   z6PaliGemmaForConditionalGeneration.set_input_embeddingsra   ru   c                 K   s   | j j|fi |S r=   )r|   r   )r:   ra   ru   r%   r%   r&   r     s   z4PaliGemmaForConditionalGeneration.get_image_featuresNr   r   r^   r`   r*   rA   r_   rZ   r   rd   r   r   r   logits_to_keeprC   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd||||||||
|	||d|d|}|d }t|trCt| dn|}| |dd|ddf }d}|	durh| j	d||	| j j
jd|}t|||j|j|j|jdS )r   NT)r   ra   rA   r^   r`   r*   rZ   rd   r   r   r   r   r_   r   )r)   r   r   )r(   r)   r*   r+   r,   r   r%   )r/   r   r   r   r|   r   rU   slicer   loss_functionr   r   r'   r*   r+   r,   r   )r:   r   ra   r^   r`   r*   rA   r_   rZ   r   rd   r   r   r   r   ru   r   r+   slice_indicesr)   r(   r%   r%   r&   r?     sN   1z)PaliGemmaForConditionalGeneration.forwardTFc                    sZ   t  j|f||||||	|
||d	|}|dd ur#|d  d7  < |s'|	s+||d< |S )N)	r*   rZ   r^   r`   r_   rd   r   rA   rc   r`   r   ra   )r2   prepare_inputs_for_generationrk   )r:   r   r*   rZ   r_   r`   ra   r^   rA   rd   r   r   rc   ru   model_inputsr;   r%   r&   r   #  s(   z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generationrX   rY   r[   rc   c           	      K   s.   t | ||||||fd|idd | D S )Nrc   c                 S   s   i | ]\}}|d kr||qS )ra   r%   ).0kvr%   r%   r&   
<dictcomp>f  s    zOPaliGemmaForConditionalGeneration.create_masks_for_generate.<locals>.<dictcomp>)rz   items)	r/   rZ   r^   r_   r*   r`   rA   rc   ru   r%   r%   r&   r	   O  s   	z;PaliGemmaForConditionalGeneration.create_masks_for_generate)NNNNNNNNNNNNNr   )NNNNNNNTNNF)NF)r   r   r    r   _tied_weights_keysr   r3   r   r   r   r"   r#   r   r   r   r   r   r   r   rV   rU   r-   r'   r?   r   staticmethodr   r   r   r	   r@   r%   r%   r;   r&   r     s    	
],
r   )r   r{   r   )NNFN)6r!   collections.abcr   dataclassesr   r"   r   cache_utilsr   configuration_utilsr   
generationr   masking_utilsr	   modeling_flash_attention_utilsr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   autor   configuration_paligemmar   
get_loggerr   rl   r   r'   Moduler.   r   rW   r#   rV   r   rz   r{   r   r   __all__r%   r%   r%   r&   <module>   s    


(	
J < =