o
    wi}l                     @   s  d Z ddlmZ ddlmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlmZ ddl m!Z! e"e#Z$eeddG dd deZ%eeddG dd deZ&G dd dej'Z(eG dd deZ)eddG dd de)Z*G d d! d!eeZ+ed"dG d#d$ d$e)eZ,g d%Z-dS )&zPyTorch PaliGemmamodel.    )	dataclass)OptionalUnionN)nn   )CacheHybridCacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPast)PreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torchdynamo_compilinglogging   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    )custom_introc                   @   s$   e Zd ZU dZdZeej ed< dS )PaligemmaModelOutputWithPasta  
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__ r#   r#   m/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   &   s   
 r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeeej ef  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	PaliGemmaCausalLMOutputWithPasta{  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r&   r   r    r!   r"   r'   r(   r   listr   r)   tupler*   r   r#   r#   r#   r$   r%   <   s   
 r%   c                       s*   e Zd Zdef fddZdd Z  ZS )PaliGemmaMultiModalProjectorconfigc                    s(   t    tj|jj|jjdd| _d S )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr.   	__class__r#   r$   r2   \   s   
z%PaliGemmaMultiModalProjector.__init__c                 C   s   |  |}|S N)r7   )r9   image_featuresr)   r#   r#   r$   forward`   s   
z$PaliGemmaMultiModalProjector.forward)r   r   r   r   r2   r>   __classcell__r#   r#   r:   r$   r-   [   s    r-   c                   @   sF   e Zd ZeZdZdZdgZdZdZ	dZ
dZdZdZdZdZdd ZdS )PaliGemmaPreTrainedModel Tr-   r(   c                 C   sV   t | jd| j j}t|tjr'|jjj	d|d |j
d ur)|j
j  d S d S d S )Ninitializer_range        )meanstd)getattrr.   get_text_configrB   
isinstancer   r3   weightdatanormal_r0   zero_)r9   modulerE   r#   r#   r$   _init_weightsu   s   
z&PaliGemmaPreTrainedModel._init_weightsN)r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_flash_attn_2_supports_sdpa_supports_flex_attn_supports_attention_backendrN   r#   r#   r#   r$   r@   f   s    r@   z{
    The Base Paligemma model which consists of a vision backbone and a language model withou language modeling head.,
    c                #       s<  e Zd ZddiZdZdef fddZdd Zd	d
 Zdd Z	dd Z
					d&dee fddZdejfddZee													d'dejdejdeej deej deeeej ef  deej deej deej deej dee dee d ee d!ee d"ee d#eeef fd$d%Z  ZS )(PaliGemmaModelzlanguage_model.modellanguage_modelFr.   c                    sj   t  | tj|jd| _t|| _|jj	| _	tj|jd}|| _
| jjd ur,| jjnd| _|   d S )N)r.   )r1   r2   r   from_configr4   vision_towerr-   multi_modal_projectortext_config
vocab_sizer\   r.   pad_token_id	post_init)r9   r.   r\   r:   r#   r$   r2      s   

zPaliGemmaModel.__init__c                 C   
   | j  S r<   )r\   get_input_embeddingsr9   r#   r#   r$   rf         
z#PaliGemmaModel.get_input_embeddingsc                 C      | j | d S r<   )r\   set_input_embeddingsr9   valuer#   r#   r$   rj         z#PaliGemmaModel.set_input_embeddingsc                 C   
   || _ d S r<   r\   r9   decoderr#   r#   r$   set_decoder   rh   zPaliGemmaModel.set_decoderc                 C      | j S r<   ro   rg   r#   r#   r$   get_decoder      zPaliGemmaModel.get_decoderNis_trainingc                 C   s  | j jjdkr|d urd|v r|S d S |d ur|n| j}t|t}t| jj	}|d u r.|}|j
d d \}	}
|r>| }nt|trH| }nt|tjrS|j
d n|d |
 d }|d urg| dkrg|S tj|
|f|| j|jd}|
dkr|rtj|dd	}n
d|d d d |
f< |tj||jd
|ddk9 }|d d d d d d f |	ddd}|d ur>| }|j
d }|r|d u rtd|d d d d d d d |f |d d d d d d f |jdkd|d d d d d d d |f< |d d d d d d d |f |d d d d d d f |j }|dk}|d d d d d d d |f |||d d d d d d d |f< |S )Nflash_attention_2rC   r   r]   r   r      
fill_valuedtypedevicediagonalr|   z/Token type ids must be provided during training)r.   ra   _attn_implementationtrainingrH   r	   r    finfor{   minshapeget_max_cache_shaper   Tensordimfullr|   triuarangereshapeexpandclone
ValueErrormasked_fillto)r9   attention_masktoken_type_idsr(   cache_positioninput_tensorrv   using_static_cache	min_dtypeinputs_lead_dimsequence_lengthtarget_lengthcausal_maskmask_lengthpadding_maskr#   r#   r$   _update_causal_mask   sX   	




 $

 $ @  z"PaliGemmaModel._update_causal_maskpixel_valuesc                 C   s0   |  |}|j}| |}|| jjjd  }|S )a  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        g      ?)r_   last_hidden_stater`   r.   ra   r5   )r9   r   image_outputsselected_image_featurer=   r#   r#   r$   get_image_features   s
   


z!PaliGemmaModel.get_image_features	input_idsr   position_idsr(   r   r   inputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictkwargsreturnc                 K   s:  |du |duA rt d|dur|n| jj}|dur|n| jj}|dur&|n| jj}|duo1|	du}|durL| jj| jkrL|| jjk}| }d||< n|}|du rX|  |}|du rt|durd|	 nd}t
j|||jd  |jd}|du r|dd }|dur| |}|du r||  t
j| jjt
j|jdk}n|| jjkd}|||j}t s||  | kr|jddjddd }t d	| d
|jd |jd   d||j|j}|||}| ||||||}| jd|||||
||d|d	|}t|j|j|j|j|dur|dS ddS )i  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r{   r|   r]   )r   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.T)	r   r   r(   r   r   r   r   r   r   )r   r(   r)   r*   r   r#   ) r   r.   r   r   use_return_dictimage_token_idrb   r   rf   get_seq_lengthr    r   r   r|   	unsqueezer   tensorlong	expand_asr   r   numelsumr{   masked_scatterr   r\   r   r   r(   r)   r*   )r9   r   r   r   r   r(   r   r   r   r   r   r   r   r   r   rv   special_image_maskllm_input_idspast_seen_tokensr=   image_tokens_in_textr   outputsr#   r#   r$   r>      s   /


zPaliGemmaModel.forward)NNNNN)NNNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingaccepts_loss_kwargsr   r2   rf   rj   rr   rt   r   boolr   r    r!   r   r   r   
LongTensorr   r   r+   r   r   r   r,   r   r>   r?   r#   r#   r:   r$   r[      s~    
D	

r[   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r   r   r   r#   r#   r#   r$   r   u  s    r   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c                %       s  e Zd ZdddddZdgZdef fdd	Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zedd Zedd Zedd Zee														d;d ejd!ejd"eej d#eej d$eeeej ef  d%eej d&eej d'eej d(eej d)ee d*ee d+ee d,ee d-eeejf d.ee d/ee e!f f d0d1Z"								2		d< fd3d4	Z#e$d"ejd5ed6ed7ej%d&ejd8efd9d:Z&  Z'S )=!PaliGemmaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr.   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr/   )r1   r2   r[   modelr   r3   ra   r5   rb   r   rd   r8   r:   r#   r$   r2     s   
z*PaliGemmaForConditionalGeneration.__init__c                 C   re   r<   )r   rf   rg   r#   r#   r$   rf     rh   z6PaliGemmaForConditionalGeneration.get_input_embeddingsc                 C   ri   r<   )r   rj   rk   r#   r#   r$   rj     rm   z6PaliGemmaForConditionalGeneration.set_input_embeddingsc                 C   rs   r<   r   rg   r#   r#   r$   get_output_embeddings  ru   z7PaliGemmaForConditionalGeneration.get_output_embeddingsc                 C   rn   r<   r   )r9   new_embeddingsr#   r#   r$   set_output_embeddings  rh   z7PaliGemmaForConditionalGeneration.set_output_embeddingsc                 C   ri   r<   )r   rr   rp   r#   r#   r$   rr     rm   z-PaliGemmaForConditionalGeneration.set_decoderc                 C   re   r<   )r   rt   rg   r#   r#   r$   rt     rh   z-PaliGemmaForConditionalGeneration.get_decoderc                 C   s   | j |S r<   )r   r   )r9   r   r#   r#   r$   r     s   z4PaliGemmaForConditionalGeneration.get_image_featuresc                 C      | j jS r<   )r   r\   rg   r#   r#   r$   r\        z0PaliGemmaForConditionalGeneration.language_modelc                 C   r   r<   )r   r_   rg   r#   r#   r$   r_     r   z.PaliGemmaForConditionalGeneration.vision_towerc                 C   r   r<   )r   r`   rg   r#   r#   r$   r`     r   z7PaliGemmaForConditionalGeneration.multi_modal_projectorNr   r   r   r   r   r(   r   r   r   r   r   r   r   r   logits_to_keepr   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd||||||||
|	||d|d|}|d }t|trCt| dn|}| |dd|ddf }d}|	durh| j	d||	| j j
jd|}t|||j|j|j|jdS )r   NT)r   r   r   r   r   r(   r   r   r   r   r   r   r   r   )r'   r   rb   )r&   r'   r(   r)   r*   r   r#   )r.   r   r   r   r   rH   intslicer   loss_functionra   rb   r%   r(   r)   r*   r   )r9   r   r   r   r   r(   r   r   r   r   r   r   r   r   r   r   r   r)   slice_indicesr'   r&   r#   r#   r$   r>     sN   /z)PaliGemmaForConditionalGeneration.forwardTc                    s   t  j|f||||||	|
|d|}|dd ur"|d  d7  < |d dkr,||d< |d uo3|d u}|d dkrVt|trV|d urE|n|}| j||||||}||d< |S )N)r(   r   r   r   r   r   r   r   r   r   r   r   r   )r1   prepare_inputs_for_generationgetrH   r   r   r   )r9   r   r(   r   r   r   r   r   r   r   r   r   r   model_inputsrv   r   r   r:   r#   r$   r     s4   
z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generationr   r   r{   
batch_sizec                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nrx   ry   r   r}   r   r]   r   )r   r    r   r   r   r|   r   r   r   r   r   r   r   r   )r   r   r   r{   r   r   r   r   r   r   r   r#   r#   r$   5_prepare_4d_causal_attention_mask_with_cache_position6  s,    $
6  zWPaliGemmaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNNNr   )
NNNNNNNTNN)(r   r   r   r   _tied_weights_keysr   r2   rf   rj   r   r   rr   rt   r   propertyr\   r_   r`   r   r   r    r   r!   r   r   r   r+   r   r   r   r   r   r,   r%   r>   r   staticmethodr{   r   r?   r#   r#   r:   r$   r   x  s    


	

[.r   )r   r@   r[   ).r   dataclassesr   typingr   r   r    torch.utils.checkpointr   cache_utilsr   r   r	   
generationr
   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   autor   configuration_paligemmar   
get_loggerr   loggerr   r%   Moduler-   r@   r[   r   r   __all__r#   r#   r#   r$   <module>   sR    
 q s