o
    iF                     @   s   d Z ddlmZmZ ddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ eeZeG dd deZeddG dd deZeddG dd dee	Zg dZdS )zPyTorch Fuyu model.    )OptionalUnionN)nn   )Cache)GenerationMixin)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)auto_docstringcan_return_tuplelogging   )
FuyuConfigc                   @   s>   e Zd ZU eed< dZdZdZdZdZ	dZ
g ZdZdd ZdS )FuyuPreTrainedModelconfigfuyuTpast_key_valuesc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )Ng        )meanstd)r   initializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx)selfmoduler    r"   Z/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weights.   s   

z!FuyuPreTrainedModel._init_weightsN)__name__
__module____qualname__r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placementr$   r"   r"   r"   r#   r   "   s   
 r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                       s2  e Zd ZddiZdef fddZdd Zdd	 Zd
d Zdd Z	de
jdee
j de
jde
jfddZde
jfddZde
jde
jde
jfddZe											d(dee
j dee
j dee
j dee
j d ee
j d!ee dee
j d"ee d#ee d$ee d%ee deeef fd&d'Z  ZS ))	FuyuModelzlanguage_model.modellanguage_modelr   c                    s\   t  | |j| _|jj| _t|j| _t	
|j|j |j |j| _d| _|   d S )NF)super__init__pad_token_idr   text_config
vocab_sizer
   from_configr3   r   r   
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initr    r   	__class__r"   r#   r5   B   s   
zFuyuModel.__init__c                 C   
   | j  S N)r3   get_input_embeddingsr    r"   r"   r#   rE   O      
zFuyuModel.get_input_embeddingsc                 C      | j | d S rD   )r3   set_input_embeddingsr    valuer"   r"   r#   rI   R      zFuyuModel.set_input_embeddingsc                 C   s
   || _ d S rD   r3   r    decoderr"   r"   r#   set_decoderU   rG   zFuyuModel.set_decoderc                 C   s   | j S rD   rM   rF   r"   r"   r#   get_decoderX   s   zFuyuModel.get_decoderword_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc              	   C   s   |j d t|kstdt|d|j d | }t|j d D ]A}tj|| dkddd }|| | }|j d || j d krVtd|| j d|j d| d	|| | |j|||f< q#|S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r    rR   rS   rT   output_embeddings	batch_idxdst_indicessrc_indicesr"   r"   r#   gather_continuous_embeddings[   s(   z&FuyuModel.gather_continuous_embeddingspixel_valuesc                    s    fdd|D }|S )a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        c                    s(   g | ]}  | j jjd qS )r   )r=   r_   r   dtypesqueeze).0patchrF   r"   r#   
<listcomp>   s    z0FuyuModel.get_image_features.<locals>.<listcomp>r"   )r    rf   kwargspatch_embeddingsr"   rF   r#   get_image_features   s   
zFuyuModel.get_image_features	input_idsinputs_embedsimage_featuresc                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}|jd |jd  }||  | krPtd| d| |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nrg   r`   r   r   z6Image features and image tokens do not match: tokens: z, features )rE   r]   tensorr   image_token_idlongr`   allsum	unsqueeze	expand_asr_   rX   numelrZ   )r    ro   rp   rq   special_image_maskn_image_tokensn_image_featuresr"   r"   r#   get_placeholder_mask   s   zFuyuModel.get_placeholder_maskNimage_patchesimage_patches_indicesattention_maskposition_idsr   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                 K   sp  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur4|dur4td|dur>|j\}}n|durI|j\}}}ntd|du rv|durX|jn|j}|durc| nd}t	j
||| t	j|d}|d}|du r| j |}|dur| |}t	j|dd|j|j}| j|||d}|||}| jd	|||||	|
||d|}|S )
a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr   rr   )dim)rp   rq   )rp   r   r   r   r   r   r   r   r"   )r   r   r   r   use_return_dictrZ   rX   r`   get_seq_lengthr]   arangerv   ry   r3   rE   rn   catr_   rg   r   masked_scatter)r    ro   r   r   r   r   r   rp   r   r   r   r   rl   
batch_size
seq_length_r`   past_key_values_lengthrm   r|   outputsr"   r"   r#   forward   sR   

	zFuyuModel.forward)NNNNNNNNNNN)r%   r&   r'   _checkpoint_conversion_mappingr   r5   rE   rI   rP   rQ   r]   Tensorlistre   FloatTensorrn   
LongTensorr   r   r   r   boolr   tupler   r   __classcell__r"   r"   rA   r#   r2   :   sz    
,
	

r2   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                !       s  e Zd ZddddZdgZdef fddZd	d
 Zdd Zdd Z	dd Z
ee													d%deej deej deej deej deej dee deej dee deej dee dee dee dee d eeef fd!d"Z						d& fd#d$	Z  ZS )'FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NF)r   )r4   r5   r2   modelr   r   r7   r<   r8   r   r?   r@   rA   r"   r#   r5     s   
zFuyuForCausalLM.__init__c                 C   rC   rD   )r   rE   rF   r"   r"   r#   rE     rG   z$FuyuForCausalLM.get_input_embeddingsc                 C   rH   rD   )r   rI   rJ   r"   r"   r#   rI     rL   z$FuyuForCausalLM.set_input_embeddingsc                 C   rH   rD   )r   rP   rN   r"   r"   r#   rP     rL   zFuyuForCausalLM.set_decoderc                 C   rC   rD   )r   rQ   rF   r"   r"   r#   rQ     rG   zFuyuForCausalLM.get_decoderNr   ro   r   r   r   r   r   rp   r   labelsr   r   r   logits_to_keeprU   c                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| j||||||||
||dd}|d }t|trGt| dn|}| 	|dd|ddf }d}|	durl| j
d||	| j jjd|}t|||j|j|jdS )a@  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import requests

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)ro   r   r   rp   r   r   r   r   r   r   r   r   )logitsr   r8   )lossr   r   hidden_states
attentionsr"   )r   r   r   r   r   r   r   intslicer   loss_functionr7   r8   r   r   r   r   )r    ro   r   r   r   r   r   rp   r   r   r   r   r   r   rl   r   r   slice_indicesr   r   r"   r"   r#   r     sF   6zFuyuForCausalLM.forwardc           
   	      sB   t  j|f||||||d|}	|d dkrd |	d< d |	d< |	S )N)r   r   rp   r   r   cache_positionr   r   r   )r4   prepare_inputs_for_generation)
r    ro   r   r   rp   r   r   r   rl   model_inputsrA   r"   r#   r   w  s    z-FuyuForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNNr   )NNNNNN)r%   r&   r'   r   _tied_weights_keysr   r5   rE   rI   rP   rQ   r   r   r   r]   r   r   r   r   r   r   r   r   r   r   r   r   r"   r"   rA   r#   r      s|    	

ar   )r   r   r2   )__doc__typingr   r   r]   r   cache_utilsr   
generationr   modeling_outputsr   modeling_utilsr	   models.auto.modeling_autor
   utilsr   r   r   configuration_fuyur   
get_loggerr%   loggerr   r2   r   __all__r"   r"   r"   r#   <module>   s2   
 : 