o
    eiD                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ eeZeG dd deZeddG dd deZeddG dd deeZg dZdS )zPyTorch Fuyu model.    N)nn   )Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)	AutoModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )
FuyuConfigc                   @   s:   e Zd ZU eed< dZdZdZdZdZ	dZ
dZg ZdZdS )FuyuPreTrainedModelconfigfuyu)imagetextTpast_key_valuesN)__name__
__module____qualname__r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placement r%   r%   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/fuyu/modeling_fuyu.pyr       s   
 r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                       s6  e Zd ZddiZdef fddZdd Zdd	 Zd
ej	de
ej	 dej	dej	fddZeedejdee deeB fddZdejdejdejfddZe											d%dejdB dej	dB dej	dB dej	dB dejdB dedB dejdB dedB d edB d!edB d"edB deeB fd#d$Z  ZS )&	FuyuModelzlanguage_model.modellanguage_modelr   c                    s\   t  | |j| _|jj| _t|j| _t	
|j|j |j |j| _d| _|   d S )NF)super__init__pad_token_idpadding_idxtext_config
vocab_sizer	   from_configr)   r   Linear
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initselfr   	__class__r%   r&   r+   6   s   
zFuyuModel.__init__c                 C   
   | j  S N)r)   get_input_embeddingsr9   r%   r%   r&   r>   C      
zFuyuModel.get_input_embeddingsc                 C      | j | d S r=   )r)   set_input_embeddingsr9   valuer%   r%   r&   rB   F      zFuyuModel.set_input_embeddingsword_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc              	   C   s   |j d t|kstdt|d|j d | }t|j d D ]A}tj|| dkddd }|| | }|j d || j d krVtd|| j d|j d| d	|| | |j|||f< q#|S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r9   rF   rG   rH   output_embeddings	batch_idxdst_indicessrc_indicesr%   r%   r&   gather_continuous_embeddingsI   s(   z&FuyuModel.gather_continuous_embeddingspixel_valueskwargsc                 K   s   |  |}t|dS )z
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        )last_hidden_state)r5   r   )r9   rZ   r[   patch_embeddingsr%   r%   r&   get_image_featuresu   s   
	
zFuyuModel.get_image_features	input_idsinputs_embedsimage_featuresc                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        NdtyperT   r   r   z6Image features and image tokens do not match, tokens: z, features: )r>   rQ   tensorr   image_token_idlongrT   allsumrL   	unsqueeze	expand_asrS   r   numel)r9   r_   r`   ra   special_image_maskn_image_tokensn_image_featuresr%   r%   r&   get_placeholder_mask   s   zFuyuModel.get_placeholder_maskNimage_patchesimage_patches_indicesattention_maskposition_idsr   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                 K   sl  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur4|dur4td|dur>|j\}}n|durI|j\}}}ntd|du rv|durX|jn|j}|durc| nd}t	j
||| t	j|d}|d}|du r| j |}|dur| j|ddj}||j|j}| j|||d}|||}| jd
|||||	|
||d	|}|S )a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr   rb   T)rx   )r`   ra   )r`   rs   rt   r   rv   rw   ru   rx   r%   )r   rv   rw   ru   use_return_dictrN   rL   rT   get_seq_lengthrQ   arangerg   rj   r)   r>   r^   r\   rS   rc   rp   masked_scatter)r9   r_   rq   rr   rs   rt   r   r`   ru   rv   rw   rx   r[   
batch_size
seq_length_rT   past_key_values_lengthr]   rm   outputsr%   r%   r&   forward   sR   
	zFuyuModel.forward)NNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r+   r>   rB   rQ   TensorlistrY   r   r   FloatTensorr
   r   tupler   r^   
LongTensorrp   r   boolr   r   __classcell__r%   r%   r:   r&   r(   .   s    
,

	
r(   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                        s  e Zd ZddddZddiZdef fdd	Zd
d Zdd Ze	e
													d#dejdB dejdB dejdB dejdB dejdB dedB dejdB dedB dejdB dedB dedB dedB dedB deeB fddZ							 d$ fd!d"	Z  ZS )%FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NF)bias)r*   r+   r(   modelr   r1   r.   r4   r/   r   r7   r8   r:   r%   r&   r+      s   
zFuyuForCausalLM.__init__c                 C   r<   r=   )r   r>   r?   r%   r%   r&   r>      r@   z$FuyuForCausalLM.get_input_embeddingsc                 C   rA   r=   )r   rB   rC   r%   r%   r&   rB      rE   z$FuyuForCausalLM.set_input_embeddingsNr   r_   rq   rr   rs   rt   r   r`   ru   labelsrv   rw   rx   logits_to_keeprI   c                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| j||||||||
||dd}|d }t|trGt| dn|}| 	|dd|ddf }d}|	durl| j
d||	| j jjd|}t|||j|j|jdS )a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)r_   rq   rr   r`   rs   rt   r   rv   rw   ru   rx   r   )logitsr   r/   )lossr   r   hidden_states
attentionsr%   )r   rv   rw   ru   ry   r   
isinstanceintslicer   loss_functionr.   r/   r   r   r   r   )r9   r_   rq   rr   rs   rt   r   r`   ru   r   rv   rw   rx   r   r[   r   r   slice_indicesr   r   r%   r%   r&   r      sF   8zFuyuForCausalLM.forwardFc	              
      sH   t  j|f|||||||d|	}
|s"|	ddr"d |
d< d |
d< |
S )N)r   rs   r`   rq   rr   cache_positionis_first_iterationru   Trr   rq   )r*   prepare_inputs_for_generationget)r9   r_   r   rs   r`   rq   rr   r   r   r[   model_inputsr:   r%   r&   r   _  s"   	z-FuyuForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNNr   )NNNNNNF)r   r   r   r   _tied_weights_keysr   r+   r>   rB   r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r%   r%   r:   r&   r      sz    	
cr   )r   r   r(   )__doc__rQ   r   cache_utilsr   
generationr   modeling_outputsr   r   modeling_utilsr   models.auto.modeling_autor	   processing_utilsr
   utilsr   r   r   r   r   configuration_fuyur   
get_loggerr   loggerr   r(   r   __all__r%   r%   r%   r&   <module>   s2   
 2 