o
    wiD                     @   s  d Z ddlmZmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ eeZeG dd deZG dd de
eZeddG dd deZeddG dd deeZg dZdS )zPyTorch Fuyu model.    )OptionalUnionN)nn   )GenerationMixin)FlashAttentionKwargs)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)
LossKwargsauto_docstringcan_return_tuplelogging   )
FuyuConfigc                   @   s8   e Zd ZeZdZdZdZdZdZ	dZ
g ZdZdd ZdS )FuyuPreTrainedModelfuyuTpast_key_valuesc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )Ng        )meanstd)configinitializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx)selfmoduler    r#   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weights/   s   

z!FuyuPreTrainedModel._init_weightsN)__name__
__module____qualname__r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placementr%   r#   r#   r#   r$   r   #   s    r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r&   r'   r(   r#   r#   r#   r$   r2   ;   s    r2   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                       s  e Zd ZddiZdef fddZdd Zdd	 Zd
d Zdd Z	de
jdee
j de
jde
jfddZde
jfddZe											d%de
jde
jde
jdee
j dee
j deee
j  dee
j dee d ee d!ee d"ee deeef fd#d$Z  ZS )&	FuyuModelzlanguage_model.modellanguage_modelr   c                    s\   t  | |j| _|jj| _t|j| _t	
|j|j |j |j| _d| _|   d S )NF)super__init__pad_token_idr    text_config
vocab_sizer
   from_configr5   r   r   
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initr!   r   	__class__r#   r$   r7   F   s   
zFuyuModel.__init__c                 C   
   | j  S N)r5   get_input_embeddingsr!   r#   r#   r$   rG   S      
zFuyuModel.get_input_embeddingsc                 C      | j | d S rF   )r5   set_input_embeddingsr!   valuer#   r#   r$   rK   V      zFuyuModel.set_input_embeddingsc                 C   
   || _ d S rF   r5   r!   decoderr#   r#   r$   set_decoderY   rI   zFuyuModel.set_decoderc                 C      | j S rF   rP   rH   r#   r#   r$   get_decoder\      zFuyuModel.get_decoderword_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc              	   C   s   |j d t|kstdt|d|j d | }t|j d D ]A}tj|| dkddd }|| | }|j d || j d krVtd|| j d|j d| d	|| | |j|||f< q#|S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r!   rW   rX   rY   output_embeddings	batch_idxdst_indicessrc_indicesr#   r#   r$   gather_continuous_embeddings_   s(   z&FuyuModel.gather_continuous_embeddingspixel_valuesc                    s    fdd|D }|S )a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        c                    s(   g | ]}  | j jjd qS )r   )r?   rd   r   dtypesqueeze).0patchrH   r#   r$   
<listcomp>   s    z0FuyuModel.get_image_features.<locals>.<listcomp>r#   )r!   rk   kwargspatch_embeddingsr#   rH   r$   get_image_features   s   
zFuyuModel.get_image_featuresN	input_idsimage_patchesimage_patches_indicesattention_maskposition_idsr   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|dur4|dur4td|dur>|j\}}n|durI|j\}}}ntd|}d}|durb|d d jd }|| }|du r|durm|jn|j}tj	||| tj
|d}|d}|du r| j |}|dur|du r| |}tj|dd}|| j jkd}|||j}||j|j}|||}| jd
|||||	|
||d	|}|S )a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr      )rl   re   )dim)ry   rw   rx   r   r{   r|   rz   r}   r#   )r   r{   r|   rz   use_return_dictr_   r]   re   rb   arangelong	unsqueezer5   rG   rs   catimage_token_id	expand_asrd   rl   masked_scatter)r!   rt   ru   rv   rw   rx   r   ry   rz   r{   r|   r}   rq   
batch_size
seq_length_seq_length_with_pastpast_key_values_lengthre   rr   special_image_maskoutputsr#   r#   r$   forward   sZ   

	zFuyuModel.forward)NNNNNNNNNNN)r&   r'   r(   _checkpoint_conversion_mappingr   r7   rG   rK   rS   rU   rb   Tensorlistrj   FloatTensorrs   r   
LongTensorr   boolr   tupler   r   __classcell__r#   r#   rC   r$   r4   >   sl    
,	

r4   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                !       s2  e Zd ZddddZdgZdef fddZd	d
 Zdd Zdd Z	dd Z
dd Zdd Zee													d+dejdejdejdeej deej deeej  deej dee deej d ee d!ee d"ee d#ee d$eeef fd%d&Z					d, fd'd(	Zed)d* Z  ZS )-FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NF)r   )r6   r7   r4   modelr   r   r9   r>   r:   r   rA   rB   rC   r#   r$   r7      s   
zFuyuForCausalLM.__init__c                 C   rE   rF   )r   rG   rH   r#   r#   r$   rG      rI   z$FuyuForCausalLM.get_input_embeddingsc                 C   rJ   rF   )r   rK   rL   r#   r#   r$   rK      rN   z$FuyuForCausalLM.set_input_embeddingsc                 C   rT   rF   r   rH   r#   r#   r$   get_output_embeddings  rV   z%FuyuForCausalLM.get_output_embeddingsc                 C   rO   rF   r   )r!   new_embeddingsr#   r#   r$   set_output_embeddings  rI   z%FuyuForCausalLM.set_output_embeddingsc                 C   rJ   rF   )r   rS   rQ   r#   r#   r$   rS     rN   zFuyuForCausalLM.set_decoderc                 C   rE   rF   )r   rU   rH   r#   r#   r$   rU     rI   zFuyuForCausalLM.get_decoderNr   rt   ru   rv   rw   rx   r   ry   rz   labelsr{   r|   r}   logits_to_keeprZ   c                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| j||||||||
||dd}|d }t|trGt| dn|}| 	|dd|ddf }d}|	durl| j
d||	| j jjd|}t|||j|j|jdS )a@  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import requests

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)rt   ru   rv   ry   rw   rx   r   r{   r|   rz   r}   r   )logitsr   r:   )lossr   r   hidden_states
attentionsr#   )r   r{   r|   rz   r   r   r   intslicer   loss_functionr9   r:   r   r   r   r   )r!   rt   ru   rv   rw   rx   r   ry   rz   r   r{   r|   r}   r   rq   r   r   slice_indicesr   r   r#   r#   r$   r     sF   5zFuyuForCausalLM.forwardc           	         s<   t  j|f|||||d|}|d urd |d< d |d< |S )N)r   rw   ry   ru   rv   rv   ru   )r6   prepare_inputs_for_generation)	r!   rt   r   rw   ry   ru   rv   rq   model_inputsrC   r#   r$   r   m  s   
z-FuyuForCausalLM.prepare_inputs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr#   c                 3   s$    | ]}| d  |jV  qdS )r   N)index_selectrd   re   )rn   
past_statebeam_idxr#   r$   	<genexpr>  s   " z1FuyuForCausalLM._reorder_cache.<locals>.<genexpr>)r   )r   r   reordered_past
layer_pastr#   r   r$   _reorder_cache  s   zFuyuForCausalLM._reorder_cache)NNNNNNNNNNNNr   )NNNNN)r&   r'   r(   r   _tied_weights_keysr   r7   rG   rK   r   r   rS   rU   r   r   rb   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodr   r   r#   r#   rC   r$   r      s    	

`r   )r   r   r4   ) __doc__typingr   r   rb   torch.utils.checkpointr   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr	   models.auto.modeling_autor
   utilsr   r   r   r   configuration_fuyur   
get_loggerr&   loggerr   r2   r4   r   __all__r#   r#   r#   r$   <module>   s6   
 ' &