o
    wi}R                     @   s4  d dl mZ d dlmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ eeddG dd deZeeddG dd deZG dd dejZeG dd deZeddG dd deZeddG dd dee
Zg d ZdS )!    )	dataclass)OptionalUnionN)nn   )ACT2FN)GenerationMixin)BaseModelOutputWithPastModelOutput)PreTrainedModel)auto_docstringcan_return_tupleis_torchdynamo_compiling   )	AutoModel   )VipLlavaConfigzM
    Base class for VipLlava outputs, with hidden states and attentions.
    )custom_introc                   @   s$   e Zd ZU dZdZeej ed< dS )VipLlavaModelOutputWithPasta  
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__ r   r   k/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   %   s   
 r   zT
    Base class for VipLlava causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	VipLlavaCausalLMOutputWithPastaw  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r    r   r   r   r   r!   r"   listr#   tupler$   r   r   r   r   r   r   ;   s   
 r   c                       s*   e Zd Zdef fddZdd Z  ZS )VipLlavaMultiModalProjectorconfigc                    s   t    t|jtrdnt|j}tj||jj	 |j
d| _tj||jj	 |jj	dd| _t|j | _tj|jj	|jj	dd| _d S )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr(   num_feature_layers	__class__r   r   r-   [   s   

z$VipLlavaMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S N)r6   r9   r;   r<   )r=   r#   r   r   r   forwardj   s
   



z#VipLlavaMultiModalProjector.forward)r   r   r   r   r-   rB   __classcell__r   r   r?   r   r'   Z   s    r'   c                   @   s@   e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZdZdd ZdS )VipLlavaPreTrainedModel Tr"   c                 C   s   t | jd| j j}t|tjr)|jjj	d|d |j
d ur'|j
j  d S d S t|tjr>|jjd |j
j  d S d S )Ninitializer_rangeg        )meanstdg      ?)getattrr(   get_text_configrF   r.   r   r7   weightdatanormal_r+   zero_r2   fill_)r=   modulerH   r   r   r   _init_weights   s   
z%VipLlavaPreTrainedModel._init_weightsN)r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_cache_class_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_static_cache_supports_flex_attn_supports_attention_backendrQ   r   r   r   r   rD   r   s    rD   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    c                       s  e Zd ZddiZdef fddZdd Zdd	 Zd
d Zdd Z		d de
jdeeeee f  fddZe												d!de
jde
jdee
j dee
j deee
j  dee
j deeeee f  dee dee dee dee dee
j deeef fddZ  ZS )"VipLlavaModelzlanguage_model.modellanguage_modelr(   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S rA   )r,   r-   r   from_configr3   vision_towerr'   multi_modal_projectorr8   r^   	post_initr=   r(   r?   r   r   r-      s
   
zVipLlavaModel.__init__c                 C   
   | j  S rA   )r^   get_input_embeddingsr=   r   r   r   re         
z"VipLlavaModel.get_input_embeddingsc                 C      | j | d S rA   )r^   set_input_embeddingsr=   valuer   r   r   ri         z"VipLlavaModel.set_input_embeddingsc                 C   
   || _ d S rA   r^   r=   decoderr   r   r   set_decoder   rg   zVipLlavaModel.set_decoderc                 C      | j S rA   rn   rf   r   r   r   get_decoder      zVipLlavaModel.get_decoderNpixel_valuesr/   c                    sv   |dur|n| j j}| j|dd t|tr$ j| ddddf }n fdd|D }tj|dd}| |}|S )	aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, list[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        NT)output_hidden_statesr   c                    s&   g | ]} j | d d dd f qS )Nr   )r#   ).0indeximage_outputsr   r   
<listcomp>   s   & z4VipLlavaModel.get_image_features.<locals>.<listcomp>)dim)	r(   r/   r`   r.   r0   r#   r   catra   )r=   ru   r/   image_featuresr   ry   r   get_image_features   s   

z VipLlavaModel.get_image_features	input_idsattention_maskposition_idsr"   inputs_embeds	use_cacheoutput_attentionsrv   return_dictcache_positionreturnc                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td|dur@|dur@td|du rJ|  |}|dur| j||d}|| j jk	d}|
||j}t s||  | kr|| j jk }|jd |jd  }td| d	| ||j|j}|||}| jd||||||	|
d
|d	|}t|j|j|j|j|dur|ndd}|r|S | S )z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oneru   r/   r|   r   r   z6Image features and image tokens do not match: tokens: z, features T)	r   r   r"   r   r   r   rv   r   r   )last_hidden_stater"   r#   r$   r   r   )r(   r   rv   use_return_dictr/   
ValueErrorre   r   image_token_id	unsqueeze	expand_astodevicer   numelsumshapedtypemasked_scatterr^   r   r   r"   r#   r$   to_tuple)r=   r   ru   r   r   r"   r   r/   r   r   rv   r   r   	lm_kwargsr   special_image_maskn_image_tokensn_image_featuresoutputsoutputr   r   r   rB      sb   
zVipLlavaModel.forwardrA   )NNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r-   re   ri   rq   rs   r   r   r   r   r0   r%   r   r   
LongTensorTensorboolr&   r   rB   rC   r   r   r?   r   r]      sj    
	

r]   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c                #       s  e Zd ZdddddZdgZdef fdd	Zd
d Zdd Zde	j
fddZdd Zdd Zdd Z	d3dejdeeeee f  fddZedd Zedd Zed d! Zee														"d4d#ejdejd$eej d%eej d&eeej  d'eej deeeee f  d(eej d)ee d*ee d+ee d,ee d-eej d.eeejf deee f fd/d0Z!						d5 fd1d2	Z"  Z#S )6 VipLlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr(   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr*   )r,   r-   r]   modelr   r7   r8   r4   
vocab_sizer   rb   rc   r?   r   r   r-   )  s   
z)VipLlavaForConditionalGeneration.__init__c                 C   rd   rA   )r   re   rf   r   r   r   re   /  rg   z5VipLlavaForConditionalGeneration.get_input_embeddingsc                 C   rh   rA   )r   ri   rj   r   r   r   ri   2  rl   z5VipLlavaForConditionalGeneration.set_input_embeddingsr   c                 C   rr   rA   r   rf   r   r   r   get_output_embeddings5  rt   z6VipLlavaForConditionalGeneration.get_output_embeddingsc                 C   rm   rA   r   )r=   new_embeddingsr   r   r   set_output_embeddings8  rg   z6VipLlavaForConditionalGeneration.set_output_embeddingsc                 C   rh   rA   )r   rq   ro   r   r   r   rq   ;  rl   z,VipLlavaForConditionalGeneration.set_decoderc                 C      | j jS rA   )r   rs   rf   r   r   r   rs   >  s   z,VipLlavaForConditionalGeneration.get_decoderNru   r/   c                 C   s   | j j||dS )Nr   )r   r   )r=   ru   r/   r   r   r   r   A  s   z3VipLlavaForConditionalGeneration.get_image_featuresc                 C   r   rA   )r   r^   rf   r   r   r   r^   G     z/VipLlavaForConditionalGeneration.language_modelc                 C   r   rA   )r   r`   rf   r   r   r   r`   K  r   z-VipLlavaForConditionalGeneration.vision_towerc                 C   r   rA   )r   ra   rf   r   r   r   ra   O  r   z6VipLlavaForConditionalGeneration.multi_modal_projectorr   r   r   r   r"   r   labelsr   r   rv   r   r   logits_to_keepc                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| jd|||||||	||
|d|d|}|d }t|trLt| dn|}| 	|dd|ddf }d}|durm| j
||| j jjd}t|||j|j|j|jdS )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)r   ru   r   r   r"   r   r   r/   r   rv   r   r   r   )r!   r   r   )r    r!   r"   r#   r$   r   r   )r(   r   rv   r   r/   r   r.   r0   slicer   loss_functionr8   r   r   r"   r#   r$   r   )r=   r   ru   r   r   r"   r   r/   r   r   r   rv   r   r   r   r   r   r#   slice_indicesr!   r    r   r   r   rB   S  sH   6z(VipLlavaForConditionalGeneration.forwardc           
         s8   t  j|f|||||d|}	|d dkr||	d< |	S )N)r"   r   r   r   r   r   ru   )r,   prepare_inputs_for_generation)
r=   r   r"   r   ru   r   r   r   kwargsmodel_inputsr?   r   r   r     s   
z>VipLlavaForConditionalGeneration.prepare_inputs_for_generationrA   )NNNNNNNNNNNNNr   )NNNNNN)$r   r   r   r   _tied_weights_keysr   r-   re   ri   r   Moduler   r   rq   rs   r   r   r   r   r0   r%   r   propertyr^   r`   ra   r   r   r   r   r   r&   r   rB   r   rC   r   r   r?   r   r     s    



	

br   )r]   r   rD   ) dataclassesr   typingr   r   r   r   activationsr   
generationr   modeling_outputsr	   r
   modeling_utilsr   utilsr   r   r   autor   configuration_vipllavar   r   r   r   r'   rD   r]   r   __all__r   r   r   r   <module>   sF     4