o
    wi3                     @   s   d dl mZmZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ eeZG d
d de	ZG dd deZG dd dejZG dd de
ZG dd deZG dd deZg dZdS )    )OptionalUnionN)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)auto_docstringis_torchdynamo_compilinglogging   )VipLlavaConfigc                   @      e Zd ZdS )VipLlavaModelOutputWithPastN__name__
__module____qualname__ r   r   j/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/vipllava/modular_vipllava.pyr   %       r   c                   @   r   )VipLlavaCausalLMOutputWithPastNr   r   r   r   r   r   )   r   r   c                       s*   e Zd Zdef fddZdd Z  ZS )VipLlavaMultiModalProjectorconfigc                    s   t    t|jtrdnt|j}tj||jj	 |j
d| _tj||jj	 |jj	dd| _t|j | _tj|jj	|jj	dd| _d S )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr   num_feature_layers	__class__r   r   r    .   s   

z$VipLlavaMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S N)r)   r,   r.   r/   )r0   hidden_statesr   r   r   forward=   s
   



z#VipLlavaMultiModalProjector.forward)r   r   r   r   r    r6   __classcell__r   r   r2   r   r   -   s    r   c                   @   r   )VipLlavaPreTrainedModelNr   r   r   r   r   r8   E   r   r8   c                   @   s   e Zd Z	ddejdeeeee f  fddZ	e
												ddejdejdeej deej d	eeej  d
eej deeeee f  dee dee dee dee deej deeef fddZdS )VipLlavaModelNpixel_valuesr"   c                    sv   |dur|n| j j}| j|dd t|tr$ j| ddddf }n fdd|D }tj|dd}| |}|S )	aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, list[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        NT)output_hidden_statesr   c                    s&   g | ]} j | d d dd f qS )Nr   )r5   ).0indeximage_outputsr   r   
<listcomp>d   s   & z4VipLlavaModel.get_image_features.<locals>.<listcomp>)dim)	r   r"   vision_towerr!   r#   r5   torchcatmulti_modal_projector)r0   r:   r"   image_featuresr   r>   r   get_image_featuresJ   s   

z VipLlavaModel.get_image_features	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsr;   return_dictcache_positionreturnc                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td|dur@|dur@td|du rJ|  |}|dur| j||d}|| j jk	d}|
||j}t s||  | kr|| j jk }|jd |jd  }td| d	| ||j|j}|||}| jd||||||	|
d
|d	|}t|j|j|j|j|dur|ndd}|r|S | S )z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either oner:   r"   rA   r   r   z6Image features and image tokens do not match: tokens: z, features T)	rJ   rK   rL   rM   rN   rO   r;   rP   rQ   )last_hidden_staterL   r5   
attentionsimage_hidden_statesr   )r   rO   r;   use_return_dictr"   
ValueErrorget_input_embeddingsrH   image_token_id	unsqueeze	expand_astodevicer   numelsumshapedtypemasked_scatterlanguage_modelr   rT   rL   r5   rU   to_tuple)r0   rI   r:   rJ   rK   rL   rM   r"   rN   rO   r;   rP   rQ   	lm_kwargsrG   special_image_maskn_image_tokensn_image_featuresoutputsoutputr   r   r   r6   i   sb   
zVipLlavaModel.forwardr4   )NNNNNNNNNNNN)r   r   r   rD   FloatTensorr   r   r#   listrH   r   
LongTensorTensorbooltupler   r6   r   r   r   r   r9   I   s^    
	

r9   c                !   @   s   e Zd Z	ddejdeeeee f  fddZ															ddej
dejdeej d	eej
 d
eeej  deej deeeee f  deej
 dee dee dee dee deej
 deeejf deeef fddZdS ) VipLlavaForConditionalGenerationNr:   r"   c                 C   s   | j j||dS )NrS   )modelrH   )r0   r:   r"   r   r   r   rH      s   z3VipLlavaForConditionalGeneration.get_image_featuresr   rI   rJ   rK   rL   rM   labelsrN   rO   r;   rP   rQ   logits_to_keeprR   c                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| jd|||||||	||
|d|d|}|d }t|trLt| dn|}| 	|dd|ddf }d}|durm| j
||| j jjd}t|||j|j|j|jdS )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rI   r:   rJ   rK   rL   rM   rN   r"   rO   r;   rP   rQ   r   )logitsrt   
vocab_size)lossrv   rL   r5   rU   rV   r   )r   rO   r;   rW   r"   rs   r!   r#   slicelm_headloss_functionr+   rw   r   rL   r5   rU   rV   )r0   rI   r:   rJ   rK   rL   rM   r"   rt   rN   rO   r;   rP   rQ   ru   rf   rj   r5   slice_indicesrv   rx   r   r   r   r6      sH   4z(VipLlavaForConditionalGeneration.forwardr4   )NNNNNNNNNNNNNr   )r   r   r   rD   rl   r   r   r#   rm   rH   rn   ro   rp   rq   r   r6   r   r   r   r   rr      sh    
	

rr   )r9   rr   r8   )typingr   r   rD   r   (transformers.models.llava.modeling_llavar   r   r   r   r	   activationsr   utilsr   r   r   configuration_vipllavar   
get_loggerr   loggerr   r   Moduler   r8   r9   rr   __all__r   r   r   r   <module>   s   
qf