o
    eiR                     @   sL  d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZ eeddG dd deZeeddG dd deZG dd dejZeG dd deZ eddG dd de Z!eddG d d! d!e e	Z"g d"Z#dS )#    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringtorch_compilable_check)can_return_tuple   )	AutoModel   )VipLlavaConfigzM
    Base class for VipLlava outputs, with hidden states and attentions.
    custom_introc                   @   s$   e Zd ZU dZdZejdB ed< dS )VipLlavaModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__ r    r    l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   &   s   
 r   zT
    Base class for VipLlava causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dS )	VipLlavaCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r#   r   r   r   r$   r%   r   r&   tupler'   r   r    r    r    r!   r"   ;   s   
 r"   c                       s*   e Zd Zdef fddZdd Z  ZS )VipLlavaMultiModalProjectorconfigc                    s   t    t|jtrdnt|j}tj||jj	 |j
d| _tj||jj	 |jj	dd| _t|j | _tj|jj	|jj	dd| _d S )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr*   num_feature_layers	__class__r    r!   r/   Z   s   

z$VipLlavaMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S N)r8   r;   r=   r>   )r?   r&   r    r    r!   forwardi   s
   



z#VipLlavaMultiModalProjector.forward)r   r   r   r   r/   rD   __classcell__r    r    rA   r!   r)   Y   s    r)   c                   @   s:   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZdZdS )VipLlavaPreTrainedModelr*   model)imagetextTr%   N)r   r   r   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr    r    r    r!   rF   q   s   
 rF   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    c                       sF  e Zd ZddiZdef fddZdd Zdd	 Zee	d
d		d"de
jdeee B dB dedB dee deeB f
ddZde
jde
jde
jfddZe													d#de
jdB de
jdB de
jdB de
jdB dedB de
jdB deee B dB dedB dedB dedB dedB de
jdB deeB fd d!Z  ZS )$VipLlavaModel^language_model.modellanguage_modelr*   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S rC   )r.   r/   r   from_configr5   vision_towerr)   multi_modal_projectorr:   rU   	post_initr?   r*   rA   r    r!   r/      s
   
zVipLlavaModel.__init__c                 C   
   | j  S rC   )rU   get_input_embeddingsr?   r    r    r!   r\         
z"VipLlavaModel.get_input_embeddingsc                 C      | j | d S rC   )rU   set_input_embeddingsr?   valuer    r    r!   r`         z"VipLlavaModel.set_input_embeddingszWObtains image last hidden states from the vision tower and apply multimodal projection.r   Npixel_valuesr1   output_hidden_stateskwargsreturnc                    s   |dur|n| j j}| j|fddd| t|tr) j| ddddf }n fdd|D }tj|dd}| |}| _	 S )	\  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
            The tensors corresponding to the input images.
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        NT)re   return_dictr   c                    s&   g | ]} j | d d dd f qS )Nr   )r&   ).0indeximage_outputsr    r!   
<listcomp>   s   & z4VipLlavaModel.get_image_features.<locals>.<listcomp>)dim)
r*   r1   rW   r0   r2   r&   r   catrX   pooler_output)r?   rd   r1   re   rf   image_featuresr    rl   r!   get_image_features   s"   
	
z VipLlavaModel.get_image_features	input_idsinputs_embedsrs   c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)dtypedevicero   r   r   z6Image features and image tokens do not match, tokens: z, features: )r\   r   tensorr*   image_token_idlongrx   allsumshape	unsqueeze	expand_astor   numel)r?   ru   rv   rs   special_image_maskn_image_tokensn_image_featuresr    r    r!   get_placeholder_mask   s   z"VipLlavaModel.get_placeholder_maskattention_maskposition_idsr%   	use_cacheoutput_attentionsri   cache_positionc                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td|du r>|  |}|dura| j||ddj}|	|j
|j}| j|||d}|||}| jd||||||	|
d|d	|}t|j|j|j|j|dur|ndd}|r|S | S )	z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsT)rd   r1   ri   )rv   rs   )	r   r   r%   rv   r   r   re   ri   r   )last_hidden_stater%   r&   r'   r   r    )r*   r   re   use_return_dictr1   
ValueErrorr\   rt   rr   r   rx   rw   r   masked_scatterrU   r   r   r%   r&   r'   to_tuple)r?   ru   rd   r   r   r%   rv   r1   r   r   re   ri   r   	lm_kwargsrs   r   outputsoutputr    r    r!   rD      sT   
zVipLlavaModel.forward)NN)NNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r/   r\   r`   r   r   r   r   r2   listboolr   r   r(   r	   rt   
LongTensorr   Tensorr   r   rD   rE   r    r    rA   r!   rS      s    %
	
rS   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c                "       sl  e Zd ZdddddZddiZdef fd	d
Zdd Zdd Zde	j
fddZe	d*dejdeee B dB dee deeB fddZee														d+dejdB dejdB dejdB dejdB dedB dejdB deee B dB dejdB dedB d edB d!edB d"edB d#ejdB d$eejB deeB fd%d&Z							'd, fd(d)	Z  ZS )- VipLlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)rT   z^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr*   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr,   )r.   r/   rS   rG   r   r9   r:   r6   
vocab_sizer   rY   rZ   rA   r    r!   r/   .  s   
z)VipLlavaForConditionalGeneration.__init__c                 C   r[   rC   )rG   r\   r]   r    r    r!   r\   4  r^   z5VipLlavaForConditionalGeneration.get_input_embeddingsc                 C   r_   rC   )rG   r`   ra   r    r    r!   r`   7  rc   z5VipLlavaForConditionalGeneration.set_input_embeddingsrg   c                 C   s   | j S rC   )r   r]   r    r    r!   get_output_embeddings:  s   z6VipLlavaForConditionalGeneration.get_output_embeddingsNrd   r1   rf   c                 K   s   | j jd||d|S )rh   )rd   r1   Nr    )rG   rt   )r?   rd   r1   rf   r    r    r!   rt   =  s
   z3VipLlavaForConditionalGeneration.get_image_featuresr   ru   r   r   r%   rv   labelsr   r   re   ri   r   logits_to_keepc                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| jd|||||||	||
|d|d|}|d }t|trLt| dn|}| 	|dd|ddf }d}|durm| j
||| j jjd}t|||j|j|j|jdS )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)ru   rd   r   r   r%   rv   r   r1   r   re   ri   r   r   )r$   r   r   )r#   r$   r%   r&   r'   r   r    )r*   r   re   r   r1   rG   r0   r2   slicer   loss_functionr:   r   r"   r%   r&   r'   r   )r?   ru   rd   r   r   r%   rv   r1   r   r   r   re   ri   r   r   r   r   r&   slice_indicesr$   r#   r    r    r!   rD   O  sH   8z(VipLlavaForConditionalGeneration.forwardFc	              	      s>   t  j|f||||||d|	}
|s|	dds||
d< |
S )N)r%   rv   r   r   r   is_first_iterationr   Trd   )r.   prepare_inputs_for_generationget)r?   ru   r%   rv   rd   r   r   r   r   rf   model_inputsrA   r    r!   r     s   z>VipLlavaForConditionalGeneration.prepare_inputs_for_generationrC   )NNNNNNNNNNNNNr   )NNNNNNF)r   r   r   r   _tied_weights_keysr   r/   r\   r`   r   Moduler   r   r   r   r2   r   r   r   r(   r	   rt   r   r   r   r   r   r"   rD   r   rE   r    r    rA   r!   r      s    	
dr   )rS   r   rF   )$dataclassesr   r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr   r	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   autor   configuration_vipllavar   r   r"   r   r)   rF   rS   r   __all__r    r    r    r!   <module>   sJ     1