o
    	Û·i†P  ã                   @   s<  d dl mZ d dlmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ eeddG dd„ deƒƒƒZeeddG dd„ deƒƒƒZG dd„ dejƒZeG dd„ deƒƒZeddG dd„ deƒƒZeddG dd „ d eeƒƒZg d!¢Z dS )"é    )Ú	dataclass)ÚOptionalÚUnionN)Únné   )ÚACT2FN)ÚCache)ÚGenerationMixin)ÚBaseModelOutputWithPastÚModelOutput)ÚPreTrainedModel)Úauto_docstringÚcan_return_tupleé   )Ú	AutoModelé   )ÚVipLlavaConfigzM
    Base class for VipLlava outputs, with hidden states and attentions.
    )Úcustom_introc                   @   s$   e Zd ZU dZdZeej ed< dS )ÚVipLlavaModelOutputWithPastaÏ  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    NÚimage_hidden_states)	Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   ÚtorchÚFloatTensorÚ__annotations__© r   r   úd/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   &   s   
 r   zT
    Base class for VipLlava causal language model (or autoregressive) outputs.
    c                   @   s„   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	ÚVipLlavaCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    NÚlossÚlogitsÚpast_key_valuesÚhidden_statesÚ
attentionsr   )r   r   r   r   r    r   r   r   r   r!   r"   r   r#   Útupler$   r   r   r   r   r   r   ;   s   
 r   c                       s*   e Zd Zdef‡ fdd„Zdd„ Z‡  ZS )ÚVipLlavaMultiModalProjectorÚconfigc                    s†   t ƒ  ¡  t|jtƒrdnt|jƒ}tj||jj	 |j
d| _tj||jj	 |jj	dd| _t|j | _tj|jj	|jj	dd| _d S )Nr   )ÚepsT©Úbias)ÚsuperÚ__init__Ú
isinstanceÚvision_feature_layersÚintÚlenr   Ú	LayerNormÚvision_configÚhidden_sizeÚprojector_layernorm_epsÚprojector_layernormÚLinearÚtext_configÚlinear_1r   Úprojector_hidden_actÚactÚlinear_2)Úselfr'   Únum_feature_layers©Ú	__class__r   r   r,   Z   s   
ÿ
ýz$VipLlavaMultiModalProjector.__init__c                 C   s,   |   |¡}|  |¡}|  |¡}|  |¡}|S ©N)r5   r8   r:   r;   )r<   r#   r   r   r   Úforwardi   s
   


z#VipLlavaMultiModalProjector.forward)r   r   r   r   r,   rA   Ú__classcell__r   r   r>   r   r&   Y   s    r&   c                   @   s6   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZdS )ÚVipLlavaPreTrainedModelr'   Ú Tr"   N)r   r   r   r   r   Úbase_model_prefixÚsupports_gradient_checkpointingÚ_skip_keys_device_placementÚ_supports_flash_attnÚ_supports_sdpaÚ_can_compile_fullgraphÚ_supports_flex_attnÚ_supports_attention_backendr   r   r   r   rC   q   s   
 rC   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    c                       s:  e Zd ZddiZdef‡ fdd„Zdd„ Zdd	„ Zd
d„ Zdd„ Z		d#de
jdeeeee f  fdd„Zde
jde
jde
jfdd„Ze												d$dee
j dee
j dee
j dee
j dee dee
j deeeee f  dee dee dee dee dee
j d eeef fd!d"„ƒZ‡  ZS )%ÚVipLlavaModelzlanguage_model.modelÚlanguage_modelr'   c                    s>   t ƒ  |¡ t |j¡| _t|ƒ| _t |j¡| _	|  
¡  d S r@   )r+   r,   r   Úfrom_configr2   Úvision_towerr&   Úmulti_modal_projectorr7   rN   Ú	post_init©r<   r'   r>   r   r   r,   ˆ   s
   
zVipLlavaModel.__init__c                 C   ó
   | j  ¡ S r@   )rN   Úget_input_embeddings©r<   r   r   r   rU      ó   
z"VipLlavaModel.get_input_embeddingsc                 C   ó   | j  |¡ d S r@   )rN   Úset_input_embeddings©r<   Úvaluer   r   r   rY   “   ó   z"VipLlavaModel.set_input_embeddingsc                 C   s
   || _ d S r@   ©rN   ©r<   Údecoderr   r   r   Úset_decoder–   rW   zVipLlavaModel.set_decoderc                 C   ó   | j S r@   r]   rV   r   r   r   Úget_decoder™   ó   zVipLlavaModel.get_decoderNÚpixel_valuesr.   c                    sv   |dur|n| j j}| j|dd‰ t|tƒr$ˆ j| dd…dd…f }n‡ fdd„|D ƒ}tj|dd}|  |¡}|S )	aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, list[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        NT)Úoutput_hidden_statesr   c                    s&   g | ]}ˆ j | d d …dd …f ‘qS )Nr   )r#   )Ú.0Úindex©Úimage_outputsr   r   Ú
<listcomp>¶   s   & z4VipLlavaModel.get_image_features.<locals>.<listcomp>éÿÿÿÿ)Údim)	r'   r.   rP   r-   r/   r#   r   ÚcatrQ   )r<   rd   r.   Úimage_featuresr   rh   r   Úget_image_featuresœ   s   ÿ

z VipLlavaModel.get_image_featuresÚ	input_idsÚinputs_embedsrn   c                 C   s¤   |du r||   ¡ tj| jjtj|jdƒk}| d¡}n|| jjk}| ¡ }| 	d¡ 
|¡ |j¡}|jd |jd  }||  ¡ | ¡ krPtd|› d|› ƒ‚|S )zï
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)ÚdtypeÚdevicerk   r   r   z6Image features and image tokens do not match: tokens: z, features )rU   r   Útensorr'   Úimage_token_idÚlongrs   ÚallÚsumÚ	unsqueezeÚ	expand_asÚtoÚshapeÚnumelÚ
ValueError)r<   rp   rq   rn   Úspecial_image_maskÚn_image_tokensÚn_image_featuresr   r   r   Úget_placeholder_mask»   s   ÿÿz"VipLlavaModel.get_placeholder_maskÚattention_maskÚposition_idsr"   Ú	use_cacheÚoutput_attentionsre   Úreturn_dictÚcache_positionÚreturnc                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4tdƒ‚|du r>|  ¡ |ƒ}|dur_| j||d}| |j	|j
¡}| j|||d}| ||¡}| jd||||||	|
d|dœ	|¤Ž}t|j|j|j|j|dur€|ndd}|rˆ|S | ¡ S )	zÃ
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embeds©rd   r.   )rq   rn   T)	rƒ   r„   r"   rq   r…   r†   re   r‡   rˆ   )Úlast_hidden_stater"   r#   r$   r   r   )r'   r†   re   Úuse_return_dictr.   r~   rU   ro   r{   rs   rr   r‚   Úmasked_scatterrN   r   r‹   r"   r#   r$   Úto_tuple)r<   rp   rd   rƒ   r„   r"   rq   r.   r…   r†   re   r‡   rˆ   Ú	lm_kwargsrn   r   ÚoutputsÚoutputr   r   r   rA   Ó   sP   ÿÿÿÿ÷
öûzVipLlavaModel.forwardr@   )NNNNNNNNNNNN)r   r   r   Ú_checkpoint_conversion_mappingr   r,   rU   rY   r`   rb   r   r   r   r   r/   Úlistro   Ú
LongTensorr‚   r   ÚTensorr   Úboolr%   r   rA   rB   r   r   r>   r   rM   €   sx    ÿÿ
ÿÿÿ
ÿóþýüûúùø	÷
öõôó
ñrM   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c                #       s˜  e Zd ZdddddœZdgZdef‡ fdd	„Zd
d„ Zdd„ Zde	j
fdd„Zdd„ Zdd„ Z	d1dejdeeeee f  fdd„Zedd„ ƒZedd„ ƒZedd„ ƒZee														 d2d!eej deej d"eej d#eej d$ee d%eej deeeee f  d&eej d'ee d(ee d)ee d*ee d+eej d,eeejf deee f fd-d.„ƒƒZ!						d3‡ fd/d0„	Z"‡  Z#S )4Ú VipLlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorÚlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr'   c                    s<   t ƒ  |¡ t|ƒ| _tj|jj|jjdd| _	|  
¡  d S )NFr)   )r+   r,   rM   Úmodelr   r6   r7   r3   Ú
vocab_sizer˜   rR   rS   r>   r   r   r,   '  s   
z)VipLlavaForConditionalGeneration.__init__c                 C   rT   r@   )r™   rU   rV   r   r   r   rU   -  rW   z5VipLlavaForConditionalGeneration.get_input_embeddingsc                 C   rX   r@   )r™   rY   rZ   r   r   r   rY   0  r\   z5VipLlavaForConditionalGeneration.set_input_embeddingsr‰   c                 C   ra   r@   )r˜   rV   r   r   r   Úget_output_embeddings3  rc   z6VipLlavaForConditionalGeneration.get_output_embeddingsc                 C   rX   r@   )r™   r`   r^   r   r   r   r`   6  r\   z,VipLlavaForConditionalGeneration.set_decoderc                 C   rT   r@   )r™   rb   rV   r   r   r   rb   9  rW   z,VipLlavaForConditionalGeneration.get_decoderNrd   r.   c                 C   s   | j j||dS )NrŠ   )r™   ro   )r<   rd   r.   r   r   r   ro   <  s   z3VipLlavaForConditionalGeneration.get_image_featuresc                 C   ó   | j jS r@   )r™   rN   rV   r   r   r   rN   B  ó   z/VipLlavaForConditionalGeneration.language_modelc                 C   rœ   r@   )r™   rP   rV   r   r   r   rP   F  r   z-VipLlavaForConditionalGeneration.vision_towerc                 C   rœ   r@   )r™   rQ   rV   r   r   r   rQ   J  r   z6VipLlavaForConditionalGeneration.multi_modal_projectorr   rp   rƒ   r„   r"   rq   Úlabelsr…   r†   re   r‡   rˆ   Úlogits_to_keepc                 K   sö   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| jd|||||||	||
|d|dœ|¤Ž}|d }t|tƒrLt| dƒn|}|  	|dd…|dd…f ¡}d}|durm| j
||| j jjd}t|||j|j|j|jdS )a¢  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rp   rd   rƒ   r„   r"   rq   r…   r.   r†   re   r‡   rˆ   r   )r!   rž   rš   )r    r!   r"   r#   r$   r   r   )r'   r†   re   rŒ   r.   r™   r-   r/   Úslicer˜   Úloss_functionr7   rš   r   r"   r#   r$   r   )r<   rp   rd   rƒ   r„   r"   rq   r.   rž   r…   r†   re   r‡   rˆ   rŸ   r   r   r#   Úslice_indicesr!   r    r   r   r   rA   N  sH   6ÿÿôóúz(VipLlavaForConditionalGeneration.forwardc           
         s8   t ƒ j|f|||||dœ|¤Ž}	|d dkr||	d< |	S )N)r"   rq   rƒ   rˆ   rŸ   r   rd   )r+   Úprepare_inputs_for_generation)
r<   rp   r"   rq   rd   rƒ   rˆ   rŸ   ÚkwargsÚmodel_inputsr>   r   r   r£   ¯  s   ÿúù
z>VipLlavaForConditionalGeneration.prepare_inputs_for_generationr@   )NNNNNNNNNNNNNr   )NNNNNN)$r   r   r   r’   Ú_tied_weights_keysr   r,   rU   rY   r   ÚModuler›   r`   rb   r   r   r   r   r/   r“   ro   ÚpropertyrN   rP   rQ   r   r   r”   r•   r   r–   r%   r   rA   r£   rB   r   r   r>   r   r—     sž    üÿÿ
ÿ


ñþýüûúùø	÷
öõôóòñ
ïbør—   )rM   r—   rC   )!Údataclassesr   Útypingr   r   r   r   Úactivationsr   Úcache_utilsr   Ú
generationr	   Úmodeling_outputsr
   r   Úmodeling_utilsr   Úutilsr   r   Úautor   Úconfiguration_vipllavar   r   r   r§   r&   rC   rM   r—   Ú__all__r   r   r   r   Ú<module>   sH   ÿÿÿ ÿ 1