o
    ei3                     @   s  d dl Z d dl mZ d dlmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ eeZG dd deZG dd deZG dd dejZG dd deZG dd deZ G dd deZ!g dZ"dS )    N)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)Cache)BaseModelOutputWithPooling)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )VipLlavaConfigc                   @      e Zd ZdS )VipLlavaModelOutputWithPastN__name__
__module____qualname__ r   r   k/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vipllava/modular_vipllava.pyr   &       r   c                   @   r   )VipLlavaCausalLMOutputWithPastNr   r   r   r   r   r   *   r   r   c                       s*   e Zd Zdef fddZdd Z  ZS )VipLlavaMultiModalProjectorconfigc                    s   t    t|jtrdnt|j}tj||jj	 |j
d| _tj||jj	 |jj	dd| _t|j | _tj|jj	|jj	dd| _d S )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r	   projector_hidden_actactlinear_2)selfr   num_feature_layers	__class__r   r   r"   /   s   

z$VipLlavaMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S N)r+   r.   r0   r1   )r2   hidden_statesr   r   r   forward>   s
   



z#VipLlavaMultiModalProjector.forward)r   r   r   r   r"   r8   __classcell__r   r   r4   r   r   .   s    r   c                   @   r   )VipLlavaPreTrainedModelNr   r   r   r   r   r:   F   r   r:   c                   @   s   e Zd Zeedd		ddejdeee B dB de	dB de
e deeB f
d	d
Ze												ddejdB dejdB dejdB dejdB dedB dejdB deee B dB de	dB de	dB de	dB de	dB dejdB deeB fddZdS )VipLlavaModelzWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introNpixel_valuesr$   output_hidden_stateskwargsreturnc                    s   |dur|n| j j}| j|fddd| t|tr) j| ddddf }n fdd|D }tj|dd}| |}| _	 S )	\  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
            The tensors corresponding to the input images.
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        NT)r>   return_dictr   c                    s&   g | ]} j | d d dd f qS )Nr   )r7   ).0indeximage_outputsr   r   
<listcomp>m   s   & z4VipLlavaModel.get_image_features.<locals>.<listcomp>)dim)
r   r$   vision_towerr#   r%   r7   torchcatmulti_modal_projectorpooler_output)r2   r=   r$   r>   r?   image_featuresr   rE   r   get_image_featuresK   s"   
	
z VipLlavaModel.get_image_features	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrB   cache_positionc                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td|du r>|  |}|dura| j||ddj}|	|j
|j}| j|||d}|||}| jd||||||	|
d|d	|}t|j|j|j|j|dur|ndd}|r|S | S )	z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsT)r=   r$   rB   )rU   rO   )	rR   rS   rT   rU   rV   rW   r>   rB   rX   )last_hidden_staterT   r7   
attentionsimage_hidden_statesr   )r   rW   r>   use_return_dictr$   
ValueErrorget_input_embeddingsrP   rN   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelr   rY   rT   r7   rZ   to_tuple)r2   rQ   r=   rR   rS   rT   rU   r$   rV   rW   r>   rB   rX   	lm_kwargsrO   special_image_maskoutputsoutputr   r   r   r8   t   sT   
zVipLlavaModel.forward)NN)NNNNNNNNNNNN)r   r   r   r   r   rK   FloatTensorr%   listboolr   r   tupler   rP   
LongTensorTensorr
   r   r8   r   r   r   r   r;   J   st    %	
r;   c                    @   s   e Zd Ze	ddejdeee B dB dee	 de
eB fddZ														dd	ejdB dejdB d
ejdB dejdB dedB dejdB deee B dB dejdB dedB dedB dedB dedB dejdB deejB de
eB fddZdS ) VipLlavaForConditionalGenerationNr=   r$   r?   r@   c                 K   s   | j jd||d|S )rA   )r=   r$   Nr   )modelrP   )r2   r=   r$   r?   r   r   r   rP      s
   z3VipLlavaForConditionalGeneration.get_image_featuresr   rQ   rR   rS   rT   rU   labelsrV   rW   r>   rB   rX   logits_to_keepc                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| jd|||||||	||
|d|d|}|d }t|trLt| dn|}| 	|dd|ddf }d}|durm| j
||| j jjd}t|||j|j|j|jdS )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rQ   r=   rR   rS   rT   rU   rV   r$   rW   r>   rB   rX   r   )logitsrr   
vocab_size)lossrt   rT   r7   rZ   r[   r   )r   rW   r>   r\   r$   rq   r#   r%   slicelm_headloss_functionr-   ru   r   rT   r7   rZ   r[   )r2   rQ   r=   rR   rS   rT   rU   r$   rr   rV   rW   r>   rB   rX   rs   rf   rh   r7   slice_indicesrt   rv   r   r   r   r8      sH   6z(VipLlavaForConditionalGeneration.forwardr6   )NNNNNNNNNNNNNr   )r   r   r   r   rK   rj   r%   rk   r   r   rm   r   rP   rn   ro   r
   rl   r   r8   r   r   r   r   rp      sr    	
rp   )r;   rp   r:   )#rK   r   (transformers.models.llava.modeling_llavar   r   r   r   r   activationsr	   cache_utilsr
   modeling_outputsr   processing_utilsr   utilsr   r   r   utils.genericr   configuration_vipllavar   
get_loggerr   loggerr   r   Moduler   r:   r;   rp   __all__r   r   r   r   <module>   s$   
pu