o
    ei\I                     @   sb  d Z ddlmZ ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ee Z!eeddG dd deZ"eeddG dd deZ#G dd dej$Z%eG dd deZ&eddG dd de&Z'ed dG d!d" d"e&e
Z(g d#Z)dS )$zPyTorch Llava model.    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringloggingtorch_compilable_check)can_return_tuplemerge_with_config_defaults   )	AutoModel   )LlavaConfigzJ
    Base class for Llava outputs, with hidden states and attentions.
    custom_introc                   @   s$   e Zd ZU dZdZejdB ed< dS )LlavaModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__ r"   r"   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.pyr   $   s   
 r   zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dS )	LlavaCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r%   r   r    r!   r&   r'   r   r(   tupler)   r   r"   r"   r"   r#   r$   9   s   
 r$   c                       s*   e Zd Zdef fddZdd Z  ZS )LlavaMultiModalProjectorconfigc                    sp   t    t|jtrdnt|j}tj|jj	| |j
j	|jd| _t|j | _tj|j
j	|j
j	|jd| _d S )Nr   bias)super__init__
isinstancevision_feature_layerintlenr   Linearvision_confighidden_sizetext_configmultimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)selfr,   num_feature_layers	__class__r"   r#   r0   X   s   

z!LlavaMultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S N)r:   r<   r=   )r>   image_featuresr(   r"   r"   r#   forwardf   s   


z LlavaMultiModalProjector.forward)r   r   r   r   r0   rD   __classcell__r"   r"   r@   r#   r+   W   s    r+   c                   @   s:   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZdZdS )LlavaPreTrainedModelr,   model)imagetextTr'   N)r   r   r   r   r!   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr"   r"   r"   r#   rF   m   s   
 rF   zu
    The Llava model which consists of a vision backbone and a language model, without a language modeling head.
    c                       sN  e Zd ZddiZdef fddZdd Zdd	 Zee	e
d
d			d!dejdeee B dB dedB dedB dee deeB fddZdejdejdejfddZee
										d"dejdB dejdB dejdB dejdB dedB dejdB deee B dB dedB dejdB dejdB dee deeB fdd Z  ZS )#
LlavaModel^language_model.modellanguage_modelr,   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S rB   )r/   r0   r   from_configr6   vision_towerr+   multi_modal_projectorr8   rU   	post_initr>   r,   r@   r"   r#   r0      s
   
zLlavaModel.__init__c                 C   
   | j  S rB   )rU   get_input_embeddingsr>   r"   r"   r#   r\         
zLlavaModel.get_input_embeddingsc                 C      | j | d S rB   )rU   set_input_embeddingsr>   valuer"   r"   r#   r`         zLlavaModel.set_input_embeddingszWObtains image last hidden states from the vision tower and apply multimodal projection.r   Npixel_valuesr2   vision_feature_select_strategyoutput_hidden_stateskwargsreturnc           
         s   dd |  D }| j|fddd| t|tr. j| }|dkr-|d d dd f }n fdd|D }|dkrBd	d |D }tj|d
d}| |}|dd urstj	|d |j
d| jj jd
d }	t|d|	}nt|}| _ S )Nc                 S   s   i | ]\}}|d ur||qS rB   r"   ).0kvr"   r"   r#   
<dictcomp>   s    z1LlavaModel.get_image_features.<locals>.<dictcomp>T)rf   return_dictdefaultr   c                    s   g | ]} j | qS r"   )r(   )ri   	layer_idximage_outputsr"   r#   
<listcomp>   s    z1LlavaModel.get_image_features.<locals>.<listcomp>c                 S   s    g | ]}|d d dd f qS )Nr   r"   )ri   hsr"   r"   r#   rr      s     dimimage_sizes)devicer   )itemsrW   r1   r3   r(   r   catrX   get	as_tensorrx   
patch_sizeprodtolistsplitsqueezelistpooler_output)
r>   rd   r2   re   rf   rg   selected_image_featurehs_poolrC   split_sizesr"   rp   r#   get_image_features   s:   
	

zLlavaModel.get_image_features	input_idsinputs_embedsrC   c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)dtyperx   rt   r   r   z6Image features and image tokens do not match, tokens: z, features: )r\   r   tensorr,   image_token_idlongrx   allsumshape	unsqueeze	expand_astor   numel)r>   r   r   rC   special_image_maskn_image_tokensn_image_featuresr"   r"   r#   get_placeholder_mask   s   zLlavaModel.get_placeholder_maskattention_maskposition_idsr'   cache_positionrw   c                 K   s   |d u |d uA rt d|d u r|  |}|d ur@| j||||
ddj}tj|dd|j|j}| j	|||d}|
||}| jd	|||||	d|}t|j|j|j|j|d ur_|dS d dS )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rd   r2   re   rw   rm   r   ru   )r   rC   )r   r   r'   r   r   )last_hidden_stater'   r(   r)   r   r"   )
ValueErrorr\   r   r   r   rz   r   rx   r   r   masked_scatterrU   r   r   r'   r(   r)   )r>   r   rd   r   r   r'   r   r2   re   r   rw   rg   rC   r   outputsr"   r"   r#   rD      sJ   	
zLlavaModel.forward)NNN)
NNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r0   r\   r`   r   r   r   r   r    r3   r   strboolr   r   r*   r	   r   
LongTensorr   Tensorr   r   rD   rE   r"   r"   r@   r#   rS   }   s    /
	
rS   zS
    The LLAVA model which consists of a vision backbone and a language model.
    c                        sl  e Zd ZdddddZddiZdef fd	d
Zdd Zdd Zde	j
fddZe		d(dejdeee B dB dedB dee deeB f
ddZee												d)dejdB dejdB dejdB dejdB dedB dejdB deee B dB dedB dejdB d ejdB d!eejB d"ejdB dee deeB fd#d$Z							%d* fd&d'	Z  ZS )+LlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)rT   z^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr,   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr-   )r/   r0   rS   rG   r   r5   r8   r7   
vocab_sizer   rY   rZ   r@   r"   r#   r0   %  s   
z&LlavaForConditionalGeneration.__init__c                 C   r[   rB   )rG   r\   r]   r"   r"   r#   r\   +  r^   z2LlavaForConditionalGeneration.get_input_embeddingsc                 C   r_   rB   )rG   r`   ra   r"   r"   r#   r`   .  rc   z2LlavaForConditionalGeneration.set_input_embeddingsrh   c                 C   s   | j S rB   )r   r]   r"   r"   r#   get_output_embeddings1  s   z3LlavaForConditionalGeneration.get_output_embeddingsNrd   r2   re   rg   c                 K   s   | j jd|||d|S )N)rd   r2   re   r"   )rG   r   )r>   rd   r2   re   rg   r"   r"   r#   r   4  s   z0LlavaForConditionalGeneration.get_image_featuresr   r   r   r   r'   r   labelsr   logits_to_keeprw   c                 K   s   | j d|||||||||
|d
|}|d }t|tr"t| dn|}| |dd|ddf }d}|	durG| jd||	| jjjd|}t	|||j
|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```)
r   rd   r   r   r'   r   r2   re   r   rw   r   N)r&   r   r   )r%   r&   r'   r(   r)   r   r"   )rG   r1   r3   slicer   loss_functionr,   r8   r   r$   r'   r(   r)   r   )r>   r   rd   r   r   r'   r   r2   re   r   r   r   rw   rg   r   r(   slice_indicesr&   r%   r"   r"   r#   rD   C  s@   /z%LlavaForConditionalGeneration.forwardFc	              	      s>   t  j|f||||||d|	}
|s|	dds||
d< |
S )N)r'   r   r   r   r   is_first_iteration	use_cacheTrd   )r/   prepare_inputs_for_generationr{   )r>   r   r'   r   rd   r   r   r   r   rg   model_inputsr@   r"   r#   r     s   z;LlavaForConditionalGeneration.prepare_inputs_for_generation)NN)NNNNNNNNNNr   N)NNNNNNF)r   r   r   r   _tied_weights_keysr   r0   r\   r`   r   Moduler   r   r   r    r3   r   r   r   r   r*   r	   r   r   r   r   r   r$   rD   r   rE   r"   r"   r@   r#   r     s    	
Rr   )r   rF   rS   )*r   dataclassesr   r   r   activationsr   cache_utilsr   
generationr   modeling_outputsr   r	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   r   autor   configuration_llavar   
get_loggerr   loggerr   r$   r   r+   rF   rS   r   __all__r"   r"   r"   r#   <module>   sN   
  