o
    wiu                     @   s|  d dl Zd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 d dlZddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. e/e0Z1	dAde
j2dej3dej3dej3deej3 de4de4fddZ5G dd de%Z6G dd  d e#Z7eG d!d" d"eZ8eed#d$G d%d& d&eZ9G d'd( d(e
j2Z:G d)d* d*e
j2Z;G d+d, d,e!Z<e
j=e6d-Z>G d.d/ d/eZ?G d0d1 d1e
j2Z@eG d2d3 d3e8ZAG d4d5 d5e+ZBdZCG d6d7 d7e
j2ZDG d8d9 d9e*ZEG d:d; d;e)ZFG d<d= d=e'ZGG d>d? d?e(ZHg d@ZIdS )B    N)	dataclass)CallableOptionalUnion   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging	torch_int   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfig        modulequerykeyvalueattention_maskscalingdropoutc                 K   s   |}|}	t ||dd| }
|d ur+|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj|
|| jd}
t |
|	}|dd	 }||
fS )Nr   r   dim)ptrainingr   )
torchmatmul	transposeshapenn
functionalsoftmaxr'   r-   
contiguous)r!   r"   r#   r$   r%   r&   r'   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_output r<   j/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forward0   s   
&r>   c                   @      e Zd ZdS )InternVLVisionRMSNormN__name__
__module____qualname__r<   r<   r<   r=   r@   K       r@   c                
       sT   e Zd Zdef fddZ		ddejdeej deej dee	 fd	d
Z
  ZS )InternVLVisionAttentionconfigc                    sT   t    | `d| _|j}|rt| jnt | _	|r#t| j| _
d S t | _
d S )NF)super__init__num_key_value_groups	is_causaluse_qk_normr@   	embed_dimr2   Identityq_normk_norm)selfrG   qk_norm	__class__r<   r=   rI   P   s   
"z InternVLVisionAttention.__init__Nhidden_statesr%   output_attentionsr6   c                 K   s"  |  \}}}| |}| |}	| |}
| |}| |	}	|||| j| j	dd}|	||| j| j	dd}	|

||| j| j	dd}
t}| jjdkrXt| jj }|| ||	|
|f| jsddn| j| jdd|\}}|||| j}| |}| |}|r||f}|S |d f}|S )Nr   r   eagerr    F)r'   r&   rK   )sizeq_projk_projv_projrO   rP   reshape	num_headshead_dimr0   viewr>   rG   _attn_implementationr   r-   attention_dropoutscalerM   projection_layerprojection_dropout)rQ   rU   r%   rV   r6   
batch_sizeseq_len_query_statesr7   r8   attention_interfacer;   r9   outputoutputsr<   r<   r=   forward[   s@   




	


zInternVLVisionAttention.forwardNN)rB   rC   rD   r   rI   r.   Tensorr   r   r   rl   __classcell__r<   r<   rS   r=   rF   O   s    rF   c                   @   s:   e Zd ZeZdZdZdZdgZdZ	dZ
dZdZdd ZdS )InternVLVisionPreTrainedModelinternvl_visionpixel_valuesTInternVLVisionLayerc                 C   s:  t |tjtjtjfr%|jjjd| jj	d |j
dur#|j
j  dS dS t |tjrH|jjjd| jj	d |jdurF|jj|j   dS dS t |tjr]|j
j  |jjd dS t |tr|jj  |jdurs|jj  |jdur|jj  dS dS t |tr|jj| jj |jj| jj dS dS )zInitialize the weightsr    meanstdN      ?)
isinstancer2   LinearConv2dConvTranspose2dweightdatanormal_rG   initializer_rangebiaszero_	Embeddingpadding_idx	LayerNormfill_InternVLVisionEmbeddings	cls_token
mask_tokenposition_embeddingsrs   lambda_1layer_scale_init_valuelambda_2)rQ   r!   r<   r<   r=   _init_weights   s0   





z+InternVLVisionPreTrainedModel._init_weightsN)rB   rC   rD   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backendr   r<   r<   r<   r=   rp      s    rp   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                   @   s   e Zd ZdZdS )$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)rB   rC   rD   __doc__r<   r<   r<   r=   r      s    r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _|| _tj	||||d| _
d S )Nr   r   )kernel_sizestride)rH   rI   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper2   rz   
projection)rQ   rG   r   r   r   r   r   r   rS   r<   r=   rI      s   
  z&InternVLVisionPatchEmbeddings.__init__rr   returnc           	      C   s^   |j \}}}}|| jkrtd| |}|j d |j d }}|ddd}|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r1   r   
ValueErrorr   flattenr0   )	rQ   rr   re   r   heightwidth
embeddingspatch_heightpatch_widthr<   r<   r=   rl      s   

z%InternVLVisionPatchEmbeddings.forward)	rB   rC   rD   r   rI   r.   rn   rl   ro   r<   r<   rS   r=   r      s    r   c                       sl   e Zd ZdZdeddf fddZdejded	edejfd
dZ		ddejde
ej dejfddZ  ZS )r   zc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rG   r   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _|j| _t|jtjjr8|jn|j|jf| _| jj}|jrUttd|d |j| _nd | _t|j| _d S )Nr   )rH   rI   r2   	Parameterr.   zerosr   r   use_mask_tokenr   r   patch_embeddingsr   rx   r   collectionsabcIterabler    use_absolute_position_embeddingsr   Dropouthidden_dropout_probr'   )rQ   rG   r   rS   r<   r=   rI      s    


z!InternVLVisionEmbeddings.__init__r   r   r   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| jd  }	|| jd  }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr)   r         ?r   r   bicubicF)rX   modealign_cornersr*   )r1   r   r.   jit
is_tracingr   r   r\   permuter2   r3   interpolater_   cat)rQ   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedr+   
new_height	new_widthsqrt_num_positionsr<   r<   r=   interpolate_pos_encoding   s(   

z1InternVLVisionEmbeddings.interpolate_pos_encodingrr   bool_masked_posc                 C   s   |j \}}}}| |\}\}}| \}	}
}|d ur5| j|	|
d}|d|}|d|  ||  }| j|	dd}tj	||fdd}| j
d urT|| ||| }| |}|||ffS )Nr)   r   r*   )r1   r   rX   r   expand	unsqueezetype_asr   r.   r   r   r   r'   )rQ   rr   r   rg   r   r   r   r   r   re   rf   mask_tokensw
cls_tokensr<   r<   r=   rl   &  s   

z InternVLVisionEmbeddings.forwardN)rB   rC   rD   r   r   rI   r.   rn   intr   r   
BoolTensorrl   ro   r<   r<   rS   r=   r      s    +r   c                   @   r?   )InternVLVisionMLPNrA   r<   r<   r<   r=   r   @  rE   r   )
layer_normrms_normc                       s`   e Zd ZdZdeddf fddZ	ddejd	ede	e
ej e
ejejf f fd
dZ  ZS )rs   z?This corresponds to the Block class in the timm implementation.rG   r   Nc                    s   t    |j| _d| _t|| _t|| _t|j	 |j
|jd| _t|j	 |j
|jd| _|j}tj|t|j
 dd| _tj|t|j
 dd| _t|j| _d S )Nr   epsT)requires_grad)rH   rI   chunk_size_feed_forwardseq_len_dimrF   	attentionr   mlpNORM2FN	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterr   r2   r   r.   onesr   r   r   r   r'   )rQ   rG   init_valuesrS   r<   r=   rI   J  s   


zInternVLVisionLayer.__init__FrU   rV   c                 C   sl   | j | ||d\}}| j| }|| }| |}| |}| |}| jd ur.| j| }|| }||fS )N)rV   )r   r   r   r   r   r'   r   )rQ   rU   rV   attention_outputattention_weightslayer_outputr<   r<   r=   rl   Y  s   






zInternVLVisionLayer.forward)F)rB   rC   rD   r   r   rI   r.   rn   boolr   tuplerl   ro   r<   r<   rS   r=   rs   G  s    rs   c                       sT   e Zd Zdeddf fddZe		ddejded	ede	e
ef fd
dZ  ZS )InternVLVisionEncoderrG   r   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r<   )rs   ).0irG   r<   r=   
<listcomp>{  s    z2InternVLVisionEncoder.__init__.<locals>.<listcomp>F)	rH   rI   rG   r2   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrQ   rG   rS   r   r=   rI   x  s   
 
zInternVLVisionEncoder.__init__FrU   rV   output_hidden_statesc           	      C   sz   |rdnd }|r
dnd }t | jD ]\}}|r||f }|||}|d }|r.||d f }q|r6||f }t|||dS )Nr<   r   r   last_hidden_staterU   
attentions)	enumerater   r
   )	rQ   rU   rV   r   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputsr<   r<   r=   rl   ~  s"   


zInternVLVisionEncoder.forward)FF)rB   rC   rD   r   rI   r   r.   rn   r   r   r   r
   rl   ro   r<   r<   rS   r=   r   w  s    
r   c                       st   e Zd Zdeddf fddZdd Zee			ddej	d	e
ej d
e
e de
e deeef f
ddZ  ZS )InternVLVisionModelrG   r   Nc                    sT   t  | || _t|| _t|| _|jrt	 ntj
|j|jd| _|   d S )Nr   )rH   rI   rG   r   r   r   encoderuse_mean_poolingr2   rN   r   r   r   	layernorm	post_initr   rS   r<   r=   rI     s   

zInternVLVisionModel.__init__c                 C   s   | j jS r   )r   r   )rQ   r<   r<   r=   get_input_embeddings  s   z(InternVLVisionModel.get_input_embeddingsrr   r   rV   r   c           	      C   sn   |dur|n| j j}|dur|n| j j}| j||d\}}| j|||d}|d }| |}t||j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   )rV   r   r   r   )	rG   rV   r   r   r   r   r   rU   r   )	rQ   rr   r   rV   r   embedding_outputrg   encoder_outputssequence_outputr<   r<   r=   rl     s    
zInternVLVisionModel.forward)NNN)rB   rC   rD   r   rI   r   r   r   r.   rn   r   r   r   r   r   r   rl   ro   r<   r<   rS   r=   r     s&    
r   c                   @   s   e Zd Zdd ZdS )InternVLPreTrainedModelc                 C   s   t | jd| j j}t|tjr)|jjj	d|d |j
d ur'|j
j  d S d S t|tjr>|j
j  |jjd d S d S )Nr   r    rt   rw   )getattrrG   get_text_configr   rx   r2   ry   r|   r}   r~   r   r   r   r   )rQ   r!   rv   r<   r<   r=   r     s   
z%InternVLPreTrainedModel._init_weightsN)rB   rC   rD   r   r<   r<   r<   r=   r    s    r  c                       s*   e Zd Zdef fddZdd Z  ZS )InternVLMultiModalProjectorrG   c                    sz   t    t|jjtd|j d  | _t	|jjtd|j d  |j
j| _t|j | _t	|j
j|j
j| _d S )Nr   r   )rH   rI   r2   r   vision_configr   r   downsample_ratior   ry   text_configlinear_1r   projector_hidden_actactlinear_2r   rS   r<   r=   rI     s   
"z$InternVLMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S r   )r   r
  r  r  )rQ   image_featuresrU   r<   r<   r=   rl     s
   



z#InternVLMultiModalProjector.forward)rB   rC   rD   r   rI   rl   ro   r<   r<   rS   r=   r    s    	r  c                   @   r?   )InternVLModelOutputWithPastNrA   r<   r<   r<   r=   r    rE   r  c                #   @   s  e Zd ZddejdefddZ		ddejdee	e
ee
 f  d	ee fd
dZee													ddejdejdeej deej deeej  deej dee	e
ee
 f  d	ee dee dee dee dee deej dee de	eef fddZdS )InternVLModelr   vision_featuresscale_factorc              	   C   s   |  \}}}}|| dks|| dkrtd|||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rX   r   r_   r   r   r5   )rQ   r  r  re   r   r   channelsr<   r<   r=   pixel_shuffle  s   $zInternVLModel.pixel_shuffleNrr   vision_feature_layervision_feature_select_strategyc           
      K   s   |dur|n| j j}|dur|n| j j}| j j}|dkr$| j|dj}n	| j|dj| }|dkr>|ddddddf }|jd }t	|d }|jd }	|
|	||d}| j||d}|
|	d|jd }| |}|S )	a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `list[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        Nr)   )rr   defaultr   r   r   )r  )rG   r  r  r  vision_towerr   vision_modelrU   r1   r   r\   r  multi_modal_projector)
rQ   rr   r  r  r6   r  r  r  feature_sizere   r<   r<   r=   get_image_features  s(   


z InternVLModel.get_image_features	input_idsr%   position_idspast_key_valuesinputs_embeds	use_cacherV   r   return_dictcache_positionr6   r   c                 K   s  |
d ur|
n| j j}
|d ur|n| j j}|d ur|n| j j}|d ur$|n| j j}|d ur.|n| j j}|d u |d uA r>td|d u rH|  |}|d ur| j|||d}|d u rw||  t	j
| j jt	j|jdk}|jddjddd }n|| j jkd}|||j}|| j jk }t s||  | kr|| j jk }|jd |jd  }td| d	| ||j|j}|||}| jd|||||	|
|d
|d	|}t|j|j|j|j|d ur|dS d dS )Nz:You must specify exactly one of input_ids or inputs_embeds)rr   r  r  )dtypedevicer   r*   r   r)   z6Image features and image tokens do not match: tokens: z, features T)	r%   r  r  r   r!  rV   r   r"  r#  )r   r  rU   r   image_hidden_statesr<   )rG   rV   r   use_return_dictr  r  r   r   r  r.   tensorimage_token_idlongr%  sumr   	expand_astor   numelr1   r$  masked_scatterlanguage_modelr  r   r  rU   r   )rQ   r  rr   r%   r  r  r   r  r  r!  rV   r   r"  r#  r6   r  special_image_maskn_image_tokensn_image_featuresrk   r<   r<   r=   rl   R  st   

zInternVLModel.forward)r   rm   )NNNNNNNNNNNNN)rB   rC   rD   r.   rn   floatr  FloatTensorr   r   r   liststrr  r   r   
LongTensorr   r   r   r   r  rl   r<   r<   r<   r=   r    sr    &
5	

r  c                   @   r?   )InternVLCausalLMOutputWithPastNrA   r<   r<   r<   r=   r9    rE   r9  c                       s   e Zd Z fddZ  ZS ) InternVLForConditionalGenerationc                     s   t  jdi |  dS )ai  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```Nr<   )rH   rl   )super_kwargsrS   r<   r=   rl     s   $z(InternVLForConditionalGeneration.forward)rB   rC   rD   rl   ro   r<   r<   rS   r=   r:    s    r:  )rp   r   r  r  r:  )r    )Jcollections.abcr   dataclassesr   typingr   r   r   r.   torch.nnr2   torch.utils.checkpointactivationsr   modeling_flash_attention_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   r   configuration_internvlr   r   
get_loggerrB   loggerModulern   r4  r>   r@   rF   rp   r   r   r   r   r   r   rs   r   r   r  INTERNVL_INPUTS_DOCSTRINGr  r  r  r9  r:  __all__r<   r<   r<   r=   <module>   sz   


8%	&^0&5 1(