o
    eib                     @   s  d dl Zd dl mZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0 	dDdej1dej2dej2dej2dej2dB de3de3fddZ4G d d! d!e'Z5G d"d# d#e%Z6eed$d%G d&d' d'eZ7G d(d) d)ej1Z8G d*d+ d+ej1Z9G d,d- d-e#Z:ej;e5d.Z<G d/d0 d0eZ=G d1d2 d2ej1Z>eG d3d4 d4eZ?eG d5d6 d6e?Z@G d7d8 d8e-ZAdZBG d9d: d:ej1ZCG d;d< d<e,ZDG d=d> d>e+ZEG d?d@ d@e)ZFG dAdB dBe*ZGg dCZHdS )E    N)Callable)	dataclass   )initialization)ACT2FN)Cache)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfig        modulequerykeyvalueattention_maskscalingdropoutc                 K   sx   |}|}	t ||dd| }
|d ur|
| }
tjj|
dd}
tjj|
|| jd}
t |
|	}|dd }||
fS )Nr   r   dim)ptrainingr   )	torchmatmul	transposenn
functionalsoftmaxr'   r,   
contiguous)r!   r"   r#   r$   r%   r&   r'   kwargs
key_statesvalue_statesattn_weightsattn_output r9   k/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forward.   s   
r;   c                   @      e Zd ZdS )InternVLVisionRMSNormN__name__
__module____qualname__r9   r9   r9   r:   r=   H       r=   c                       sH   e Zd Zdef fddZ	d
dejdejdB dee fdd	Z	  Z
S )InternVLVisionAttentionconfigc                    sV   t  | | `d| _|j}|rt| jnt | _	|r$t| j| _
d S t | _
d S )NF)super__init__num_key_value_groups	is_causaluse_qk_normr=   	embed_dimr0   Identityq_normk_norm)selfrD   qk_norm	__class__r9   r:   rF   M   s   "z InternVLVisionAttention.__init__Nhidden_statesr%   r4   c                 K   s  |  \}}}| |}| |}| |}	| |}| |}|||| j| j	dd}|||| j| j	dd}|	
||| j| j	dd}	t| jjt}
|
| |||	|f| js^dn| j| jdd|\}}|||| j}| |}| |}||fS )Nr   r   r    F)r'   r&   rH   )sizeq_projk_projv_projrL   rM   reshape	num_headshead_dimr/   viewr   get_interfacerD   _attn_implementationr;   r,   attention_dropoutscalerJ   projection_layerprojection_dropout)rN   rR   r%   r4   
batch_sizeseq_len_query_statesr5   r6   attention_interfacer8   r7   outputr9   r9   r:   forwardX   s:   




	


zInternVLVisionAttention.forwardN)r?   r@   rA   r   rF   r-   Tensorr   r   rg   __classcell__r9   r9   rP   r:   rC   L   s    rC   z7
    Class for outputs of [`InternVLVisionModel`].
    custom_introc                   @      e Zd ZdZdS )$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)r?   r@   rA   __doc__r9   r9   r9   r:   rn      s    rn   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _|| _tj	||||d| _
d S )Nr   r   )kernel_sizestride)rE   rF   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper0   Conv2d
projection)rN   rD   rs   rt   ru   rv   rw   rx   rP   r9   r:   rF      s   
  z&InternVLVisionPatchEmbeddings.__init__pixel_valuesreturnc                 C   sL   |j \}}}}|| jkrtd| || jjj}|ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   )	shaperu   
ValueErrorrz   toweightdtypeflattenr/   )rN   r{   ra   ru   heightwidth
embeddingsr9   r9   r:   rg      s   
z%InternVLVisionPatchEmbeddings.forward)	r?   r@   rA   ro   rF   r-   ri   rg   rj   r9   r9   rP   r:   rp      s    rp   c                       sl   e Zd ZdZdeddf fddZdejded	edejfd
dZ		ddejdej
dB dejfddZ  ZS )InternVLVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rD   r|   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _|j| _t|jtjjr8|jn|j|jf| _| jj}|jrUttd|d |j| _nd | _t|j| _d S )Nr   )rE   rF   r0   	Parameterr-   zerosrv   	cls_tokenuse_mask_token
mask_tokenrp   patch_embeddingsrt   
isinstancers   collectionsabcIterablerw    use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probr'   )rN   rD   rw   rP   r9   r:   rF      s    


z!InternVLVisionEmbeddings.__init__r   r   r   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| jd  }	|| jd  }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr(   r         ?r   r   bicubicF)rS   modealign_cornersr)   )r}   r   r-   jit
is_tracingrt   r   rW   permuter0   r1   interpolaterZ   cat)rN   r   r   r   rw   num_positionsclass_pos_embedpatch_pos_embedr*   
new_height	new_widthsqrt_num_positionsr9   r9   r:   interpolate_pos_encoding   s(   

z1InternVLVisionEmbeddings.interpolate_pos_encodingr{   bool_masked_posc                 C   s   |j \}}}}| |}| \}}}|d ur1| j||d}	|d|	}
|d|
  |	|
  }| j|dd}tj	||fdd}| j
d urP|| ||| }| |}|S )Nr(   r   r)   )r}   r   rS   r   expand	unsqueezetype_asr   r-   r   r   r   r'   )rN   r{   r   rc   r   r   r   ra   rb   mask_tokensw
cls_tokensr9   r9   r:   rg      s   


z InternVLVisionEmbeddings.forwardrh   )r?   r@   rA   ro   r   rF   r-   ri   intr   
BoolTensorrg   rj   r9   r9   rP   r:   r      s    +r   c                   @   r<   )InternVLVisionMLPNr>   r9   r9   r9   r:   r     rB   r   )
layer_normrms_normc                       sT   e Zd ZdZdeddf fddZdejdeej eejejf B fdd	Z	  Z
S )
InternVLVisionLayerz?This corresponds to the Block class in the timm implementation.rD   r|   Nc                    s   t    |j| _d| _t|| _t|| _t|j	 |j
|jd| _t|j	 |j
|jd| _|j}tj|t|j
 dd| _tj|t|j
 dd| _t|j| _d S )Nr   epsT)requires_grad)rE   rF   chunk_size_feed_forwardseq_len_dimrC   	attentionr   mlpNORM2FN	norm_typerv   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer0   r   r-   oneslambda_1lambda_2r   r   r'   )rN   rD   init_valuesrP   r9   r:   rF     s   


zInternVLVisionLayer.__init__rR   c                 C   sd   |  | |\}}| j| }|| }| |}| |}| |}| jd ur,| j| }|| }|S rh   )r   r   r   r   r   r'   r   )rN   rR   attention_outputrc   layer_outputr9   r9   r:   rg   -  s   





zInternVLVisionLayer.forward)r?   r@   rA   ro   r   rF   r-   ri   tuplerg   rj   r9   r9   rP   r:   r     s    r   c                       s>   e Zd Zdeddf fddZdejdeeB fddZ	  Z
S )	InternVLVisionEncoderrD   r|   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r9   )r   ).0irD   r9   r:   
<listcomp>M  s    z2InternVLVisionEncoder.__init__.<locals>.<listcomp>F)	rE   rF   rD   r0   
ModuleListrangenum_hidden_layerslayergradient_checkpointingrN   rD   rP   r   r:   rF   J  s   
 
zInternVLVisionEncoder.__init__rR   c                 C   s   | j D ]}||}qt|dS )N)last_hidden_state)r   r	   )rN   rR   layer_moduler9   r9   r:   rg   P  s
   

zInternVLVisionEncoder.forward)r?   r@   rA   r   rF   r-   ri   r   r	   rg   rj   r9   r9   rP   r:   r   I  s    r   c                       s^   e Zd ZU eed< dZdZdZdZdgZ	dZ
dZdZdZeedZe  fdd	Z  ZS )
InternVLVisionPreTrainedModelrD   internvl_visionr{   )imagevideoTr   )rR   
attentionsc                    s   t  | t|tr+t|j |jdurt|j |jdur)t|j dS dS t|t	rDt
|j| jj t
|j| jj dS dS )zInitialize the weightsN)rE   _init_weightsr   r   initzeros_r   r   r   r   	constant_r   rD   r   r   )rN   r!   rP   r9   r:   r   n  s   



z+InternVLVisionPreTrainedModel._init_weights)r?   r@   rA   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rC   _can_record_outputsr-   no_gradr   rj   r9   r9   rP   r:   r   \  s    
 r   c                       sf   e Zd Zdeddf fddZdd Zeedd	e	dd
e	j
de	jdB deeB fddZ  ZS )InternVLVisionModelrD   r|   Nc                    sT   t  | || _t|| _t|| _|jrt	 ntj
|j|jd| _|   d S )Nr   )rE   rF   rD   r   r   r   encoderuse_mean_poolingr0   rK   	LayerNormrv   r   	layernorm	post_initr   rP   r9   r:   rF     s   

zInternVLVisionModel.__init__c                 C   s   | j jS rh   )r   r   )rN   r9   r9   r:   get_input_embeddings  s   z(InternVLVisionModel.get_input_embeddingsF)tie_last_hidden_statesr{   r   c                 K   s<   | j ||d}| |}|d }| |}t||j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   r   )r   rR   r   )r   r   r   rn   rR   r   )rN   r{   r   r4   embedding_outputencoder_outputssequence_outputr9   r9   r:   rg     s   


zInternVLVisionModel.forwardrh   )r?   r@   rA   r   rF   r   r   r   r   r-   ri   r   r   rn   rg   rj   r9   r9   rP   r:   r   }  s    r   c                   @   rm   )InternVLPreTrainedModel)r   textr   N)r?   r@   rA   r   r9   r9   r9   r:   r     s    r   c                       s*   e Zd Zdef fddZdd Z  ZS )InternVLMultiModalProjectorrD   c                    sz   t    t|jjtd|j d  | _t	|jjtd|j d  |j
j| _t|j | _t	|j
j|j
j| _d S )Nr   r   )rE   rF   r0   r   vision_configrv   r   downsample_ratior   Lineartext_configlinear_1r   projector_hidden_actactlinear_2r   rP   r9   r:   rF     s   
"z$InternVLMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S rh   )r   r   r   r   )rN   image_featuresrR   r9   r9   r:   rg     s
   



z#InternVLMultiModalProjector.forward)r?   r@   rA   r   rF   rg   rj   r9   r9   rP   r:   r     s    	r   c                   @   r<   )InternVLModelOutputWithPastNr>   r9   r9   r9   r:   r     rB   r   c                   @   s  e Zd ZddejdefddZeee	dd		dd	ej
d
eee B dB dedB dee deeB f
ddZee										ddejdB d	ej
dB dejdB dejdB dedB dej
dB d
eee B dB dedB dejdB dee deeB fddZdS )InternVLModelr   vision_featuresscale_factorc              	   C   s   |  \}}}}|| dks|| dkrtd|||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rS   r~   rZ   r   r   r3   )rN   r  r  ra   r   r   channelsr9   r9   r:   pixel_shuffle  s   $zInternVLModel.pixel_shufflezWObtains image last hidden states from the vision tower and apply multimodal projection.rk   Nr{   vision_feature_layervision_feature_select_strategyr4   r|   c                 K   s   |j | jd}| jj}|dkrd|d< | jd|dd|}|dkr&|j}n|j| }|dkr<|ddddddf }|jd }t|d	 }	|jd
 }
|	|
|	|	d}| j
||d}|	|
d|jd }| |}||_|S )a!  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
            The tensors corresponding to the input images.
        vision_feature_layer (`int` or `list[int]`):
            Layer index or list of layer indices to extract features from.
        )r   r(   Toutput_hidden_states)r{   return_dictdefaultNr   r   r   )r  r9   )r   r   rD   r   vision_towerr   rR   r}   r   rW   r  multi_modal_projectorpooler_output)rN   r{   r  r  r4   r   vision_outputsr  r  feature_sizera   r9   r9   r:   get_image_features  s&   



z InternVLModel.get_image_features	input_idsr%   position_idspast_key_valuesinputs_embedscache_positionc
                 K   s   |d u |d uA rt d|d u r|  |}|d ur:| j|||ddj}||j|j}| j|||d}|||}| j	d|||||	d|
}t
|j|j|j|j|d urY|dS d dS )Nz:You must specify exactly one of input_ids or inputs_embedsT)r{   r  r  r  )r  r   )r%   r  r  r  r  )r   r  rR   r   image_hidden_statesr9   )r~   r   r  r  r   devicer   get_placeholder_maskmasked_scatterlanguage_modelr   r   r  rR   r   )rN   r  r{   r%   r  r  r  r  r  r  r4   r   special_image_maskoutputsr9   r9   r:   rg     sH   	
zInternVLModel.forward)r   )NN)	NNNNNNNNN)r?   r@   rA   r-   ri   floatr  r   r   r   FloatTensorr   liststrr   r   r   r
   r  
LongTensorr   r   rg   r9   r9   r9   r:   r     sl    #.	
r   c                   @   r<   )InternVLCausalLMOutputWithPastNr>   r9   r9   r9   r:   r!  O  rB   r!  c                       s   e Zd Z fddZ  ZS ) InternVLForConditionalGenerationc                     s   t  jdi |  dS )ac  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```Nr9   )rE   rg   )super_kwargsrP   r9   r:   rg   T  s   $z(InternVLForConditionalGeneration.forward)r?   r@   rA   rg   rj   r9   r9   rP   r:   r"  S  s    r"  )r   r   r   r   r"  )r    )Icollections.abcr   r   dataclassesr   r-   torch.nnr0    r   r   activationsr   cache_utilsr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   r   configuration_internvlr   r   Moduleri   r  r;   r=   rC   rn   rp   r   r   r   r   r   r   r   r   r   INTERNVL_INPUTS_DOCSTRINGr   r   r   r!  r"  __all__r9   r9   r9   r:   <module>   s|   

6	%^. ) (