o
    wi                     @   s  d dl Zd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) edG dd de
j*Z+	dGde
j*dej,dej,dej,deej, de-de-fddZ.G dd  d e
j*Z/e!G d!d" d"eZ0ee!d#d$G d%d& d&eZ1G d'd( d(e
j*Z2G d)d* d*e
j*Z3G d+d, d,e
j*Z4e
j5e+d-Z6G d.d/ d/eZ7G d0d1 d1e
j*Z8e!G d2d3 d3e0Z9e!G d4d5 d5eZ:G d6d7 d7e
j*Z;ee!d8d$G d9d: d:eZ<e!d;d$G d<d= d=e:Z=ee!d>d$G d?d@ d@e Z>G dAdB dBeeZ?e!dCd$G dDdE dEe:eZ@g dFZAdS )H    N)	dataclass)CallableOptionalUnion   )ACT2FN)GenerationMixin)use_kernel_forward_from_hub)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torchdynamo_compiling	torch_int   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	InternVLVisionRMSNormư>c                    s&   t    tt|| _|| _dS )zD
        InternVLVisionRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ k/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/internvl/modeling_internvl.pyr!   4   s   

zInternVLVisionRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   T)keepdim)	dtypetor$   float32powmeanrsqrtr'   r&   )r(   hidden_statesinput_dtypevariancer-   r-   r.   forward<   s
   zInternVLVisionRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler&   shaper'   r(   r-   r-   r.   
extra_reprC   s   z InternVLVisionRMSNorm.extra_repr)r   )__name__
__module____qualname__r!   r:   r>   __classcell__r-   r-   r+   r.   r   2   s    r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   |}|}	t ||dd| }
|d ur+|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj|
|| jd}
t |
|	}|dd	 }||
fS )Nr   r   r/   dim)ptrainingr   )
r$   matmul	transposer<   r"   
functionalsoftmaxrJ   rO   
contiguous)rD   rE   rF   rG   rH   rI   rJ   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr-   r-   r.   eager_attention_forwardG   s   
&r[   c                
       sX   e Zd ZdZdef fddZ		ddejdeej deej d	e	e
 fd
dZ  ZS )InternVLVisionAttentionz+Attention Class for InternVL Vision Encoderconfigc                    sB  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _
|j}|j}d| _tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _t| j| j| _|dkrt|nt | _|rt| jnt | _|rt| j| _d S t | _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r    r!   r]   r)   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr"   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr   q_normk_norm)r(   r]   proj_dropoutqk_normr+   r-   r.   r!   e   s.   

"z InternVLVisionAttention.__init__Nr7   rH   output_attentionsrU   c                 K   s"  |  \}}}| |}| |}	| |}
| |}| |	}	|||| j| j	dd}|	||| j| j	dd}	|

||| j| j	dd}
t}| jjdkrXt| jj }|| ||	|
|f| jsddn| j| jdd|\}}|||| j}| |}| |}|r||f}|S |d f}|S )Nr   r   eagerrC   F)rJ   rI   ri   )sizerl   rm   rn   rr   rs   reshaperb   rc   rQ   viewr[   r]   _attn_implementationr   rO   rf   re   r`   ro   rg   )r(   r7   rH   rv   rU   
batch_sizeseq_len_query_statesrV   rW   attention_interfacerZ   rX   outputoutputsr-   r-   r.   r:      s@   




	


zInternVLVisionAttention.forwardNN)r?   r@   rA   __doc__r   r!   r$   Tensorr   r   r
   r:   rB   r-   r-   r+   r.   r\   b   s    r\   c                   @   s:   e Zd ZeZdZdZdZdgZdZ	dZ
dZdZdd ZdS )InternVLVisionPreTrainedModelinternvl_visionpixel_valuesTInternVLVisionLayerc                 C   s:  t |tjtjtjfr%|jjjd| jj	d |j
dur#|j
j  dS dS t |tjrH|jjjd| jj	d |jdurF|jj|j   dS dS t |tjr]|j
j  |jjd dS t |tr|jj  |jdurs|jj  |jdur|jj  dS dS t |tr|jj| jj |jj| jj dS dS )zInitialize the weightsrC   r5   stdN      ?)
isinstancer"   rj   Conv2dConvTranspose2dr&   datanormal_r]   initializer_ranger_   zero_	Embeddingpadding_idx	LayerNormfill_InternVLVisionEmbeddings	cls_token
mask_tokenposition_embeddingsr   lambda_1layer_scale_init_valuelambda_2)r(   rD   r-   r-   r.   _init_weights   s0   





z+InternVLVisionPreTrainedModel._init_weightsN)r?   r@   rA   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backendr   r-   r-   r-   r.   r      s    r   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                   @   s   e Zd ZdZdS )$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)r?   r@   rA   r   r-   r-   r-   r.   r      s    r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _|| _tj	||||d| _
d S )Nr   r   )kernel_sizestride)r    r!   
image_size
patch_sizenum_channelsr)   num_patchespatch_shaper"   r   
projection)r(   r]   r   r   r   r)   r   r   r+   r-   r.   r!      s   
  z&InternVLVisionPatchEmbeddings.__init__r   returnc           	      C   s^   |j \}}}}|| jkrtd| |}|j d |j d }}|ddd}|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r<   r   rd   r   flattenrQ   )	r(   r   r|   r   heightwidth
embeddingspatch_heightpatch_widthr-   r-   r.   r:      s   

z%InternVLVisionPatchEmbeddings.forward)	r?   r@   rA   r   r!   r$   r   r:   rB   r-   r-   r+   r.   r      s    r   c                       sl   e Zd ZdZdeddf fddZdejded	edejfd
dZ		ddejde
ej dejfddZ  ZS )r   zc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    r]   r   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _|j| _t|jtjjr8|jn|j|jf| _| jj}|jrUttd|d |j| _nd | _t|j| _d S )Nr   )r    r!   r"   r#   r$   zerosr)   r   use_mask_tokenr   r   patch_embeddingsr   r   r   collectionsabcIterabler    use_absolute_position_embeddingsr   rp   hidden_dropout_probrJ   )r(   r]   r   r+   r-   r.   r!     s    


z!InternVLVisionEmbeddings.__init__r   r   r   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| jd  }	|| jd  }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr/   r         ?r   r   bicubicF)rx   modealign_cornersrL   )r<   r   r$   jit
is_tracingr   r   ry   permuter"   rR   interpolaterz   cat)r(   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrM   
new_height	new_widthsqrt_num_positionsr-   r-   r.   interpolate_pos_encoding$  s(   

z1InternVLVisionEmbeddings.interpolate_pos_encodingr   bool_masked_posc                 C   s   |j \}}}}| |\}\}}| \}	}
}|d ur5| j|	|
d}|d|}|d|  ||  }| j|	dd}tj	||fdd}| j
d urT|| ||| }| |}|||ffS )Nr/   r   rL   )r<   r   rx   r   expand	unsqueezetype_asr   r$   r   r   r   rJ   )r(   r   r   r~   r   r   r   r   r   r|   r}   mask_tokensw
cls_tokensr-   r-   r.   r:   L  s   

z InternVLVisionEmbeddings.forwardN)r?   r@   rA   r   r   r!   r$   r   intr   r   
BoolTensorr:   rB   r-   r-   r+   r.   r     s    +r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )InternVLVisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )r    r!   r]   r   
hidden_actactivation_fnr"   rj   r)   intermediate_sizefc1fc2r(   r]   r+   r-   r.   r!   g  s
   
zInternVLVisionMLP.__init__r7   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r(   r7   r-   r-   r.   r:   n  s   


zInternVLVisionMLP.forward)r?   r@   rA   r!   r$   r   r:   rB   r-   r-   r+   r.   r   f  s    r   )
layer_normrms_normc                       s`   e Zd ZdZdeddf fddZ	ddejd	ede	e
ej e
ejejf f fd
dZ  ZS )r   z?This corresponds to the Block class in the timm implementation.r]   r   Nc                    s   t    |j| _d| _t|| _t|| _t|j	 |j
|jd| _t|j	 |j
|jd| _|j}tj|t|j
 dd| _tj|t|j
 dd| _t|j| _d S )Nr   r*   T)requires_grad)r    r!   chunk_size_feed_forwardseq_len_dimr\   	attentionr   mlpNORM2FN	norm_typer)   layer_norm_epslayernorm_beforelayernorm_afterr   r"   r#   r$   r%   r   r   rp   r   rJ   )r(   r]   init_valuesr+   r-   r.   r!   {  s   


zInternVLVisionLayer.__init__Fr7   rv   c                 C   sl   | j | ||d\}}| j| }|| }| |}| |}| |}| jd ur.| j| }|| }||fS )N)rv   )r   r   r   r   r   rJ   r   )r(   r7   rv   attention_outputattention_weightslayer_outputr-   r-   r.   r:     s   






zInternVLVisionLayer.forward)F)r?   r@   rA   r   r   r!   r$   r   boolr   r;   r:   rB   r-   r-   r+   r.   r   x  s    r   c                       sT   e Zd Zdeddf fddZe		ddejded	ede	e
ef fd
dZ  ZS )InternVLVisionEncoderr]   r   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r-   )r   ).0ir]   r-   r.   
<listcomp>  s    z2InternVLVisionEncoder.__init__.<locals>.<listcomp>F)	r    r!   r]   r"   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r+   r   r.   r!     s   
 
zInternVLVisionEncoder.__init__Fr7   rv   output_hidden_statesc           	      C   sz   |rdnd }|r
dnd }t | jD ]\}}|r||f }|||}|d }|r.||d f }q|r6||f }t|||dS )Nr-   r   r   last_hidden_stater7   
attentions)	enumerater   r   )	r(   r7   rv   r   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputsr-   r-   r.   r:     s"   


zInternVLVisionEncoder.forward)FF)r?   r@   rA   r   r!   r   r$   r   r   r   r;   r   r:   rB   r-   r-   r+   r.   r     s    
r   c                       st   e Zd Zdeddf fddZdd Zee			ddej	d	e
ej d
e
e de
e deeef f
ddZ  ZS )InternVLVisionModelr]   r   Nc                    sT   t  | || _t|| _t|| _|jrt	 ntj
|j|jd| _|   d S )Nr   )r    r!   r]   r   r   r   encoderuse_mean_poolingr"   rq   r   r)   r   	layernorm	post_initr   r+   r-   r.   r!     s   

zInternVLVisionModel.__init__c                 C      | j jS r   )r   r   r=   r-   r-   r.   get_input_embeddings     z(InternVLVisionModel.get_input_embeddingsr   r   rv   r   c           	      C   sn   |dur|n| j j}|dur|n| j j}| j||d\}}| j|||d}|d }| |}t||j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   )rv   r   r   r  )	r]   rv   r   r   r
  r  r   r7   r  )	r(   r   r   rv   r   embedding_outputr~   encoder_outputssequence_outputr-   r-   r.   r:     s    
zInternVLVisionModel.forward)NNN)r?   r@   rA   r   r!   r  r   r   r$   r   r   r   r   r   r;   r   r:   rB   r-   r-   r+   r.   r	    s&    
r	  c                   @   s@   e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZdZdd ZdS )InternVLPreTrainedModel Tpast_key_valuesc                 C   s   t | jd| j j}t|tjr)|jjj	d|d |j
d ur'|j
j  d S d S t|tjr>|j
j  |jjd d S d S )Nr   rC   r   r   )getattrr]   get_text_configr   r   r"   rj   r&   r   r   r_   r   r   r   )r(   rD   r   r-   r-   r.   r     s   
z%InternVLPreTrainedModel._init_weightsN)r?   r@   rA   r   r   r   r   _skip_keys_device_placement_supports_cache_classr   r   _supports_quantized_cache_supports_static_cacher   r   r   r-   r-   r-   r.   r    s    r  c                       s*   e Zd Zdef fddZdd Z  ZS )InternVLMultiModalProjectorr]   c                    sz   t    t|jjtd|j d  | _t	|jjtd|j d  |j
j| _t|j | _t	|j
j|j
j| _d S )Nr   r   )r    r!   r"   r   vision_configr)   r   downsample_ratior   rj   text_configlinear_1r   projector_hidden_actactlinear_2r   r+   r-   r.   r!     s   
"z$InternVLMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S r   )r   r!  r#  r$  )r(   image_featuresr7   r-   r-   r.   r:   (  s
   



z#InternVLMultiModalProjector.forward)r?   r@   rA   r   r!   r:   rB   r-   r-   r+   r.   r    s    	r  zM
    Base class for InternVL outputs, with hidden states and attentions.
    c                   @   s$   e Zd ZU dZdZeej ed< dS )InternVLModelOutputWithPasta  
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	r?   r@   rA   r   r'  r   r$   FloatTensor__annotations__r-   r-   r-   r.   r&  0  s   
 r&  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    c                #       sR  e Zd ZddiZdef fddZdd Zdd	 Zd
d Zdd Z			d'de
jdeeeee f  dee fddZee													d(de
jde
jdee
j dee
j deee
j  dee
j deeeee f  dee dee dee dee dee dee
j dee deeef fd d!Zd)d#e
jd$efd%d&Z  ZS )*InternVLModelzlanguage_model.modellanguage_modelr]   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S r   )r    r!   r   from_configr  vision_towerr  multi_modal_projectorr   r+  r  r   r+   r-   r.   r!   N  s
   
zInternVLModel.__init__c                 C   
   | j  S r   )r+  r  r=   r-   r-   r.   r  V     
z"InternVLModel.get_input_embeddingsc                 C      | j | d S r   )r+  set_input_embeddingsr(   rG   r-   r-   r.   r2  Y     z"InternVLModel.set_input_embeddingsc                 C   
   || _ d S r   r+  r(   decoderr-   r-   r.   set_decoder\  r0  zInternVLModel.set_decoderc                 C      | j S r   r6  r=   r-   r-   r.   get_decoder_     zInternVLModel.get_decoderNr   vision_feature_layervision_feature_select_strategyc           
      K   s   |dur|n| j j}|dur|n| j j}| j j}|dkr$| j|dj}n	| j|dj| }|dkr>|ddddddf }|jd }t	|d }|jd }	|
|	||d}| j||d}|
|	d|jd }| |}|S )	a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `list[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        Nr/   )r   defaultr   r   r   )scale_factor)r]   r=  r>  r  r-  r  vision_modelr7   r<   r   ry   pixel_shuffler.  )
r(   r   r=  r>  rU   r  vision_featureschannelsfeature_sizer|   r-   r-   r.   get_image_featuresb  s(   


z InternVLModel.get_image_features	input_idsrH   position_idsr  inputs_embeds	use_cacherv   r   return_dictcache_positionrU   r   c                 K   s  |
d ur|
n| j j}
|d ur|n| j j}|d ur|n| j j}|d ur$|n| j j}|d ur.|n| j j}|d u |d uA r>td|d u rH|  |}|d ur| j|||d}|d u rw||  t	j
| j jt	j|jdk}|jddjddd }n|| j jkd}|||j}|| j jk }t s||  | kr|| j jk }|jd |jd  }td| d	| ||j|j}|||}| jd|||||	|
|d
|d	|}t|j|j|j|j|d ur|dS d dS )Nz:You must specify exactly one of input_ids or inputs_embedsr   r=  r>  )r1   devicer   rL   r   r/   z6Image features and image tokens do not match: tokens: z, features T)	rH   rH  r  rI  rJ  rv   r   rK  rL  )r  r  r7   r  r'  r-   )r]   rv   r   use_return_dictr=  r>  rd   r  rF  r$   tensorimage_token_idlongrN  sumr   	expand_asr2   r   numelr<   r1   masked_scatterr+  r&  r  r  r7   r  )r(   rG  r   rH   rH  r  rI  r=  r>  rJ  rv   r   rK  rL  rU   r%  special_image_maskn_image_tokensn_image_featuresr   r-   r-   r.   r:     st   

zInternVLModel.forwardr   rC  r@  c              	   C   s   |  \}}}}|| dks|| dkrtd|||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rx   rd   rz   r   r   rT   )r(   rC  r@  r|   r   r   rD  r-   r-   r.   rB    s   $zInternVLModel.pixel_shuffler   )NNNNNNNNNNNNN)r   )r?   r@   rA   _checkpoint_conversion_mappingr   r!   r  r2  r9  r;  r$   r(  r   r   r   liststrrF  r   r   
LongTensorr   r   r   r
   r;   r&  r:   floatrB  rB   r-   r-   r+   r.   r*  F  s~    
5	

Tr*  zT
    Base class for InternVL causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	InternVLCausalLMOutputWithPastaw  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitsr  r7   r  r'  )r?   r@   rA   r   r`  r   r$   r(  r)  ra  r  r[  r7   r;   r  r'  r-   r-   r-   r.   r_    s   
 r_  c                   @   s   e Zd ZdS )KwargsForCausalLMN)r?   r@   rA   r-   r-   r-   r.   rb  0  s    rb  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c                )       s  e Zd ZdddddZdgZdef fdd	Zd
d Zdd Zde	j
fddZdd Zdd Zdd Z		d6dejdeeeee f  dee fddZedd Zedd  Zed!d" Zee															#	d7d$ejdejd%eej d&eej d'eeej  d(eej deeeee f  dee d)eej d*ee d+ee d,ee d-ee d.eej d/eeejf d0eej d1e e! dee"e#f f$d2d3Z$						d8 fd4d5	Z%  Z&S )9 InternVLForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr]   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr^   )r    r!   r*  modelr"   rj   r   r)   
vocab_sizerd  r  r   r+   r-   r.   r!   A  s   
z)InternVLForConditionalGeneration.__init__c                 C   r/  r   )re  r  r=   r-   r-   r.   r  G  r0  z5InternVLForConditionalGeneration.get_input_embeddingsc                 C   r1  r   )re  r2  r3  r-   r-   r.   r2  J  r4  z5InternVLForConditionalGeneration.set_input_embeddingsr   c                 C   r:  r   rd  r=   r-   r-   r.   get_output_embeddingsM  r<  z6InternVLForConditionalGeneration.get_output_embeddingsc                 C   r5  r   rg  )r(   new_embeddingsr-   r-   r.   set_output_embeddingsP  r0  z6InternVLForConditionalGeneration.set_output_embeddingsc                 C   r1  r   )re  r9  r7  r-   r-   r.   r9  S  r4  z,InternVLForConditionalGeneration.set_decoderc                 C   r  r   )re  r;  r=   r-   r-   r.   r;  V  r  z,InternVLForConditionalGeneration.get_decoderNr   r=  r>  c                 K   s   | j jd|||d|S )NrM  r-   )re  rF  )r(   r   r=  r>  rU   r-   r-   r.   rF  Y  s   z3InternVLForConditionalGeneration.get_image_featuresc                 C   r  r   )re  r+  r=   r-   r-   r.   r+  h     z/InternVLForConditionalGeneration.language_modelc                 C   r  r   )re  r-  r=   r-   r-   r.   r-  l  rk  z-InternVLForConditionalGeneration.vision_towerc                 C   r  r   )re  r.  r=   r-   r-   r.   r.  p  rk  z6InternVLForConditionalGeneration.multi_modal_projectorr   rG  rH   rH  r  rI  labelsrJ  rv   r   rK  rL  logits_to_keepimage_sizesrU   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|dur.|n| j j}| jd|||||||||
||d||d|}|d }t|trXt	| dn|}| 
|dd|ddf }d}|	dur}| jd||	| j jjd|}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```NT)rG  r   rH   rH  r  rI  r=  r>  rJ  rv   r   rK  rL  rn  r   )ra  rl  rf  )r`  ra  r  r7   r  r'  r-   )r]   rv   r   rO  r=  r>  re  r   r   slicerd  loss_functionr   rf  r_  r  r7   r  r'  )r(   rG  r   rH   rH  r  rI  r=  r>  rl  rJ  rv   r   rK  rL  rm  rn  rU   r   r7   slice_indicesra  r`  r-   r-   r.   r:   t  s\   >z(InternVLForConditionalGeneration.forwardc           
         s8   t  j|f|||||d|}	|d dkr||	d< |	S )N)r  rI  rH   rL  rm  r   r   )r    prepare_inputs_for_generation)
r(   rG  r  rI  r   rH   rL  rm  rU   model_inputsr+   r-   r.   rr    s   
z>InternVLForConditionalGeneration.prepare_inputs_for_generationr   )NNNNNNNNNNNNNNr   N)NNNNNN)'r?   r@   rA   rZ  _tied_weights_keysr   r!   r  r2  r"   Modulerh  rj  r9  r;  r$   r(  r   r   r   r[  r\  rF  propertyr+  r-  r.  r   r   r]  r   r   r   rb  r;   r_  r:   rr  rB   r-   r-   r+   r.   rc  3  s    



	

src  )r   r	  r  r*  rc  )rC   )Bcollections.abcr   dataclassesr   typingr   r   r   r$   torch.nnr"   activationsr   
generationr   integrationsr	   modeling_flash_attention_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   autor   configuration_internvlr   r   ru  r   r   r^  r[   r\   r   r   r   r   r   r   r   r   r   r	  r  r  r&  r*  r_  rb  rc  __all__r-   r-   r-   r.   <module>   s    
K%	&^0&5 G N