o
    ei                     @   s  d dl Zd dl mZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- edG dd dej.Z/	dHdej.dej0dej0dej0dej0dB de1de1fd d!Z2G d"d# d#ej.Z3ee!d$d%G d&d' d'eZ4G d(d) d)ej.Z5G d*d+ d+ej.Z6G d,d- d-ej.Z7ej8e/d.Z9G d/d0 d0eZ:G d1d2 d2ej.Z;e!G d3d4 d4eZ<e!G d5d6 d6e<Z=e!G d7d8 d8eZ>G d9d: d:ej.Z?ee!d;d%G d<d= d=eZ@e!d>d%G d?d@ d@e>ZAee!dAd%G dBdC dCeZBe!dDd%G dEdF dFe>eZCg dGZDdS )I    N)Callable)	dataclass   )initialization)ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringtorch_compilable_check	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )InternVLVisionRMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )zD
        InternVLVisionRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizer!   	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/internvl/modeling_internvl.pyr$   /   s   

zInternVLVisionRMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   T)keepdim)	dtypetor'   float32powmeanrsqrtr*   r)   )r+   r1   input_dtypevariancer/   r/   r0   forward7   s
   zInternVLVisionRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler)   shaper*   r+   r/   r/   r0   
extra_repr>   s   z InternVLVisionRMSNorm.extra_repr)r    )
__name__
__module____qualname__floatr$   r'   Tensorr<   r@   __classcell__r/   r/   r-   r0   r   -   s    r           modulequerykeyvalueattention_maskscalingdropoutc                 K   sx   |}|}	t ||dd| }
|d ur|
| }
tjj|
dd}
tjj|
|| jd}
t |
|	}|dd }||
fS )Nr   r   r2   dim)ptrainingr   )	r'   matmul	transposer%   
functionalsoftmaxrN   rR   
contiguous)rH   rI   rJ   rK   rL   rM   rN   kwargs
key_statesvalue_statesattn_weightsattn_outputr/   r/   r0   eager_attention_forwardB   s   
r]   c                       sL   e Zd ZdZdef fddZ	ddejdejdB dee	 fd	d
Z
  ZS )InternVLVisionAttentionz+Attention Class for InternVL Vision Encoderconfigc                    sB  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _
|j}|j}d| _tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _t| j| j| _|dkrt|nt | _|rt| jnt | _|rt| j| _d S t | _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r#   r$   r_   r,   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr%   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr   q_normk_norm)r+   r_   proj_dropoutqk_normr-   r/   r0   r$   _   s.   

"z InternVLVisionAttention.__init__Nr1   rL   rX   c                 K   s  |  \}}}| |}| |}| |}	| |}| |}|||| j| j	dd}|||| j| j	dd}|	
||| j| j	dd}	t| jjt}
|
| |||	|f| js^dn| j| jdd|\}}|||| j}| |}| |}||fS )Nr   r   rG   F)rN   rM   rk   )sizern   ro   rp   rt   ru   reshaperd   re   rT   viewr   get_interfacer_   _attn_implementationr]   rR   rh   rg   rb   rq   ri   )r+   r1   rL   rX   
batch_sizeseq_len_query_statesrY   rZ   attention_interfacer\   r[   outputr/   r/   r0   r<   {   s:   




	


zInternVLVisionAttention.forwardN)rA   rB   rC   __doc__r   r$   r'   rE   r   r   r<   rF   r/   r/   r-   r0   r^   \   s    r^   z7
    Class for outputs of [`InternVLVisionModel`].
    custom_introc                   @   s   e Zd ZdZdS )$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)rA   rB   rC   r   r/   r/   r/   r0   r      s    r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _|| _tj	||||d| _
d S )Nr   r   )kernel_sizestride)r#   r$   
image_size
patch_sizenum_channelsr,   num_patchespatch_shaper%   Conv2d
projection)r+   r_   r   r   r   r,   r   r   r-   r/   r0   r$      s   
  z&InternVLVisionPatchEmbeddings.__init__pixel_valuesr"   c                 C   sL   |j \}}}}|| jkrtd| || jjj}|ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   )	r>   r   rf   r   r5   r)   r4   flattenrT   )r+   r   r}   r   heightwidth
embeddingsr/   r/   r0   r<      s   
z%InternVLVisionPatchEmbeddings.forward)	rA   rB   rC   r   r$   r'   rE   r<   rF   r/   r/   r-   r0   r      s    r   c                       sl   e Zd ZdZdeddf fddZdejded	edejfd
dZ		ddejdej
dB dejfddZ  ZS )InternVLVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    r_   r"   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _|j| _t|jtjjr8|jn|j|jf| _| jj}|jrUttd|d |j| _nd | _t|j| _d S )Nr   )r#   r$   r%   r&   r'   zerosr,   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsrr   hidden_dropout_probrN   )r+   r_   r   r-   r/   r0   r$      s    


z!InternVLVisionEmbeddings.__init__r   r   r   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| jd  }	|| jd  }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr2   r         ?r   r   bicubicF)rx   modealign_cornersrO   )r>   r   r'   jit
is_tracingr   r   ry   permuter%   rU   interpolaterz   cat)r+   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrP   
new_height	new_widthsqrt_num_positionsr/   r/   r0   interpolate_pos_encoding   s(   

z1InternVLVisionEmbeddings.interpolate_pos_encodingr   bool_masked_posc                 C   s   |j \}}}}| |}| \}}}|d ur1| j||d}	|d|	}
|d|
  |	|
  }| j|dd}tj	||fdd}| j
d urP|| ||| }| |}|S )Nr2   r   rO   )r>   r   rx   r   expand	unsqueezetype_asr   r'   r   r   r   rN   )r+   r   r   r   r   r   r   r}   r~   mask_tokensw
cls_tokensr/   r/   r0   r<     s   


z InternVLVisionEmbeddings.forwardr   )rA   rB   rC   r   r   r$   r'   rE   intr   
BoolTensorr<   rF   r/   r/   r-   r0   r      s    +r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )InternVLVisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )r#   r$   r_   r   
hidden_actactivation_fnr%   rl   r,   intermediate_sizefc1fc2r+   r_   r-   r/   r0   r$   8  s
   
zInternVLVisionMLP.__init__r1   r"   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r+   r1   r/   r/   r0   r<   ?  s   


zInternVLVisionMLP.forward)rA   rB   rC   r$   r'   rE   r<   rF   r/   r/   r-   r0   r   7  s    r   )
layer_normrms_normc                       sT   e Zd ZdZdeddf fddZdejdeej eejejf B fdd	Z	  Z
S )
InternVLVisionLayerz?This corresponds to the Block class in the timm implementation.r_   r"   Nc                    s   t    |j| _d| _t|| _t|| _t|j	 |j
|jd| _t|j	 |j
|jd| _|j}tj|t|j
 dd| _tj|t|j
 dd| _t|j| _d S )Nr   r!   T)requires_grad)r#   r$   chunk_size_feed_forwardseq_len_dimr^   	attentionr   mlpNORM2FN	norm_typer,   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer%   r&   r'   r(   lambda_1lambda_2rr   r   rN   )r+   r_   init_valuesr-   r/   r0   r$   L  s   


zInternVLVisionLayer.__init__r1   c                 C   sd   |  | |\}}| j| }|| }| |}| |}| |}| jd ur,| j| }|| }|S r   )r   r   r   r   r   rN   r   )r+   r1   attention_outputr   layer_outputr/   r/   r0   r<   [  s   





zInternVLVisionLayer.forward)rA   rB   rC   r   r   r$   r'   rE   r=   r<   rF   r/   r/   r-   r0   r   I  s    r   c                       s>   e Zd Zdeddf fddZdejdeeB fddZ	  Z
S )	InternVLVisionEncoderr_   r"   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r/   )r   ).0ir_   r/   r0   
<listcomp>{  s    z2InternVLVisionEncoder.__init__.<locals>.<listcomp>F)	r#   r$   r_   r%   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r-   r   r0   r$   x  s   
 
zInternVLVisionEncoder.__init__r1   c                 C   s   | j D ]}||}qt|dS )N)last_hidden_state)r   r   )r+   r1   layer_moduler/   r/   r0   r<   ~  s
   

zInternVLVisionEncoder.forward)rA   rB   rC   r   r$   r'   rE   r=   r   r<   rF   r/   r/   r-   r0   r   w  s    r   c                       s^   e Zd ZU eed< dZdZdZdZdgZ	dZ
dZdZdZeedZe  fdd	Z  ZS )
InternVLVisionPreTrainedModelr_   internvl_visionr   )imagevideoTr   )r1   
attentionsc                    s   t  | t|tr+t|j |jdurt|j |jdur)t|j dS dS t|t	rDt
|j| jj t
|j| jj dS dS )zInitialize the weightsN)r#   _init_weightsr   r   initzeros_r   r   r   r   	constant_r   r_   r   r   )r+   rH   r-   r/   r0   r     s   



z+InternVLVisionPreTrainedModel._init_weights)rA   rB   rC   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r^   _can_record_outputsr'   no_gradr   rF   r/   r/   r-   r0   r     s    
 r   c                       sf   e Zd Zdeddf fddZdd Zeedd	e	dd
e	j
de	jdB deeB fddZ  ZS )InternVLVisionModelr_   r"   Nc                    sT   t  | || _t|| _t|| _|jrt	 ntj
|j|jd| _|   d S )Nr   )r#   r$   r_   r   r   r   encoderuse_mean_poolingr%   rs   	LayerNormr,   r   	layernorm	post_initr   r-   r/   r0   r$     s   

zInternVLVisionModel.__init__c                 C   s   | j jS r   )r   r   r?   r/   r/   r0   get_input_embeddings  s   z(InternVLVisionModel.get_input_embeddingsF)tie_last_hidden_statesr   r   c                 K   s<   | j ||d}| |}|d }| |}t||j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   r   )r   r1   r   )r   r   r   r   r1   r   )r+   r   r   rX   embedding_outputencoder_outputssequence_outputr/   r/   r0   r<     s   


zInternVLVisionModel.forwardr   )rA   rB   rC   r   r$   r   r   r   r   r'   rE   r   r=   r   r<   rF   r/   r/   r-   r0   r     s    r   c                   @   s:   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZdZdS )InternVLPreTrainedModelr_   model)r   textr   Tpast_key_valuesN)rA   rB   rC   r   r   r   r   r   _skip_keys_device_placementr   r   _can_compile_fullgraphr   r   r/   r/   r/   r0   r    s   
 r  c                       s*   e Zd Zdef fddZdd Z  ZS )InternVLMultiModalProjectorr_   c                    sz   t    t|jjtd|j d  | _t	|jjtd|j d  |j
j| _t|j | _t	|j
j|j
j| _d S )Nr   r   )r#   r$   r%   r   vision_configr,   r   downsample_ratior   rl   text_configlinear_1r   projector_hidden_actactlinear_2r   r-   r/   r0   r$     s   
"z$InternVLMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S r   )r   r  r  r  )r+   image_featuresr1   r/   r/   r0   r<     s
   



z#InternVLMultiModalProjector.forward)rA   rB   rC   r   r$   r<   rF   r/   r/   r-   r0   r
    s    	r
  zM
    Base class for InternVL outputs, with hidden states and attentions.
    c                   @   s$   e Zd ZU dZdZejdB ed< dS )InternVLModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)rA   rB   rC   r   r  r'   FloatTensorr   r/   r/   r/   r0   r    s   
 r  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    c                       sN  e Zd ZddiZdef fddZdd Zdd	 Zee	e
d
d		d$dejdeee B dB dedB dee deeB f
ddZdejdejdejfddZee
									d%dejdB dejdB dejdB dejdB dedB dejdB deee B dB dedB dejdB dee deeB fddZd&d ejd!efd"d#Z  ZS )'InternVLModel^language_model.modellanguage_modelr_   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S r   )r#   r$   r   from_configr  vision_towerr
  multi_modal_projectorr  r  r   r   r-   r/   r0   r$     s
   
zInternVLModel.__init__c                 C   
   | j  S r   )r  r   r?   r/   r/   r0   r        
z"InternVLModel.get_input_embeddingsc                 C      | j | d S r   )r  set_input_embeddingsr+   rK   r/   r/   r0   r  !     z"InternVLModel.set_input_embeddingszWObtains image last hidden states from the vision tower and apply multimodal projection.r   Nr   vision_feature_layervision_feature_select_strategyrX   r"   c                 K   s   |j | jd}| jj}|dkrd|d< | jd|dd|}|dkr&|j}n|j| }|dkr<|ddddddf }|jd }t|d	 }	|jd
 }
|	|
|	|	d}| j
||d}|	|
d|jd }| |}||_|S )a!  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
            The tensors corresponding to the input images.
        vision_feature_layer (`int` or `list[int]`):
            Layer index or list of layer indices to extract features from.
        )r4   r2   Toutput_hidden_states)r   return_dictdefaultNr   r   r   )scale_factorr/   )r5   r4   r_   r  r  r   r1   r>   r   ry   pixel_shuffler  pooler_output)r+   r   r"  r#  rX   r  vision_outputsvision_featureschannelsfeature_sizer}   r/   r/   r0   get_image_features$  s&   



z InternVLModel.get_image_features	input_idsinputs_embedsr  c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)r4   devicer2   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r'   tensorr_   image_token_idlongr1  allsumr>   r   	expand_asr5   r   numel)r+   r/  r0  r  special_image_maskn_image_tokensn_image_featuresr/   r/   r0   get_placeholder_maskW  s   z"InternVLModel.get_placeholder_maskrL   position_idsr  cache_positionc
                 K   s   |d u |d uA rt d|d u r|  |}|d ur:| j|||ddj}||j|j}| j|||d}|||}| j	d|||||	d|
}t
|j|j|j|j|d urY|dS d dS )Nz:You must specify exactly one of input_ids or inputs_embedsT)r   r"  r#  r%  )r0  r  )rL   r=  r  r0  r>  )r   r  r1   r   r  r/   )rf   r   r.  r)  r5   r1  r4   r<  masked_scatterr  r  r   r  r1   r   )r+   r/  r   rL   r=  r  r0  r"  r#  r>  rX   r  r9  outputsr/   r/   r0   r<   o  sH   	
zInternVLModel.forwardr   r+  r'  c              	   C   s   |  \}}}}|| dks|| dkrtd|||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rx   rf   rz   r   r   rW   )r+   r+  r'  r}   r   r   r,  r/   r/   r0   r(    s   $zInternVLModel.pixel_shuffleNN)	NNNNNNNNN)r   )rA   rB   rC   _checkpoint_conversion_mappingr   r$   r   r  r   r   r   r'   r  r   liststrr   r   r=   r   r.  
LongTensorr<  rE   r   r  r<   rD   r(  rF   r/   r/   r-   r0   r    s    .
	
1r  zT
    Base class for InternVL causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dS )	InternVLCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitsr  r1   r   r  )rA   rB   rC   r   rG  r'   r  r   rH  r  r   r1   r=   r   r  r/   r/   r/   r0   rF    s   
 rF  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c                        sl  e Zd ZdddddZddiZdef fd	d
Zdd Zdd Zde	j
fddZe		d(dejdeee B dB dedB dee deeB f
ddZee												d)dejdB dejdB dejdB dejdB dedB dejdB deee B dB dedB dejdB d ejdB d!eejB d"ejdB dee deeB fd#d$Z							%d* fd&d'	Z  ZS )+ InternVLForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)r  z^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr_   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr`   )r#   r$   r  r  r%   rl   r  r,   
vocab_sizerJ  r   r   r-   r/   r0   r$     s   
z)InternVLForConditionalGeneration.__init__c                 C   r  r   )r  r   r?   r/   r/   r0   r     r  z5InternVLForConditionalGeneration.get_input_embeddingsc                 C   r  r   )r  r  r   r/   r/   r0   r    r!  z5InternVLForConditionalGeneration.set_input_embeddingsr"   c                 C   s   | j S r   )rJ  r?   r/   r/   r0   get_output_embeddings  s   z6InternVLForConditionalGeneration.get_output_embeddingsNr   r"  r#  rX   c                 K   s   | j jd|||d|S )N)r   r"  r#  r/   )r  r.  )r+   r   r"  r#  rX   r/   r/   r0   r.    s   z3InternVLForConditionalGeneration.get_image_featuresr   r/  rL   r=  r  r0  labelsr>  logits_to_keepimage_sizesc                 K   s   | j d|||||||||
|d
|}|d }t|tr"t| dn|}| |dd|ddf }d}|	durG| jd||	| jjjd|}t	|||j
|j|j|jdS )ac  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```)
r/  r   rL   r=  r  r0  r"  r#  r>  rO  r   N)rH  rM  rK  )rG  rH  r  r1   r   r  r/   )r  r   r   slicerJ  loss_functionr_   r  rK  rF  r  r1   r   r  )r+   r/  r   rL   r=  r  r0  r"  r#  rM  r>  rN  rO  rX   r@  r1   slice_indicesrH  rG  r/   r/   r0   r<     s@   5z(InternVLForConditionalGeneration.forwardFc	              	      s>   t  j|f||||||d|	}
|s|	dds||
d< |
S )N)r  r0  rL   r>  rN  is_first_iteration	use_cacheTr   )r#   prepare_inputs_for_generationget)r+   r/  r  r0  r   rL   r>  rN  rS  rX   model_inputsr-   r/   r0   rU  g  s   z>InternVLForConditionalGeneration.prepare_inputs_for_generationrA  )NNNNNNNNNNr   N)NNNNNNF)rA   rB   rC   rB  _tied_weights_keysr   r$   r   r  r%   ModulerL  r   r'   r  r   rC  rD  r   r   r=   r   r.  r   rE  rE   r   rF  r<   rU  rF   r/   r/   r-   r0   rI    s    	
XrI  )r   r   r  r  rI  )rG   )Ecollections.abcr   r   dataclassesr   r'   torch.nnr%    r   r   activationsr   cache_utilsr   
generationr   integrationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   autor   configuration_internvlr   r   rY  r   rE   rD   r]   r^   r   r   r   r   r   r   r   r   r   r   r  r
  r  r  rF  rI  __all__r/   r/   r/   r0   <module>   s   
I	%^. ) 6 "