o
    wi8M                    @   s(  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( e#)e*Z+dXde	j,de	j-dee. fddZ/	dYde	j0de	j-de	j1de.fddZ2dYddZ3ee!ddG dd  d e Z4ee!d!dG d"d# d#e Z5G d$d% d%ej6Z7	&dZd'ej6d(e	j,d)e	j,d*e	j,d+ee	j, d,e8d-e8fd.d/Z9G d0d1 d1ej6Z:G d2d3 d3ej6Z;G d4d5 d5eZ<G d6d7 d7ej6Z=G d8d9 d9ej6Z>G d:d; d;ej6Z?G d<d= d=ej6Z@G d>d? d?ej6ZAG d@dA dAeZBG dBdC dCej6ZCe!G dDdE dEeZDG dFdG dGeDZEG dHdI dIeDZFG dJdK dKeeZGe!dLdG dMdN dNeDeZHG dOdP dPej6ZIe!dQdG dRdS dSeDZJe!dTdG dUdV dVeDeZKg dWZLdS )[zPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tuplelogging	torch_int   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 C   sj   |   \}}|dur|n|}| ddddddf |d|||}d| }||tjt|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r   r   r    bszsrc_lenexpanded_maskinverted_mask r.   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_mask,   s
   *r0   input_ids_shapedevicepast_key_values_lengthc                 C   s   | \}}t j||ft |j|d}t j|d|d}|||d |ddk d ||}|dkrFt j	t j
||||d|gdd}|ddddddf |d||| S )zB
    Make causal mask used for bi-directional self-attention.
    )r2   r   r   r   r2   dimN)r&   fullr(   r)   aranger"   masked_fill_viewr$   catzerosr#   )r1   r   r2   r3   r*   r    r   	mask_condr.   r.   r/   _make_causal_mask:   s   "
 (r?   c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r6   )neintr&   cumsumtype_aslong)	input_idspadding_idxr3   r   incremental_indicesr.   r.   r/   "create_position_ids_from_input_idsL   s   rH   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ee
e
ej   ed< dZee
ej  ed< dZee
ej  ed< dZeej ed< dZee
ej  ed< dZeed	< d
e
e fddZdS )Kosmos2ModelOutputa  
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS )text_model_outputrQ   Ngetattrto_tuple.0kselfr.   r/   	<genexpr>   
    
z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>tuplekeysr]   r.   r]   r/   rY         zKosmos2ModelOutput.to_tuple)__name__
__module____qualname____doc__rK   r   r&   FloatTensor__annotations__rL   rb   rM   rN   rO   rP   rQ   r   r   rY   r.   r.   r.   r/   rJ   \   s   
 rJ   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeeej   ed< dZeeej  ed< dZeeej  ed< dZeej ed< dZeeej  ed	< dZeed
< dee fddZdS )*Kosmos2ForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    NlosslogitsrL   rM   rN   rO   rP   rQ   rR   c                    rS   )Nc                 3   rT   rU   rW   rZ   r]   r.   r/   r_      r`   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>ra   r]   r.   r]   r/   rY      rd   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple)re   rf   rg   rh   rl   r   r&   ri   rj   rm   rL   rb   rM   rN   rO   rP   rQ   r   r   rY   r.   r.   r.   r/   rk      s   
 rk   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )Kosmos2VisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r4   
persistent)super__init__ro   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr&   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr9   r#   r^   ro   	__class__r.   r/   rz      s"   
"z Kosmos2VisionEmbeddings.__init__
embeddingsheightwidthrR   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr4   g      ?r   ru   bicubicF)r"   modealign_cornersr6   )shaper   weight	unsqueezer&   jit
is_tracingrv   r~   r   reshapepermuter   
functionalinterpolater;   r<   )r^   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr7   
new_height	new_widthsqrt_num_positionsr.   r.   r/   interpolate_pos_encoding   s*   



z0Kosmos2VisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model ().r   ru   r   r4   r6   )r   r}   
ValueErrorr   r   r   r$   flatten	transposer   r#   r&   r<   r   r   rv   )r^   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   r.   r.   r/   forward   s    
zKosmos2VisionEmbeddings.forwardF)re   rf   rg   r   rz   r&   TensorrA   r   ri   r   __classcell__r.   r.   r   r/   rn      s     )rn           modulequerykeyvalueattention_maskscalingdropoutc           
      K   sp   t ||dd| }|d ur|| }tjj|dd}tjj||| jd}t ||}	|	dd }	|	|fS )Nr4   r6   ptrainingr   ru   )	r&   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr.   r.   r/   eager_attention_forward  s   
r   c                       sh   e Zd ZdZ fddZ			ddejdeej deej d	ee d
e	ejeej f f
ddZ
  ZS )Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)ry   rz   ro   r{   r|   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   r   r.   r/   rz   '  s$   

zKosmos2VisionAttention.__init__NFrM   r   causal_attention_maskoutput_attentionsrR   c              
   C   sL  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
| jj	dkrY|durR|durR|| }n|durX|}n|du| _
t}| jj	dkrz| jj	dkrt|rttd nt| jj	 }|| ||	|
|| j
| j| jsdn| jd	\}}|||| }| |}|sd}||fS )
#Input shape: Batch x Time x Channelr   ru   flash_attention_2Neagersdpa`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r   )r   r   r   r   r;   r   r   r   ro   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )r^   rM   r   r   r   r   
seq_lengthr|   queriesrc   valuesattention_interfacer   r   r.   r.   r/   r   ;  sH   	






zKosmos2VisionAttention.forward)NNF)re   rf   rg   rh   rz   r&   r   r   r'   rb   r   r   r.   r.   r   r/   r   $  s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Kosmos2VisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)ry   rz   ro   r	   
hidden_actactivation_fnr   r   r{   intermediate_sizefc1fc2r   r   r.   r/   rz   u  s
   
zKosmos2VisionMLP.__init__rM   rR   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   r^   rM   r.   r.   r/   r   |  s   


zKosmos2VisionMLP.forward)re   rf   rg   rz   r&   r   r   r   r.   r.   r   r/   r   t  s    r   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )Kosmos2VisionEncoderLayerro   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S Neps)ry   rz   r{   r|   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   r   r.   r/   rz     s   


z"Kosmos2VisionEncoderLayer.__init__FrM   r   r   r   rR   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rM   r   r   r   )r   r   r   r   )r^   rM   r   r   r   residualr   outputsr.   r.   r/   r     s"   




z!Kosmos2VisionEncoderLayer.forwardr   )re   rf   rg   r   rz   r&   r   r   r'   rb   ri   r   r   r.   r.   r   r/   r     s    r   c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    ro   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r.   )r   r[   r   ro   r.   r/   
<listcomp>      z1Kosmos2VisionEncoder.__init__.<locals>.<listcomp>F)	ry   rz   ro   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr   r   r   r/   rz     s   
 
zKosmos2VisionEncoder.__init__Nr   r   r   output_hidden_statesreturn_dictrR   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ] \}
}|r<||	f }||	|||d}|d }	|rQ||d f }q1|rY||	f }|sgtdd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr.   )r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r.   )r[   vr.   r.   r/   r_   	  s    z/Kosmos2VisionEncoder.forward.<locals>.<genexpr>)rK   rM   rN   )ro   r   r  use_return_dict	enumerater   rb   r   )r^   inputs_embedsr   r   r   r  r  encoder_statesall_attentionsrM   idxencoder_layerlayer_outputsr.   r.   r/   r     s6   &

zKosmos2VisionEncoder.forwardNNNNN)re   rf   rg   rh   r   rz   r   r&   r   r'   r   rb   r   r   r   r.   r.   r   r/   r     s*    	
r   c                       sj   e Zd Zdef fddZ					ddeej dee dee d	ed
ee de	e
ef fddZ  ZS )Kosmos2VisionTransformerro   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )ry   rz   ro   r{   rn   r   r   r   r   pre_layrnormr   encoderpost_layernorm)r^   ro   r|   r   r.   r/   rz     s   


z!Kosmos2VisionTransformer.__init__NFr   r   r  r   r  rR   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| j||d}| |}| j||||d}|d }|d d dd d f }	| |	}	|s[||	f|dd   S t	||	|j
|jdS )Nz You have to specify pixel_values)r   )r  r   r  r  r   r   )rK   pooler_outputrM   rN   )ro   r   r  r  r   r   r  r  r  r   rM   rN   )
r^   r   r   r  r   r  rM   encoder_outputsrK   pooled_outputr.   r.   r/   r     s2   

z Kosmos2VisionTransformer.forwardNNNFN)re   rf   rg   r   rz   r   r&   ri   r'   r   rb   r   r   r   r.   r.   r   r/   r    s(    
r  c                       s   e Zd ZdZddededee f fddZddededee fd	d
Zeddededee fddZ	e
 				ddee
j dee
j dedee
j fddZdd Z  ZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nr   embedding_dimrF   c                    s4   t    d| _|| _|| _| || j || d S )Nru   )ry   rz   offsetr  rF   make_weights)r^   r   r  rF   r   r.   r/   rz   K  s
   
z1Kosmos2TextSinusoidalPositionalEmbedding.__init__num_embeddingsc                 C   sB   |  |||}t| dr|j| jj| jjd}| jd|dd d S )Nweightsr5   Frw   )get_embeddinghasattrr$   r  r   r2   r   )r^   r  r  rF   emb_weightsr.   r.   r/   r  S  s   
z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |t S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        ru   i'  r   r   r   r6   r4   N)mathlogr&   expr9   int64floatr   r<   sincosr;   r=   r$   get_default_dtype)r  r  rF   half_dimembr.   r.   r/   r  [  s   	 $&z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingr   rE   r  r3   rv   c                 C   s   |d ur|  \}}|d u rt|| j||j}n|  d d \}}|d u r.| ||}| jd | | }|| j dkrK| || j | j	| j | j
d|d||| jjd  S )Nr4   r   r   )r"   rH   rF   r$   r2   &create_position_ids_from_inputs_embedsr  r  r  r  index_selectr;   r   detach)r^   rE   r  r3   rv   r*   seq_lenmax_posr.   r.   r/   r   q  s    *z0Kosmos2TextSinusoidalPositionalEmbedding.forwardc                 C   sV   |  dd }|d }tj| jd || j d tj|jd}|d| | S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr4   r   r5   r   )	r"   r&   r9   rF   rD   r2   r   r#   r   )r^   r  r3   input_shapesequence_lengthrv   r.   r.   r/   r)    s   	zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embedsr   )NNr   N)re   rf   rg   rh   rA   r   rz   r  staticmethodr  r&   no_gradr   r   r)  r   r.   r.   r   r/   r  G  s*     r  c                       s   e Zd ZdZ				ddedededed	ed
ef fddZdej	dej	fddZ
					ddej	deej	 deeej	  deej	 deej	 dedeej	eej	 eeej	  f fddZ  ZS )KosmosTextAttentionr   r   FTr|   r   r   
is_decoderadd_inner_attn_layernormrt   c                    s   t    || _|| _|| _|| _|| | _| j| | jkr*td| j d| d| jd | _|| _	t
j|||d| _t
j|||d| _t
j|||d| _t
j|||d| _d | _|rgt
j||jd| _d S d S )Nr   r   r   r   )rt   r   )ry   rz   ro   r|   r   r   r   r   r   r3  r   r   r   r   r   r   inner_attn_lnr   r   )r^   ro   r|   r   r   r3  r4  rt   r   r.   r/   rz     s,   



zKosmosTextAttention.__init__
projectionrR   c                 C   s6   |  d d | j| jf }||dddd}|S )Nr4   r   ru   r   r   )r"   r   r   r;   r   )r^   r6  new_projection_shapenew_projectionr.   r.   r/   _shape  s   zKosmosTextAttention._shapeNrM   encoder_hidden_statespast_key_valuer   layer_head_maskr   c                 K   st  |du}|j dd \}	}
|dur|n|}|r.|r.|d j d |j d kr.|d }|d }n,| | |}| | |}|durZ|sZtj|d |gdd}tj|d |gdd}| | |}| jri||f}t}| j	j
dkr| j	j
dkr|rtd nt| j	j
 }|| ||||f| jsd	n| j| jd
|\}}||	|
d }| jdur| |}| |}|||fS )r   Nru   r   r   r6   r   r   r   r   )r   r   r4   )r   r9  r   r   r&   r<   r   r3  r   ro   r   r   r   r   r   r   r   r   r   r5  r   )r^   rM   r:  r;  r   r<  r   r   is_cross_attentionr   r   current_states
key_statesvalue_statesquery_statesr   r   r   r.   r.   r/   r     sL    





zKosmosTextAttention.forward)r   FFT)NNNNF)re   rf   rg   rh   rA   r#  r'   rz   r&   r   r9  r   rb   r   r   r.   r.   r   r/   r2    sP    #		r2  c                       s*   e Zd Zdef fddZdd Z  ZS )Kosmos2TextFFNro   c                    sb   t    |j| _t|j | _|j| _t|j	|j
| _t|j
|j	| _tj|j
|jd| _d S r   )ry   rz   r   r	   activation_functionr   activation_dropoutr   r   r|   ffn_dimr   r   r   r   ffn_layernormr   r   r.   r/   rz     s   
zKosmos2TextFFN.__init__c                 C   sT   |  | |}tjj|| j| jd}| |}| |}tjj|| j| jd}|S )Nr   )	r   r   r   r   r   rD  r   rF  r   r   r.   r.   r/   r   %  s   

zKosmos2TextFFN.forward)re   rf   rg   r   rz   r   r   r.   r.   r   r/   rB    s    rB  c                       s   e Zd Zdef fddZ								ddejdeej d	eej d
eej deej deej deeej  dee	 dee	 deej
eeej
ej
f  f fddZ  ZS )Kosmos2TextBlockro   c                    s   t    |j| _t|| j|j|jddd| _|j| _tj	| j|j
d| _|jr@t|| j|j|jddd| _tj	| j|j
d| _t|| _tj	| j|j
d| _d S )NT)r|   r   r   r3  r4  r   F)ry   rz   r|   r2  attention_headsr   r   r   r   r   r   self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrB  ffnfinal_layer_normr   r   r.   r/   rz   0  s0   

zKosmos2TextBlock.__init__NFTrM   r   r:  encoder_attention_maskr<  cross_attn_layer_head_maskr;  r   	use_cacherR   c
              	   K   sH  |}|d ur|d d nd }|  |}| jd	|||||d|
\}}}tjj|| j| jd}|| }d }d }|d urt| dsHtd|  d|}| |}|d urY|dd  nd }| j	d	||||||d|
\}}}tjj|| j| jd}|| }|| }|}| 
|}| |}|| }|f}|r|||f7 }|	r||f7 }|S )
Nru   )rM   r;  r   r<  r   r   rK  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )rM   r:  r   r<  r;  r   r.   )rI  r   r   r   r   r   r  r   rL  rK  rN  rM  )r^   rM   r   r:  rO  r<  rP  r;  r   rQ  r   r   self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer   r.   r.   r/   r   M  s^   



	


zKosmos2TextBlock.forward)NNNNNNFT)re   rf   rg   r   rz   r&   r   r   rb   r'   ri   r   r   r.   r.   r   r/   rG  /  s>     	
rG  c                &       sD  e Zd ZdZdef fddZdd Z					d d	eej	 d
eej	 deej	 de
deej	 f
ddZe															d!deej	 deej	 d
eej	 deej	 deej	 deej	 deej	 deej	 deeej  d	eej	 deej	 dee dee dee dee dee deeef f"ddZ  ZS )"Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    ro   c                    s   t     | _ j| _ j| _ jrt jnd| _	t
j j j jd| _t j j jd| _t
 fddt jD | _t
 j j| _d| _d S )Nr!   )rF   )r   r  rF   c                    r   r.   )rG  r   r   r.   r/   r     r   z3Kosmos2TextTransformer.__init__.<locals>.<listcomp>F)ry   rz   ro   r   	layerdropscale_embeddingr  sqrtr|   embed_scaler   r   
vocab_sizepad_token_idembed_tokensr  max_position_embeddingsembed_positionsr   r   r   r   r   
layer_normr  r   r   r   r/   rz     s   
 
zKosmos2TextTransformer.__init__c                 C   s`   d }|d dkrt ||j|j|d}|d ur.t||j|d d|j}|d u r*|n|| }|S )Nr4   r   )r2   r3   r    )r?   r   r2   r0   r$   )r^   r   r.  r  r3   combined_attention_maskexpanded_attn_maskr.   r.   r/   _prepare_decoder_attention_mask  s   z6Kosmos2TextTransformer._prepare_decoder_attention_maskNr   r  rO   img_input_maskr3   rv   c           	      C   s   |d u r	|  |}|d ur!||jd|d||jtjd< || j }| j||||d}||j}|| }t	j
j|| j| jd}|S )Nr4   r   )rE   r  r3   rv   r   )r_  r$   r2   r;   r"   r&   r'   r\  ra  r   r   r   r   )	r^   rE   r  rO   rg  r3   rv   	positionsrM   r.   r.   r/   forward_embedding  s"   



z(Kosmos2TextTransformer.forward_embeddingrE   r   image_embeds_position_maskr:  rO  	head_maskcross_attn_head_maskrL   rQ  r   r  r  r   rR   c              
   K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur$|n| j j}|d ur4|
d ur4td|d urD|j}|d|d }n|
d urQ|
 d d }ntd|	d urb|	d d jd nd}|dkrld }d }| j	||
||||d}| 
||||}|d ur|d urt||
j|d d}tjj|| j| jd}| jr| jr|rtd	 d
}|rdnd }|rdnd }|r|d urdnd }|rdnd }t||gddgD ](\}}|d ur| d t| jkrtd| dt| j d| d  dqt| jD ]s\}}|r
||f7 }| jrtg }|| jk rq|	d ur#|	| nd }||||f||d ur4|| nd |d ur>|| nd |||d|}|d }|r[|||rVdnd f7 }|rq||d f7 }|d urq||d f7 }q| |}|r||f7 }t|||||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer4   z5You have to specify either input_ids or inputs_embedsr   ru   )rE   r  rO   rg  r3   rv   rc  r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr.   rk  rl  zThe `z` should be specified for z layers, but it is for .)rO  r<  rP  r;  r   rQ  r   r   )rK   rL   rM   rN   cross_attentions)ro   r   r  rQ  r  r   r   r;   r"   ri  rf  r0   r   r   r   r   r   r  r   r   ziplenr   r  r&   randrY  rb  r   )r^   rE   r   rO   rj  r:  rO  rk  rl  rL   r  rv   rQ  r   r  r  r   r.  r3   rM   all_hidden_statesall_self_attnsall_cross_attentionspresent_key_value_states	attn_mask	mask_namer
  decoder_layerdropout_probabilityr;  r  r.   r.   r/   r     s   	






zKosmos2TextTransformer.forward)NNNr   NNNNNNNNNNNNNNNN)re   rf   rg   rh   r   rz   rf  r   r&   r   rA   ri  r   listri   r'   r   r   r   rb   r   r   r   r.   r.   r   r/   rX    s    
#	

rX  c                   @   s0   e Zd ZeZdZddgZdZdZdZ	dd Z
dS )Kosmos2PreTrainedModelTr   rG  c                 C   s  t | tr
| jj}nt | ttfr| jjj}t | ttfr"| jj	}nt | ttfr.| jj
j	}t |tr`tjj|jd|jd | d tjj|jj|jj| d tjj|jj|jj| d dS t |tr|jd d|jj d  | }|jd | }tjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d |jjdur|jjj  |jjdur|jjj  |jjdur|jjj  |jjdur|jjj  dS dS t |tr/|jjd d|jj d  | }d|jj d | }tjj|j j|d tjj|j!j|d |j jdur|j jj  |j!jdur-|j!jj  dS dS t |t"rU|j#jj  |j#jj$d |j%jj  |j%jj$d dS t |t&r{|j'jj  |j'jj$d |j(jj  |j(jj$d dS t |t)rtjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d |jjdur|jjj  |jjdur|jjj  |jjdur|jjj  |jjdur|jjj  dS dS t |t*rtjj|j j|d tjj|j!j|d |j jdur|j jj  |j!jdur|j!jj  dS dS t |trAtjj|j+j|d |j+jdur?|j+jj  dS dS t |t,rctjj|j-j|d |j-jdura|j-jj  dS dS t |t.r|j/jjjd|d |j/j0dur|j/jj|j/j0   dS dS dS )zInitialize the weightsr   r   )meanstd)r~  ru   Nr!   )1
isinstanceKosmos2VisionModelro   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configrn   r   initnormal_r   r|   r   r   initializer_ranger   r   r   r   r   r   r   rt   datazero_r   r{   r   r   r   r   fill_r   r  r  r  r2  rB  lm_headKosmos2ImageToTextProjectiondenserX  r_  rF   )r^   r   factorr~  in_proj_stdout_proj_stdfc_stdr.   r.   r/   _init_weights  s   





 
 z$Kosmos2PreTrainedModel._init_weightsN)re   rf   rg   r   config_classsupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_2_supports_sdpar  r.   r.   r.   r/   r|    s    r|  c                       s   e Zd ZeZdZdef fddZdejfddZ	e
						ddeej d
ee dee dedee deeef fddZ  ZS )r  r   ro   c                    "   t  | t|| _|   d S r   )ry   rz   r  model	post_initr   r   r.   r/   rz        
zKosmos2VisionModel.__init__rR   c                 C   
   | j jjS r   )r  r   r   r]   r.   r.   r/   get_input_embeddings     
z'Kosmos2VisionModel.get_input_embeddingsNFr   r  r   r  c                 C   s   | j |||||dS )N)r   r   r  r   r  r  )r^   r   r   r  r   r  r.   r.   r/   r     s   	zKosmos2VisionModel.forwardr  )re   rf   rg   r   r  main_input_namerz   r   Moduler  r   r   r&   ri   r'   r   rb   r   r   r   r.   r.   r   r/   r    s0    
r  c                '       s  e Zd ZeZdef fddZdejfddZdd Z	e
e																														dd
eej deej deej deej deej deej deej deej deeej  deej deej dee dee dee dee dee deeef f"ddZ  ZS )r  ro   c                    r  r   )ry   rz   rX  r  r  r   r   r.   r/   rz     r  zKosmos2TextModel.__init__rR   c                 C      | j jS r   r  r_  r]   r.   r.   r/   r       z%Kosmos2TextModel.get_input_embeddingsc                 C      || j _d S r   r  r^   r   r.   r.   r/   set_input_embeddings
     z%Kosmos2TextModel.set_input_embeddingsNrE   r   rO   rj  r:  rO  rk  rl  rL   r  rv   rQ  r   r  r  r   c                 K   s0   | j d|||||||||	|
|||||d|S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        rE   r   rO   rj  r:  rO  rk  rl  rL   r  rv   rQ  r   r  r  Nr.   r  )r^   rE   r   rO   rj  r:  rO  rk  rl  rL   r  rv   rQ  r   r  r  r   r.   r.   r/   r     s&   $zKosmos2TextModel.forwardrz  )re   rf   rg   r   r  rz   r   r  r  r  r   r   r   r&   r   r{  ri   r'   r   r   r   rb   r   r   r   r.   r.   r   r/   r    sr    	

r  c                   @   s   e Zd ZdS )KwargsForCausalLMN)re   rf   rg   r.   r.   r.   r/   r  E  s    r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                )       sf  e Zd ZeZdgZdef fddZdejfddZ	dd	 Z
dejfd
dZdd Zee																d&deej deej deej deej deej deej deej deej deeej  deej deej deej dee dee dee dee dee deeef f$d d!Z						d' fd"d#	Zed$d% Z  ZS )(r  zlm_head.weightro   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NF)in_featuresout_featuresrt   )
ry   rz   rX  r  r   r   r|   r]  r  r  r   r   r.   r/   rz   R  s   
zKosmos2TextForCausalLM.__init__rR   c                 C   r  r   r  r]   r.   r.   r/   r  [  r  z+Kosmos2TextForCausalLM.get_input_embeddingsc                 C   r  r   r  r  r.   r.   r/   r  ^  r  z+Kosmos2TextForCausalLM.set_input_embeddingsc                 C   s   | j S r   r  r]   r.   r.   r/   get_output_embeddingsa  s   z,Kosmos2TextForCausalLM.get_output_embeddingsc                 C   s
   || _ d S r   r  r^   new_embeddingsr.   r.   r/   set_output_embeddingsd  r  z,Kosmos2TextForCausalLM.set_output_embeddingsNrE   r   rO   rj  r:  rO  rk  rl  rL   r  rv   labelsrQ  r   r  r  r   c                 K   s   |dur|n| j j}|dur|rtd d}| jd	|||||||||	|
||||dd|}| |d }d}|durJ| jd	||| j jd|}t|||j	|j
|j|jdS )
aK  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FTr  r   )rm   r  r]  )rl   rm   rL   rM   rN   rn  r.   )ro   r  r   warningr  r  loss_functionr]  r   rL   rM   rN   rn  )r^   rE   r   rO   rj  r:  rO  rk  rl  rL   r  rv   r  rQ  r   r  r  r   r   	lm_logitsrl   r.   r.   r/   r   g  sH   )
zKosmos2TextForCausalLM.forwardc              
      s   t || jjdd}	|d urd }d }n%|d ur7| \}
}| d }tj|tj|
|| ftj|jdfdd}t	 j
|f||||||	|d|}|S )Nr   )rF   r3   r4   )r"   r   r2   r   r6   )rL   r   rO   rj  rQ  rv   cache_position)rH   ro   r^  r"   r&   r<   r=   r'   r2   ry   prepare_inputs_for_generation)r^   rE   rO   rj  rL   r   rQ  r  model_kwargsrv   r   r,  mask_lenmodel_inputsr   r.   r/   r    s>   	z4Kosmos2TextForCausalLM.prepare_inputs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr.   c                 3   s$    | ]}| d  |jV  qdS )r   N)r*  r$   r2   )r[   
past_statebeam_idxr.   r/   r_     s   " z8Kosmos2TextForCausalLM._reorder_cache.<locals>.<genexpr>)rb   )rL   r  reordered_past
layer_pastr.   r  r/   _reorder_cache  s   z%Kosmos2TextForCausalLM._reorder_cache)NNNNNNNNNNNNNNNN)NNNNNN) re   rf   rg   r   r  _tied_weights_keysrz   r   r  r  r  r  r  r   r   r   r&   r   r{  ri   
LongTensorr'   r   r  r   rb   r   r   r  r0  r  r   r.   r.   r   r/   r  H  s    		

R1r  c                       s.   e Zd ZdZdef fddZdd Z  ZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)ro   c                    sb   t    t|jj|jj| _t	t
|j|jj| _t|j|jj|jj|jjddd| _d S )NF)r   r3  r4  )ry   rz   r   r   r  r{   r  r|   r  r   r&   r   latent_query_numlatent_queryr2  rH  r   x_attnr   r   r.   r/   rz     s   
z%Kosmos2ImageToTextProjection.__init__c                 C   sZ   |  |}| jd|ddd}tj||gdd}| j||d d d d\}}}||fS )Nr   r4   r   r6   )rM   r:  r;  r   r   )r  r  r   r#   r"   r&   r<   r  )r^   featuresrM   r  key_value_statesr   r   r.   r.   r/   r     s   
z$Kosmos2ImageToTextProjection.forward)re   rf   rg   rh   r   rz   r   r   r.   r.   r   r/   r    s    r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c                %       s.  e Zd ZeZdZdef fddZdejfddZ	dd	 Z
	
	
ddejdee dee fddZee													
	d deej deej deej deej deej deeej  deej deej deej dee dee dee dedee dee deeef f ddZ  ZS )!r  r   ro   c                    :   t  | t|j| _t|j| _t|| _	| 
  d S r   )ry   rz   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   r   r.   r/   rz      s
   
zKosmos2Model.__init__rR   c                 C   r  r   r  r  r_  r]   r.   r.   r/   r  *  r  z!Kosmos2Model.get_input_embeddingsc                 C      || j j_d S r   r  r  r.   r.   r/   r  -     z!Kosmos2Model.set_input_embeddingsFreturn_attentionsr   c                 C   sN   | j ||d}| j j|d }tjj|dd}| |\}}|r%||fS |S )aD  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            return_attentions (`bool`, *optional*, defaults to `False`):
                Whether to return `projection_attentions` or not.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate positional embeddings or not.
        )r   r   r   r4   r6   )r  r  r  r   r   	normalizer  )r^   r   r  r   rQ   rO   rP   r.   r.   r/   get_image_features0  s   zKosmos2Model.get_image_featuresNrE   rj  r   rk  rL   rO   r  rv   rQ  r   r  r  r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}d}d}|du r8|du r.td| j|d|d\}}| jd||||||||	|
||dd|}t|j|j	|j
|j|||dS )aE  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r  r   )rE   r   rO   rj  rk  rL   r  rv   rQ  r   r  r  )rK   rL   rM   rN   rO   rP   rQ   r.   )ro   r   r  r  r   r  r  rJ   rK   rL   rM   rN   )r^   r   rE   rj  r   rk  rL   rO   r  rv   rQ  r   r  r   r  r   rQ   rP   r   r.   r.   r/   r   O  sJ   <
zKosmos2Model.forward)FF)NNNNNNNNNNNNFN)re   rf   rg   r   r  r  rz   r   r  r  r  r&   ri   r   r'   r  r   r   r   r{  r   r   r   rb   rJ   r   r   r.   r.   r   r/   r    s    

	

r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c                #       sj  e Zd ZeZdZdgZdef fddZdej	fddZ
d	d
 Zdej	fddZdd Zee													d!deej deej deej deej deej deeej  deej deej deej deej dee dee dee dee deeef fddZ					d"deej deej deej deej deej f
dd Z  ZS )#r  r   ztext_model.lm_head.weightro   c                    r  r   )ry   rz   r  r  r  r  r  r  r  r  r  r   r   r.   r/   rz     s
   
z(Kosmos2ForConditionalGeneration.__init__rR   c                 C   r  r   r  r]   r.   r.   r/   r    r  z4Kosmos2ForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  r  r.   r.   r/   r    r  z4Kosmos2ForConditionalGeneration.set_input_embeddingsc                 C   s
   | j  S r   )r  r  r]   r.   r.   r/   r    r  z5Kosmos2ForConditionalGeneration.get_output_embeddingsc                 C   s   | j | d S r   )r  r  r  r.   r.   r/   r    s   z5Kosmos2ForConditionalGeneration.set_output_embeddingsNrE   rj  r   rk  rL   rO   r  rv   r  rQ  r   r  r   c                 K   s   |dur|n| j j}|dur|n| j j}d}d}|du rD|du r$td| j|||d}| jj|d }tjj	|dd}| 
|\}}| jd
||||||||	|
|||dd|}t|j|j|j|j|j|||d	S )a5  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r  r   r4   r6   T)rE   r   rO   rj  rk  rL   r  rv   r  rQ  r   r  r  )rl   rm   rL   rM   rN   rO   rP   rQ   r.   )ro   r   r  r   r  r  r  r   r   r  r  r  rk   rl   rm   rL   rM   rN   )r^   r   rE   rj  r   rk  rL   rO   r  rv   r  rQ  r   r  r   rQ   rP   
lm_outputsr.   r.   r/   r     sV   Gz'Kosmos2ForConditionalGeneration.forwardc                 K   s   | dd }|d ur|d urtd| d|d u r |d ur |}|d u rA| |}| jj|d }tjj|dd}| |\}}	| j	j
d||||d|}
|
S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r4   r6   )rE   r   rO   rj  r.   )popr   r  r  r  r   r   r  r  r  generate)r^   r   rj  rE   r   rO   r   r  rQ   rP   outputr.   r.   r/   r  P  s*   


z(Kosmos2ForConditionalGeneration.generate)NNNNNNNNNNNNNr  )re   rf   rg   r   r  r  r  rz   r   r  r  r  r  r  r   r   r   r&   r   r{  ri   r  r'   r   r  r   rb   rk   r   r  r   r.   r.   r   r/   r    s    	

yr  )r  r  r|  r   )r   )r   )Mrh   r  dataclassesr   typingr   r   r   r   r&   torch.utils.checkpointr   activationsr	   
generationr
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   configuration_kosmos2r   r   r   
get_loggerre   r   r   r   rA   r0   Sizer2   r?   rH   rJ   rk   r  rn   r#  r   r   r   r   r   r  r  r2  rB  rG  rX  r|  r  r  r  r  r  r  r  __all__r.   r.   r.   r/   <module>   s    
 

&,[
P3Y7Xyr `\!G '#  <