o
    i@                    @   s.  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- e&.e/Z0dXde	j1de	j2dee3 fddZ4	dYde	j5de	j2de	j6de3fddZ7dYddZ8ee$dd G d!d" d"e"Z9ee$d#d G d$d% d%e"Z:G d&d' d'e
j;Z<	(dZd)e
j;d*e	j1d+e	j1d,e	j1d-ee	j1 d.e=d/e=fd0d1Z>G d2d3 d3e
j;Z?G d4d5 d5e
j;Z@G d6d7 d7eZAG d8d9 d9e
j;ZBG d:d; d;e
j;ZCG d<d= d=e
j;ZDG d>d? d?e
j;ZEG d@dA dAe
j;ZFG dBdC dCeZGG dDdE dEe
j;ZHe$G dFdG dGeZIG dHdI dIeIZJG dJdK dKeIZKe$dLd G dMdN dNeIeZLG dOdP dPe
j;ZMe$dQd G dRdS dSeIZNe$dTd G dUdV dVeIeZOg dWZPdS )[zPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)deprecate_kwarg   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 C   sj   |   \}}|dur|n|}| ddddddf |d|||}d| }||tjt|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r"   r#   r$   bszsrc_lenexpanded_maskinverted_mask r2   `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_mask-   s
   *r4   input_ids_shapedevicepast_key_values_lengthc                 C   s   | \}}t j||ft |j|d}t j|d|d}|||d |ddk d ||}|dkrFt j	t j
||||d|gdd}|ddddddf |d||| S )zB
    Make causal mask used for bi-directional self-attention.
    )r6   r   r   r#   r6   dimN)r*   fullr,   r-   aranger&   masked_fill_viewr(   catzerosr'   )r5   r#   r6   r7   r.   r$   r"   	mask_condr2   r2   r3   _make_causal_mask;   s   "
 (rC   c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r:   )neintr*   cumsumtype_aslong)	input_idspadding_idxr7   r"   incremental_indicesr2   r2   r3   "create_position_ids_from_input_idsM   s   rL   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeeej  ed< dZeeej  ed< dZeej ed< dZeeej  ed< dZeed	< d
ee fddZdS )Kosmos2ModelOutputa  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS )text_model_outputrU   Ngetattrto_tuple.0kselfr2   r3   	<genexpr>   
    
z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>tuplekeysra   r2   ra   r3   r]         zKosmos2ModelOutput.to_tuple)__name__
__module____qualname____doc__rO   r   r*   FloatTensor__annotations__rP   r
   rQ   rf   rR   rS   rT   rU   r   r   r]   r2   r2   r2   r3   rN   ]   s   
 rN   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dZeeej  ed< dZeej ed< dZeeej  ed	< dZeed
< dee fddZdS )*Kosmos2ForConditionalGenerationModelOutputa*  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    NlosslogitsrP   rQ   rR   rS   rT   rU   rV   c                    rW   )Nc                 3   rX   rY   r[   r^   ra   r2   r3   rc      rd   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>re   ra   r2   ra   r3   r]      rh   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple)ri   rj   rk   rl   rp   r   r*   rm   rn   rq   rP   r
   rQ   rf   rR   rS   rT   rU   r   r   r]   r2   r2   r2   r3   ro      s   
 ro   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )Kosmos2VisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r8   
persistent)super__init__rs   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr*   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr=   r'   rb   rs   	__class__r2   r3   r~      s"   
"z Kosmos2VisionEmbeddings.__init__
embeddingsheightwidthrV   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr8   g      ?r   ry   bicubicF)r&   modealign_cornersr:   )shaper   weight	unsqueezer*   jit
is_tracingrz   r   r   reshapepermuter   
functionalinterpolater?   r@   )rb   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr;   
new_height	new_widthsqrt_num_positionsr2   r2   r3   interpolate_pos_encoding   s*   



z0Kosmos2VisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model ().r#   ry   r   r8   r:   )r   r   
ValueErrorr   r   r#   r(   flatten	transposer   r'   r*   r@   r   r   rz   )rb   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   r2   r2   r3   forward   s    
zKosmos2VisionEmbeddings.forwardF)ri   rj   rk   r!   r~   r*   TensorrE   r   rm   r   __classcell__r2   r2   r   r3   rr      s     )rr           modulequerykeyvalueattention_maskscalingdropoutc           
      K   sp   t ||dd| }|d ur|| }tjj|dd}tjj||| jd}t ||}	|	dd }	|	|fS )Nr8   r:   ptrainingr   ry   )	r*   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr2   r2   r3   eager_attention_forward	  s   
r   c                       sh   e Zd ZdZ fddZ			ddejdeej deej d	ee d
e	ejeej f f
ddZ
  ZS )Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r}   r~   rs   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   r   r2   r3   r~   "  s$   

zKosmos2VisionAttention.__init__NFrQ   r   causal_attention_maskoutput_attentionsrV   c              
   C   s0  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
| jj	dkrY|durR|durR|| }n|durX|}n|du| _
t}| jj	dkrlt| jj	 }|| ||	|
|| j
| j| js{dn| jd\}}|||| }| |}|sd}||fS )#Input shape: Batch x Time x Channelr   ry   flash_attention_2Neagerr   )r   r   r   )r   r   r   r   r?   r   r   r   rs   _attn_implementationr   r   r   r   r   r   r   r   r   )rb   rQ   r   r   r   r   
seq_lengthr   queriesrg   valuesattention_interfacer   r   r2   r2   r3   r   6  s@   	






zKosmos2VisionAttention.forward)NNF)ri   rj   rk   rl   r~   r*   r   r   r+   rf   r   r   r2   r2   r   r3   r     s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Kosmos2VisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r}   r~   rs   r	   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   r   r2   r3   r~   j  s
   
zKosmos2VisionMLP.__init__rQ   rV   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   rb   rQ   r2   r2   r3   r   q  s   


zKosmos2VisionMLP.forward)ri   rj   rk   r~   r*   r   r   r   r2   r2   r   r3   r   i  s    r   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )Kosmos2VisionEncoderLayerrs   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S Neps)r}   r~   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   r   r2   r3   r~   z  s   


z"Kosmos2VisionEncoderLayer.__init__FrQ   r   r   r   rV   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rQ   r   r   r   )r   r   r   r   )rb   rQ   r   r   r   residualr   outputsr2   r2   r3   r     s"   




z!Kosmos2VisionEncoderLayer.forwardr   )ri   rj   rk   r!   r~   r*   r   r   r+   rf   rm   r   r   r2   r2   r   r3   r   y  s    r   c                       sx   e Zd ZdZdef fddZe					ddeej	 deej	 dee
 d	ee
 d
ee
 deeef fddZ  ZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    rs   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r2   )r   )r_   r   rs   r2   r3   
<listcomp>  s    z1Kosmos2VisionEncoder.__init__.<locals>.<listcomp>F)	r}   r~   rs   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr   r   r   r3   r~     s   
 
zKosmos2VisionEncoder.__init__Nr   r   r   output_hidden_statesreturn_dictrV   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ] \}
}|r<||	f }||	|||d}|d }	|rQ||d f }q1|rY||	f }t|	||dS )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr2   )r   r   r   )rO   rQ   rR   )rs   r   r   use_return_dict	enumerater   r   )rb   inputs_embedsr   r   r   r   r   encoder_statesall_attentionsrQ   idxencoder_layerlayer_outputsr2   r2   r3   r     s2   '

zKosmos2VisionEncoder.forward)NNNNN)ri   rj   rk   rl   r!   r~   r   r   r*   r   r+   r   rf   r   r   r   r2   r2   r   r3   r     s,    
r   c                       sj   e Zd Zdef fddZ					ddeej dee dee d	ed
ee de	e
ef fddZ  ZS )Kosmos2VisionTransformerrs   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )r}   r~   rs   r   rr   r   r   r   r   pre_layrnormr   encoderpost_layernorm)rb   rs   r   r   r2   r3   r~     s   


z!Kosmos2VisionTransformer.__init__NFr   r   r   r   r   rV   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| j||d}| |}| j||||d}|d }|d d dd d f }	| |	}	|s[||	f|dd   S t	||	|j
|jdS )Nz You have to specify pixel_values)r   )r  r   r   r   r   r   )rO   pooler_outputrQ   rR   )rs   r   r   r  r   r   r
  r  r  r   rQ   rR   )
rb   r   r   r   r   r   rQ   encoder_outputsrO   pooled_outputr2   r2   r3   r     s2   

z Kosmos2VisionTransformer.forwardNNNFN)ri   rj   rk   r!   r~   r   r*   rm   r+   r   rf   r   r   r   r2   r2   r   r3   r	    s(    
r	  c                       s   e Zd ZdZddededee f fddZddededee fd	d
Zeddededee fddZ	e
 				ddee
j dee
j dedee
j fddZdd Z  ZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nr   embedding_dimrJ   c                    s4   t    d| _|| _|| _| || j || d S )Nry   )r}   r~   offsetr  rJ   make_weights)rb   r   r  rJ   r   r2   r3   r~   ?  s
   
z1Kosmos2TextSinusoidalPositionalEmbedding.__init__num_embeddingsc                 C   sB   |  |||}t| dr|j| jj| jjd}| jd|dd d S )Nweightsr9   Fr{   )get_embeddinghasattrr(   r  r#   r6   r   )rb   r  r  rJ   emb_weightsr2   r2   r3   r  G  s   
z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |t S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        ry   i'  r   r   r   r:   r8   N)mathlogr*   expr=   int64floatr   r@   sincosr?   rA   r(   get_default_dtype)r  r  rJ   half_dimembr2   r2   r3   r  O  s   	 $&z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingr   rI   r  r7   rz   c                 C   s   |d ur|  \}}|d u rt|| j||j}n|  d d \}}|d u r.| ||}| jd | | }|| j dkrK| || j | j	| j | j
d|d||| jjd  S )Nr8   r   r   )r&   rL   rJ   r(   r6   &create_position_ids_from_inputs_embedsr  r  r  r  index_selectr?   r   detach)rb   rI   r  r7   rz   r.   seq_lenmax_posr2   r2   r3   r   e  s    *z0Kosmos2TextSinusoidalPositionalEmbedding.forwardc                 C   sV   |  dd }|d }tj| jd || j d tj|jd}|d| | S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr8   r   r9   r   )	r&   r*   r=   rJ   rH   r6   r   r'   r   )rb   r  r7   input_shapesequence_lengthrz   r2   r2   r3   r$    s   	zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embedsr   )NNr   N)ri   rj   rk   rl   rE   r   r~   r  staticmethodr  r*   no_gradr   r   r$  r   r2   r2   r   r3   r  ;  s*     r  c                       s   e Zd ZdZ					ddededed	ee d
ee dee dee f fddZe	dddd						dde
jdee
j dee dee
j dee
j dedee
j dee
jee
j ee f fddZ  ZS )KosmosTextAttentionr   r   FTNr   r   r   
is_decoderadd_inner_attn_layernormrx   	layer_idxc	           	         s   t    || _|| _|| _|| _|| | _| j| | jkr*td| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d | _|rjtj||jd| _d S d S )Nr   r   r   r   )rx   r   )r}   r~   rs   r   r   r   r   r   r   r.  r0  r   r   r   r   r   r   inner_attn_lnr   r   )	rb   rs   r   r   r   r.  r/  rx   r0  r   r2   r3   r~     s.   


zKosmosTextAttention.__init__past_key_valuerP   4.58new_nameversionrQ   encoder_hidden_statesr   layer_head_maskr   cache_positionrV   c                 K   s  |du}	|j dd \}
}| |}||
|| j| jdd}d}|dur>t|tr<|j	| j
}|	r8|j}n|j}n|}|	rB|n|}|	r[|dur[|r[|j| j
 j}|j| j
 j}nJ| |}| |}||
d| j| jdd}||
d| j| jdd}|dur|	s|nd}|||| j
d|i\}}|	rt|trd|j| j
< t}| jjdkrt| jj }|| ||||f| jsd	n| j| jd
|\}}||
|d }| jdur| |}| |}||fS )r   Nry   r   Fr8   r9  Tr   r   )r   r   )r   r   r?   r   r   r   
isinstancer   
is_updatedgetr0  cross_attention_cacheself_attention_cacher   rg   r   r   r   updater   rs   r   r   r   r   r   r   r   r1  r   )rb   rQ   r7  rP   r   r8  r   r9  r   is_cross_attentionr   r   query_statesr;  curr_past_key_valuecurrent_states
key_statesvalue_statesr   r   r   r2   r2   r3   r     s^   







zKosmosTextAttention.forward)r   FFTN)NNNNFN)ri   rj   rk   rl   rE   r  r   r+   r~   r   r*   r   r
   rf   r   r   r2   r2   r   r3   r-    s\    	%
r-  c                       s*   e Zd Zdef fddZdd Z  ZS )Kosmos2TextFFNrs   c                    sb   t    |j| _t|j | _|j| _t|j	|j
| _t|j
|j	| _tj|j
|jd| _d S r   )r}   r~   r   r	   activation_functionr   activation_dropoutr   r   r   ffn_dimr   r   r   r   ffn_layernormr   r   r2   r3   r~     s   
zKosmos2TextFFN.__init__c                 C   sT   |  | |}tjj|| j| jd}| |}| |}tjj|| j| jd}|S )Nr   )	r   r   r   r   r   rH  r   rJ  r   r   r2   r2   r3   r     s   

zKosmos2TextFFN.forward)ri   rj   rk   r    r~   r   r   r2   r2   r   r3   rF    s    rF  c                       s   e Zd Zddef fddZedddd									
	ddejdeej deej deej deej deej dee	 dee
 dee
 deej deejeeejejf  f fddZ  ZS )Kosmos2TextBlockNrs   c              	      s   t    |j| _t|| j|j|jdd|d| _|j| _tj	| j|j
d| _|jrBt|| j|j|jdd|d| _tj	| j|j
d| _t|| _tj	| j|j
d| _d S )NT)r   r   r   r.  r/  r0  r   F)r}   r~   r   r-  attention_headsr   r   r   r   r   r   self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrF  ffnfinal_layer_norm)rb   rs   r0  r   r2   r3   r~   $  s4   
		
zKosmos2TextBlock.__init__r2  rP   r3  r4  FTrQ   r   r7  encoder_attention_maskr8  cross_attn_layer_head_maskr   	use_cacher9  rV   c              
   K   s   |}|  |}| jd||||||
d|\}}tjj|| j| jd}|| }d }|d urbt| ds:td|  d|}| |}| j	d|||||||
d|\}}tjj|| j| jd}|| }|}| 
|}| |}|| }|f}|r}|||f7 }|S )N)rQ   rP   r   r8  r   r9  r   rO  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rQ   r7  r   r8  rP   r   r9  r2   )rM  r   r   r   r   r   r  r   rP  rO  rR  rQ  )rb   rQ   r   r7  rS  r8  rT  rP   r   rU  r9  r   r   self_attn_weightscross_attn_weightsr   r2   r2   r3   r   C  sV   

	






zKosmos2TextBlock.forwardr   )	NNNNNNFTN)ri   rj   rk   r    r~   r   r*   r   r   r
   r+   rf   rm   r   r   r2   r2   r   r3   rK  #  sF    	
rK  c                '       sF  e Zd ZdZdef fddZdd Z					d!d	eej	 d
eej	 deej	 de
deej	 f
ddZ																d"deej	 deej	 d
eej	 deej	 deej	 deej	 deej	 deej	 dee d	eej	 deej	 dee dee dee dee deej	 dee deeef f$dd Z  ZS )#Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    rs   c                    s   t     | _ j| _ j| _ jrt jnd| _	t
j j j jd| _t j j jd| _t
 fddt jD | _t
 j j| _d| _d S )Nr%   )rJ   )r   r  rJ   c                    s   g | ]}t  |d qS ))r0  )rK  )r_   ir   r2   r3   r     s    z3Kosmos2TextTransformer.__init__.<locals>.<listcomp>F)r}   r~   rs   r   	layerdropscale_embeddingr  sqrtr   embed_scaler   r   
vocab_sizepad_token_idembed_tokensr  max_position_embeddingsembed_positionsr   r   r   r   r   
layer_normr   r   r   r   r3   r~     s   
 
zKosmos2TextTransformer.__init__c                 C   s`   d }|d dkrt ||j|j|d}|d ur.t||j|d d|j}|d u r*|n|| }|S )Nr8   r   )r6   r7   r$   )rC   r#   r6   r4   r(   )rb   r   r)  r  r7   combined_attention_maskexpanded_attn_maskr2   r2   r3   _prepare_decoder_attention_mask  s   z6Kosmos2TextTransformer._prepare_decoder_attention_maskNr   r  rS   img_input_maskr7   rz   c           	      C   s   |d u r	|  |}|d ur!||jd|d||jtjd< || j }| j||||d}||j}|| }t	j
j|| j| jd}|S )Nr8   r   )rI   r  r7   rz   r   )r`  r(   r6   r?   r&   r*   r+   r]  rb  r   r   r   r   )	rb   rI   r  rS   rh  r7   rz   	positionsrQ   r2   r2   r3   forward_embedding  s"   



z(Kosmos2TextTransformer.forward_embeddingrI   r   image_embeds_position_maskr7  rS  	head_maskcross_attn_head_maskrP   rU  r   r   r   r9  r   rV   c                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur*|
d ur*td|d ur:|j}|d|d }n|
d urG|
 d d }ntd| jrZ| j	rZ|rZt
d d}|rw|	d u rw|d urqtt| j dt| j dnt| j d}	|rt|	trt
d t|	}	|	d ur|	 nd}|dkrd }d }| j||
||||d	}| ||||}|d ur|d urt||
j|d d
}tjj|| j| j	d}|rdnd }|rdnd }|r|d urdnd }t||gddgD ]*\}}|d ur| d t| jkrtd| dt| j d| d  dqt| jD ]\\}}|r%||f7 }| j	r6tg }|| jk r6q||||f||d urE|| nd |d urO|| nd |	|||d|}|d }|rt||d f7 }|d urt||d f7 }q|  |}|r||f7 }t!||	|||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer8   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )rI   r  rS   rh  r7   rz   rd  r   r2   rl  rm  zThe `z` should be specified for z layers, but it is for .)rS  r8  rT  rP   r   rU  r9  r   ry   )rO   rP   rQ   rR   cross_attentions)"rs   r   r   rU  r   r   r?   r&   r   r   loggerwarning_oncer   r   r:  rf   from_legacy_cacheget_seq_lengthrj  rg  r4   r#   r   r   r   ziplenr   r  r*   randrZ  rc  r   )rb   rI   r   rS   rk  r7  rS  rl  rm  rP   r  rz   rU  r   r   r   r9  r   r)  r7   rQ   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer  decoder_layerdropout_probabilityr  r2   r2   r3   r     s   

	






zKosmos2TextTransformer.forward)NNNr   NNNNNNNNNNNNNNNNN)ri   rj   rk   rl   r    r~   rg  r   r*   r   rE   rj  r
   r+   r   r   r   rf   r   r   r   r2   r2   r   r3   rX    s    
%	

rX  c                   @   s>   e Zd ZU eed< dZddgZdZdZdZ	de
jfddZdS )	Kosmos2PreTrainedModelrs   Tr   rK  r   c                 C   s6  t | tr
| jj}nt | ttfr| jjj}t | ttfr"| jj	}nt | ttfr.| jj
j	}t |tr`tjj|jd|jd | d tjj|jj|jj| d tjj|jj|jj| d n"t |tr|jd d|jj d  | }|jd | }tjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d nt |tr|jjd d|jj d  | }d|jj d | }tjj|jj|d tjj|jj|d nt |trtjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d n{t |t r"tjj|jj|d tjj|jj|d n`t |tr3tjj|j!j|d nOt |t"rKtjj|j#j|d tj|j$ n7t |t%rn|j&jj'jd|d |j&j(durm|j&jj'|j&j( )  nt |tj*r|jj'+d |j,j')  t |tj-r|j,dur|j,j')  dS dS dS )zInitialize the weightsr   r   )meanstd)r  ry   Nr%   ).r:  Kosmos2VisionModelrs   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configrr   r   initnormal_r   r   r   r   initializer_ranger   r   r   r   r   r   r   r   r   r   r   r-  rF  lm_headKosmos2ImageToTextProjectiondenselatent_queryrX  r`  datarJ   zero_r   fill_rx   r   )rb   r   factorr  in_proj_stdout_proj_stdfc_stdr2   r2   r3   _init_weightsy  s`   





 

 z$Kosmos2PreTrainedModel._init_weightsN)ri   rj   rk   r   rn   supports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpar   Moduler  r2   r2   r2   r3   r  p  s   
 r  c                       s   e Zd ZU eed< dZdef fddZdejfddZ	e
						ddeej d
ee dee dedee deeef fddZ  ZS )r  rs   r   c                    "   t  | t|| _|   d S r   )r}   r~   r	  model	post_initr   r   r2   r3   r~        
zKosmos2VisionModel.__init__rV   c                 C   
   | j jjS r   )r  r   r   ra   r2   r2   r3   get_input_embeddings     
z'Kosmos2VisionModel.get_input_embeddingsNFr   r   r   r   c                 C   s   | j |||||dS )N)r   r   r   r   r   r  )rb   r   r   r   r   r   r2   r2   r3   r     s   	zKosmos2VisionModel.forwardr  )ri   rj   rk   r!   rn   main_input_namer~   r   r  r  r   r   r*   rm   r+   r   rf   r   r   r   r2   r2   r   r3   r    s0   
 
r  c                )       s  e Zd ZU eed< def fddZdejfddZe	e
																ddeej d	eej d
eej deej deej deej deej deej dee deej deej dee dee dee dee deej dee deeef f$ddZ  ZS )r  rs   c                    r  r   )r}   r~   rX  r  r  r   r   r2   r3   r~     r  zKosmos2TextModel.__init__rV   c                 C      | j jS r   r  r`  ra   r2   r2   r3   r       z%Kosmos2TextModel.get_input_embeddingsNrI   r   rS   rk  r7  rS  rl  rm  rP   r  rz   rU  r   r   r   r9  r   c                 K   sp   | j di d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d||S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        rI   r   rS   rk  r7  rS  rl  rm  rP   r  rz   rU  r   r   r   r9  Nr2   r  )rb   rI   r   rS   rk  r7  rS  rl  rm  rP   r  rz   rU  r   r   r   r9  r   r2   r2   r3   r     sD   
%	
zKosmos2TextModel.forwardr~  )ri   rj   rk   r    rn   r~   r   r  r  r   r   r   r*   r   r
   r+   r   r   r   rf   r   r   r   r2   r2   r   r3   r    sv   
 	

r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                +       sX  e Zd ZU eed< dgZdef fddZdejfddZ	dejfdd	Z
ee	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
	
d!deej deej deej deej deej deej deej deej dee deej deej deej dee dee dee dee deej dee deeef f&ddZ	
	
	
	
	
	
	
d" fdd 	Z  ZS )#r  rs   zlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NF)in_featuresout_featuresrx   )
r}   r~   rX  r  r   r   r   r^  r  r  r   r   r2   r3   r~     s   
zKosmos2TextForCausalLM.__init__rV   c                 C   r  r   r  ra   r2   r2   r3   r  (  r  z+Kosmos2TextForCausalLM.get_input_embeddingsc                 C   s   | j S r   )r  ra   r2   r2   r3   get_output_embeddings+  s   z,Kosmos2TextForCausalLM.get_output_embeddingsNrI   r   rS   rk  r7  rS  rl  rm  rP   r  rz   labelsrU  r   r   r   r9  r   c                 K   s   |dur|n| j j}|dur|rtd d}| jdi d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|ddd||}| |d }d}|durj| jd||| j jd|}t|||j	|j
|j|jdS )aK  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FrI   r   rS   rk  r7  rS  rl  rm  rP   r  rz   rU  r   r   r   Tr9  r   )rq   r  r^  )rp   rq   rP   rQ   rR   ro  r2   )rs   r  rp  warningr  r  loss_functionr^  r   rP   rQ   rR   ro  )rb   rI   r   rS   rk  r7  rS  rl  rm  rP   r  rz   r  rU  r   r   r   r9  r   r   	lm_logitsrp   r2   r2   r3   r   .  sf   *

	
zKosmos2TextForCausalLM.forwardc	              
      s   |d dkrd }d }n1|d ur<|d ur|  d d n|  \}
}|  d }tj|tj|
|| ftj|jdfdd}t j|f|||||||d|	}|dd  |S )Nr   r8   )r&   r#   r6   r   r:   )rP   r   rS   rk  r  rU  r9  rz   )	r&   r*   r@   rA   r+   r6   r}   prepare_inputs_for_generationpop)rb   rI   rS   rk  rP   r   r  rU  r9  model_kwargsr   r'  mask_lenmodel_inputsr   r2   r3   r    s6   $	z4Kosmos2TextForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNNNNNNN)NNNNNNN)ri   rj   rk   r    rn   _tied_weights_keysr~   r   r  r  r  r   r   r   r*   r   r
   
LongTensorr+   r   r   r   rf   r   r   r  r   r2   r2   r   r3   r    s   
 		

Tr  c                       s.   e Zd ZdZdef fddZdd Z  ZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)rs   c                    sb   t    t|jj|jj| _t	t
|j|jj| _t|j|jj|jj|jjddd| _d S )NF)r   r.  r/  )r}   r~   r   r   r  r   r  r   r  r   r*   r   latent_query_numr  r-  rL  r   x_attnr   r   r2   r3   r~     s   
z%Kosmos2ImageToTextProjection.__init__c                 C   sX   |  |}| jd|ddd}tj||gdd}| j||d d d d\}}||fS )Nr   r8   r   r:   )rQ   r7  rP   r   r   )r  r  r   r'   r&   r*   r@   r  )rb   featuresrQ   r  key_value_statesr   r2   r2   r3   r     s   

z$Kosmos2ImageToTextProjection.forward)ri   rj   rk   rl   r   r~   r   r   r2   r2   r   r3   r    s    r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c                %       s.  e Zd ZU eed< dZdef fddZdejfddZ	dd	 Z
	
	
ddejdee dee fddZee													
	d deej deej deej deej deej dee deej deej deej dee dee dee dedee dee deeef f ddZ  ZS )!r  rs   r   c                    :   t  | t|j| _t|j| _t|| _	| 
  d S r   )r}   r~   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   r   r2   r3   r~     s
   
zKosmos2Model.__init__rV   c                 C   r  r   r  r  r`  ra   r2   r2   r3   r    r  z!Kosmos2Model.get_input_embeddingsc                 C      || j j_d S r   r  rb   r   r2   r2   r3   set_input_embeddings     z!Kosmos2Model.set_input_embeddingsFreturn_attentionsr   c                 C   sN   | j ||d}| j j|d }tjj|dd}| |\}}|r%||fS |S )aD  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            return_attentions (`bool`, *optional*, defaults to `False`):
                Whether to return `projection_attentions` or not.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate positional embeddings or not.
        )r   r   r   r8   r:   )r  r  r  r   r   	normalizer  )rb   r   r  r   rU   rS   rT   r2   r2   r3   get_image_features  s   zKosmos2Model.get_image_featuresNrI   rk  r   rl  rP   rS   r  rz   rU  r   r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}d}d}|du r8|du r.td| j|d|d\}}| jd||||||||	|
||dd|}t|j|j	|j
|j|||dS )aE  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r  r   )rI   r   rS   rk  rl  rP   r  rz   rU  r   r   r   )rO   rP   rQ   rR   rS   rT   rU   r2   )rs   r   r   r  r   r  r  rN   rO   rP   rQ   rR   )rb   r   rI   rk  r   rl  rP   rS   r  rz   rU  r   r   r   r   r   rU   rT   r   r2   r2   r3   r     sJ   <
zKosmos2Model.forward)FF)NNNNNNNNNNNNFN)ri   rj   rk   r   rn   r  r~   r   r  r  r  r*   rm   r   r+   r  r   r   r   r
   r   r   r   rf   rN   r   r   r2   r2   r   r3   r    s   
 

	

r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c                #       s~  e Zd ZU eed< dZdgZdef fddZdej	fddZ
d	d
 Zdej	fddZdd Zee													d!deej deej deej deej deej dee deej deej deej deej dee dee dee dee deeef fddZe 						d"deej deej deej deej deej deej fdd Z  ZS )#r  rs   r   ztext_model.lm_head.weightc                    r  r   )r}   r~   r  r  r  r  r  r  r  r  r  r   r   r2   r3   r~   }  s
   
z(Kosmos2ForConditionalGeneration.__init__rV   c                 C   r  r   r  ra   r2   r2   r3   r    r  z4Kosmos2ForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  r  r2   r2   r3   r    r  z4Kosmos2ForConditionalGeneration.set_input_embeddingsc                 C   s
   | j  S r   )r  r  ra   r2   r2   r3   r    r  z5Kosmos2ForConditionalGeneration.get_output_embeddingsc                 C   s   | j | d S r   )r  set_output_embeddings)rb   new_embeddingsr2   r2   r3   r    s   z5Kosmos2ForConditionalGeneration.set_output_embeddingsNrI   rk  r   rl  rP   rS   r  rz   r  rU  r   r   r   c                 K   s   |dur|n| j j}|dur|n| j j}d}d}|du rD|du r$td| j|||d}| jj|d }tjj	|dd}| 
|\}}| jd
||||||||	|
|||dd|}t|j|j|j|j|j|||d	S )a5  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r   r   r8   r:   T)rI   r   rS   rk  rl  rP   r  rz   r  rU  r   r   r   )rp   rq   rP   rQ   rR   rS   rT   rU   r2   )rs   r   r   r   r  r  r  r   r   r  r  r  ro   rp   rq   rP   rQ   rR   )rb   r   rI   rk  r   rl  rP   rS   r  rz   r  rU  r   r   r   rU   rT   
lm_outputsr2   r2   r3   r     sV   Gz'Kosmos2ForConditionalGeneration.forwardc                 K   s   | dd }|d ur|d urtd| d|d u r |d ur |}|d u rA| |}	| jj|	d }tjj|dd}| |\}}
| j	j
d|||||d|}|S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r8   r:   )rI   r   rS   rk  r  r2   )r  r   r  r  r  r   r   r  r  r  generate)rb   r   rk  rI   r   rS   r  r   r  rU   rT   outputr2   r2   r3   r    s,   

	z(Kosmos2ForConditionalGeneration.generate)NNNNNNNNNNNNN)NNNNNN)ri   rj   rk   r   rn   r  r  r~   r   r  r  r  r  r  r   r   r   r*   r   r
   r  r+   r   r   r   rf   ro   r   r,  r  r   r2   r2   r   r3   r  r  s   
 	

wr  )r  r  r  r   )r   )r   )Qrl   r  dataclassesr   typingr   r   r   r   r*   r   activationsr	   cache_utilsr
   r   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   configuration_kosmos2r   r    r!   
get_loggerri   rp  r   r#   rE   r4   Sizer6   rC   rL   rN   ro   r  rr   r  r   r   r   r   r   r	  r  r-  rF  rK  rX  r  r  r  r  r  r  r  __all__r2   r2   r2   r3   <module>   s    
 

#)[
J3X7Xyg g=!F #  ?