o
    ei9                    @   sP  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 e)1e2Z3dZde	j4de	j5de6dB fddZ7	d[de	j8de	j5de	j9de6fddZ:ee'G dd  d eZ;ee'd!d"G d#d$ d$e%Z<ee'd%d"G d&d' d'e%Z=G d(d) d)e
j>Z?	*d\d+e
j>d,e	j4d-e	j4d.e	j4d/e	j4dB d0e@d1e@fd2d3ZAG d4d5 d5e
j>ZBG d6d7 d7e
j>ZCG d8d9 d9eZDG d:d; d;e
j>ZEG d<d= d=e
j>ZFG d>d? d?e
j>ZGG d@dA dAe
j>ZHG dBdC dCe
j>ZIG dDdE dEeZJG dFdG dGe
j>ZKe'G dHdI dIe!ZLG dJdK dKeLZMG dLdM dMeLZNe'dNd"G dOdP dPeLeZOG dQdR dRe
j>ZPe'dSd"G dTdU dUeLZQe'dVd"G dWdX dXeLeZRg dYZSdS )]zPyTorch KOSMOS-2 model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)is_flash_attention_requested   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 C   sj   |   \}}|dur|n|}| ddddddf |d|||}d| }||tjt|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r!   r"   r#   bszsrc_lenexpanded_maskinverted_mask r1   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_mask/   s
   *r3   input_ids_shapedevicepast_key_values_lengthc                 C   s   | \}}t j||ft |j|d}t j|d|d}|||d |ddk d ||}|dkrFt j	t j
||||d|gdd}|ddddddf |d||| S )zB
    Make causal mask used for bi-directional self-attention.
    )r5   r   r   r"   r5   dimN)r)   fullr+   r,   aranger%   masked_fill_viewr'   catzerosr&   )r4   r"   r5   r6   r-   r#   r!   	mask_condr1   r1   r2   _make_causal_mask=   s   "
 (rB   c                   @   s(   e Zd ZU dZdZeej dB ed< dS )'BaseModelOutputWithProjectionAttentionsaq  
    projection_attentions (`tuple(torch.FloatTensor)`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    Nprojection_attentions)	__name__
__module____qualname____doc__rD   tupler)   FloatTensor__annotations__r1   r1   r1   r2   rC   N   s   
 	rC   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZe	dB ed< dZ
eej dB ed< dZeej dB ed< dZejdB ed< dZeej dB ed< dZeed	< d
ee fddZdS )Kosmos2ModelOutputa  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsrD   vision_model_outputreturnc                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS )text_model_outputrS   Ngetattrto_tuple.0kselfr1   r2   	<genexpr>   
    
z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>rI   keysr_   r1   r_   r2   r[         zKosmos2ModelOutput.to_tuple)rE   rF   rG   rH   rN   r)   rJ   rK   rO   r	   rP   rI   rQ   rR   rD   rS   r   r   r[   r1   r1   r1   r2   rM   ]   s   
 rM   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dZeej dB ed	< dZeed
< dee fddZdS )*Kosmos2ForConditionalGenerationModelOutputa*  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    NlosslogitsrO   rP   rQ   rR   rD   rS   rT   c                    rU   )Nc                 3   rV   rW   rY   r\   r_   r1   r2   ra      rb   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>rc   r_   r1   r_   r2   r[      re   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple)rE   rF   rG   rH   rg   r)   rJ   rK   rh   rO   r	   rP   rI   rQ   rR   rD   rS   r   r   r[   r1   r1   r1   r2   rf      s   
 rf   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )Kosmos2VisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   r7   
persistent)super__init__rj   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr)   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr<   r&   r`   rj   	__class__r1   r2   rv      s"   
"z Kosmos2VisionEmbeddings.__init__
embeddingsheightwidthrT   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr7   g      ?r   rp   bicubicF)r%   modealign_cornersr9   )shaper   weight	unsqueezer)   jit
is_tracingrq   rz   r   reshapepermuter   
functionalinterpolater>   r?   )r`   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr:   
new_height	new_widthsqrt_num_positionsr1   r1   r2   interpolate_pos_encoding   s*   



z0Kosmos2VisionEmbeddings.interpolate_pos_encodingFpixel_valuesc              
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| jjj}| |j|d}|ddd}| j	
|dd}	tj|	|gdd	}
|r[|
| |
|| }
|
S |
| | j }
|
S )
NzInput image size (*z) doesn't match model ().r"   rp   r   r7   r9   )r   ry   
ValueErrorr   r   r"   r'   flatten	transposer}   r&   r)   r?   r   r   rq   )r`   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   r1   r1   r2   forward   s    
zKosmos2VisionEmbeddings.forwardF)rE   rF   rG   r    rv   r)   Tensorintr   rJ   r   __classcell__r1   r1   r   r2   ri      s     )ri           modulequerykeyvalueattention_maskscalingdropoutc           
      K   sp   t ||dd| }|d ur|| }tjj|dd}tjj||| jd}t ||}	|	dd }	|	|fS )Nr7   r9   ptrainingr   rp   )	r)   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr1   r1   r2   eager_attention_forward	  s   
r   c                       sh   e Zd ZdZ fddZ			ddejdejdB dejdB d	edB d
eejejdB f f
ddZ	  Z
S )Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)ru   rv   rj   rw   rx   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   r   r1   r2   rv   "  s$   

zKosmos2VisionAttention.__init__NFrP   r   causal_attention_maskoutput_attentionsrT   c              
   C   s"  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
t| j	sX|durQ|durQ|| }n|durW|}n|du| _
t| j	jt}|| ||	|
|| j
| j| jstdn| jd\}}|||| }| |}|sd}||fS )#Input shape: Batch x Time x Channelr   rp   Nr   )r   r   r   )r   r   r   r   r>   r   r   r   r   rj   r   r   get_interface_attn_implementationr   r   r   r   r   r   r   )r`   rP   r   r   r   r   
seq_lengthrx   queriesrd   valuesattention_interfacer   r   r1   r1   r2   r   6  s@   	







zKosmos2VisionAttention.forward)NNF)rE   rF   rG   rH   rv   r)   r   r*   rI   r   r   r1   r1   r   r2   r     s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )Kosmos2VisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)ru   rv   rj   r   
hidden_actactivation_fnr   r   rw   intermediate_sizefc1fc2r   r   r1   r2   rv   j  s
   
zKosmos2VisionMLP.__init__rP   rT   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   r`   rP   r1   r1   r2   r   q  s   


zKosmos2VisionMLP.forward)rE   rF   rG   rv   r)   r   r   r   r1   r1   r   r2   r   i  s    r   c                       sV   e Zd Zdef fddZ	ddejdejdedB d	ee	 d
e
ej f
ddZ  ZS )Kosmos2VisionEncoderLayerrj   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S Neps)ru   rv   rw   rx   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   r   r1   r2   rv   z  s   


z"Kosmos2VisionEncoderLayer.__init__FrP   r   r   Nr   rT   c                 K   sj   |}|  |}| jd|||d|\}}|| }|}| |}| |}|| }|f}|r3||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rP   r   r   Nr1   )r   r   r   r   )r`   rP   r   r   r   residualr   outputsr1   r1   r2   r     s$   




z!Kosmos2VisionEncoderLayer.forwardr   )rE   rF   rG   r    rv   r)   r   r*   r   r   rI   rJ   r   r   r1   r1   r   r2   r   y  s    r   c                       sh   e Zd ZdZdef fddZe				ddejdB de	dB de	dB d	e	dB d
e
eB f
ddZ  ZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    rj   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r1   )r   )r]   r   rj   r1   r2   
<listcomp>  s    z1Kosmos2VisionEncoder.__init__.<locals>.<listcomp>F)	ru   rv   rj   r   
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr   r   r   r2   rv     s   
 
zKosmos2VisionEncoder.__init__Nr   r   output_hidden_statesreturn_dictrT   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]#\}
}|r<||	f }||	|fd|i|}|d }	|rT||d f }q1|r\||	f }t|	||dS )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr1   r   r   r   )rN   rP   rQ   )rj   r   r   use_return_dict	enumerater   rC   )r`   inputs_embedsr   r   r   r   r   encoder_statesall_attentionsrP   idxencoder_layerlayer_outputsr1   r1   r2   r     s6    

zKosmos2VisionEncoder.forward)NNNN)rE   rF   rG   rH   r    rv   r   r)   r   r*   rI   r   r   r   r1   r1   r   r2   r     s&    r   c                       sf   e Zd Zdef fddZ					ddejdB dedB dedB d	ed
edB dee	B fddZ
  ZS )Kosmos2VisionTransformerrj   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )ru   rv   rj   rw   ri   r   r   r   r   pre_layrnormr   encoderpost_layernorm)r`   rj   rx   r   r1   r2   rv     s   


z!Kosmos2VisionTransformer.__init__NFr   r   r   r   r   rT   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| j||d}| |}| j||||d}|d }|d d dd d f }	| |	}	|s[||	f|dd   S t	||	|j
|jdS )Nz You have to specify pixel_values)r   )r   r   r   r   r   r   )rN   pooler_outputrP   rQ   )rj   r   r   r   r   r   r  r  r  r   rP   rQ   )
r`   r   r   r   r   r   rP   encoder_outputsrN   pooled_outputr1   r1   r2   r     s2   

z Kosmos2VisionTransformer.forwardNNNFN)rE   rF   rG   r    rv   r)   rJ   r*   rI   r   r   r   r1   r1   r   r2   r    s(    r  c                       s   e Zd ZdZddedededB f fddZddedededB fd	d
ZeddedededB fddZe	
 				dde	jdB de	jdB dede	jdB fddZedd ZedddZ  ZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nr   embedding_dimpadding_idxc                    s:   t    d| _|| _|| _|| _| || j || d S )Nrp   )ru   rv   offsetr   r
  r  make_weights)r`   r   r
  r  r   r1   r2   rv   7  s   
z1Kosmos2TextSinusoidalPositionalEmbedding.__init__num_embeddingsc                 C   sB   |  |||}t| dr|j| jj| jjd}| jd|dd d S )Nweightsr8   Frs   )get_embeddinghasattrr'   r  r"   r5   r   )r`   r  r
  r  emb_weightsr1   r1   r2   r  @  s   
z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrUtj|t| dgdd}|durad||ddf< |t S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        rp   i'  r   r   r   r9   r7   N)mathlogr)   expr<   int64floatr   r?   sincosr>   r@   r'   get_default_dtype)r  r
  r  half_dimembr1   r1   r2   r  H  s   	 $&z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingr   	input_idsr   r6   rq   c                 C   s   |d ur|  \}}|d u r| || j||j}n|  d d \}}|d u r1| ||| j}| jd | | }|| j dkrN| || j | j	| j | j
d|d||| jjd  S )Nr7   r   r   )r%   "create_position_ids_from_input_idsr  r'   r5   &create_position_ids_from_inputs_embedsr  r  r  r
  index_selectr>   r   detach)r`   r  r   r6   rq   r-   seq_lenmax_posr1   r1   r2   r   ^  s$   *z0Kosmos2TextSinusoidalPositionalEmbedding.forwardc                 C   sR   |   dd }|d }tj|d || d tj| jd}|d| | S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr7   r   r8   r   )r%   r)   r<   longr5   r   r&   r   )r   r6   r  input_shapesequence_lengthrq   r1   r1   r2   r  {  s   zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embedsc                 C   s6   |  | }tj|dd|| | }| | S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   r9   )ner   r)   cumsumtype_asr$  )r  r  r6   r!   incremental_indicesr1   r1   r2   r    s   zKKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_input_idsr   )NNr   Nr   )rE   rF   rG   rH   r   rv   r  staticmethodr  r)   no_gradr   r   r  r  r   r1   r1   r   r2   r	  3  s0     	
r	  c                       s   e Zd ZdZ					ddededed	edB d
edB dedB dedB f fddZ					ddej	dej	dB de
dB dej	dB dedej	dB deej	ej	dB e
dB f fddZ  ZS )KosmosTextAttentionr   r   FTNrx   r   r   
is_decoderadd_inner_attn_layernormro   	layer_idxc	           	         s   t    || _|| _|| _|| _|| | _d| _| j| | jkr-td| j d| d| jd | _	|| _
|| _tj|||d| _tj|||d| _tj|||d| _tj|||d| _d | _|rmtj||jd| _d S d S )NTr   r   r   r   )ro   r   )ru   rv   rj   rx   r   r   r   r   r   r   r/  r1  r   r   r   r   r   r   inner_attn_lnr   r   )	r`   rj   rx   r   r   r/  r0  ro   r1  r   r1   r2   rv     s0   


zKosmosTextAttention.__init__rP   encoder_hidden_statesrO   r   r   cache_positionrT   c                 K   s  |du}|j dd \}	}
| |}||	|
| j| jdd}d}|dur>t|tr<|j	| j
}|r8|j}n|j}n|}|rB|n|}|r[|dur[|r[|j| j
 j}|j| j
 j}nJ| |}| |}||	d| j| jdd}||	d| j| jdd}|dur|s|nd}|||| j
d|i\}}|rt|trd|j| j
< t| jjt}|| ||||f| jsdn| j| jd	|\}}||	|
d }| jdur| |}| |}||fS )
r   Nrp   r   Fr7   r4  Tr   )r   r   )r   r   r>   r   r   r   
isinstancer   
is_updatedgetr1  cross_attention_cacheself_attention_cacher   rd   r   r   r   updater   r   rj   r   r   r   r   r   r   r   r2  r   )r`   rP   r3  rO   r   r   r4  r   is_cross_attentionr   r   query_statesr6  curr_past_key_valuescurrent_states
key_statesvalue_statesr   r   r   r1   r1   r2   r     s^   







zKosmosTextAttention.forward)r   FFTNr  )rE   rF   rG   rH   r   r  r*   rv   r)   r   r	   rI   r   r   r1   r1   r   r2   r.    sT    	)	r.  c                       s*   e Zd Zdef fddZdd Z  ZS )Kosmos2TextFFNrj   c                    sb   t    |j| _t|j | _|j| _t|j	|j
| _t|j
|j	| _tj|j
|jd| _d S r   )ru   rv   r   r   activation_functionr   activation_dropoutr   r   rx   ffn_dimr   r   r   r   ffn_layernormr   r   r1   r2   rv     s   
zKosmos2TextFFN.__init__c                 C   sT   |  | |}tjj|| j| jd}| |}| |}tjj|| j| jd}|S )Nr   )	r   r   r   r   r   rC  r   rE  r   r   r1   r1   r2   r   $  s   

zKosmos2TextFFN.forward)rE   rF   rG   r   rv   r   r   r1   r1   r   r2   rA    s    rA  c                       s   e Zd Zddef fddZ							ddejdejdB d	ejdB d
ejdB dedB dedB dedB dejdB de	ej
e	ej
ej
f dB f fddZ  ZS )Kosmos2TextBlockNrj   c              	      s   t    |j| _t|| j|j|jdd|d| _|j| _tj	| j|j
d| _|jrBt|| j|j|jdd|d| _tj	| j|j
d| _t|| _tj	| j|j
d| _d S )NT)rx   r   r   r/  r0  r1  r   F)ru   rv   rx   r.  attention_headsr   r   r   r   r   r   self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrA  ffnfinal_layer_norm)r`   rj   r1  r   r1   r2   rv   /  s4   
		
zKosmos2TextBlock.__init__FTrP   r   r3  encoder_attention_maskrO   r   	use_cacher4  rT   c	              	   K   s   |}
|  |}| jd|||||d|	\}}tjj|| j| jd}|
| }d }|d ur`t| ds9td|  d|}
| |}| j	d||||||d|	\}}tjj|| j| jd}|
| }|}
| 
|}| |}|
| }|f}|r{|||f7 }|S )N)rP   rO   r   r   r4  r   rJ  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rP   r3  r   rO   r   r4  r1   )rH  r   r   r   r   r   r  r   rK  rJ  rM  rL  )r`   rP   r   r3  rN  rO   r   rO  r4  r   r   self_attn_weightscross_attn_weightsr   r1   r1   r2   r   N  sR   





	

zKosmos2TextBlock.forwardr   )NNNNFTN)rE   rF   rG   r   rv   r)   r   r	   r*   rI   rJ   r   r   r1   r1   r   r2   rF  .  s8    "	rF  c                "       s*  e Zd ZdZdef fddZdd Z					dd	ejdB d
ejdB dejdB de	dejdB f
ddZ
														d dejdB dejdB d
ejdB dejdB dejdB dejdB dedB d	ejdB dejdB dedB dedB dedB dedB dejdB dee deeB f ddZ  ZS )!Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    rj   c                    s   t     | _ j| _ j| _ jrt jnd| _	t
j j j jd| _t j j jd| _t
 fddt jD | _t
 j j| _d| _d S )Nr$   )r  )r   r
  r  c                    s   g | ]}t  |d qS ))r1  )rF  )r]   ir   r1   r2   r     s    z3Kosmos2TextTransformer.__init__.<locals>.<listcomp>F)ru   rv   rj   r   	layerdropscale_embeddingr  sqrtrx   embed_scaler   r   
vocab_sizepad_token_idembed_tokensr	  max_position_embeddingsembed_positionsr   r   r   r   r   
layer_normr   r   r   r   r2   rv     s   
 
zKosmos2TextTransformer.__init__c                 C   s`   d }|d dkrt ||j|j|d}|d ur.t||j|d d|j}|d u r*|n|| }|S )Nr7   r   )r5   r6   r#   )rB   r"   r5   r3   r'   )r`   r   r%  r   r6   combined_attention_maskexpanded_attn_maskr1   r1   r2   _prepare_decoder_attention_mask  s   z6Kosmos2TextTransformer._prepare_decoder_attention_maskNr   r   rR   img_input_maskr6   rq   c           	      C   s   |d u r	|  |}|d ur!||jd|d||jtjd< || j }| j||||d}||j}|| }t	j
j|| j| jd}|S )Nr7   r   )r  r   r6   rq   r   )rZ  r'   r5   r>   r%   r)   r*   rW  r\  r   r   r   r   )	r`   r  r   rR   rb  r6   rq   	positionsrP   r1   r1   r2   forward_embedding  s"   



z(Kosmos2TextTransformer.forward_embeddingr  r   image_embeds_position_maskr3  rN  rO   rO  r   r   r   r4  r   rT   c              	   K   sd  |d ur|n| j j}|d ur|n| j j}|
d ur|
n| j j}
|d ur*|d ur*td|d ur:|j}|d|d }n|d urG| d d }ntd| jrZ| j	rZ|
rZt
d d}
|
r{|d u r{|d ush| j jrutt| j dt| j dnt| j d}|d ur| nd}|dkrd }d }| j||||||	d}| ||||}|d ur|d urt||j|d d	}tjj|| j| j	d
}|rdnd }|rdnd }|r|d urdnd }t| jD ]C\}}|r||f7 }| j	rtg }|| jk rq||||f||||
|d|}|d }|r||d f7 }|d ur||d f7 }q| |}|r)||f7 }t|||||dS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer7   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   )r  r   rR   rb  r6   rq   r^  r   r1   )rN  rO   r   rO  r4  r   rp   )rN   rO   rP   rQ   cross_attentions)rj   r   r   rO  r   r   r>   r%   r   r   loggerwarning_onceis_encoder_decoderr   r
   get_seq_lengthrd  ra  r3   r"   r   r   r   r   r   r)   randrT  r]  r   )r`   r  r   rR   re  r3  rN  rO   r   rq   rO  r   r   r   r4  r   r%  r6   rP   all_hidden_statesall_self_attnsall_cross_attentionsr   decoder_layerdropout_probabilityr   r1   r1   r2   r     s   
	


	


zKosmos2TextTransformer.forward)NNNr   NNNNNNNNNNNNNNN)rE   rF   rG   rH   r   rv   ra  r)   r   r   rd  r	   r*   r   r   rI   r   r   r   r1   r1   r   r2   rR    s    
%	
rR  c                   @   sJ   e Zd ZU eed< dZdZddgZdZdZ	dZ
e dejfdd	Zd
S )Kosmos2PreTrainedModelrj   )imagetextTr   rF  Fr   c                 C   sh  t | tr
| jj}nt | ttfr| jjj}t | ttfr"| jj	}nt | ttfr.| jj
j	}t |trntj|jd|jd | d tj|jj|jj| d tj|jj|jj| d t|jt|jjd d n-t |tr|jd d|jj d  | }|jd | }tj|jj|d tj|jj|d tj|jj|d tj|jj|d nt |t r|jj!d d|jj d  | }d|jj! d | }tj|j"j|d tj|j#j|d nt |t$rtj|jj|d tj|jj|d tj|jj|d tj|jj|d nt |t%r$tj|j"j|d tj|j#j|d nwt |tr4tj|j&j|d ngt |t'rJtj|j(j|d t|j) nQt |t*rmtj|j+jd|d |j+j,durlt-|j+j|j+j,  n.t |t.j/rt0|j t-|j1 nt |t2r|3|j4|j5 |j6|j,}t|j7| t |t.j8r|j1durt-|j1 dS dS dS )	zInitialize the weightsr   r   )meanstd)rv  r7   rr   rp   N)9r5  Kosmos2VisionModelrj   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configri   initnormal_r}   rx   r   r   initializer_ranger   copy_rq   r)   r<   r   r&   r   r   r   r   r   r   r   rw   r   r   r.  rA  lm_headKosmos2ImageToTextProjectiondenselatent_queryrR  rZ  r  zeros_r   r   ones_ro   r	  r  r   r  r
  r  r   )r`   r   factorrv  in_proj_stdout_proj_stdfc_stdr  r1   r1   r2   _init_weightsl  sl   





&

 z$Kosmos2PreTrainedModel._init_weightsN)rE   rF   rG   r   rK   input_modalitiessupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpar)   r-  r   Moduler  r1   r1   r1   r2   rr  b  s   
 rr  c                       s   e Zd ZU eed< dZdZdef fddZdej	fddZ
e							
		ddejd	B ded	B ded	B deded	B deeB fddZ  ZS )rw  rj   r   )rs  c                    "   t  | t|| _|   d S r   )ru   rv   r  model	post_initr   r   r1   r2   rv        
zKosmos2VisionModel.__init__rT   c                 C   
   | j jjS r   )r  r   r   r_   r1   r1   r2   get_input_embeddings     
z'Kosmos2VisionModel.get_input_embeddingsNFr   r   r   r   c                 K   s   | j |||||dS )N)r   r   r   r   r   r  )r`   r   r   r   r   r   r   r1   r1   r2   r     s   
zKosmos2VisionModel.forwardr  )rE   rF   rG   r    rK   main_input_namer  rv   r   r  r  r   r)   rJ   r*   rI   rC   r   r   r1   r1   r   r2   rw    s2   
 rw  c                $       s  e Zd ZU eed< dZdef fddZdejfddZ	e
e														dd	ejdB d
ejdB dejdB dejdB dejdB dejdB dedB dejdB dejdB dedB dedB dedB dedB dejdB dee deeB f ddZ  ZS )r|  rj   )rt  c                    r  r   )ru   rv   rR  r  r  r   r   r1   r2   rv     r  zKosmos2TextModel.__init__rT   c                 C      | j jS r   r  rZ  r_   r1   r1   r2   r       z%Kosmos2TextModel.get_input_embeddingsNr  r   rR   re  r3  rN  rO   r   rq   rO  r   r   r   r4  r   c                 K   s.   | j d|||||||||	|
||||d|S )aN  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        )r  r   rR   re  r3  rN  rO   r   rq   rO  r   r   r   r4  Nr1   r  )r`   r  r   rR   re  r3  rN  rO   r   rq   rO  r   r   r   r4  r   r1   r1   r2   r     s$   zKosmos2TextModel.forwardrq  )rE   rF   rG   r   rK   r  rv   r   r  r  r   r   r)   r   r	   r*   r   r   rI   r   r   r   r1   r1   r   r2   r|    sl   
 	
r|  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                &       sB  e Zd ZU eed< ddiZdef fddZdejfddZ	dejfd	d
Z
ee															d"dejdB dejdB dejdB dejdB dejdB dejdB dedB dejdB dejdB dejdB dedB dedB dedB dejdB deejB dee deeB f"ddZ								d# fd d!	Z  ZS )$r}  rj   zlm_head.weightzmodel.embed_tokens.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NF)in_featuresout_featuresro   )
ru   rv   rR  r  r   r   rx   rX  r  r  r   r   r1   r2   rv     s   
zKosmos2TextForCausalLM.__init__rT   c                 C   r  r   r  r_   r1   r1   r2   r    r  z+Kosmos2TextForCausalLM.get_input_embeddingsc                 C   s   | j S r   )r  r_   r1   r1   r2   get_output_embeddings  s   z,Kosmos2TextForCausalLM.get_output_embeddingsNr   r  r   rR   re  r3  rN  rO   r   rq   labelsrO  r   r   r4  logits_to_keepr   c                 K   s   |
dur|rt d d}| jd|||||||||	||||d|}|j}t|tr1t| dn|}| |dd|ddf }d}|
durU| jd||
| j	j
d|}t|||j|j|j|jdS )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)r  r   rR   re  r3  rN  rO   r   rq   rO  r   r   r4  )rh   r  rX  )rg   rh   rO   rP   rQ   rf  r1   )rg  warningr  rN   r5  r   slicer  loss_functionrj   rX  r   rO   rP   rQ   rf  )r`   r  r   rR   re  r3  rN  rO   r   rq   r  rO  r   r   r4  r  r   r   rP   slice_indicesrh   rg   r1   r1   r2   r   "  sF   #
zKosmos2TextForCausalLM.forwardFc
                    s   |	s	|r	d }d }n1|d ur:|d ur|  d d n|  \}}|  d }tj|tj||| ftj|jdfdd}t j|f||||||||	d|
}|dd  |S )Nr7   )r%   r"   r5   r   r9   )rO   r   rR   re  r   rO  r4  is_first_iterationrq   )	r%   r)   r?   r@   r*   r5   ru   prepare_inputs_for_generationpop)r`   r  rR   re  rO   r   r   rO  r4  r  model_kwargsr   r"  mask_lenmodel_inputsr   r1   r2   r  m  s8   $
z4Kosmos2TextForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNNNNr   )NNNNNNNF)rE   rF   rG   r   rK   _tied_weights_keysrv   r   r  r  r  r   r   r)   r   r	   
LongTensorr*   r   r   r   rI   r   r   r  r   r1   r1   r   r2   r}  	  s   
 		
Lr}  c                       s.   e Zd ZdZdef fddZdd Z  ZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)rj   c                    sb   t    t|jj|jj| _t	t
|j|jj| _t|j|jj|jj|jjddd| _d S )NF)r   r/  r0  )ru   rv   r   r   r{  rw   r  rx   r  r{   r)   r|   latent_query_numr  r.  rG  r   x_attnr   r   r1   r2   rv     s   
z%Kosmos2ImageToTextProjection.__init__c                 C   sX   |  |}| jd|ddd}tj||gdd}| j||d d d d\}}||fS )Nr   r7   r   r9   )rP   r3  rO   r   r   )r  r  r   r&   r%   r)   r?   r  )r`   featuresrP   r  key_value_statesr   r1   r1   r2   r     s   

z$Kosmos2ImageToTextProjection.forward)rE   rF   rG   rH   r   rv   r   r   r1   r1   r   r2   r    s    r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c                "       s,  e Zd ZU eed< dZdef fddZdejfddZ	dd	 Z
ee	
ddejdedB dee deeB fddZee												
	ddejdB dejdB dejdB dejdB dedB dejdB dejdB dejdB dedB dedB dedB dededB dee deeB fddZ  ZS )ry  rj   r   c                    :   t  | t|j| _t|j| _t|| _	| 
  d S r   )ru   rv   r|  r  
text_modelrw  r{  vision_modelr  image_to_text_projectionr  r   r   r1   r2   rv     s
   
zKosmos2Model.__init__rT   c                 C   r  r   r  r  rZ  r_   r1   r1   r2   r    r  z!Kosmos2Model.get_input_embeddingsc                 C      || j j_d S r   r  r`   r   r1   r1   r2   set_input_embeddings     z!Kosmos2Model.set_input_embeddingsFr   Nr   c                 K   sx   d|v rt dt |dd  | jd||dd|}| jj|d }tjj	|dd}| 
|\}}||_||_|S )	Nreturn_attentionsz`return_attentions` is deprecated and will be removed in a future version. Please use `return_dict` and access `projection_attentions` from the returned `ModelOutput` instead.T)r   r   r   r   r7   r9   r1   )warningswarnFutureWarningr  r  r  r  r   r   	normalizer  r  rD   )r`   r   r   r   vision_outputrR   rD   r1   r1   r2   get_image_features  s&   zKosmos2Model.get_image_featuresr  re  r   rO   rR   r   rq   rO  r   r   r   c                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}d}d}|du r<|du r.td| j||dd}|j}|j}| jd||||||||	|
|dd|}t	|j
|j|j|j|||dS )a  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r   r   )r  r   rR   re  rO   r   rq   rO  r   r   r   )rN   rO   rP   rQ   rR   rD   rS   r1   )rj   r   r   r   r   r  r  rD   r  rM   rN   rO   rP   rQ   )r`   r   r  re  r   rO   rR   r   rq   rO  r   r   r   r   r   rS   rD   image_featuresr   r1   r1   r2   r     sL   =zKosmos2Model.forwardr   )NNNNNNNNNNNFN)rE   rF   rG   r   rK   r  rv   r   r  r  r  r   r   r)   rJ   r*   r   r   rI   rC   r  r   r	   r   rM   r   r   r1   r1   r   r2   ry    s   
 
	
ry  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c                "       s|  e Zd ZU eed< dZddiZdef fddZdej	fdd	Z
d
d Zdej	fddZdd Zee													d#dejdB dejdB dejdB dejdB dedB dejdB dejdB dejdB dejdB dedB dedB dedB deejB dee deeB fdd Ze 						d$dejdB dejdB dejdB dejdB dejdB dejdB fd!d"Z  ZS )%rz  rj   r   ztext_model.lm_head.weightz$text_model.model.embed_tokens.weightc                    r  r   )ru   rv   r}  r  r  rw  r{  r  r  r  r  r   r   r1   r2   rv   q  s
   
z(Kosmos2ForConditionalGeneration.__init__rT   c                 C   r  r   r  r_   r1   r1   r2   r  |  r  z4Kosmos2ForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  r  r1   r1   r2   r    r  z4Kosmos2ForConditionalGeneration.set_input_embeddingsc                 C   s
   | j  S r   )r  r  r_   r1   r1   r2   r    r  z5Kosmos2ForConditionalGeneration.get_output_embeddingsc                 C   s   | j | d S r   )r  set_output_embeddings)r`   new_embeddingsr1   r1   r2   r    s   z5Kosmos2ForConditionalGeneration.set_output_embeddingsNr   r  re  r   rO   rR   r   rq   r  rO  r   r   r  r   c                 K   s   |dur|n| j j}|dur|n| j j}d}d}|du rD|du r$td| j|||d}| jj|d }tjj	|dd}| 
|\}}| jd	||||||||	|
|||d|}t|j|j|j|j|j|||dS )
a  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r   r   r7   r9   )r  r   rR   re  rO   r   rq   r  rO  r   r   r  )rg   rh   rO   rP   rQ   rR   rD   rS   r1   )rj   r   r   r   r  r  r  r   r   r  r  r  rf   rg   rh   rO   rP   rQ   )r`   r   r  re  r   rO   rR   r   rq   r  rO  r   r   r  r   rS   rD   
lm_outputsr1   r1   r2   r     sT   Iz'Kosmos2ForConditionalGeneration.forwardc                 K   s   | dd }|d ur|d urtd| d|d u r |d ur |}|d u rA| |}	| jj|	d }tjj|dd}| |\}}
| j	j
d|||||d|}|S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r7   r9   )r  r   rR   re  r   r1   )r  r   r  r  r  r   r   r  r  r  generate)r`   r   re  r  r   rR   r   r   r  rS   rD   outputr1   r1   r2   r    s,   

	z(Kosmos2ForConditionalGeneration.generate)NNNNNNNNNNNNr   )NNNNNN)rE   rF   rG   r   rK   r  r  rv   r   r  r  r  r  r  r   r   r)   r   r	   r  r*   r   r   r   rI   rf   r   r-  r  r   r1   r1   r   r2   rz  f  s   
 	
xrz  )rz  ry  rr  r   r+  )r   )TrH   r  r  collections.abcr   dataclassesr   typingr   r)   r    r   r  activationsr   cache_utilsr	   r
   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   configuration_kosmos2r   r   r    
get_loggerrE   rg  r   r"   r   r3   Sizer5   rB   rC   rM   rf   r  ri   r  r   r   r   r   r   r  r	  r.  rA  rF  rR  rr  rw  r|  r}  r  ry  rz  __all__r1   r1   r1   r2   <module>   s    
 
#)[
J2Q7mwb SE#> #  @