o
    eis                  	   @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ ddlmZmZ dd	lmZ eeeef  eeeeeef  B eeeeef   B eeeeeef   B ZeedB eedB eedB eedB  B  B  B  ZG d
d de	ddZG dd deddZG dd de
ddZeG dd deZdeeeeef dedeeef fddZdededefddZ dd Z!dd  Z"d!d" Z#d&d$d%Z$dgZ%dS )'zProcessor class for KOSMOS-2.    N   )BatchFeature)
ImageInput)ImagesKwargsProcessingKwargsProcessorMixin
TextKwargsUnpack)
AddedToken)BatchEncoding	TextInput)auto_docstringc                   @   s2   e Zd ZU dZedB ed< eed< edB ed< dS )Kosmos2ImagesKwargsa  
    bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
        The bounding bboxes associated to `texts`.
    num_image_tokens (`int`, *optional* defaults to 64):
        The number of (consecutive) places that are used to mark the placeholders to store image information.
        This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
    first_image_token_id (`int`, *optional*):
        The token id that will be used for the first place of the subsequence that is reserved to store image
        information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
    Nbboxesnum_image_tokensfirst_image_token_id)__name__
__module____qualname____doc__
NestedList__annotations__int r   r   l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/kosmos2/processing_kosmos2.pyr   '   s
   
 r   F)totalc                   @   s   e Zd ZU dZeed< dS )Kosmos2TextKwargsz
    add_eos_token (`bool`, defaults to `False`):
    Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
    add_eos_tokenN)r   r   r   r   boolr   r   r   r   r   r   8   s   
 r   c                
   @   s@   e Zd ZU eed< eed< dddddddddd	ddid	Zd
S )Kosmos2ProcessorKwargstext_kwargsimages_kwargsTFr   )	add_special_tokenspaddingstridereturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsverboser   r   @   )r    r!   N)r   r   r   r   r   r   	_defaultsr   r   r   r   r   A   s    
 
r   c                       s  e Zd Zd! fdd	Ze		d"dedB deee B dee	 de
fd	d
Zdd Zdd Z			d#deee B dedB dededB deee B f
ddZd$ddZd$ddZedd Zdedeee  eee  B defddZdeeef eeeeef B deeef fdd Z  ZS )%Kosmos2Processor   c                    s   d|_ d| _d| _d| _d| _d| _d| _d| _d	| _d
| _	d| _
d| _| j| j| j| j| j| j| j| j| j	| j
| jg| _|| _dd t| jD }g }| j| D ]}|t|dddd qQ|| t || dS )z
        num_patch_index_tokens (`int`, *optional*, defaults to 1024):
            The number of tokens that represent patch indices.
        Fz</doc>z<image>z</image>z</chunk>z</line>z<phrase>z	</phrase>z<object>z	</object></delimiter_of_multi_objects/>z<grounding>c                 S   s"   g | ]}d t |d dqS )<patch_index_   >)strzfill.0xr   r   r   
<listcomp>   s   " z-Kosmos2Processor.__init__.<locals>.<listcomp>T)lstriprstrip
normalizedN)r(   	eod_token	boi_token	eoi_token	eoc_token	eol_token	bop_token	eop_token	boo_token	eoo_token	dom_token	grd_token
tag_tokensnum_patch_index_tokensrangeappendr
   
add_tokenssuper__init__)selfimage_processor	tokenizerrG   kwargspatch_index_tokenstokens_to_addtoken	__class__r   r   rL   X   s>   
zKosmos2Processor.__init__NimagestextrP   returnc                    sJ  |d u r|d u rt djtfdjji|}|d dd }|d dd}|d dd }|d d	d
}|d d }	|d d }
|d dd }t }|d urdj|fi |d }|	| |d urƈj
||||d}|	r|st|trjj | }nt|trfdd|D }|d d o||d d< |d u r|
nd
|d d< |d u r|nd |d d< jd%d|i|d }|	| |	|d d< |
|d d< ||d d< |d ur#|d ur#|d u rjjd }|	}t|d }tt||| }dgdg|  dg }g }g }|d }t|tr |g}|d g|d< |D ]7}|d | | ||| d   }|| t|}|rFdg| }|dgt|t|  7 }|| q"t|trtdd t|jD dd d}|d \}}|d \}}|d d o||d d< d |d d< jd%d|| gi|d }t|jd  | krjjdkr҇ fdd|D } fdd|D } fdd|d D |d< n'jjdkr fd d|D } fd!d|D } fd"d|d D |d< t|tr|d u r|d }|d d |d< |d }|	t||d |d#|d$ |S )&Nz*You have to specify either images or text.tokenizer_init_kwargsr!   r   r   r*   r   r    r   Fr"   r#   return_tensors)r   c                    s   g | ]
} j j | qS r   )rO   	bos_token)r5   srM   r   r   r7          z-Kosmos2Processor.__call__.<locals>.<listcomp>rW      r   	input_idsattention_maskc                 S   s   g | ]
\}}|t |fqS r   len)r5   idxr6   r   r   r   r7      r^   c                 S   s   | d S Nr   )r6   r   r   r   <lambda>   s    z+Kosmos2Processor.__call__.<locals>.<lambda>)keyrf   rightc                    s&   g | ]}|j jg t|   qS r   rO   pad_token_idrc   r4   max_len_paddedrM   r   r   r7         & c                    "   g | ]}|d g t |   qS r   rb   r4   rm   r   r   r7          c                    ro   rp   rb   r4   rq   r   r   r7      rr   leftc                    s&   g | ]}j jg t|  | qS r   rj   r4   rl   r   r   r7      rn   c                    "   g | ]}d g t |  | qS rp   rb   r4   rq   r   r   r7      rr   c                    rt   rp   rb   r4   rq   r   r   r7      rr   )r`   ra   image_embeds_position_mask)datatensor_typer   )
ValueError_merge_kwargsr   rO   init_kwargspop
setdefaultr   rN   updatepreprocess_examples
isinstancer2   r[   listunk_token_idr   rH   rI   copyrc   sorted	enumerater`   padding_sider   )rM   rV   rW   rP   output_kwargsr   r   r   r   r"   r#   rZ   encodingimage_encodingtext_encodingwith_bosstart_indeximage_token_idsbase_image_embeds_position_maskr`   ru   all_input_idstext_idsmasksorted_length_min_len_not_paddedrd   r   rl   r   __call__   s   




 









zKosmos2Processor.__call__c                 C   s   |du rdS t |tstd|D ];}|du rqt |ts |g}|D ])}t |trGt|dkr8tdd |D sKt|dkrGtdd |D sKtdq"qdS )	a  
        Check `bboxes` for a single text example. It could be
            - `None`: no bounding box associated to a text.
            - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
              in a text. This could be:
                  - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
                  - A tuple of 2 integers: A single bounding box specified by patch indices.
                  - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
                  - A list containing the above 2 tuple types: Multiple bounding boxes for a
                   `<phrase> ... </phrase>` pair.
        Nz@`bboxes` (for a single text example) should be `None` or a list.   c                 s       | ]}t |tV  qd S N)r   r   r4   r   r   r   	<genexpr>/      zAKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>r0   c                 s   r   r   )r   floatr4   r   r   r   r   0  r   a'  Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing 2 integers or 4 float point numbers, or a list containing such tuples. Also make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in batches or both for a single example.)r   r   rx   tuplerc   all)rM   r   bboxelementr   r   r   _check_bboxes_for_single_text  s&   


z.Kosmos2Processor._check_bboxes_for_single_textc                 C   s.   |  }|d ur| d| }| ||}|S )N )strip_insert_patch_index_tokens)rM   rW   imager   img_info_tokensr   r   r   _preprocess_single_example9  s
   z+Kosmos2Processor._preprocess_single_exampler*   textsr   r   c           	         sB  j g| }dj g| jg  d}t|trd}|g}|du r+dgt| }nt|ts3|g}t|t|krJtdt| dt| d|sU| |g}n|durmt|tsbtd|D ]}| qdndgt| }t|t|krtd	t| dt| d fd
dt	|||D }|s|d }|S )a-  Add image and bounding box information to `texts` as image and patch index tokens.

        Args:
            texts (`Union[TextInput, list[TextInput]]`): The texts to be processed.
            images (`ImageInput`, *optional*): The images associated to `texts`.
            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional*, defaults to 64):
                The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
                attribute in `Kosmos2Config`.

        Returns:
            `Union[TextInput, list[TextInput]]`: The processed texts with image and patch index tokens.
        r   TFNzGThe number of examples in `texts` and `images` should be the same. Got  v.s. 	 instead.zS`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.zGThe number of examples in `texts` and `bboxes` should be the same. Got c                    s"   g | ]\}}} ||| qS r   )r   )r5   rW   r   r   r   rM   r   r   r7   {  s    z8Kosmos2Processor.preprocess_examples.<locals>.<listcomp>r   )
r<   joinr=   r   r2   rc   r   rx   r   zip)	rM   r   rV   r   r   
img_tokensbatchedr6   resultr   r   r   r~   C  sD   




z$Kosmos2Processor.preprocess_examplesTc                 C   s    | | jd }|rt|S |S re   )splitr=   +clean_text_and_extract_entities_with_bboxes)rM   rW   cleanup_and_extractcaptionr   r   r   post_process_generation  s   z(Kosmos2Processor.post_process_generationc                    s(    j |fd|i|} fdd|D S )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[str]`: The decoded text.
        skip_special_tokensc                    s   g | ]	} j |d dqS )F)r   )r   )r5   rW   r]   r   r   r7     s    zDKosmos2Processor.post_process_image_text_to_text.<locals>.<listcomp>)batch_decode)rM   generated_outputsr   rP   generated_textsr   r]   r   post_process_image_text_to_text  s   z0Kosmos2Processor.post_process_image_text_to_textc                 C   s   | j j}| jj}|| dg S )Nru   )rO   model_input_namesrN   )rM   tokenizer_input_namesimage_processor_input_namesr   r   r   r     s   z"Kosmos2Processor.model_input_namesc                 C   sP  |d u s
t |dkr|S ttjd|d}t |t |kr,tdt | dt | dd}g }t||D ]\\}}| \}}	||||	  |	}|d u rOq5t|t	rW|g}g }
t
dd |D sftd	|D ]}| |\}}|
| d
|  qht |
dkrq5d|
}|d| d q5|t |k r|||d   d|}|S )Nr   z<phrase>.+?</phrase>)stringzuThe number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got r   r   c                 s   s    | ]}|d uV  qd S r   r   )r5   boxr   r   r   r     s    z>Kosmos2Processor._insert_patch_index_tokens.<locals>.<genexpr>zTThe multiple bounding boxes for a single phrase should not contain any `None` value.r   z  </delimiter_of_multi_objects/> z	<object> z
 </object> )rc   r   refinditerrx   r   spanrI   r   r   r   #_convert_bbox_to_patch_index_tokensr   )rM   rW   r   matched_phrasescurr_posbuffermatchedr   r   endpatch_index_stringsr   patch_index_1patch_index_2position_strr   r   r   r     sB   


z+Kosmos2Processor._insert_patch_index_tokensr   c                 C   sh   t |dkr|\}}ntt| j}t||\}}dt|d d}dt|d d}||fS )Nr   r/   r0   r1   )rc   r   mathsqrtrG   coordinate_to_patch_indexr2   r3   )rM   r   idx_1idx_2num_patches_per_sidetoken_1token_2r   r   r   r     s   
z4Kosmos2Processor._convert_bbox_to_patch_index_tokens)r-   )NN)NNr*   )T)r   r   r   rL   r   r   r   r   r	   r   r   r   r   r   	BboxInputr   r2   r~   r   r   propertyr   r   r   r   r   __classcell__r   r   rT   r   r,   V   sR    1
 #



B

*-
r,   r   r   rX   c                 C   s   | \}}}}||kr||kst dt|| }t|| }t|| d }t|| d }	|| | }
|	| | }|
|fS )a  Convert a bounding box to a pair of patch indices.

    Args:
        bbox (`tuple[float, float, float, float]`):
            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
            lower-right corners of the box. It should have x2 > x1 and y2 > y1.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
    zTThe coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.r_   )rx   r   floorceil)r   r   x1y1x2y2ul_xul_ylr_xlr_yul_idxlr_idxr   r   r   r     s   r   r   r   c                 C   s   d| }| | }| | }|| }|| }| |kr-|| }|| }	|| | }
|| | }n=||ks5||krJ|| }|| }	|| | }
|| | }n || |d  }|| |d  }	|| |d  }
|| |d  }||	|
|fS )a  
    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).

    Args:
        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
    g      ?r   r   )r   r   r   	cell_sizer   r   r   r   r   r   r   r   r   r   r   patch_index_to_coordinate  s(   r   c              	   C   s4  d}t || }g }|D ]}|d}| \}}}|s,d}|dd |dd f}|d}	g }
|	D ];}t d|}t d|dd }|rp|rp|r_|
t|dt|df q5|
t|dt|df q5|r|||||
f q|
D ]}d|d  d	|d  d
}||||gf q~q|S )a  Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.

    This function is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
    processing happens, including converting to normalized coordinates and whitespace character cleaning up.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> entities = extract_entities_with_patch_indices(text)
    >>> entities
    [(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
    ```z(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>r   Nr   r.   z<patch_index_(\d+)>r_   r/   z><patch_index_r1   )	r   r   r   groupsr   searchrI   r   group)rW   patternmatchesentities_with_patch_indicesmatchr   
phrase_tagphrasematch_contentpatch_index_pairsentity_bboxespairr6   yr   entityr   r   r   #extract_entities_with_patch_indices/  s4   

$"r   c                 C   sP   | \}\}}t tdd|d| }t tdd|d| }|||ff}|S )zfAdjust the positions of the entities in `text` to be relative to the text with special fields removed.<.*?>r   N)rc   r   sub)r   rW   entity_namestartr   adjusted_startadjusted_endadjusted_entityr   r   r   adjust_entity_positionsi  s
   r   c                 C   s   |   }t| t|   }g }|D ]5\}\}}}t|t|  }	t|t|  }
|| |	 }|| |
 }|  }||||f|f q||fS )z9Remove the spaces around the text and the entities in it.)r   rc   r8   r9   rI   )rW   entitiesnew_textleading_spacesnew_entitiesr   r   r   r   entity_name_leading_spacesentity_name_trailing_spacesr   r   r   _cleanup_spacess  s   r      c           
         sp   t dd| }t| }g }|D ]#}|dd |d }}t|| } fdd|D }	|||	f  qt||S )a  Remove the tag tokens from `text`, extract entities in it with some cleaning up of white characters.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
    >>> clean_text
    'An image of a snowman warming himself by a fire.'

    >>> entities
    [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
    ```r   r   r   r   c                    s    g | ]}t |d  |d  qS )r   r_   )r   )r5   r   r   r   r   r7     s     z?clean_text_and_extract_entities_with_bboxes.<locals>.<listcomp>)r   r   r   r   rI   r  )
rW   r   processed_textr   r   itemr   r   r   bboxes_in_coordsr   r  r   r     s   

r   )r  )&r   r   r   r   image_processing_utilsr   image_utilsr   processing_utilsr   r   r   r   r	   tokenization_pythonr
   tokenization_utils_baser   r   utilsr   r   r   r   r   r   r   r   r   r   r,   r   r   r   r   r  r   __all__r   r   r   r   <module>   sD   0	   *-:


