o
    ߥiQ                     @   s   d Z ddlmZ ddlmZmZmZmZ ddlZddl	Z
ddlZddlZddlmZmZ ddlmZ ddlmZ dd	 ZG d
d deZG dd deZdd Zdd Zdd Zdeeeejf  fddZG dd deZG dd deZdS )z"
Processor class for GeoLayoutLM.
    )defaultdict)DictIterableListUnionN)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
transforms)	LoadImagec                 C   sF   |  d| dd  }t|dkr|d dkr|dd  }|S |}|S )Nzpad    r   u   ▁)tokenizelen)	tokenizertexttokstoks2 r   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/vldoc/processing.pycustom_tokenize   s
   $r   c                   @   s   e Zd ZdZ									ddededeeef d	ed
edede	ee
e f de	ee
e f deddfddZde	ejejjf fddZde	eejejjef fddZdS )ImageProcessorz
    Construct a GeoLayoutLM image processor
    Args:
        do_preprocess (`bool`): whether to do preprocess to unify the image format,
            resize and convert to tensor.
        do_rescale: only works when we disable do_preprocess.
    TFNp?do_preprocess	do_resize
image_size
do_rescalerescale_factordo_normalize
image_mean	image_std	apply_ocrreturnc
                 K   s   || _ || _|d ur|nddd| _|o| | _|| _|| _|d u r$tn|}|d u r,tn|}t|t	r8|||fn|| _
t|t	rE|||fn|| _|	| _|
| _t | _d S )N   )heightwidth)r   r   sizer   r   r   r   r   
isinstancefloatr   r   r   kwargsr	   ToTensortotensor)selfr   r   r   r   r   r   r   r   r   r'   r   r   r   __init__"   s0   zImageProcessor.__init__imagec                 C   sd   t |dddddddf }|jdd }| jr)t|| jd | jd f}| |}||fS )z? unify the image format, resize and convert to tensor.
        N   r#   r"   )r
   convert_to_ndarrayshaper   cv2resizer$   r)   )r*   r,   size_rawimage_ptr   r   r   
preprocessA   s   "
zImageProcessor.preprocessimagesc           
      C   s   t |ts|g}g }| jr&tt|D ]}| || \||< }|| qtj|dd}| j	r5|| j
 }| jrXt| jdddd}t| jdddd}|| |d  }d}| jratdt|dkrid}|||d}	|	S )	z]
        Args:
            images: list of np.ndarrays, PIL images or image tensors.
        r   dimr      g:0yE>Nz!OCR service is not available yet!)r6   	ocr_infos	sizes_raw)r%   listr   ranger   r5   appendtorchstackr   r   r   tensorr   viewr   r   NotImplementedError)
r*   r6   r;   ir3   	images_ptmustdr:   datar   r   r   __call__M   s0   

zImageProcessor.__call__)	TFNFr   TNNT)__name__
__module____qualname____doc__boolr   strintr&   r   r   r+   npndarrayPILImager5   r<   rI   r   r   r   r   r      sD    	
	

$r   c                   @   s   e Zd Zdd Zdd ZdS )OCRUtilsc                 C   s
   d| _ d S )Nv0)version)r*   r   r   r   r+   q   s   
zOCRUtils.__init__c                 C      t )zd
        sort boxes, filtering or other preprocesses
        should return sorted ocr_infos
        rC   )r*   r:   r   r   r   rI   t   s   zOCRUtils.__call__N)rJ   rK   rL   r+   rI   r   r   r   r   rU   o   s    rU   c                 C   sl   t | dkst | dksJ tt | D ]}|d@ r'tdt| | || |< qtdt| | || |< q| S )N      r   r   )r   r=   maxmin)boxr"   r#   rD   r   r   r   	bound_box|   s   r_   c              	   C   s8   | d | d | d | d | d | d | d | d g}|S )Nr   r   r.   r9   r   )box2pbox4pr   r   r   
bbox2pto4p   s   *rb   c              	   C   sx   t | d | d | d | d t | d | d | d | d t| d | d | d | d t| d | d | d | d g}|S )	Nr   r.   rZ      r   r9         )r]   r\   )ra   r`   r   r   r   
bbox4pto2p   s   rf   tensor_dictsc                 C   s^   t t}| D ]}| D ]\}}|| | qqi }| D ]\}}tj|dd||< q|S )Nr   r7   )r   r<   itemsr>   r?   r@   )rg   one_dicttdkvres_dictr   r   r   stack_tensor_dict   s   rn   c                   @   s   e Zd Z				ddedededefdd	Zd
edefddZ						d d
edede	e	 de	e	 dedefddZ
d
ededefddZdede	e	 de	e	 dedef
ddZ					d!de	e	 dede	e	 de	e	 def
ddZdS )"TextLayoutSerializerr!   TNmax_seq_lengthmax_block_numuse_roberta_tokenizer	ocr_utilsc                 C   sl   d| _ || _|| _|| _|| _|| _|| _|| _|j| _|j	| _
|j| _|j| _dgd | _dgd | _d S )NrV   g        r[   r   rZ   )rW   rp   rq   r   r#   r"   rr   rs   pad_token_idbos_token_idcls_token_ideos_token_idsep_token_idunk_token_idcls_bbs_wordcls_bbs_line)r*   rp   rq   r   r#   r"   rr   rs   r   r   r   r+      s   zTextLayoutSerializer.__init__ocr_info
label_infoc                 C   rX   NrY   )r*   r|   r}   r   r   r   	label2seq   s   zTextLayoutSerializer.label2seq	input_ids	bbox_line	bbox_wordr#   r"   c                 C   st  |dur)t |t |ksJ t |t |ksJ | |||||\}}}}	}
}}n|dus/J | |||\}}}}	}
}}i }tj| jtjd| j |d< tj| jtjd|d< tj| j	tjd|d< tj| j	tjd|d< tj| jdtj
d|d< tj| jd	tj
d|d
< tj| jtjd|d< tj| jtjd|d< tj| jtjd|d< ||gd	 }||gd }| jg| | jg }| jg| |g }| jg| |g }t |}t |	}t||d d|< d|d d|< t|	|d d|< d|d d|< t|
|d d|d < t||d d|d < |d |d  |d< t||d d|d < t||d d|ddf< |d ddg df | |d ddg df< |d ddg df | |d ddg df< t||d
 d|ddf< |d
 ddddgf | d |d
 ddddgf< |d
 ddddgf | d |d
 ddddgf< |d
  |d
< |S )a  
        Either ocr_info or (input_ids, bbox_line, bbox_word)
            should be provided.
        If (input_ids, bbox_line, bbox_word) is provided,
            convinient plug into the serialization (customization)
            is offered. The tokens must be organised by blocks and words.
        Else, ocr_info must be provided, to be parsed
            to sequences directly (the simplest way).
        Args:
            ocr_info: [
                {"text": "xx", "box": [a,b,c,d],
                 "words": [{"text": "x", "box": [e,f,g,h]}, ...]},
                ...
            ]
            bbox_line: the coordinate value should match the original image
                (i.e., not be normalized).
        N)dtyper   attention_maskfirst_token_idxesfirst_token_idxes_maskr[   bbox_4p_normalizedrZ   bboxline_rank_idline_rank_inner_idword_rank_idr.   r   )r   r.   rZ   rc   )r   r9   rd   re   r   i  r9   )r   halfseq2seqocr_info2seqr?   onesrp   int64rt   zerosrq   float32rv   rx   r{   rz   rA   long)r*   r|   r   r   r   r#   r"   bbs_wordbbs_liner   line_rank_idsline_rank_inner_idsword_rank_ids	token_seqsep_bbs_wordsep_bbs_line
len_tokens	len_linesr   r   r   serialize_single   s   













z%TextLayoutSerializer.serialize_singlec                 C   s  g }g }g }g }g }g }	g }
d}t |D ]\}}|| jkrd}|r# n|d }t|||}d}t |d D ]\}}|d }|d }t|||}t|}| jrUt| j|}n| j|}| j|}t	|dkrm|
| j t	|}t	|| | jd krd} n_|r|
t	|d  || ||g|  ||g|  |
|d g|  ||d g|  |rt	|	dkr|	d	 dkrd
|	d	< |	dg|d dg   d}q5|	|dg  q5qt	|	dkr|	d	 dkrd
|	d	< ||||||	|
fS )NFTr^   wordsr   r   r.   r   r-   r9   )	enumeraterq   r_   rb   rr   r   r   r   convert_tokens_to_idsr   r>   ry   rp   extend)r*   r|   r#   r"   r   r   r   r   r   r   r   
early_stopline_idxlinelboxis_first_wordword_id	word_infowtextwboxwbox4pwtokens
wtoken_idsn_tokensr   r   r   r   '  sj   


z!TextLayoutSerializer.ocr_info2seqc                 C   s  g }g }g }g }	g }
g }t |}d\}}d}d}t|D ]}|| }|| }|du p-||k}|du p5||k}||}}t |dkrEt|}t |dksMJ t|||}t |dkr]t|}t |dkseJ t|||}|| || |r{|d7 }|r|d7 }||d  t |
dkr|
d dkrd	|
d< |
d d}n|
d |	| || qt |
dkr|
d dkrd	|
d< |||||	|
|fS )
z
        for convinient plug into the serialization, given the 3 customized sequences.
        They should not contain special tokens like [CLS] or [SEP].
        )NNr   r   Nr[   rZ   r-   r.   r9   )r   r=   rf   r_   rb   r>   )r*   r   r   r   r#   r"   r   r   r   r   r   r   n_real_tokenslb_prevwb_prevline_idr   rD   lb_nowwb_now
line_start
word_startr   r   r   r   _  s\   





z TextLayoutSerializer.halfseq2seqr:   bboxes_linebboxes_wordr;   c              	   K   s  |d urt |nt |}|d u r| j| jfg| }g }|d urUt |t |ks)J t |t |ks3J t||||D ]\}	}
}}|\}}| d |	|
|||}|| q:n-|d us]J d| jd urg| |}t||D ]\}}|\}}| j|||d}|| qlt|}|S )Nz2For serialization, ocr_infos must not be NoneType!)r#   r"   )r   r"   r#   zipr   r>   rs   rn   )r*   r:   r   r   r   r;   r'   	n_samplesseqsinput_idr   r   r3   r"   r#   r   r|   pt_seqsr   r   r   rI     s6   	


zTextLayoutSerializer.__call__)r!   r!   TN)NNNNr!   r!   )NNNNN)rJ   rK   rL   rP   rN   rU   r+   r<   r   r   r   r   r   rI   r   r   r   r   ro      st    

e8
;ro   c                	   @   sv   e Zd ZdZ					ddededefdd	Z			dd
ee	e
jejjef dee dede	fddZ	dddZdS )	Processora  Construct a GeoLayoutLM processor.

    Args:
        max_seq_length: max length for token
        max_block_num: max number of text lines (blocks or segments)
        img_processor: type of ImageProcessor.
        tokenizer: to tokenize strings.
        use_roberta_tokenizer: Whether the tokenizer is originated from RoBerta tokenizer
            (True by default).
        ocr_utils: a tool to preprocess ocr_infos.
        width: default width. It can be used only when all the images are of the same shape.
        height: default height. It can be used only when all the images are of the same shape.

    In `serialize_from_tokens`, the 3 sequences (i.e., `input_ids`, `bboxes_line`, `bboxes_word`)
        must not contain special tokens like [CLS] or [SEP].
    The boxes in `bboxes_line` and `bboxes_word` can be presented by either 2 points or 4 points.
    The value in boxes should keep original.
    Here is an example of the 3 arguments:
        ```
        input_ids ->
        [[6, 2391, 6, 31833, 6, 10132, 6, 2283, 6, 17730, 6, 2698, 152]]
        bboxes_line ->
        [[[230, 1, 353, 38], [230, 1, 353, 38], [230, 1, 353, 38], [230, 1, 353, 38],
            [230, 1, 353, 38], [230, 1, 353, 38], [230, 1, 353, 38], [230, 1, 353, 38],
            [257, 155, 338, 191], [257, 155, 338, 191], [257, 155, 338, 191], [257, 155, 338, 191],
            [257, 155, 338, 191]]]
        bboxes_word ->
        [[[231, 2, 267, 2, 267, 38, 231, 38], [231, 2, 267, 2, 267, 38, 231, 38],
            [264, 7, 298, 7, 298, 36, 264, 36], [264, 7, 298, 7, 298, 36, 264, 36],
            [293, 3, 329, 3, 329, 41, 293, 41], [293, 3, 329, 3, 329, 41, 293, 41],
            [330, 4, 354, 4, 354, 39, 330, 39], [330, 4, 354, 4, 354, 39, 330, 39],
            [258, 156, 289, 156, 289, 193, 258, 193], [258, 156, 289, 156, 289, 193, 258, 193],
            [288, 158, 321, 158, 321, 192, 288, 192], [288, 158, 321, 158, 321, 192, 288, 192],
            [321, 156, 336, 156, 336, 190, 321, 190]]]
        ```

    NTr!   img_processorrr   rs   c	           
   	   K   s.   || _ || _|	| _t|||||||d| _d S )N)rr   rs   )r   r   r'   ro   
serializer)
r*   rp   rq   r   r   rr   rs   r#   r"   r'   r   r   r   r+     s   
zProcessor.__init__r6   r:   
token_seqsr;   c           	      C   s   |  |}|d }|d u r|d n|}|d u r|d n|}|d u r)| j||d}n| j	 dd|i|}|d us<J di }||d< | D ]
\}}|| ||< qF|S )Nr6   r:   r;   )r;   z token_seqs must not be NoneType!r,   r~   )r   r   rh   )	r*   r6   r:   r   r;   img_databatchrk   rl   r   r   r   rI     s&   
zProcessor.__call__c                 C   s*   i }||d< ||d< ||d< | |d ||S )Nr   r   r   r   )r*   r6   r   r   r   r;   
half_batchr   r   r   serialize_from_tokens  s
   zProcessor.serialize_from_tokens)NTNr!   r!   )NNNr~   )rJ   rK   rL   rM   r   rN   rU   r+   r   r<   rQ   rR   rS   rT   rO   r   dictrI   r   r   r   r   r   r     s8    *

r   ) rM   collectionsr   typingr   r   r   r   r1   numpyrQ   rS   r?   timm.data.constantsr   r   torchvisionr	   modelscope.preprocessors.imager
   r   objectr   rU   r_   rb   rf   rO   Tensorrn   ro   r   r   r   r   r   <module>   s*   V
  