o
    gi                     @   s   d Z ddlZddlZddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ G dd deZdS )zP
    This implementation of EventListener extracts all text from a PDF Document
    N)Decimal)Canvas)CanvasStreamProcessor)BeginPageEvent)ChunkOfTextRenderEvent)LeftToRightComparator)EndPageEvent)Event)EventListener)Document)Pagec                   @   s   e Zd ZdZdd ZdefddZdefddZd	ed
dfddZ	de
fddZd
ejeef fddZeded
ejeef fddZdS )SimpleTextExtractionzT
    This implementation of EventListener extracts all text from a PDF Document
    c                 C   s   i | _ i | _d| _d S )N)_text_render_info_per_page_text_per_page_current_pageself r   \/home/ubuntu/.local/lib/python3.10/site-packages/borb/toolkit/text/simple_text_extraction.py__init__!   s   
zSimpleTextExtraction.__init__pagec                 C   s   |  j d7  _ d S )N   )r   )r   r   r   r   r   _begin_page,   s   z SimpleTextExtraction._begin_pagec           	      C   s  | j | jv r| j| j  ng }dd |D }dd |D }t|dkr$d S t|ttjd}|d  j	}|d  j
}d}|D ]}t| j	| dkrxt|dkrx|dr^|dd	 }|d
7 }|| 7 }| j
| j }| j	}q@| ds|dr|| 7 }| j
| j }q@t|| j
 }t| d}||td |k rdnd7 }|| 7 }| j
| j }q@|| j| j < d S )Nc                 S   s   g | ]
}|  d ur|qS N)get_text.0xr   r   r   
<listcomp>8   s    z2SimpleTextExtraction._end_page.<locals>.<listcomp>c                 S   s(   g | ]}t | d ddkr|qS )  r   )lenr   replacer   r   r   r   r   9   s   ( r   )keyr!   
   r    r   
r   g?)r   r   r"   sorted	functools
cmp_to_keyr   cmpget_baselineyr   absendswithr   width
startswithround0get_space_character_width_estimate_in_user_spacer   r   )	r   r   trislast_baseline_bottomlast_baseline_righttexttdeltaspace_widthr   r   r   	_end_page/   s@   "

zSimpleTextExtraction._end_pageeventreturnNc                 C   sL   t |tr
| | t |tr| |  t |tr$| |  d S d S r   )
isinstancer   _render_textr   r   get_pager   r:   )r   r;   r   r   r   _event_occurredd   s   



z$SimpleTextExtraction._event_occurredtext_render_infoc                 C   s.   | j | jvrg | j| j < | j| j  | d S r   )r   r   append)r   rA   r   r   r   r>   l   s   z!SimpleTextExtraction._render_textc                 C   s   | j S )z@
        This function returns all text on a given page
        )r   r   r   r   r   r   x   s   zSimpleTextExtraction.get_textpdfc                 C   s   i }t |   p
d}td|D ]5}| |}t|d d }t }|t	| t
|t g ||g |t| | d ||< q|S )z
        This function returns the text for a given PDF (per page)
        :param pdf:     the PDF to be analyzed
        :return:        the text per page (represented by typing.Dict[int, str])
        r   ContentsDecodedBytes)intget_document_infoget_number_of_pagesranger?   ioBytesIOr   r@   r   r   r   readr   r   )rC   text_per_pagenumber_of_pagespage_nrr   page_sourcelr   r   r   get_text_from_pdf~   s   
z&SimpleTextExtraction.get_text_from_pdf)__name__
__module____qualname____doc__r   r   r   r:   r	   r@   r   r>   typingDictrF   strr   staticmethodr   rR   r   r   r   r   r      s    5"r   )rV   rJ   rW   decimalr   r(   borb.pdf.canvas.canvasr   'borb.pdf.canvas.canvas_stream_processorr   &borb.pdf.canvas.event.begin_page_eventr   0borb.pdf.canvas.event.chunk_of_text_render_eventr   r   $borb.pdf.canvas.event.end_page_eventr   $borb.pdf.canvas.event.event_listenerr	   r
   borb.pdf.document.documentr   borb.pdf.page.pager   r   r   r   r   r   <module>   s    