o
    gi                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ G dd deZdS )z^
    This implementation of EventListener extracts all paragraphs of text from a PDF Document
    N)Decimal)disjointset)Canvas)CanvasStreamProcessor)BeginPageEvent)EndPageEvent)	Rectangle)LayoutElement)	Paragraph)Document)Page)SimpleLineOfTextExtractionc                       s   e Zd ZdZededfdedef fddZdef fd	d
ZdededefddZ	de
jee
je f fddZedede
jee
je f fddZ  ZS )SimpleParagraphExtractionzb
    This implementation of EventListener extracts all paragraphs of text from a PDF Document
    g?gffffff?%minimum_horizontal_overlap_percentagemaximum_multiplied_leadingc                    s$   t t|   || _|| _i | _d S )N)superr   __init__&_minimum_horizontal_overlap_percentage_maximum_multiplied_leading_paragraphs_per_page)selfr   r   	__class__ a/home/ubuntu/.local/lib/python3.10/site-packages/borb/toolkit/text/simple_paragraph_extraction.pyr   !   s
   
z"SimpleParagraphExtraction.__init__pagec              	      s  t t| | t }| j| j D ]}|| q|D ]f}|D ]a}||kr&q||||kr1q| 	 dksA| 	 dkrBq| 
| | t| j| j }t| j| j t| j| j }|| jkr|| jkr|||  nqqg }| D ]m}	dd |	D }
ddd |
D d d }t||
d  |
d  |
d  d}ttdd |
D td	d |
D td
d |
D tdd |
D  tdd |
D tdd |
D  |_t|tsJ || q|| j| j< d S )Nr   c                 S   s   g | ]}|qS r   r   .0xr   r   r   
<listcomp>c   s    z7SimpleParagraphExtraction._end_page.<locals>.<listcomp> c                 S   s   g | ]}|  d  qS )
)get_textr   r   r   r   r   f   s    )textfont
font_color	font_sizec                 S      g | ]}|  jqS r   get_previous_layout_boxr   r   lr   r   r   r   p       c                 S   r(   r   r*   yr+   r   r   r   r   q   r-   c                 S       g | ]}|  j|  j qS r   )r*   r   widthr+   r   r   r   r   s       c                 S   r(   r   r)   r+   r   r   r   r   y   r-   c                 S   r0   r   )r*   r/   heightr+   r   r   r   r   {   r2   c                 S   r(   r   r.   r+   r   r   r   r      r-   )r   r   	_end_pager   _lines_of_text_per_page_current_page_numberaddfindr*   	get_width_overlapminr1   absr/   r3   r   r   unionsetsjoinr
   get_fontget_font_colorget_font_sizer   max_previous_layout_box
isinstanceappendr   )r   r   line_of_text_disjoint_setline_of_textl0l1overlap_percentageleading
paragraphsline_of_text_partitionlines_of_texttxtpr   r   r   r4   1   s   




z#SimpleParagraphExtraction._end_pager0r1returnc                 C   s   t |j|j|j t|j|j|j k rtdS t |j|j|j t|j|j|j k r0tdS t t|j|j|j t|j|j|j }tt |j|j|j t |j|j|j }t|| S )Nr   )rC   r   r1   r;   r   r<   )r   rR   rS   abr   r   r   r:      s   ((**z"SimpleParagraphExtraction._overlapc                 C   s   | j S )zE
        This function returns the paragraphs on a given PDF
        )r   )r   r   r   r   get_paragraphs   s   z(SimpleParagraphExtraction.get_paragraphspdfc                 C   s   i }t |   p
d}td|D ]5}| |}t|d d }t }|t	| t
|t g ||g |t| | d ||< q|S )z
        This function returns the Paragraph objects for a given PDF (per page)
        :param pdf:     the PDF to be analyzed
        :return:        the Paragraph objects per page (represented by typing.Dict[int, typing.List[Paragraph]])
        r   ContentsDecodedBytes)intget_document_infoget_number_of_pagesrangeget_pageioBytesIOr   _event_occurredr   r   r   readr   rW   )rX   paragraphs_per_pagenumber_of_pagespage_nrr   page_sourcer,   r   r   r   get_paragraphs_from_pdf   s   	
z1SimpleParagraphExtraction.get_paragraphs_from_pdf)__name__
__module____qualname____doc__r   r   r   r4   r   r:   typingDictr[   Listr
   rW   staticmethodr   rh   __classcell__r   r   r   r   r      s$    
Xr   )rl   r`   rm   decimalr   borb.datastructure.disjoint_setr   borb.pdf.canvas.canvasr   'borb.pdf.canvas.canvas_stream_processorr   &borb.pdf.canvas.event.begin_page_eventr   $borb.pdf.canvas.event.end_page_eventr   "borb.pdf.canvas.geometry.rectangler   %borb.pdf.canvas.layout.layout_elementr	   %borb.pdf.canvas.layout.text.paragraphr
   borb.pdf.document.documentr   borb.pdf.page.pager   0borb.toolkit.text.simple_line_of_text_extractionr   r   r   r   r   r   <module>   s    