o
    8wi3+                     @   s   d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ erTd dlZddlmZ dddefddZeG dd dZ ddde!fddZ"dS )    N)	dataclassfield)BytesIO)TYPE_CHECKINGAnyClassVarDictOptionalUnion   )config)DownloadConfig)
array_cast)is_local_pathxopen)no_op_if_value_is_nullstring_to_dict   )FeatureTypepdfpdfplumber.pdf.PDFreturnc                 C   sL   t  }| jD ]	}||jj q| W  d   S 1 sw   Y  dS )z-Convert a pdfplumber.pdf.PDF object to bytes.N)r   pageswriter   streamgetvalue)r   bufferpage r   R/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/features/pdf.pypdf_to_bytes   s
   
$r    c                   @   s  e Zd ZU dZdZeed< edddZe	e
 ed< dZee
 ed	< ee e d
Zee ed< ed dddZe
ed< dd Zdee
eeedf defddZddeddfddZdedee
df f fddZdeejejejf dejfddZ ddejdejfddZ!dS )Pdfa  
    **Experimental.**
    Pdf [`Feature`] to read pdf documents from a pdf file.

    Input: The Pdf feature accepts as input:
    - A `str`: Absolute path to the pdf file (i.e. random access is allowed).
    - A `dict` with the keys:
        - `path`: String with relative path of the pdf file in a dataset repository.
        - `bytes`: Bytes of the pdf file.
      This is useful for archived files with sequential access.

    - A `pdfplumber.pdf.PDF`: pdfplumber pdf object.

    Args:
        mode (`str`, *optional*):
            The mode to convert the pdf to. If `None`, the native mode of the pdf is used.
        decode (`bool`, defaults to `True`):
            Whether to decode the pdf data. If `False`,
            returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`.

    Examples:

    ```py
    >>> from datasets import Dataset, Pdf
    >>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf())
    >>> ds.features["pdf"]
    Pdf(decode=True, id=None)
    >>> ds[0]["pdf"]
    <pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>
    >>> ds = ds.cast_column("pdf", Pdf(decode=False))
    >>> ds[0]["pdf"]
    {'bytes': None,
    'path': 'path/to/pdf/file.pdf'}
    ```
    TdecodeNF)defaultrepridr   dtypebytespathpa_type)r#   initr$   _typec                 C   s   | j S N)r*   )selfr   r   r   __call__K   s   zPdf.__call__valuer   c                 C   s   t jrddl}nd}t|tr|ddS t|ttfr d|dS |dur/t||jjr/t	|S |
ddurFtj|d rFd|
ddS |
ddusT|
ddur_|
d|
ddS td| d)	zEncode example into a format for Arrow.

        Args:
            value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
                Data passed as input to Pdf feature.

        Returns:
            `dict` with "path" and "bytes" fields
        r   Nr)   r(   r)   r'   r(   zRA pdf sample should have one of 'path' or 'bytes' but they are missing or None in .)r   PDFPLUMBER_AVAILABLE
pdfplumber
isinstancestrr(   	bytearrayr   PDFencode_pdfplumber_pdfgetosr)   isfile
ValueError)r.   r0   r4   r   r   r   encode_exampleN   s    





zPdf.encode_examplec                 C   s.  | j stdtjrddl}ntd|du ri }|d |d }}|du r{|du r2td| dt|r=||}|S |	d	d
 }|
tjrMtjntj}zt||d }	||	}
W n tyi   d}
Y nw t|
d}t|d|d}||S |t|}|}W d   |S 1 sw   Y  |S )ai  Decode example pdf file into pdf data.

        Args:
            value (`str` or `dict`):
                A string with the absolute pdf file path, a dictionary with
                keys:

                - `path`: String with absolute or relative pdf file path.
                - `bytes`: The bytes of the pdf file.

            token_per_repo_id (`dict`, *optional*):
                To access and decode pdf files from private repositories on
                the Hub, you can pass a dictionary
                repo_id (`str`) -> token (`bool` or `str`).

        Returns:
            `pdfplumber.pdf.PDF`
        zKDecoding is disabled for this feature. Please use Pdf(decode=True) instead.r   Nz6To support decoding pdfs, please install 'pdfplumber'.r)   r(   z@A pdf should have one of 'path' or 'bytes' but both are None in r2   ::repo_idtokenrbdownload_config)r"   RuntimeErrorr   r3   r4   ImportErrorr=   r   opensplit
startswithHF_ENDPOINTHUB_DATASETS_URLHUB_DATASETS_HFFS_URLr   r:   r   r   r   )r.   r0   token_per_repo_idr4   r)   bytes_r   
source_urlpatternrA   rC   rF   fpr   r   r   decode_exampleo   sD   





zPdf.decode_exampler   c                 C   s(   ddl m} | jr| S |d|ddS )zfIf in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.r   )Valuebinarystringr'   )featuresrV   r"   )r.   rV   r   r   r   flatten   s   zPdf.flattenstoragec                 C   s<  t j|jr%t jdgt| t  d}t jj||gddg|	 d}nst j
|jrJt jdgt| t  d}t jj||gddg|	 d}nNt j|jr|jddkr_|d}nt jdgt| t  d}|jddkr{|d}nt jdgt| t  d}t jj||gddg|	 d}t|| jS )a  Cast an Arrow array to the Pdf arrow storage type.
        The Arrow types that can be converted to the Pdf pyarrow storage type are:

        - `pa.string()` - it must contain the "path" data
        - `pa.binary()` - it must contain the image bytes
        - `pa.struct({"bytes": pa.binary()})`
        - `pa.struct({"path": pa.string()})`
        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
        - `pa.list(*)` - it must contain the pdf array data

        Args:
            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):
                PyArrow array to cast.

        Returns:
            `pa.StructArray`: Array in the Pdf arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Ntyper(   r)   maskr   )patypes	is_stringr]   arraylenrW   StructArrayfrom_arraysis_null	is_binaryrX   	is_structget_field_indexr   r   r*   )r.   r[   bytes_array
path_arrayr   r   r   cast_storage   s     zPdf.cast_storagec                    s   du ri t fdd tj fdd| D t d}tjdd |d D t d}tjj||gd	dg|	 d
}t
|| jS )a4  Embed PDF files into the Arrow array.

        Args:
            storage (`pa.StructArray`):
                PyArrow array to embed.

        Returns:
            `pa.StructArray`: Array in the PDF arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        Nc                    s   |  dd }|tjrtjntj}t||}|d ur# |d nd }t|d}t	| d|d}|
 W  d    S 1 s@w   Y  d S )Nr?   r@   rA   rB   rD   rE   )rJ   rK   r   rL   rM   rN   r   r:   r   r   read)r)   rQ   rR   source_url_fieldsrC   rF   rS   )rO   r   r   path_to_bytes   s   

$z(Pdf.embed_storage.<locals>.path_to_bytesc                    s8   g | ]}|d ur|d d u r |d n|d nd qS )Nr(   r)   r   ).0x)rp   r   r   
<listcomp>   s    *z%Pdf.embed_storage.<locals>.<listcomp>r\   c                 S   s$   g | ]}|d urt j|nd qS r-   )r;   r)   basename)rq   r)   r   r   r   rs      s   $ r)   r(   r^   )r   r`   rc   	to_pylistrW   r   rX   re   rf   rg   r   r*   )r.   r[   rO   rk   rl   r   )rp   rO   r   embed_storage   s    
zPdf.embed_storager-   )"__name__
__module____qualname____doc__r"   bool__annotations__r   r%   r	   r6   r&   r   r`   structrW   rX   r*   r   r,   r/   r
   r(   r7   dictr>   rU   r   rZ   StringArrayre   	ListArrayrm   rv   r   r   r   r   r!      s   
 $$ !:$%r!   c                 C   s:   t | drt | jdr| jjr| jjddS dt| dS )aA  
    Encode a pdfplumber.pdf.PDF object into a dictionary.

    If the PDF has an associated file path, returns the path. Otherwise, serializes
    the PDF content into bytes.

    Args:
        pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.

    Returns:
        dict: A dictionary with "path" or "bytes" field.
    r   nameNr1   )hasattrr   r   r    )r   r   r   r   r9     s   r9   )#r;   dataclassesr   r   ior   typingr   r   r   r   r	   r
   pyarrowr`    r   download.download_configr   tabler   utils.file_utilsr   r   utils.py_utilsr   r   r4   rY   r   r(   r    r!   r~   r9   r   r   r   r   <module>   s$      g