o
    .iw$                     @   s   d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ erRd dlZddlmZ dddefddZeG dd dZdS )    N)	dataclassfield)BytesIO)TYPE_CHECKINGAnyClassVarDictOptionalUnion   )config)DownloadConfig)
array_cast)is_local_pathxopen)string_to_dict   )FeatureTypepdfpdfplumber.pdf.PDFreturnc                 C   sL   t  }| jD ]	}||jj q| W  d   S 1 sw   Y  dS )z-Convert a pdfplumber.pdf.PDF object to bytes.N)r   pageswriter   streamgetvalue)r   bufferpage r   I/home/ubuntu/.local/lib/python3.10/site-packages/datasets/features/pdf.pypdf_to_bytes   s
   
$r   c                   @   s  e Zd ZU dZdZeed< dZee	 ed< dZ
ee	 ed< ee e dZee ed	< ed d
d
dZe	ed< dd Zdee	eedf defddZdddefddZddeddfddZdedee	df f fddZdeejejejf dejfddZ dS )Pdfa  
    **Experimental.**
    Pdf [`Feature`] to read pdf documents from a pdf file.

    Input: The Pdf feature accepts as input:
    - A `str`: Absolute path to the pdf file (i.e. random access is allowed).
    - A `dict` with the keys:
        - `path`: String with relative path of the pdf file in a dataset repository.
        - `bytes`: Bytes of the pdf file.
      This is useful for archived files with sequential access.

    - A `pdfplumber.pdf.PDF`: pdfplumber pdf object.

    Args:
        mode (`str`, *optional*):
            The mode to convert the pdf to. If `None`, the native mode of the pdf is used.
        decode (`bool`, defaults to `True`):
            Whether to decode the pdf data. If `False`,
            returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`.

    Examples:

    ```py
    >>> from datasets import Dataset, Pdf
    >>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf())
    >>> ds.features["pdf"]
    Pdf(decode=True, id=None)
    >>> ds[0]["pdf"]
    <pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>
    >>> ds = ds.cast_column("pdf", Pdf(decode=False))
    >>> ds[0]["pdf"]
    {'bytes': None,
    'path': 'path/to/pdf/file.pdf'}
    ```
    TdecodeNidr   dtypebytespathpa_typeF)defaultinitrepr_typec                 C   s   | j S N)r'   )selfr   r   r   __call__K   s   zPdf.__call__valuer   c                 C   s   t jrddl}nd}t|tr|ddS t|trd|dS |dur.t||jjr.| |S |	ddurEt
j|d rEd|	ddS |	ddusS|	ddur^|	d|	ddS td| d)	zEncode example into a format for Arrow.

        Args:
            value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
                Data passed as input to Pdf feature.

        Returns:
            `dict` with "path" and "bytes" fields
        r   Nr&   r%   r&   r$   r%   zRA pdf sample should have one of 'path' or 'bytes' but they are missing or None in .)r   PDFPLUMBER_AVAILABLE
pdfplumber
isinstancestrr%   r   PDFencode_pdfplumber_pdfgetosr&   isfile
ValueError)r-   r/   r3   r   r   r   encode_exampleN   s    







zPdf.encode_exampler   c                 C   s:   t | drt | jdr| jjr| jjddS dt| dS )aa  
        Encode a pdfplumber.pdf.PDF object into a dictionary.

        If the PDF has an associated file path, returns the path. Otherwise, serializes
        the PDF content into bytes.

        Args:
            pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.

        Returns:
            dict: A dictionary with "path" or "bytes" field.
        r   nameNr0   )hasattrr   r=   r   )r   r   r   r   r7   o   s   zPdf.encode_pdfplumber_pdfc                 C   s.  | j stdtjrddl}ntd|du ri }|d |d }}|du r{|du r2td| dt|r=||}|S |	d	d
 }|
tjrMtjntj}zt||d }	||	}
W n tyi   d}
Y nw t|
d}t|d|d}||S |t|}|}W d   |S 1 sw   Y  |S )ai  Decode example pdf file into pdf data.

        Args:
            value (`str` or `dict`):
                A string with the absolute pdf file path, a dictionary with
                keys:

                - `path`: String with absolute or relative pdf file path.
                - `bytes`: The bytes of the pdf file.

            token_per_repo_id (`dict`, *optional*):
                To access and decode pdf files from private repositories on
                the Hub, you can pass a dictionary
                repo_id (`str`) -> token (`bool` or `str`).

        Returns:
            `pdfplumber.pdf.PDF`
        zKDecoding is disabled for this feature. Please use Pdf(decode=True) instead.r   Nz6To support decoding pdfs, please install 'pdfplumber'.r&   r%   z@A pdf should have one of 'path' or 'bytes' but both are None in r1   z::repo_id)tokenrb)download_config)r!   RuntimeErrorr   r2   r3   ImportErrorr;   r   opensplit
startswithHF_ENDPOINTHUB_DATASETS_URLHUB_DATASETS_HFFS_URLr   r8   r   r   r   )r-   r/   token_per_repo_idr3   r&   bytes_r   
source_urlpatternr@   rA   rC   fpr   r   r   decode_example   sD   





zPdf.decode_exampler   c                 C   s(   ddl m} | jr| S |d|ddS )zfIf in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.r   )Valuebinarystringr$   )featuresrS   r!   )r-   rS   r   r   r   flatten   s   zPdf.flattenstoragec                 C   s<  t j|jr%t jdgt| t  d}t jj||gddg|	 d}nst j
|jrJt jdgt| t  d}t jj||gddg|	 d}nNt j|jr|jddkr_|d}nt jdgt| t  d}|jddkr{|d}nt jdgt| t  d}t jj||gddg|	 d}t|| jS )a  Cast an Arrow array to the Pdf arrow storage type.
        The Arrow types that can be converted to the Pdf pyarrow storage type are:

        - `pa.string()` - it must contain the "path" data
        - `pa.binary()` - it must contain the image bytes
        - `pa.struct({"bytes": pa.binary()})`
        - `pa.struct({"path": pa.string()})`
        - `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
        - `pa.list(*)` - it must contain the pdf array data

        Args:
            storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):
                PyArrow array to cast.

        Returns:
            `pa.StructArray`: Array in the Pdf arrow storage type, that is
                `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
        N)typer%   r&   )maskr   )patypes	is_stringrY   arraylenrT   StructArrayfrom_arraysis_null	is_binaryrU   	is_structget_field_indexr   r   r'   )r-   rX   bytes_array
path_arrayr   r   r   cast_storage   s     zPdf.cast_storager,   )!__name__
__module____qualname____doc__r!   bool__annotations__r"   r	   r5   r#   r   r[   structrT   rU   r'   r   r   r+   r.   r
   r%   dictr<   r7   rR   r   rW   StringArrayr`   	ListArrayrh   r   r   r   r   r       s   
 $$!:(r    ) r9   dataclassesr   r   ior   typingr   r   r   r   r	   r
   pyarrowr[    r   download.download_configr   tabler   utils.file_utilsr   r   utils.py_utilsr   r3   rV   r   r%   r   r    r   r   r   r   <module>   s      