o
    8wi!                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlZd dlZ	d dl
mZ d dlZd dlZd dlmZ d dlmZ ejjeZdd Zdd	 Zd
d ZeG dd dejZG dd dejZdS )    N)	dataclass)Optional)
table_cast)readlinec                  O   >   zt jjj| i |W S  ty   t jjj| i | Y S w N)pdiojsonujson_dumpsAttributeErrordumpsargskwargs r   `/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/datasets/packaged_modules/json/json.pyr      
   r   c                  O   r   r   )r   r	   r
   ujson_loadsr   loadsr   r   r   r   r      r   r   c                 K   s(   t jjjdkrd|d< tj| fi |S )N   pyarrowdtype_backend)datasetsconfigPANDAS_VERSIONmajorr   	read_json)path_or_bufr   r   r   r   pandas_read_json#   s   r   c                       s   e Zd ZU dZdZeej ed< dZ	e
ed< dZee
 ed< dZee
 ed< dZeed	< dZee ed
< dZeed< dZee ed<  fddZ  ZS )
JsonConfigzBuilderConfig for JSON.Nfeaturesutf-8encodingencoding_errorsfieldTuse_threads
block_sizei   	chunksizenewlines_in_valuesc                    s   t    d S r   )super__post_init__self	__class__r   r   r+   6   s   zJsonConfig.__post_init__)__name__
__module____qualname____doc__r!   r   r   Features__annotations__r#   strr$   r%   r&   boolr'   intr(   r)   r+   __classcell__r   r   r.   r   r    )   s   
 r    c                   @   s>   e Zd ZeZdd Zdd ZdejdejfddZ	d	d
 Z
dS )Jsonc                 C   s\   | j jd urtd | j j| j _| j jdurtd | j jd ur&tdtj	| j j
dS )NzTThe JSON loader parameter `block_size` is deprecated. Please use `chunksize` insteadTzZThe JSON loader parameter `use_threads` is deprecated and doesn't have any effect anymore.zEThe JSON loader parameter `newlines_in_values` is no longer supported)r!   )r   r'   loggerwarningr(   r&   r)   
ValueErrorr   DatasetInfor!   r,   r   r   r   _info=   s   
z
Json._infoc                    s   | j jstd| j j d j_ | j j}g }| D ]!\}}t|tr*|g} fdd|D }|	t
j|d|id q|S )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=Tc                    s   g | ]}  |qS r   )
iter_files).0file
dl_managerr   r   
<listcomp>S   s    z*Json._split_generators.<locals>.<listcomp>files)name
gen_kwargs)r   
data_filesr=   download_configextract_on_the_flydownload_and_extractitems
isinstancer6   appendr   SplitGenerator)r-   rD   rI   splits
split_namerF   r   rC   r   _split_generatorsI   s   
zJson._split_generatorspa_tablereturnc                 C   sn   | j jd ur5t| j jt|j D ]}| j jj|j}||tj	d gt
| |d}qt|| j jj}|S )N)type)r   r!   setcolumn_namesarrow_schemar%   rV   append_columnpaarraylenr   )r-   rT   column_namerV   r   r   r   _cast_tableW   s   "zJson._cast_tablec                 c   s   t tj|D ]\}}| jjd urkt|| jj| jjd}t	|
 }W d    n1 s0w   Y  || jj }ttt|}|j dgkrZ| jjrVt| jjndg|_tjj|dd}|| |fV  q	t|da}d}t| jjd d}	| jjd ur| jjnd	}
	 |
| jj}|sn6z|| 7 }W n ttjfy   |t|7 }Y nw | jjdkr|j| jj|
dd}zV	 ztjt |tj!|	dd}W nB tj"tj#fy } z0t$|tj"rdt%|vs|	t&|kr t'(dt&| d|	 d|	d  d |	d9 }	W Y d }~nd }~ww qW n tj"y } zz"t|| jj| jjd}t|}W d    n	1 s>w   Y  W n t)y^   t'*d| dt+| d|  |w |j dgkrw| jjrst| jjndg|_z
tjj|dd}W n' tj"y } zt'*d| dt+| d|  t)d| dd d }~ww || |fV  W Y d }~nd }~ww ||f| |fV  |d7 }qW d    n	1 sw   Y  q	d S )N)r#   errorsr   textF)preserve_indexrb    i @  strictTr"   )r`   )r'   )read_options
straddlingz	Batch of z* bytes couldn't be parsed with block_size=z. Retrying with block_size=r   .zFailed to load JSON from file 'z' with error z: z=Failed to convert pandas DataFrame to Arrow Table from file 'z<Failed to convert pandas DataFrame to Arrow Table from file    ),	enumerate	itertoolschainfrom_iterabler   r%   openr#   r$   r   readr   r	   StringIOr   columnstolistr!   listr[   Tablefrom_pandasr_   maxr(   r   r   UnsupportedOperationdecodeencodepajr   BytesIOReadOptionsArrowInvalidArrowNotImplementedErrorrN   r6   r]   r;   debugr=   errorrV   )r-   rF   file_idxrB   fdatasetdfrT   	batch_idxr'   r$   batcher   r   r   _generate_tablesb   s   

 
zJson._generate_tablesN)r0   r1   r2   r    BUILDER_CONFIG_CLASSr?   rS   r[   rt   r_   r   r   r   r   r   r:   :   s    r:   )r	   rk   dataclassesr   typingr   pandasr   r   r[   pyarrow.jsonr
   rz   r   datasets.configdatasets.tabler   datasets.utils.file_utilsr   utilslogging
get_loggerr0   r;   r   r   r   BuilderConfigr    ArrowBasedBuilderr:   r   r   r   r   <module>   s$    