o
    R
i[                     @   s  d dl Z d dlZzd dlZW n ey   dZY nw d dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZ zd dlmZ d dlmZmZmZ W n eyS   dZY nw zd dlZd dlmZ d dlmZmZ W n eyu   d ZZY nw ejjZejjdd Zejjd	d
 Z ejjdd Z!ejjdd Z"ejjdd Z#ejjdd Z$ejjdd Z%ejjdd Z&ejjdd Z'ejjdd Z(ejjdd Z)ejjdd Z*ejjdd  Z+ejjd!d" Z,ejjd#d$ Z-ejjd%d& Z.ejjd'd( Z/ejjd)d* Z0ejjd+d, Z1ejjd-d. Z2ejjd/d0 Z3ejjd1d2 Z4ejjd3d4 Z5ejjd5d6 Z6ejjd7d8 Z7ejjej8d9g d:ej8d;d<d=gd>d? Z9ejjd@dA Z:ejjdBdC Z;dS )D    N)LocalFileSystemSubTreeFileSystem)guid)Version)_read_table_test_dataframe_write_table)_roundtrip_pandas_dataframealltypes_samplec                 C   s   t dd}| d }tj|}d|jjv sJ t|| t|j}d|v s(J t	
|d d}|d dd ddd	d
gksAJ d S )N'  sizepandas_roundtrip.parquet   pandasutf8index_columnsranger      )kindnamestartstopstep)r
   paTablefrom_pandasschemametadatar   pqread_metadatajsonloadsdecode)tempdirdffilenamearrow_tabler   js r(   U/home/ubuntu/.local/lib/python3.10/site-packages/pyarrow/tests/parquet/test_pandas.py#test_pandas_parquet_custom_metadata7   s   

r*   c              	   C   s   t t dt  t dt  t dt  g}ttj	dtj
dtj	dtjdg dd}tdd	gd
d gd d gd}t jj||dd}t jj||dd}|jj|jddr]J |j|jsfJ tj| d |d}|| || d S )Nintfloatstring   dtype)ABBAEDDAACDC)r+   r,   r-         g?F)r   preserve_indexT)check_metadatazmerged.parquet)r   )r   r   fieldint16float32r-   pd	DataFramenparangeuint8r   r   equalsr   ParquetWriterwrite_table)r#   r   df1df2table1table2writerr(   r(   r)   :test_merging_parquet_tables_with_different_pandas_metadataK   s,   
rH   c                 C   s   | d }t dd}ddddd|_tj|}d	|jjd
 v s!J t|| t	|j}t
|d
 d}d|v s<J |d |jksEJ d S )Nzmetadata_persistence.parquetr   r   zhalf-precisionzsingle precisionzdouble precisionz%Attributes Persistence Test DataFrame)float16r:   float64
desciptions
   attributesr   r   
attributes)r
   attrsr   r   r   r   r   r   r   r   r    r!   r"   )r#   r%   r$   tabler   r'   r(   r(   r)   $test_attributes_metadata_persistenceh   s   

rO   c                 C   s   t dd}tjjtt|j|jd d d ddgd|_| d }tj	|}|j
jd us.J t|| t|}| }t|| d S )N
   r   level_1level_2namesr   )r
   r;   
MultiIndexfrom_tupleslistzipcolumnsr   r   r   r   pandas_metadatar   r   read_pandas	to_pandastmassert_frame_equal)r#   r$   r%   r&   
table_readdf_readr(   r(   r)   %test_pandas_parquet_column_multiindex   s   


rb   c                 C   s   t dd}| d }tjj|dd}|jj}|d rJ |d s!J t|| t|}|jj}|d r5J |jj	}|jj	|ksAJ |
 }t|| d S )Nr   r   r   Fr6   r   rZ   )r
   r   r   r   r   r[   r   r   r\   r   r]   r^   r_   )r#   r$   r%   r&   r'   r`   r   ra   r(   r(   r)   <test_pandas_parquet_2_roundtrip_read_pandas_no_index_written   s   


rd   c                  C   X   t d} tj| }t }t||dd | }t|}t|	 }t
| | d S )Nr   2.6versionr   r   r   r   BufferOutputStreamr   getvalueBufferReaderr   r]   r^   r_   r$   r&   imosbufreaderra   r(   r(   r)   )test_pandas_parquet_native_file_roundtrip      
rq   c                  C   sj   t d} tj| }t }t||dd | }t|}tj	|ddgd
 }t| ddg | d S )Nr   rf   rg   stringsr?   rZ   )r   r   r   r   rj   r   rk   rl   r   r\   r]   r^   r_   rm   r(   r(   r)   test_read_pandas_column_subset   s   
ru   c                  C   re   )Nr   rf   rg   ri   rm   r(   r(   r)   #test_pandas_parquet_empty_roundtrip   rr   rv   c                  C   sJ   ddiddiddigdd} t j| d}tj|}t }t|| d S )	N	page_typer   record_typenon_consecutive_homer   1001)agg_col	uid_first)data)r;   r<   r   r   r   rj   r   )r}   r$   r&   rn   r(   r(   r)   !test_pandas_can_write_nested_data   s   r~   c           	      C   s   | d }d}t tj|tjdtj|tjdtj|tjdtj|dkg dd}t	j
|}|d}t||dd	 W d    n1 sHw   Y  t| }t|}| }t|| d S )
Nzpandas_pyfile_roundtrip.parquetr5   r/   r   )foobarNbazqux)int64r:   rJ   boolrs   wbrf   rg   )r;   r<   r=   r>   r   r:   rJ   randomrandnr   r   r   openr   ioBytesIO
read_bytesr   r]   r^   r_   )	r#   r%   r   r$   r&   fr}   r`   ra   r(   r(   r)   $test_pandas_parquet_pyfile_roundtrip   s"   r   c           
      C   s  d}t jd tt j|t jdt j|t jdt j|t jdt j|t j	dt j|t j
dt j|t j
dt j|t jdt j|t jdt j|t jdt j|t jdt j|dkd}| d }tj|}dD ]}t||d|d t|}| }t|| qgdD ]}t||d|d	 t|}| }t|| qd
D ]$}	|	dkrtjj|	sqt||d|	d t|}| }t|| qd S )Nr   r   r/   )r?   uint16uint32uint64int8r9   int32r   r:   rJ   r   r   )TFrf   )rh   use_dictionary)rh   write_statistics)NONESNAPPYGZIPLZ4ZSTDr   )rh   compression)r=   r   seedr;   r<   r>   r?   r   r   r   r9   r   r   r:   rJ   r   r   r   r   r   r   r]   r^   r_   libCodecis_available)
r#   r   r$   r%   r&   r   r`   ra   r   r   r(   r(   r)   )test_pandas_parquet_configuration_options   sV   r   c                  C   sH   t dd} tddt|  d| _d| j_t| ddi}t||  d S )Nd   r   r   rP   r   flavorspark)	r   r=   r>   lenindexr   r	   r^   r_   )r$   resultr(   r(   r)   +test_spark_flavor_preserves_pandas_metadata)  s
   
r   c                 C   s   t ddt ddit dt dt dt did}t| d }t j|ddjdd	d
}tj|}t|| t	|}|
 }t|| d S )Nz2017-06-30 01:31:00g*_c@z2017-06-30 01:32:00)closetimedata.parquetzdatetime64[us]r/   r   Fdrop)r;   	Timestampstrr<   	set_indexr   r   r   r   r   r]   r^   r_   )r#   r}   pathdfxtdfxr&   	result_dfr(   r(   r)    test_index_column_name_duplicate3  s$   


r   c           	      C   s   d}t t|}tjjg d|gddgd}tjd|i|d}tj|}| d }t	|| t
|}||s9J | }t|| d S )	Nr.   )r   r   r   foobarsome_numbersrT   numbers)r   zdup_multi_index_levels.parquet)rX   r   r;   rV   from_arraysr<   r   r   r   r   r   r@   r]   r^   r_   )	r#   num_rowsr   r   r$   rN   r%   result_tabler   r(   r(   r)    test_multiindex_duplicate_valuesQ  s   

r   c                 C   sB   d}t jt|dd ddd}t| d }| }t|| d S )N  carat        cut  color  clarity  depth  table  price     x     y     z
 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39\s{2,}r   pythonsep	index_colheaderenginezv0.7.1.parquet)r;   read_csvr   r   r   r]   r^   r_   datadirexpected_stringexpectedrN   r   r(   r(   r)   &test_backwards_compatible_index_namingg  s   r   c                 C   sJ   d}t jt|dg dddd }t| d }| }t|| d S )Nr   r   cutcolorclarityr   r   r   zv0.7.1.all-named-index.parquet)	r;   r   r   r   
sort_indexr   r]   r^   r_   r   r(   r(   r)   1test_backwards_compatible_index_multi_level_named|  s   
r   c                 C   s\   d}t jt|dg dddd }|jg d|_t| d }| }t	
|| d S )	Nr   r   r   r   r   r   )r   Nr   zv0.7.1.some-named-index.parquet)r;   r   r   r   r   r   	set_namesr   r]   r^   r_   r   r(   r(   r)   6test_backwards_compatible_index_multi_level_some_named  s   r   c              	   C   s   t dt tjkrtd tg dg dtjddddd	}tjjg d	tjddddgd
d gd|_	| d }t
|}| }t|| t
|dgd}| }t||dg jdd d S )Nz2.2.0zRegression in pandas 2.2.0r      r.   )g?g?g333333?z
2017-01-01r.   zEurope/Brussels)periodstzabcr   rT   z'v0.7.1.column-metadata-handling.parquetr   rt   Tr   )r   r;   __version__pytestskipr<   
date_rangerV   r   r   r   r]   r^   r_   reset_index)r   r   r   rN   r   r(   r(   r)   2test_backwards_compatible_column_metadata_handling  s,   
r   c                  C   s   t jddgddggddgd} | d d| d< | dg} tj| }t }t	|| t
|  }t|jt js@J |j| jsIJ d S )	Nr   r   r   dc1c2rt   category)r;   r<   astyper   r   r   r   rj   r   rB   r\   rk   r]   
isinstancer   CategoricalIndexr@   )r$   rN   bosref_dfr(   r(   r)   )test_categorical_index_survives_roundtrip  s   r   c                  C   sh   t dt jg dg dddi} tj| }t }t|| |	 }t
| }t||  d S )Nr   )r   r   r   r   )r   r   r   T)
categoriesordered)r;   r<   Categoricalr   r   r   rj   r   rB   rk   r\   r]   r^   r_   )r$   rN   r   contentsr   r(   r(   r)   )test_categorical_order_survives_roundtrip  s   

r   c                  C   s   t d gd dgd d} | ddd}tj| }tj|}t }tj||ddd t	|
 }|d |d sAJ |d	 |d	 sLJ d S )
Nr   g      ?)colr+   r   rf   rP   )rh   
chunk_sizer   r   )r;   r<   r   r   r   r   rj   r   rB   
read_tablerk   r@   )r$   df_categoryrN   	table_catro   r   r(   r(   r)   *test_pandas_categorical_na_type_row_groups  s   r   c                  C   s   t jg ddd} g d}tdtjj| |di}t }t	t
|| t|  }|jjdks8J |jjj|k sCJ t|| d S )N)r   r   r   r   r   rQ   r   r   r/   )r   r   r   x)r   r   )r=   arrayr;   r<   r   
from_codesr   rj   r   rB   rN   r   rk   r]   r   r0   catr   allr^   r_   )codesr   r$   ro   r   r(   r(   r)   !test_pandas_categorical_roundtrip  s   
r   c                 C   s   t tjt dk rtd tjdg didd}|d}tdg di}|d}t|d 	 t|d 	 ks@J t|d j
jj	 t|d j
jj	 ksZJ t| d }tt|| t| }t|| d S )	Nz1.3.0z:PyArrow backed string data type introduced in pandas 1.3.0r   )r   r   r   zstring[pyarrow]r/   r   zcat.parquet)r   r;   r   r   r   r<   r   r   r   	to_pylistr   r   valuesr   r   rB   rN   r   r]   r^   r_   )r#   rC   rD   r   r   r(   r(   r)   )test_categories_with_string_pyarrow_dtype  s    


(r   c                 C   s   t dg dd}|d d|d< t|}tj|t| d dgd tt| d 	 }t
|dg |dg  t|t| d	  tt| d	 	 }t
|dg |dg  t|t| d
  tt| d
 	 }t
|dg |dg  d S )Nr   r   partr   r   Int64case1r   partition_colscase2r   )r;   r<   r   r   rN   r   write_to_datasetr   r   r]   r^   r_   rB   )r#   r$   rN   r   r(   r(   r)   5test_write_to_dataset_pandas_preserve_extensiondtypes$  s   
r  c                 C   s  t g dg dd}t jg ddd|_t|}|ddg  }|d d	|d< tj	|t
| d
 dgd tt
| d
  }t|| t	|t
| d  tt
| d  }t|| t|t
| d  tt
| d  }t|| d S )N)r   r   r   r   r   r   idxr   r   r   r   r  r  r  r   )r;   r<   Indexr   r   rN   copyr   r   r  r   r   r]   r^   r_   rB   )r#   r$   rN   df_catr   r(   r(   r)   +test_write_to_dataset_pandas_preserve_index9  s    
r  r6   )TFNmetadata_fname	_metadata_common_metadatac                    sL  d}d}| t   }|  g }g }g }t|D ]L}	t||	d}
tjtj|	| |	d | dddd|
_||	 d }t	j
j|
|d	}|d }|jjd u sOJ t|| || ||
 || qt	j
j|
|d	}t|j||  t|}d
dg |j d }t fdd|D }|dur|
jjnd |j_t|| d S )Nr5   )r   r   r   r/   r   r  z.parquetrc   r?   rs   rt   c                    s   g | ]}|  qS r(   r(   ).0r   rt   r(   r)   
<listcomp>~  s    z<test_dataset_read_pandas_common_metadata.<locals>.<listcomp>F)r   mkdirr   r   r;   r	  r=   r>   r   r   r   r   replace_schema_metadatar   r   r   appendr   write_metadataParquetDatasetr\   r]   concatr   r^   r_   )r#   r6   r  nfilesr   dirpath	test_dataframespathsir$   r   rN   table_for_metadatadatasetr   r   r(   rt   r)   (test_dataset_read_pandas_common_metadataR  s>   





r   c                 C   sV   t dg di}| d }t|| tjdtt| t d}|t	
|s)J d S )Nr   r   r   )
filesystem)r;   r<   r   r   r\   r   r   r   r@   r   rN   )r#   r$   r%   r   r(   r(   r)   %test_read_pandas_passthrough_keywords  s   
r"  c                 C   s   t t ddgddggt ddgd}| d }tt t }ttd	|td
t g}tj	||}t
|| t| }t|| d S )N)id	something)value2else)r#  
something2)valueelse2r   r   )col1col2r   r*  r+  )r;   r<   Seriesr   map_r-   r   r8   r   r   r   r   r\   r]   r^   r_   )r#   r$   r%   udtr   r&   r   r(   r(   r)   test_read_pandas_map_fields  s   "
r/  )<r   r    numpyr=   ImportErrorr   pyarrowr   
pyarrow.fsr   r   pyarrow.utilr   pyarrow.vendored.versionr   pyarrow.parquetparquetr   pyarrow.tests.parquet.commonr   r   r   pandasr;   pandas.testingtestingr^   r	   r
   mark
pytestmarkr*   rH   rO   rb   rd   rq   ru   rv   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  parametrizer   r"  r/  r(   r(   r(   r)   <module>   s   










,
	












/
