o
    'i	                     @   s  d dl Z d dlZd dlmZ d dlZd dlZd dlmZ d dlm	Z	 d dl
Z
d dlZd dlmZ d dlmZ d dlmZmZmZ zd dlmZ d dlmZmZ W n ey[   dZY nw zd dlZd dlmZ d d	lm Z  d d
lm!Z! W n ey   d ZZY nw zd dl"Z#W n ey   dZ#Y nw e
j$jZ%dd Z&dd Z'e
j$j"dd Z(e
j$j"dd Z)e
j$jdd Z*e
j$jdd Z+e
j$jdd Z,dd Z-dd Z.dd Z/e
j$j0dd  Z1e
j$jd!d" Z2e
j$jd#d$ Z3d%d& Z4e
j$jd'd( Z5d)d* Z6e
j$7d+de8 ge
j$7d,d-d.d/ Z9d0d1 Z:d2d3 Z;d4d5 Z<d6d7 Z=d8d9 Z>d:d; Z?d<d= Z@d>d? ZAe
j$jd@dA ZBe
j$jdBdC ZCe
j$jdDdE ZDdFdG ZEdHdI ZFe
j$jdJdK ZGdLdM ZHe
j$je
j$jIe
j$JdNe
j$JdOdPdQ ZKe
j$7dRdSdT dUdT dVdT dWdT ge
j$7dXdYdZgd[d\ ZLd]d^ ZMd_d` ZNdadb ZOdcdd ZPdedf ZQdgdh ZRe
j$jSdidj ZTe
j$7dkdldmdnggdodp ZUdS )q    N)OrderedDict)copytree)Decimal)fs)util)_check_roundtrip_roundtrip_table_test_table)_read_table_write_table)dataframe_with_lists)alltypes_samplec                 C   s   t dg di}tjtdd t|| d dd W d    n1 s$w   Y  tjtdd t|| d dd	 W d    d S 1 sEw   Y  d S )
Na         z"Unsupported Parquet format versionmatchztest_version.parquetz2.2versionz%Unsupported Parquet data page version)data_page_version)patablepytestraises
ValueErrorr   )tempdirr    r   ^/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/pyarrow/tests/parquet/test_basic.pytest_parquet_invalid_version<   s   
"r    c                  C   sH   t g dd } t jj| gdgd}ddg}|D ]}t||d qd S )Nr   i f0namesi   i   )data_page_size)r   arrayTablefrom_arraysr   )arrt
page_sizestarget_page_sizer   r   r   test_set_data_page_sizeF   s   r,   c                  C   s   t d} t| dddd d S )Nd   
   r   2.4)r$   write_batch_sizer   )r	   r   r   r   r   r   test_set_write_batch_sizeP   s   
r2   c                  C   sX   t d} t| dddd tt t| dddd W d    d S 1 s%w   Y  d S )Nr-   r   r.   r/   )dictionary_pagesize_limitr$   r   r   )r	   r   r   r   	TypeErrorr1   r   r   r   "test_set_dictionary_pagesize_limitY   s   "r5   c               	   C   s   g } t jtdd}| t j|gd  t \}}t j|}| t j|gd  dD ]}dD ]}| D ]
}t|d||d q8q4q0d S )Nr.   sizer   )z1.0z2.0)TF2.6)r   r   use_dictionary)	r   RecordBatchfrom_pandasr   appendr&   from_batchesr   r   )tablesbatchdf_r   r9   r   r   r   r   test_chunked_table_writee   s"   
rB   c                 C   s   t dd}tj|}t|ddidd t| d }t|d}t||dd	 W d    n1 s1w   Y  tj	|dd
}|
|sDJ d S )Nr.   r6   
memory_mapTr8   read_table_kwargsr   tmp_filewbr   )rC   r   r   r&   r;   r   stropenr   pqread_pandasequalsr   r@   r   filenamef
table_readr   r   r   test_memory_mapx      

rR   c                 C   s   t dd}tj|}t|ddidd t| d }t|d}t||dd	 W d    n1 s1w   Y  tj	|d
d}|
|sDJ d S )Nr.   r6   buffer_sizei  r8   rD   rF   rG   r   i   )rT   rH   rN   r   r   r   test_enable_buffered_stream   rS   rU   c                 C   sj   t jt dggdg}d}| | }| rJ t|t| | s&J tt|}||s3J d S )N*   intsz	foo # bar)	r   r&   r'   r%   existsr   rI   r
   rM   )r   r   rO   pathrQ   r   r   r   test_special_chars_filename   s   rZ   c                   C   sv   t jtdd td  W d    n1 sw   Y  t jtdd td  W d    d S 1 s4w   Y  d S )NNoner   )r   r   r4   rK   
read_tableParquetFiler   r   r   r   test_invalid_source   s   "r^   c              	   C   s|  ddl m} G dd d}| d }tdg di}t|| |jd|d	 tjtd
d t	j
|dgd W d    n1 sAw   Y  tjtdd t	j
|ddgd W d    n1 saw   Y  tjtdd t	j
||jd W d    n1 sw   Y  tjtdd t	
|  W d    n1 sw   Y  t	
|}||ksJ W d    d S 1 sw   Y  d S )Nr   )mockc                   @   s   e Zd Zdd ZdS )z;test_read_table_without_dataset.<locals>.MockParquetDatasetc                 _      t d)NMockParquetDataset)ImportError)selfargskwargsr   r   r   __init__      zDtest_read_table_without_dataset.<locals>.MockParquetDataset.__init__N)__name__
__module____qualname__rf   r   r   r   r   ra      s    ra   test.parquetr   r   z#pyarrow.parquet.core.ParquetDataset)newzthe 'filters' keywordr   )integer=r   )filterszthe 'partitioning' keywordweekcolor)partitioningzthe 'schema' argumentschemathe 'source' argument)unittestr_   r   r   r   patchr   r   r   rK   r\   rt   )r   r_   ra   rY   r   resultr   r   r   test_read_table_without_dataset   s*   

"ry   c                  C   s*   t jttdgdgd} t| dd d S )Ni@  r!   r"   r   )row_group_size)r   r   listranger   )r)   r   r   r   (test_file_with_over_int16_max_row_groups   s   r}   c                  C   s   t dd} tj| }tjjdd | D |jjd}|jdj	t
 ks)J |jdj	tt
 ks9J t|dd	 d S )
Nr.   r6   c                 S   s   g | ]}| d dd  qS )r   N)chunk).0colr   r   r   
<listcomp>   s    z.test_empty_table_roundtrip.<locals>.<listcomp>r"   null	null_listr8   r   )r   r   r&   r;   r'   itercolumnsrt   r#   fieldtyper   list_r   )r@   r   r   r   r   test_empty_table_roundtrip   s   
 
r   c                  C   s$   t  } tjj| dd}t| d S )NF)preserve_index)pd	DataFramer   r&   r;   r   )r@   emptyr   r   r   test_empty_table_no_columns   s   r   c                     sp   t t tt d g t dddgg}  fdd| D } fdd|D }tj|t }t| d S )N)int32list_stringr   )Gc                    s$   g | ]}t j|t  d  qS )r   )r   r%   structflattenr   r?   colsr   r   r      s    zEtest_write_nested_zero_length_array_chunk_failure.<locals>.<listcomp>c                    s"   g | ]}t jj|t  d qS )rs   )r   r:   r'   rt   r   r   r   r   r      s    )	r   r   r   r   stringr&   r=   rt   r   )data	my_arrays
my_batchestblr   r   r   1test_write_nested_zero_length_array_chunk_failure   s   

r   c                 C   s   | d }t dtjdtjdi}t|| t|}| }t	|| t
| d }t dtjdtjdi}t|| t|}| }t	|| d S )Nzzzz.parquetxr.   dtype)r   r   nparangeint64r   r
   	to_pandastmassert_frame_equalrI   )r   rY   r@   rQ   df_readr   r   r   test_multiple_path_types  s   

r   c                 C   s   | d }t dg di}t|| t|}t|}||s"J tt	 t|t
 d W d    d S 1 s;w   Y  d S )Nrk   r   r   
filesystem)r   r   r   r   FSProtocolClassr
   rM   r   r   r4   r   
FileSystem)r   rY   r   fs_protocol_objrx   r   r   r   test_fspath  s   

"r   r   name)data.parquetu   例.parquetc                 C   s   t dg di}| | }t|t| t|  tj||d}W d    n1 s,w   Y  ||s8J |	  |
 rBJ t|  tj|||d W d    n1 sZw   Y  t|}||skJ d S )Nr   r   r   )r   r   rK   write_tablerI   r   
change_cwdr\   rM   unlinkrX   )r   r   r   r   rY   rx   r   r   r   test_relative_paths$  s   
r   c                   C   s:   t t td W d    d S 1 sw   Y  d S )Nzi-am-not-existing.parquet)r   r   FileNotFoundErrorrK   r\   r   r   r   r   test_read_non_existing_file=  s   "r   c                  C   sT   G dd dt j} tjtdd t| d W d    d S 1 s#w   Y  d S )Nc                   @   s   e Zd Zdd Zdd ZdS )z3test_file_error_python_exception.<locals>.BogusFilec                 W   r`   NzorglubZeroDivisionErrorrc   rd   r   r   r   readE  rg   z8test_file_error_python_exception.<locals>.BogusFile.readc                 W   r`   r   r   r   r   r   r   seekH  rg   z8test_file_error_python_exception.<locals>.BogusFile.seekN)rh   ri   rj   r   r   r   r   r   r   	BogusFileD  s    r   r   r       )ioBytesIOr   r   r   rK   r\   )r   r   r   r    test_file_error_python_exceptionC  s   "r   c                 C   s   t dg di}t|t| d  tt| d d}t|}W d    n1 s,w   Y  ||s8J tt| d d}tt |}W d    n1 sTw   Y  ||s`J d S )Nr   r   r   rb)	r   r   rK   r   rI   rJ   r\   rM   
PythonFile)r   r   rP   rx   r   r   r   test_parquet_read_from_bufferP  s   r   c                  C   s,  t ttttd} t ttttd}t ddgd }| | g}t jj|ddgd}t	||dddd	 t	||ddgdgd	 t	||dddgddgd	 t jj| | ||gg d
d}t	||ddgddgd t jj|gdgd}t
jtdd t	||ddd W d    d S 1 sw   Y  d S )Nr-   TF2   r   br"   gzip)expectedcompressionr9   use_byte_stream_splitr   r   cdr   r   )r   r9   r   tmpBYTE_STREAM_SPLIT only supportsr   )r   r   r9   )r   r%   r{   mapfloatr|   intr&   r'   r   r   r   IOError)	arr_floatarr_intarr_bool
data_floatr   mixed_tabler   r   r   test_byte_stream_split^  s:   "r   c              	   C   sX  t jttttdt ddd}t jttttdt ddd}t jttttdt ddd}t dd	gd
 }|||g}t jj|g dd}t	||dd	dd t
j| d}tj||dd	dd t|}|jd}	|jd}
|	jdksJ |
jdksJ t	||dd	ddddd t jj||||gg dd}t	||d	dd d S )Nr-      r   r      	      TFr   r   r   r   r"   r   )r   r   r9   store_decimal_as_integerrk   )r   r9   r   r   r   INT32INT64DELTA_BINARY_PACKEDr   r   )r   r   r9   r   column_encodingr   )r   r9   r   )r   r%   r{   r   r   r|   
decimal128r&   r'   r   osrY   joinrK   r   r]   rt   columnphysical_type)r   arr_decimal_1_9arr_decimal_10_18arr_decimal_gt18r   data_decimalr   pqtestfile_path
pqtestfilepqcol_decimal_1_9pqcol_decimal_10_18r   r   r   r   test_store_decimal_as_integer  s^   






r   c               
   C   s  t ttttd} t ttttd}t jdd tdD t  d}t jdd tdD t dd}t g dd }t jj	| ||||gg d	d
}t
||ddddddd t
||ddd t
||dddddd t
||dddddd t
||ddddddd t
||dddid tjtdd t
||dddddd W d    n1 sw   Y  tjtdd t
||dddddd W d    n1 sw   Y  tjtdd t
||ddd W d    n1 sw   Y  tjtdd t
||dddid W d    n	1 sw   Y  tt t
||dgddid W d    n	1 s8w   Y  tt t
||ddid  W d    n	1 sWw   Y  tt t
||ddgddddd! W d    n	1 s{w   Y  tt t
||dd"ddddd! W d    n	1 sw   Y  tt t
||dd"d W d    d S 1 sw   Y  d S )#Nr-   c                 S   s   g | ]}t |qS r   )rI   r   r   r   r   r   r     s    z(test_column_encoding.<locals>.<listcomp>r   c                 S   s   g | ]	}t |d qS )r.   )rI   zfillr   r   r   r   r     s    r.   )FTFF   )r   r   r   r   er"   FBYTE_STREAM_SPLITPLAINr   )r   r9   r   r   r   DELTA_LENGTH_BYTE_ARRAYDELTA_BYTE_ARRAYr   RLEr   r   )r   r   r   z)DELTA_BINARY_PACKED encoder only supportsz+'RLE_DICTIONARY' is already used by defaultRLE_DICTIONARYz/Unsupported column encoding: 'MADE_UP_ENCODING'r   MADE_UP_ENCODINGr   )r   r   )r   r9   r   r   T)r   r%   r{   r   r   r|   r   binaryr&   r'   r   r   r   r   OSErrorr   r4   )r   r   arr_binarr_flbar   r   r   r   r   test_column_encoding  s    


$r  c               	   C   s   t ttttd} | | g}t jj|ddgd}t||ddd t||ddd t||dd	d
d t||dddd
d t||ddd t||ddd g d}t	
 }|D ]#\}}tttf t||||d W d    n1 sww   Y  qYd S )N  r   r   r"   r   r   )r   r   compression_levelr   snappyr   )r   r   r   r   lz4r   ))r     )r   i)r[   i  )lzo   )r   r  )r   r%   r{   r   r   r|   r&   r'   r   r   r   r   r   r   r   r   )r(   r   r   invalid_combinationsbufcodeclevelr   r   r   test_compression_level:  s>   	r  c                  C   sP   t g d} d}t j| g|g}t|ddid}d}|jd j|ks&J d S )N)r   r   r   r   r  zprohib; ,	{}flavorspark)write_table_kwargsprohib______r   )r   r%   r&   r'   r   rt   r   )a0r   r   rx   expected_namer   r   r    test_sanitized_spark_field_namesg  s   r  c                  C   sl   t dd} tj| }t }t||ddd |d t|dd}|d t|d	d}|	|s4J d S )
Ni'  r6   SNAPPYr8   )r   r   r   T)use_threadsF)
r   r   r&   r;   r   r   r   r   r
   rM   )r@   r   r  table1table2r   r   r   test_multithreaded_readr  s   


r  c                  C   s   t jtdgg dd} tj|  }t	 }t
||dd |d t|}||s0J tt t
||dd W d    d S 1 sHw   Y  d S )Nr  )ABCD)columns)
chunk_sizer   )r   r   r   r   r   r&   r;   reset_indexr   r   r   r   r
   rM   r   r   r   )r   r   r  rx   r   r   r   test_min_chunksize  s   
"r#  c                 C   s   t tdttddtdddtjddd	d
g dt tdt jdddt jddddt jddddd	}t	j
|}| d }z	t||dd W n
 t	jyX   Y nw | r_J d S )Nabcr   r  r      u1      @      @float64r   TFT20130101periodsz
US/Eastern)r-  tzns)r-  freq)	r   r   r   r   r   rP   ghirF   r/   r   )r   r   r{   r|   r   r   astypeCategorical
date_ranger   r&   r;   r   ArrowExceptionrX   )r   r@   pdfrO   r   r   r   (test_write_error_deletes_incomplete_file  s(   
r9  c              
   C   sN   d}zt | W d S  ty& } z||jd v sJ W Y d }~d S d }~ww )Nznonexistent-file.parquetr   )rK   r\   	Exceptionrd   )r   rY   r   r   r   r   test_read_non_existent_file  s    r;  c                 C   sH   t   t jdd t| d  W d    d S 1 sw   Y  d S )Nerror)actionzv0.7.1.parquet)warningscatch_warningssimplefilterrK   r\   )datadirr   r   r   test_read_table_doesnt_warn  s   
"rB  c                  C   s`   t jt ddggdg} t }tj| |dd |d t	|}t
| |   d S )Nr$  defsome_colr   r   r   )r   r&   r'   r%   r   r   rK   r   r   r\   r   r   r   )r   rP   	roundtripr   r   r   test_zlib_compression_bug  s   

rG  c              	   C   s   t | d }tjtjdd" t|d}W d    n1 sw   Y  t| W d    n1 s3w   Y  tjtjdd( t|d}|d W d    n1 sVw   Y  t| W d    d S 1 skw   Y  d S )Nrk   zsize is 0 bytesr   rG   zsize is 4 bytess   ffff)	rI   r   r   r   ArrowInvalidrJ   rK   r\   write)r   rY   rP   r   r   r   test_parquet_file_too_small  s   "rJ  zignore:RangeIndex:FutureWarningz.ignore:tostring:DeprecationWarning:fastparquetc           	      C   s   t d}ttdttddtjddddg d	tjd
ddt	g dd}t
|}t| d }tj||d d ||}| }t|| t| d }||| t|}|d t|d< t| | d S )Nfastparquetr$  r   r  r'  r(  r)  r   r*  r+  r   r,  )r   r   r   )r   r   r   r   r   rP   zcross_compat_arrow.parquetrE  z cross_compat_fastparquet.parquetrP   )r   importorskipr   r   r{   r|   r   r   r6  r5  r   r   rI   rK   r   r]   r   r   r   rI  rL   r4  object)	r   fpr@   r   
file_arrowfp_filedf_fpfile_fastparquettable_fpr   r   r   $test_fastparquet_cross_compatibility  s*   



rT  array_factoryc                   C      t dd gd S Nr   r.   r   r%   r   r   r   r   <lambda>       rY  c                   C      t dd gd  S rW  r   r%   dictionary_encoder   r   r   r   rY        c                   C   rV  N r.   rX  r   r   r   r   rY    rZ  c                   C   r[  r_  r\  r   r   r   r   rY    r^  read_dictionaryFTc                 C   s   t jd|  i}t }tj||dd |d |rdgnd }tj|d|d}|j	D ]}|j
\}| d }| |jd ksCJ q,d S )	Nr   T)r9   r   F)r  ra  r       )r   r&   from_pydictr   r   rK   r   r   r\   r  chunksbuffers
to_pybytesr7   )rU  ra  
orig_tablebior   r   r~   r  r   r   r   test_buffer_contents  s   

ri  c                 C   sP   t jt tdgdgd}| d }tj||dd t|}||s&J d S )Nr  rW   r"   zarrow-10480.pyarrow.gzGZIPrE  )r   r   r%   r|   rK   r   r\   rM   )r   r   rY   rx   r   r   r   "test_parquet_compression_roundtrip  s
   
rk  c                 C   s   t jt jg ddgdg}| d }d}t||j}t|D ]}|| q W d    n1 s2w   Y  t	|}|j
j|ksDJ t|D ]}|||sTJ qHd S )Nr   r   r!   zempty_row_groups.parquetr   )r   r&   r'   r%   rK   ParquetWriterrt   r|   r   r]   metadatanum_row_groupsread_row_grouprM   )r   r   rY   
num_groupswriterr3  readerr   r   r   test_empty_row_groups&  s   
rs  c                 C   sV   d gd }| dg tj|gdg}| d }t|| t|}||ks)J d S )Ni   r   r   zarrow-11607.parquet)r<   r   r&   r'   rK   r   r\   )r   r   r   rY   r  r   r   r   test_reads_over_batch8  s   

rt  c                 C   s   | d }|j dd tjg dg dgddgd}t||d	  tjg d
g dgddgd}t||d  tt|}tjg dg dgddgd}||ksTJ d S )N dataset_column_order_permutationT)exist_okr   )皙?皙?333333?r   r   r"   zdata1.parquet)皙?      ?333333?)r  r   r%  zdata2.parquet)r   r   r   r  r   r%  )rw  rx  ry  rz  r{  r|  )mkdirr   r   rK   r   r\   rI   )r   casedata1data2r   r  r   r   r    test_permutation_of_column_orderF  s   
r  c                 C   s  | d }t ttd}d}t j|g| dd t|D d}t|| tjt	dd tj
|d	| d
 W d    n1 sAw   Y  tjt	dd tj
||d W d    n1 s_w   Y  tj
|d| d
}||kssJ tj
|d| d}||ksJ t
|}||ksJ d S )Nzlargethrift.parquetr.   r  c                 S   s   g | ]}d | qS )some_long_column_name_r   )r   r3  r   r   r   r   `  r^  z+test_thrift_size_limits.<locals>.<listcomp>r"   z1Couldn't deserialize thrift:.*Exceeded size limitr   r   )thrift_string_size_limit)thrift_container_size_limitr-   r   )r   r%   r{   r|   r   rK   r   r   r   r   r\   )r   rY   r%   num_colsr   gotr   r   r   test_thrift_size_limitsY  s4   
r  c           
      C   s  | d }t dg di}tj||dd tj|dd}||ks"J t| }|d |d ks2J |d |d |d< |d< | d	 }|| tj|d
d}||ksUJ |t dg diksbJ tj	t
dd tj|dd}W d   n1 s{w   Y  tj|d
d}| }	|	|ksJ |	t dg diksJ tj|dd}tj	t
dd | }W d   dS 1 sw   Y  dS )zUCheck that checksum verification works for datasets created with
    pq.write_table()zcorrect.parquetr   r   r   r   r  Twrite_page_checksumpage_checksum_verification   $   zcorrupted.parquetFr   r   r   r  CRC checksum verificationr   N)r   r   rK   r   r\   	bytearray
read_byteswrite_bytesr   r   r   r]   r   )
r   original_path
table_origtable_checkbin_datacorrupted_pathtable_corruptrA   corrupted_pq_filetable_corrupt2r   r   r   +test_page_checksum_verification_write_tablet  s<   

"r  c                 C   s>  t dg di}| d }tj||dd t| }t|dks#J |d }tj|dd}||ks4J t|	 }|d	 |d
 ksDJ |d
 |d	 |d	< |d
< | d }t
|| ||j }|| tj|dd}	|	|ksqJ |	t dg diks~J tjtdd tj|dd}
W d   dS 1 sw   Y  dS )zXCheck that checksum verification works for datasets created with
    pq.write_to_datasetr   r  correct_dirTr  r   r   r  r  r  corrupted_dirFr  r  r   N)r   r   rK   write_to_datasetr{   iterdirlenr\   r  r  r   r   r  r   r   r   )r   r  original_dir_pathoriginal_file_path_listr  r  r  corrupted_dir_pathcorrupted_file_pathr  rA   r   r   r   test_checksum_write_to_dataset  s4   


"r  sourcez/tmp/z/tmp/file1.parquetz/tmp/file2.parquetc                 C   sP   |  tjdd  tjtdd tj|d W d    d S 1 s!w   Y  d S )Nzpyarrow.datasetru   r   )r  )setitemsysmodulesr   r   r   rK   r\   )monkeypatchr  r   r   r   9test_read_table_raises_value_error_when_ds_is_unavailable  s   "r  )Vr   r  collectionsr   r   r>  shutilr   decimalr   r   pyarrowr   r   pyarrow.testsr   pyarrow.tests.parquet.commonr   r   r	   pyarrow.parquetparquetrK   r
   r   rb   pandasr   pandas.testingtestingr   pyarrow.tests.pandas_examplesr   r   numpyr   mark
pytestmarkr    r,   r2   r5   rB   rR   rU   rZ   r^   ry   slowr}   r   r   r   r   r   parametrizeLocalFileSystemr   r   r   r   r   r   r  r  r  r  r#  r9  r;  rB  rG  rJ  rK  filterwarningsrT  ri  rk  rs  rt  r  r  r  datasetr  r  r   r   r   r   <module>   s   












&6 -





$:
4