o
    $ikQ                     @   s  d dl Z d dlZd dlmZmZmZmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d d	lm Z  d d
l!m"Z"m#Z#m$Z$ d dl%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 zd dl6Z6W n e7y   dZ6Y nw erd dl8Z8d dl9m:Z: edZ;e <e=Z>edZ?dZ@edeAe2d ZBde3defddZCde3defddZDG dd de	ZEG dd de'ZFdd d!eAde
eA fd"d#ZGG d$d% d%e&ZHG d&d' d'e,ZIdS )(    N)TYPE_CHECKINGAnyCallableDictIteratorListMappingOptionalTupleTypeVarUnion)parse)get_pyarrow_version)env_integer)TENSOR_COLUMN_NAME)convert_to_pyarrow_arraypyarrow_table_from_pydict)transform_polarstransform_pyarrow)shuffle)row_reprrow_repr_prettyrow_str)TableBlockAccessorTableBlockBuilder)BlockBlockAccessorBlockColumnBlockColumnAccessorBlockExecStatsBlockMetadataWithSchema	BlockTypeU)DEFAULT_TARGET_MAX_BLOCK_SIZEDataContext)Expr)SortKeyTz13.0.0
__bsp_stub#RAY_DATA_ARROW_MAX_CHUNK_SIZE_BYTES    contextreturnc                 C      | j s| jr	tjS tjS N)
use_polarsuse_polars_sortr   sortr   r+    r3   [/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/data/_internal/arrow_block.pyget_sort_transformJ      r5   c                 C   r-   r.   )r/   r0   r   concat_and_sortr   r2   r3   r3   r4   get_concat_and_sort_transformQ   r6   r8   c                   @   s   e Zd ZdZdefddZdeeee f defddZ	de
fd	d
Zdd Zdeeef fddZdd Zdd Zdd ZdS )ArrowRowzA
    Row of a tabular Dataset backed by a Arrow Table block.
    rowc                 C   s
   || _ d S r.   )_row)selfr:   r3   r3   r4   __init__]      
zArrowRow.__init__keyr,   c                    sj   ddl m} | dtt dtf fdd}t|t}|r!|gn|}||}|d u r-d S |r3|d S |S )Nr   ) get_arrow_extension_tensor_typeskeysr,   c                    s    j j}t|| d jrt fdd| D S  j | }t|dkr(d S dd |jD }z
tdd |D W S  t	yE   | Y S w )Nr   c                    s   g | ]}t j j|d dqS )r   )col_namerow_idx)ArrowBlockAccessor_build_tensor_rowr;   ).0r?   r<   r3   r4   
<listcomp>j   s    z:ArrowRow.__getitem__.<locals>.get_item.<locals>.<listcomp>c                 S   s   g | ]}|d  qS r   r3   rF   colr3   r3   r4   rH   v       c                 S   s   g | ]}|  qS r3   as_py)rF   itemr3   r3   r4   rH   y   rL   )
r;   schema
isinstancefieldtypetupleselectlencolumnsAttributeError)rA   rP   tableitemsr<   tensor_arrow_extension_typesr3   r4   get_iteme   s    
	z&ArrowRow.__getitem__.<locals>.get_item)ray.data.extensionsr@   r   strr   rQ   )r<   r?   r@   r]   is_single_itemrA   rZ   r3   r[   r4   __getitem__`   s   
zArrowRow.__getitem__c                 c   s    | j jD ]}|V  qd S r.   )r;   column_names)r<   kr3   r3   r4   __iter__   s   zArrowRow.__iter__c                 C      | j jS r.   )r;   num_columnsrG   r3   r3   r4   __len__      zArrowRow.__len__c                 C   s   t |  S r.   )dictrZ   rG   r3   r3   r4   	as_pydict      zArrowRow.as_pydictc                 C      t | S r.   )r   rG   r3   r3   r4   __str__   rh   zArrowRow.__str__c                 C   rl   r.   )r   rG   r3   r3   r4   __repr__   rh   zArrowRow.__repr__c                 C   s   t | ||S r.   )r   )r<   pcycler3   r3   r4   _repr_pretty_   rk   zArrowRow._repr_pretty_N)__name__
__module____qualname____doc__r   r=   r   r_   r   ra   r   rd   rg   r   rj   rm   rn   rq   r3   r3   r3   r4   r9   X   s    ,r9   c                       s   e Zd Z fddZedeeee f de	fddZ
edee	 de	fdd	Zedefd
dZedddZdefddZ  ZS )ArrowBlockBuilderc                    s&   t d u rtdt t jtf d S Nz+Run `pip install pyarrow` for Arrow support)pyarrowImportErrorsuperr=   TablebytesrG   	__class__r3   r4   r=      s   zArrowBlockBuilder.__init__rW   r,   c                 C   s   t dd |  D S )Nc                 S   s   i | ]
\}}|t ||qS r3   )r   )rF   column_namecolumn_valuesr3   r3   r4   
<dictcomp>   s    
z8ArrowBlockBuilder._table_from_pydict.<locals>.<dictcomp>)r   rZ   )rW   r3   r3   r4   _table_from_pydict   s
   z$ArrowBlockBuilder._table_from_pydicttablesc                 C   s"   t | dkrtj| ddS | d S )N   Tpromote_typesr   )rV   r   concat)r   r3   r3   r4   _combine_tables   s   z!ArrowBlockBuilder._combine_tablesc                   C   s   dS )NFr3   r3   r3   r3   r4   _concat_would_copy   s   z$ArrowBlockBuilder._concat_would_copypyarrow.Tablec                   C   s   t i S r.   )r   r3   r3   r3   r4   _empty_table      zArrowBlockBuilder._empty_tablec                 C      t jS r.   r!   ARROWrG   r3   r3   r4   
block_type      zArrowBlockBuilder.block_typer,   r   )rr   rs   rt   r=   staticmethodr   r_   r   r   r   r   r   boolr   r   r!   r   __classcell__r3   r3   r}   r4   rv      s     rv   rY   r   max_chunk_size_bytesc                 C   s,   | j dkrdS | j | j }tdt|| S )aP  
    Calculate the max chunk size in rows for Arrow to Batches conversion in
    ArrowBlockAccessor.iter_rows().
    Args:
        table: The pyarrow table to calculate the max chunk size for.
        max_chunk_size_bytes: The max chunk size in bytes.
    Returns:
        The max chunk size in rows, or None if the table is empty.
    r   Nr   )nbytesnum_rowsmaxint)rY   r   avg_row_sizer3   r3   r4   _get_max_chunk_size   s   
r   c                
       s  e Zd ZeZda fddZdedefddZdee	 fd	d
Z
de	dedefddZededd fddZeefdedede	dejfddZdbdedededdfddZdee ddfddZdcd!d"Zddd$d%Z	&ded'eee	ee	 f  deejee	ejf f fd(d)Zdfd*d+Zdefd,d-Z defd.d/Z!d0e"dd1fd2d3Z#d4e	d5e$ddfd6d7Z%ede&fd8d9Z'edfd:d;Z(d<eee d=d>f ddfd?d@Z)d'ee	 defdAdBZ*d'ee	 ddfdCdDZ+dEee	e	f ddfdFdGZ,dgdIdJZ-dKedLdMddfdNdOZ.dLdMdefdPdQZ/dRee0 dLdMded1 fdSdTZ1edUee dLdMde2ee3f fdVdWZ4de5fdXdYZ6dZede7ee8ejf  fd[d\Z9dhd_d`Z:  Z;S )irD   rY   r   c                    s&   t d u rtdt | d | _d S rw   )rx   ry   rz   r=   _max_chunk_size)r<   rY   r}   r3   r4   r=      s   
zArrowBlockAccessor.__init__indexr,   c                 C   s   | j ||d dd}t|S )Nr   F)copy)slicer9   )r<   r   base_rowr3   r3   r4   _get_row   s   zArrowBlockAccessor._get_rowc                 C   re   r.   )_tablerb   rG   r3   r3   r4   rb      rh   zArrowBlockAccessor.column_namesnamevaluec                 C   sv   dd l m} t|tjtjfr| ||S t|tjr|j}nt	|g}tj
t| j|d}|||}| ||S )Nr   )rS   )pyarrow.computecomputerQ   rx   ArrayChunkedArrayupsert_columnScalarrS   
infer_typenullsrV   r   	fill_null)r<   r   r   pcrS   arrayr3   r3   r4   fill_column   s   zArrowBlockAccessor.fill_columndatac                 C   s   t j|}| | S r.   )rx   ipcopen_streamread_all)clsr   readerr3   r3   r4   
from_bytes   s   zArrowBlockAccessor.from_bytesr:   rC   rB   c                 C   s0   | | | }|  }t|tjsJ t||S r.   )rN   rQ   npndarrayrS   )r:   rC   rB   elementarrr3   r3   r4   rE      s   z$ArrowBlockAccessor._build_tensor_rowFstartendr   c                 C   s&   | j ||| }|rt||}|S r.   )r   r   r   combine_chunks)r<   r   r   r   viewr3   r3   r4   r     s   zArrowBlockAccessor.slicerandom_seedc                 C   s   t | j|S r.   )r   r   )r<   r   r3   r3   r4   random_shuffle  rk   z!ArrowBlockAccessor.random_shufflepyarrow.lib.Schemac                 C   re   r.   )r   rP   rG   r3   r3   r4   rP     rh   zArrowBlockAccessor.schemapandas.DataFramec                 C   s6   ddl m} t }| jj|jd}|jr||}|S )Nr   ) _cast_tensor_columns_to_ndarrays)ignore_metadata)"ray.air.util.data_batch_conversionr   r$   get_currentr   	to_pandaspandas_block_ignore_metadataenable_tensor_extension_casting)r<   r   ctxdfr3   r3   r4   r     s   zArrowBlockAccessor.to_pandasNrW   c           	      C   s   |d u r| j j}d}nt|trd}n|g}d}t| j j}|D ]}||vr0td| d| q g }|D ]}| j | }t|}|tj	|dd q5|rZt
|dksVJ |d S tt||S )NFTzCannot find column z, available columns: zero_copy_onlyr   r   )r   rb   rQ   listset
ValueErrorr   combine_chunked_arrayappendto_numpyrV   ri   zip)	r<   rW   should_be_single_ndarraycolumn_names_setcolumncolumn_values_ndarraysrB   rK   combined_arrayr3   r3   r4   r     s6   


zArrowBlockAccessor.to_numpyc                 C      | j S r.   )r   rG   r3   r3   r4   to_arrowD  r   zArrowBlockAccessor.to_arrowc                 C   s   | j jdkr
| j jS dS Nr   )r   rf   r   rG   r3   r3   r4   r   G  s   zArrowBlockAccessor.num_rowsc                 C   re   r.   )r   r   rG   r3   r3   r4   
size_bytesL  rh   zArrowBlockAccessor.size_bytesaccr   c                 C   st   |   }|  }|jD ],}||}||jv r1d}|}||jv r/d||}|d7 }||jv s |}|||}q|S )Nr   z{}_{})r   rb   r   formatappend_column)r<   r   rsrB   rK   inew_namer3   r3   r4   _zipO  s   




zArrowBlockAccessor._zipr   column_datac                 C   sX   t |tjtjfsJ dt| | jj|}|dkr$| j||S | j	|||S )Nz>Expected either a pyarrow.Array or pyarrow.ChunkedArray, got: )
rQ   rx   r   r   rS   r   rP   get_field_indexr   
set_column)r<   r   r   
column_idxr3   r3   r4   r   _  s   z ArrowBlockAccessor.upsert_columnc                   C   s   t  S r.   )rv   r3   r3   r3   r4   builderl  s   zArrowBlockAccessor.builderc                   C   s   t  S r.   )rv   r   r3   r3   r3   r4   r   p  r   zArrowBlockAccessor._empty_tableindicespyarrow.Arraypyarrow.ChunkedArrayc                 C   s   t | j|S )zSelect rows from the underlying table.

        This method is an alternative to pyarrow.Table.take(), which breaks for
        extension arrays.
        )r   
take_tabler   )r<   r   r3   r3   r4   taket  s   	zArrowBlockAccessor.takec                 C      | j |S r.   )r   dropr<   rW   r3   r3   r4   r     rk   zArrowBlockAccessor.dropc                 C   sF   t dd |D std| dt|dkr| td S | j|S )Nc                 s   s    | ]}t |tV  qd S r.   )rQ   r_   rJ   r3   r3   r4   	<genexpr>  s    z,ArrowBlockAccessor.select.<locals>.<genexpr>zYColumns must be a list of column name strings when aggregating on Arrow blocks, but got: .r   )allr   rV   r   $_BATCH_SIZE_PRESERVING_STUB_COL_NAMEr   rU   r   r3   r3   r4   rU     s   zArrowBlockAccessor.selectcolumns_renamec                 C   r   r.   )r   rename_columns)r<   r   r3   r3   r4   r     rk   z!ArrowBlockAccessor.rename_columnsother_blockc                 C   s.   | j }t|j|jD ]
\}}|||}q
|S r.   )r   r   rb   rW   r   )r<   r   result_tabler   r   r3   r3   r4   hstack  s   zArrowBlockAccessor.hstack	n_samplessort_keyr&   c                 C   s0   t t| jj|}| j| }t||S r.   )	randomsampleranger   r   rU   get_columnsr   r   )r<   r   r   r   rY   r3   r3   r4   _sample  s   zArrowBlockAccessor._samplec                 C   sL   |  sJ d|   d| jjdkr|  S t }t|}|| j|S )Nz'Sorting columns couldn't be empty (got )r   )r  r   r   r   r$   r   r5   )r<   r   r+   r1   r3   r3   r4   r1     s   zArrowBlockAccessor.sort
boundariesc                    sV     |}|jdkr fddtt|d D S t|dkr"|gS t|||S )Nr   c                    s   g | ]}   qS r3   )r   )rF   _rG   r3   r4   rH     rL   z9ArrowBlockAccessor.sort_and_partition.<locals>.<listcomp>r   )r1   r   r  rV   r   	for_block_find_partitions_sorted)r<   r  r   rY   r3   rG   r4   sort_and_partition  s   


z%ArrowBlockAccessor.sort_and_partitionblocksc                 C   sj   t  }dd | D } t| dkrt }nt| tj} t	t
 }|| |dd}|tj|| dfS )Nc                 S   s   g | ]	}|j d kr|qS rI   )r   )rF   br3   r3   r4   rH     s    z:ArrowBlockAccessor.merge_sorted_blocks.<locals>.<listcomp>r   Tr   )stats)r   r   rV   rD   r   r   normalize_block_typesr!   r   r8   r$   r   r    
from_blockbuild)r  r   r  retr7   r3   r3   r4   merge_sorted_blocks  s   
z&ArrowBlockAccessor.merge_sorted_blocksc                 C   r   r.   r   rG   r3   r3   r4   r     r   zArrowBlockAccessor.block_typepublic_row_formatc                 c   sn    | j }|r$| jd u rt|t| _|j| jdD ]	}| E d H  qd S |  }t|D ]}| |V  q,d S )N)max_chunksize)	r   r   r   ARROW_MAX_CHUNK_SIZE_BYTES
to_batches	to_pylistr   r  r   )r<   r  rY   batchr   r   r3   r3   r4   	iter_rows  s   
zArrowBlockAccessor.iter_rowspredicate_exprr%   c                 C   s6   | j jdkr	| j S ddlm} ||| j }| j |S )z,Filter rows based on a predicate expression.r   )	eval_expr)r   r   ?ray.data._internal.planner.plan_expression.expression_evaluatorr  filter)r<   r  r  maskr3   r3   r4   r    s
   zArrowBlockAccessor.filter)rY   r   F)r,   r   )r,   r   r.   r   )r   r   r,   r   )r  r%   r,   r   )<rr   rs   rt   r9   ROW_TYPEr=   r   r   r   r_   rb   r   r   r   classmethodr|   r   r   r   r   r   rE   r   r   r	   r   rP   r   r   r   r   r   r   r   r   r   r   r   rv   r   r   r   r   rU   r   r   r  r1   r'   r  r
   r    r  r!   r   r   r   r  r  r   r3   r3   r}   r4   rD      s    




'





rD   c                
       s  e Zd Zded f fddZdddeded	ee fd
dZdddeded	ee fddZ	dddeded	ee fddZ
dddeded	ee fddZdddeded	ee fddZ	d0dedee ded	ee fddZdddededed	ee fddZd	efddZd	eeeef  fddZd	efdd Zd	efd!d"Zd	efd#d$Zd	efd%d&Zd	ee fd'd(Zd1d*ed	ejfd+d,Zd	eee d-f fd.d/Z  Z S )2ArrowBlockColumnAccessorrK   )r   r   c                    s   t  | d S r.   )rz   r=   )r<   rK   r}   r3   r4   r=     s   z!ArrowBlockColumnAccessor.__init__TrM   ignore_nullsrN   r,   c                C   s4   dd l m} |j| j|rdndd}|r| S |S )Nr   
only_validr   )mode)r   r   count_columnrN   r<   r$  rN   pacresr3   r3   r4   r'    s   zArrowBlockColumnAccessor.countc                C   ,   dd l m} |j| j|d}|r| S |S Nr   
skip_nulls)r   r   sumr(  rN   r)  r3   r3   r4   r0       zArrowBlockColumnAccessor.sumc                C   r,  r-  )r   r   minr(  rN   r)  r3   r3   r4   r2    r1  zArrowBlockColumnAccessor.minc                C   r,  r-  )r   r   r   r(  rN   r)  r3   r3   r4   r     r1  zArrowBlockColumnAccessor.maxc                C   r,  r-  )r   r   meanr(  rN   r)  r3   r3   r4   r3    r1  zArrowBlockColumnAccessor.meanNr3  c                 C   s\   dd l m} |d u r| j|d}|d u rd S |j||| j|d|d}|r,| S |S )Nr   )r$     r.  )r   r   r3  r0  powersubtractr(  rN   )r<   r$  r3  rN   r*  r+  r3   r3   r4   sum_of_squared_diffs_from_mean  s   z7ArrowBlockColumnAccessor.sum_of_squared_diffs_from_meanqc                C   s6   dd l m} |j| j||d}|d }|r| S |S )Nr   )r8  r/  )r   r   quantiler(  rN   )r<   r8  r$  rN   r*  r   r+  r3   r3   r4   r9  "  s   z!ArrowBlockColumnAccessor.quantilec                 C   s<   dd l m} |  rdd l}|| j  S || jS r   )r   r   is_composed_of_listspolars
from_arrowr(  uniquer   )r<   r*  r;  r3   r3   r4   r=  ,  s
   zArrowBlockColumnAccessor.uniquec                 C   sF   dd l m} || j}t|dkrd S |d |d dS )Nr   valuescounts)r>  r?  )r   r   value_countsr(  rV   rR   r  )r<   r*  r@  r3   r3   r4   r@  9  s   z%ArrowBlockColumnAccessor.value_countsc                 C   s4   dd l }|d| ji}| j|jdd}| S )Nr   rK   T)wrap_numerical)r;  	DataFramer(  	hash_rowscastInt64r   )r<   plr   hashesr3   r3   r4   hashD  s   zArrowBlockColumnAccessor.hashc                 C      dd l m} || jS r   )r   r   list_flattenr(  r<   r*  r3   r3   r4   flattenK     z ArrowBlockColumnAccessor.flattenc                 C   rI  r   )r   r   	drop_nullr(  rK  r3   r3   r4   dropnaP  rM  zArrowBlockColumnAccessor.dropnac                 C   s   t jjt jjf}t| jj|S r.   )rx   libListTypeLargeListTyperQ   r(  rS   )r<   typesr3   r3   r4   r:  U  s   z-ArrowBlockColumnAccessor.is_composed_of_listsc                 C   s
   | j  S r.   )r(  r  rG   r3   r3   r4   r  Y  r>   z"ArrowBlockColumnAccessor.to_pylistFr   c                 C   s>   t  tk rt| jtjr| j S | jj|dS | jj|dS )Nr   )r   ,_MIN_PYARROW_VERSION_TO_NUMPY_ZERO_COPY_ONLYrQ   r(  rx   r   r   )r<   r   r3   r3   r4   r   \  s   

z!ArrowBlockColumnAccessor.to_numpyr   c                 C   r   r.   )r(  rG   r3   r3   r4   _as_arrow_compatibleg  r   z-ArrowBlockColumnAccessor._as_arrow_compatible)NTr   )!rr   rs   rt   r   r=   r   r	   r"   r'  r0  r2  r   r3  r7  floatr9  r   r=  r   r_   r   r@  rH  rL  rO  r:  r   r  r   r   r   rU  r   r3   r3   r}   r4   r#    sH         


"r#  )Jloggingr  typingr   r   r   r   r   r   r   r	   r
   r   r   numpyr   packaging.versionr   parse_versionray._private.arrow_utilsr   ray._private.ray_constantsr   ray.air.constantsr   $ray.air.util.tensor_extensions.arrowr   r   ray.data._internal.arrow_opsr   r   .ray.data._internal.arrow_ops.transform_pyarrowr   ray.data._internal.rowr   r   r   ray.data._internal.table_blockr   r   ray.data.blockr   r   r   r   r   r    r!   r"   ray.data.contextr#   r$   ray.data.expressionsr%   rx   ry   pandas2ray.data._internal.planner.exchange.sort_task_specr&   r'   	getLoggerrr   loggerrT  r   r   r  r5   r8   r9   rv   r   rD   r#  r3   r3   r3   r4   <module>   s^    4(


H"
  