o
    $isf                     @   s  d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dl m!Z!m"Z" d d	l#m$Z$ d d
l%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 erd dlZd dl1Z1d dl2m3Z3 d dl%m4Z4 edZ5dZ6e7e8Z9da:dd Z;G dd de	Z<G dd de)Z=G dd de"Z>e ?dddgZ@G dd de!ZAdS )    N)
TYPE_CHECKINGAnyDictIteratorListMappingOptionalTupleTypeVarUnion)is_object_dtype	is_scalaris_string_dtype)TENSOR_COLUMN_NAME)_should_convert_to_tensor)convert_to_numpy)row_reprrow_repr_prettyrow_str)TableBlockAccessorTableBlockBuilder)is_null)BlockBlockAccessorBlockColumnBlockColumnAccessorBlockExecStats	BlockTypeU)DataContext)Expr)SortKeyBlockMetadataWithSchemaT   c                  C   s   t d u r
dd l} | a t S Nr   )_pandaspandasr(    r*   \/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/data/_internal/pandas_block.pylazy_import_pandas7   s   r,   c                   @   s   e Zd ZdZdefddZdeeee f defddZ	de
fd	d
Zdd Zdeeef fddZdd Zdd Zdd ZdS )	PandasRowzF
    Row of a tabular Dataset backed by a Pandas DataFrame block.
    rowc                 C   s
   || _ d S N)_row)selfr.   r*   r*   r+   __init__E      
zPandasRow.__init__keyreturnc                    sd   ddl m  dtt dtf fdd}t|t}|r|gn|}||}|d u r*d S |r0|d S |S )Nr   TensorArrayElementkeysr5   c              
      s   j |  }t|dkrd S |jd }t|jd  r#tdd |D S z
tdd |D W S  ttfyN } ztjd| d|d |W  Y d }~S d }~ww )Nr   c                 s   s    | ]}|  V  qd S r/   to_numpy.0itemr*   r*   r+   	<genexpr>T   s    z:PandasRow.__getitem__.<locals>.get_item.<locals>.<genexpr>c                 s   s    | ]}|V  qd S r/   r*   r;   r*   r*   r+   r>   Y   s    zFailed to convert z to a tuple)exc_info)	r0   leniloc
isinstancetupleAttributeError
ValueErrorloggerwarning)r8   colitemser7   r1   r*   r+   get_itemK   s   

z'PandasRow.__getitem__.<locals>.get_item)ray.data.extensionsr7   r   strr   rB   )r1   r4   rL   is_single_itemr8   rI   r*   rK   r+   __getitem__H   s   
zPandasRow.__getitem__c                 c   s    | j jD ]}|V  qd S r/   )r0   columns)r1   kr*   r*   r+   __iter__n   s   zPandasRow.__iter__c                 C      | j jd S )N   )r0   shaper1   r*   r*   r+   __len__r      zPandasRow.__len__c                 C   s>   i }|   D ]\}}t|r|tju rd ||< q|||< q|S r/   )rI   r   pdNA)r1   pydictr4   valuer*   r*   r+   	as_pydictu   s   

zPandasRow.as_pydictc                 C      t | S r/   )r   rW   r*   r*   r+   __str__      zPandasRow.__str__c                 C   r_   r/   )r   rW   r*   r*   r+   __repr__   ra   zPandasRow.__repr__c                 C   s   t | ||S r/   )r   )r1   pcycler*   r*   r+   _repr_pretty_   rY   zPandasRow._repr_pretty_N)__name__
__module____qualname____doc__r   r2   r   rN   r   rP   r   rS   rX   r   r^   r`   rb   re   r*   r*   r*   r+   r-   @   s    &r-   c                
       s  e Zd Zd2 fddZdddeded	ee fd
dZdddeded	ee fddZdddeded	ee fddZ	dddeded	ee fddZ
dddeded	ee fddZdddededed	ee fddZd	eeeef  fddZd	efddZd	efddZd	efddZd	efdd Z	!	d3ded"ee ded	ee fd#d$Zd	ee fd%d&Zd4d(ed	ejfd)d*Zd	eee d+f fd,d-Zd.d/ Zd	efd0d1Z   Z!S )5PandasBlockColumnAccessorrH   pandas.Seriesc                       t  | d S r/   superr2   )r1   rH   	__class__r*   r+   r2         z"PandasBlockColumnAccessor.__init__T)as_pyignore_nullsrr   r5   c                C   s   |r| j  S t| j S r/   )_columncountr@   r1   rs   rr   r*   r*   r+   ru         zPandasBlockColumnAccessor.countc                C   s   |   rd S | jj|ddS )NrU   )skipna	min_count)_is_all_nullrt   sumrv   r*   r*   r+   r{      s   zPandasBlockColumnAccessor.sumc                C      |   rd S | jj|dS Nrx   )rz   rt   minrv   r*   r*   r+   r         zPandasBlockColumnAccessor.minc                C   r|   r}   )rz   rt   maxrv   r*   r*   r+   r      r   zPandasBlockColumnAccessor.maxc                C   s(   | j |d}t|s|| j|d S |S )Nrs   )r{   r   ru   )r1   rs   rr   sum_r*   r*   r+   mean   s
   zPandasBlockColumnAccessor.meanqc                C   s   | j j|dS )N)r   )rt   quantile)r1   r   rs   rr   r*   r*   r+   r      s   z"PandasBlockColumnAccessor.quantilec                 C   s0   | j  }t|dkrd S |j |j dS )Nr   )valuescounts)rt   value_countsr@   indextolistr   )r1   r   r*   r*   r+   r      s   
z&PandasBlockColumnAccessor.value_countsc                 C   sr   ddl m} tdd | jD d }t||r| jdd | _dd l}|| j }|	 j
|jdd}| S )	Nr   r6   c                 s       | ]	}|d ur|V  qd S r/   r*   r<   xr*   r*   r+   r>          z1PandasBlockColumnAccessor.hash.<locals>.<genexpr>c                 S      |   S r/   r9   r   r*   r*   r+   <lambda>       z0PandasBlockColumnAccessor.hash.<locals>.<lambda>T)wrap_numerical)%ray.air.util.tensor_extensions.pandasr7   nextrt   rB   applypolarsfrom_pandasto_frame	hash_rowscastInt64	to_pandas)r1   r7   first_non_nullpldfhashesr*   r*   r+   hash   s   
zPandasBlockColumnAccessor.hashc              
   C   s~   t  }z|  r| jdd }n| j}|| W S  ty> } zdt|v r9|| j  W  Y d }~S  d }~ww )Nc                 S   s   | d u r| S t | S r/   )rC   )lr*   r*   r+   r          z2PandasBlockColumnAccessor.unique.<locals>.<lambda>z buffer source array is read-only)	r,   is_composed_of_listsrt   mapSeriesuniquerE   rN   copy)r1   rZ   rH   rJ   r*   r*   r+   r      s    z PandasBlockColumnAccessor.uniquec                    sr   ddl m  tdd | jD d }t| s| j}n
| j fdd}|  r3|dd }|| }|jdd	S )
Nr   r6   c                 s   r   r/   r*   r   r*   r*   r+   r>      r   z4PandasBlockColumnAccessor.flatten.<locals>.<genexpr>c                    s   t |  r	|  S | S r/   )rB   r:   r   r6   r*   r+   r      s    z3PandasBlockColumnAccessor.flatten.<locals>.<lambda>c                 S   s   | d uo	t | dkS r&   )r@   r   r*   r*   r+   r      r   Tignore_index)r   r7   r   rt   rB   r   r   explode)r1   r   columnmaskr*   r6   r+   flatten   s   

z!PandasBlockColumnAccessor.flattenc                 C   
   | j  S r/   )rt   dropnarW   r*   r*   r+   r      r3   z PandasBlockColumnAccessor.dropnaNr   c                 C   s6   |d u r
| j |d}t|r|S | j| d j|dS )Nr      r~   )r   r   rt   r{   )r1   rs   r   rr   r*   r*   r+   sum_of_squared_diffs_from_mean   s
   z8PandasBlockColumnAccessor.sum_of_squared_diffs_from_meanc                 C   r   r/   )rt   to_listrW   r*   r*   r+   	to_pylist  r3   z#PandasBlockColumnAccessor.to_pylistFzero_copy_onlyc                 C   s   | j j| dS )zqNOTE: Unlike Arrow, specifying `zero_copy_only=True` isn't a guarantee
        that no copy will be made
        r   )rt   r:   )r1   r   r*   r*   r+   r:     s   z"PandasBlockColumnAccessor.to_numpyzpyarrow.Arrayc                 C   r   r/   )r   rW   r*   r*   r+   _as_arrow_compatible  ra   z.PandasBlockColumnAccessor._as_arrow_compatiblec                 C   s   | j    S r/   )rt   notnaanyrW   r*   r*   r+   rz     rq   z&PandasBlockColumnAccessor._is_all_nullc                 C   s8   ddl m} ttj|f}tdd | jD d }t||S )Nr   r6   c                 s   r   r/   r*   r   r*   r*   r+   r>   "  r   zAPandasBlockColumnAccessor.is_composed_of_lists.<locals>.<genexpr>)r   r7   listnpndarrayr   rt   rB   )r1   r7   typesr   r*   r*   r+   r     s   
z.PandasBlockColumnAccessor.is_composed_of_lists)rH   rk   NTF)"rf   rg   rh   r2   boolr   r   ru   r{   r   r   r   floatr   r   rN   r   r   r   r   r   r   r   r   r   r   r   r   r:   r   r   rz   r   __classcell__r*   r*   ro   r+   rj      sL         
	
rj   c                       s   e Zd Z fddZedeeee f ddfddZ	eded ddfd	d
Z
edefddZedddZdefddZ  ZS )PandasBlockBuilderc                    s   t  }t |j d S r/   )r,   rn   r2   	DataFrame)r1   r(   ro   r*   r+   r2   '  s   zPandasBlockBuilder.__init__rQ   r5   pandas.DataFramec                    s.   ddl m  t }| fdd|  D S )Nr   TensorArrayc                    s8   i | ]\}}|t |d krt||r t|n|qS r   )r@   r   r   )r<   column_namecolumn_valuesr   r*   r+   
<dictcomp>2  s    z9PandasBlockBuilder._table_from_pydict.<locals>.<dictcomp>)$ray.data.extensions.tensor_extensionr   r,   r   rI   )rQ   r(   r*   r   r+   _table_from_pydict+  s   
z%PandasBlockBuilder._table_from_pydicttablesc                 C   s^   t  }ddlm} t| dkr|j| dd}|jddd n| d }t }|jr-||}|S )Nr   ))_cast_ndarray_columns_to_tensor_extensionrU   Tr   dropinplace)	r,   "ray.air.util.data_batch_conversionr   r@   concatreset_indexr   get_currentenable_tensor_extension_casting)r   r(   r   r   ctxr*   r*   r+   _combine_tables=  s   z"PandasBlockBuilder._combine_tablesc                   C   s   dS r   r*   r*   r*   r*   r+   _concat_would_copyP  s   z%PandasBlockBuilder._concat_would_copyc                  C   s   t  } |  S r/   )r,   r   r)   r*   r*   r+   _empty_tableT  s   zPandasBlockBuilder._empty_tablec                 C      t jS r/   r   PANDASrW   r*   r*   r+   
block_typeY     zPandasBlockBuilder.block_typer5   r   )rf   rg   rh   r2   staticmethodr   rN   r   r   r   r   r   r   r   r   r   r   r*   r*   ro   r+   r   &  s     r   PandasBlockSchemanamesr   c                	       sb  e Zd ZeZdW fddZdedefddZdee	 fd	d
Z
de	dedefddZedededejfddZdXdedededdfddZdee ddfddZdee	 defddZdee	 ddfdd Zd!ee	e	f ddfd"d#Zd$e	d%eddfd&d'Zd(ee ddfd)d*Zdefd+d,ZdYd-d.Z	/dZdee e	ee	 f  de ejee	ejf f fd0d1Z!d[d3d4Z"defd5d6Z#defd7d8Z$d9e%ddfd:d;Z&ede'fd<d=Z(edYd>d?Z)d@edAdBddfdCdDZ*d\dEdFZ+dGee, dAdBdee fdHdIZ-edJee dAdBde.edKf fdLdMZ/de0fdNdOZ1dPede2e e3ejf  fdQdRZ4d]dUdVZ5  Z6S )^PandasBlockAccessortabler   c                    rl   r/   rm   )r1   r   ro   r*   r+   r2   e  rq   zPandasBlockAccessor.__init__r   r5   c                 C   s   | j ||d dd}t|S )NrU   Fr   )slicer-   )r1   r   base_rowr*   r*   r+   _get_rowh  s   zPandasBlockAccessor._get_rowc                 C   s   | j j S r/   )_tablerQ   r   rW   r*   r*   r+   column_namesl  rY   z PandasBlockAccessor.column_namesnamer]   c                 C   s4   t |tjtjfr| ||S | jjdi ||iS )Nr*   )rB   rZ   r   r   r   upsert_columnr   assign)r1   r   r]   r*   r*   r+   fill_columno  s   zPandasBlockAccessor.fill_columnr.   row_idxc                 C   s0   ddl m} | t j| }t||r| }|S )Nr   r6   )rM   r7   r   rA   rB   r:   )r.   r   r7   tensorr*   r*   r+   _build_tensor_rowv  s
   
z%PandasBlockAccessor._build_tensor_rowFstartendr   c                 C   s0   | j || }|jddd |r|jdd}|S )NTr   deep)r   r   r   )r1   r   r   r   viewr*   r*   r+   r     s
   zPandasBlockAccessor.sliceindicesc                 C   s   | j |}|jddd |S )NTr   )r   taker   )r1   r   r   r*   r*   r+   r     s   zPandasBlockAccessor.takerQ   c                 C   s   | j j|ddS )NrQ   )axis)r   r   r1   rQ   r*   r*   r+   r     rq   zPandasBlockAccessor.dropc                 C   s,   t dd |D std| d| j| S )Nc                 s   s    | ]}t |tV  qd S r/   rB   rN   )r<   rH   r*   r*   r+   r>     s    z-PandasBlockAccessor.select.<locals>.<genexpr>zZColumns must be a list of column name strings when aggregating on Pandas blocks, but got: .)allrE   r   r   r*   r*   r+   select  s   
zPandasBlockAccessor.selectcolumns_renamec                 C   s   | j j|dddS )NF)rQ   r   r   )r   rename)r1   r  r*   r*   r+   rename_columns  s   z"PandasBlockAccessor.rename_columnsr   column_datac                 C   s8   dd l }t||j|jfr| }| jjdi ||iS )Nr   r*   )pyarrowrB   ArrayChunkedArrayr   r   r   )r1   r   r  r  r*   r*   r+   r     s   z!PandasBlockAccessor.upsert_columnrandom_seedc                 C   s"   | j jd|d}|jddd |S )NrU   )fracrandom_stateTr   )r   sampler   )r1   r	  r   r*   r*   r+   random_shuffle  s   z"PandasBlockAccessor.random_shufflec                 C   sJ   | j j}t|j |j d}tdd |jD r#td|jd|S )N)r   r   c                 s   s    | ]	}t |t V  qd S r/   r   )r<   r   r*   r*   r+   r>     r   z-PandasBlockAccessor.schema.<locals>.<genexpr>zwA Pandas DataFrame with column names of non-str types is not supported by Ray Dataset. Column names of this DataFrame: r   )	r   dtypesr   r   r   r   r   r   rE   )r1   r  schemar*   r*   r+   r    s   zPandasBlockAccessor.schemac                 C   s,   ddl m} t }| j}|jr||}|S )Nr   ) _cast_tensor_columns_to_ndarrays)r   r  r   r   r   r   )r1   r  r   r   r*   r*   r+   r     s   zPandasBlockAccessor.to_pandasNc                 C   s   |d u r| j j }d}nt|trd}n|g}d}t| j j}|D ]}||vr6td| d| j j  q"g }|D ]}|| j |   q;|rP|d }|S t	t
||}|S )NFTzCannot find column z, available columns: r   )r   rQ   r   rB   r   setrE   appendr:   dictzip)r1   rQ   should_be_single_ndarraycolumn_names_setr   arraysr*   r*   r+   r:     s0   

zPandasBlockAccessor.to_numpypyarrow.Tablec           	      C   s   dd l }ddlm} |jj| jdd}i }t| jjD ]%\}}| j| }t|j	|r+q|
  s@|jt|| d|||f< q| D ]\\}}}||||}qE|S )Nr   )TensorDtypeF)preserve_index)type)r  r   r  Tabler   r   	enumeraterQ   rB   dtyper   r   nullsr@   nullrI   
set_column)	r1   par  arrow_tablenull_coerced_columnsidxcol_namerH   null_colr*   r*   r+   to_arrow  s    

zPandasBlockAccessor.to_arrowc                 C   rT   r&   )r   rV   rW   r*   r*   r+   num_rows  rY   zPandasBlockAccessor.num_rowsc                    sj  ddl m} ddlm m} t  fdd| jjddd}|f}t}| jj	D ]}| j| j
}t|s>t|s>t||rt| j| }t||}	|	dkrOq)| j| j|	d	j}
z5t|
|rmt|
d jtjrm|
j}ntfd
d}t||
}|||	  }||  t|7  < W q) ty } ztd| d|  W Y d }~q)d }~ww q)| }t|S )Nr   r   )r7   r  c                    s:  t  }d}t| g}|r| }t|ttttfr$t	
|}||7 }qt||v r+q|t| zt	
|}W n tyD   d}Y nw ||7 }t|tjrW||j| 7 }nBt|jrk||jddd | 7 }n.t|ttt fry|| n t|tr||  ||  nt| r||  |s|S )zhCalculates the memory size of objects,
            including nested objects using an iterative approach.r   Tr   r   )r  collectionsdequepoprB   rN   bytesintr   sys	getsizeofidadd	TypeErrorr   r   nbytesr   memory_usager{   r   rC   extendr  r8   r   r:   )objseen
total_sizeobjectscurrentsize)r7   rZ   r*   r+   get_deep_size  sB   


$z5PandasBlockAccessor.size_bytes.<locals>.get_deep_sizeTFr*  )nc                    s    | S r/   r*   r   )r>  r*   r+   r   `  r   z0PandasBlockAccessor.size_bytes.<locals>.<lambda>z#Error calculating size for column 'z': )r   r   rM   r7   r  r,   r   r6  #_PANDAS_SIZE_BYTES_MAX_SAMPLE_COUNTrQ   r  r   r   rB   r@   r   r  r   r   
issubdtypenumpy_dtypenumberr5  	vectorizer{   r/  	ExceptionrF   rG   )r1   r   r  r6  object_need_checkmax_sample_countr   r  r:  sample_sizesampled_datacolumn_memory_samplevectorized_size_calccolumn_memoryrJ   total_memory_usager*   )r7   r>  rZ   r+   
size_bytes  sL   /
""zPandasBlockAccessor.size_bytesaccc           	      C   sz   |   jdd}|  }|jD ]+}|| }t|j}||v r6d}|}||v r4d||}|d7 }||v s&|}|||< q|S )NFr   rU   z{}_{})r   r   rQ   r   format)	r1   rO  rsr&  rH   r   inew_namer*   r*   r+   _zipq  s   


zPandasBlockAccessor._zipc                   C   s   t  S r/   )r   r*   r*   r*   r+   builder  s   zPandasBlockAccessor.builderc                   C   s   t  S r/   )r   r   r*   r*   r*   r+   r     s   z PandasBlockAccessor._empty_table	n_samplessort_keyr!   c                 C   s   | j |  j|ddS )NTr   )r   get_columnsr  )r1   rW  rX  r*   r*   r+   _sample  rw   zPandasBlockAccessor._samplec                 C   sP   |  sJ d|   d| jjd dkr|  S | \}}| jj||dS )Nz'Sorting columns couldn't be empty (got )r   by	ascending)rY  r   rV   r   to_pandas_sort_argssort_values)r1   rX  rQ   r^  r*   r*   r+   sort  s   zPandasBlockAccessor.sort
boundariesc                    sZ     |}|jd dkr fddtt|d D S t|dkr$|gS t|||S )Nr   c                    s   g | ]}   qS r*   )r   )r<   _rW   r*   r+   
<listcomp>  r   z:PandasBlockAccessor.sort_and_partition.<locals>.<listcomp>rU   )ra  rV   ranger@   r   	for_block_find_partitions_sorted)r1   rb  rX  r   r*   rW   r+   sort_and_partition  s   

z&PandasBlockAccessor.sort_and_partitionblocksr#   c                 C   s   t  }t }dd | D } t| dkrt }nt| tj	} |j
| dd}| \}}|j||d}ddlm} ||j|| dfS )	Nc                 S   s   g | ]}|j d  d kr|qS r   )rV   )r<   br*   r*   r+   rd    s    z;PandasBlockAccessor.merge_sorted_blocks.<locals>.<listcomp>r   Tr   r\  r"   )stats)r,   r   rV  r@   r   r   r   normalize_block_typesr   r   r   r_  r`  ray.data.blockr#   
from_blockbuild)ri  rX  rZ   rk  retrQ   r^  r#   r*   r*   r+   merge_sorted_blocks  s   
z'PandasBlockAccessor.merge_sorted_blocksc                 C   r   r/   r   rW   r*   r*   r+   r     r   zPandasBlockAccessor.block_typepublic_row_formatc                 c   s<    |   }t|D ]}| |}|r| V  q	|V  q	d S r/   )r)  re  r   r^   )r1   rr  r)  rS  r.   r*   r*   r+   	iter_rows  s   
zPandasBlockAccessor.iter_rowspredicate_exprr    c                 C   s0   | j jr| j S ddlm} ||| j }| j | S )z,Filter rows based on a predicate expression.r   )	eval_expr)r   empty?ray.data._internal.planner.plan_expression.expression_evaluatorru  )r1   rt  ru  r   r*   r*   r+   filter  s
   
zPandasBlockAccessor.filter)r   r   r   r   r/   )r5   r  )rX  r!   )rt  r    r5   r   )7rf   rg   rh   r-   ROW_TYPEr2   r/  r   r   rN   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r  r   r   r   r  r   r  r   r   r:   r(  r)  rN  r   rU  r   rV  r   rZ  ra  r$   rh  r	   rq  r   r   r   r   rs  rx  r   r*   r*   ro   r+   r   b  sz    






(f



r   )Br+  loggingr0  typingr   r   r   r   r   r   r   r	   r
   r   numpyr   r(   rZ   pandas.api.typesr   r   r   ray.air.constantsr   $ray.air.util.tensor_extensions.utilsr    ray.data._internal.numpy_supportr   ray.data._internal.rowr   r   r   ray.data._internal.table_blockr   r   ray.data._internal.utilr   rm  r   r   r   r   r   r   r   ray.data.contextr   ray.data.expressionsr    r  2ray.data._internal.planner.exchange.sort_task_specr!   r#   r$   r@  	getLoggerrf   rF   r'   r,   r-   rj   r   
namedtupler   r   r*   r*   r*   r+   <module>   s@    0$	
	L 9