o
    `۷id                     @   s  d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZmZmZ d dlmZ d dlmZmZmZ d dlmZmZ d dlm Z  d dl!m"Z" d d	l#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d d
l+m,Z, d dl-m.Z. erd dlZd dl/Z/d dl0m1Z1 d dl#m2Z2 edZ3dZ4e5e6Z7da8dd Z9G dd de	Z:G dd de'Z;G dd deZ<e =dddgZ>G dd deZ?dS )    N)
TYPE_CHECKINGAnyDictIteratorListMappingOptionalTupleTypeVarUnion)is_object_dtype	is_scalaris_string_dtype)convert_to_numpy)row_reprrow_repr_prettyrow_str)TableBlockAccessorTableBlockBuilder)_should_convert_to_tensor)is_null)BlockBlockAccessorBlockColumnBlockColumnAccessorBlockExecStats	BlockTypeU)DataContext)Expr)SortKeyBlockMetadataWithSchemaT   c                  C   s   t d u r
dd l} | a t S Nr   )_pandaspandasr'    r)   U/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/data/_internal/pandas_block.pylazy_import_pandas6   s   r+   c                   @   s   e Zd ZdZdefddZdeeee f defddZ	de
fd	d
Zdd Zdeeef fddZdd Zdd Zdd ZdS )	PandasRowzF
    Row of a tabular Dataset backed by a Pandas DataFrame block.
    rowc                 C   s
   || _ d S N)_row)selfr-   r)   r)   r*   __init__D      
zPandasRow.__init__keyreturnc                    sd   ddl m  dtt dtf fdd}t|t}|r|gn|}||}|d u r*d S |r0|d S |S )Nr   TensorArrayElementkeysr4   c              
      s   j |  }t|dkrd S |jd }t|jd  r#tdd |D S z
tdd |D W S  ttfyN } ztjd| d|d |W  Y d }~S d }~ww )Nr   c                 s   s    | ]}|  V  qd S r.   to_numpy.0itemr)   r)   r*   	<genexpr>S   s    z:PandasRow.__getitem__.<locals>.get_item.<locals>.<genexpr>c                 s   s    | ]}|V  qd S r.   r)   r:   r)   r)   r*   r=   X   s    zFailed to convert z to a tuple)exc_info)	r/   leniloc
isinstancetupleAttributeError
ValueErrorloggerwarning)r7   colitemser6   r0   r)   r*   get_itemJ   s   

z'PandasRow.__getitem__.<locals>.get_item)ray.data.extensionsr6   r   strr   rA   )r0   r3   rK   is_single_itemr7   rH   r)   rJ   r*   __getitem__G   s   
zPandasRow.__getitem__c                 c   s    | j jD ]}|V  qd S r.   )r/   columns)r0   kr)   r)   r*   __iter__m   s   zPandasRow.__iter__c                 C      | j jd S )N   )r/   shaper0   r)   r)   r*   __len__q      zPandasRow.__len__c                 C   s>   i }|   D ]\}}t|r|tju rd ||< q|||< q|S r.   )rH   r   pdNA)r0   pydictr3   valuer)   r)   r*   	as_pydictt   s   

zPandasRow.as_pydictc                 C      t | S r.   )r   rV   r)   r)   r*   __str__      zPandasRow.__str__c                 C   r^   r.   )r   rV   r)   r)   r*   __repr__   r`   zPandasRow.__repr__c                 C   s   t | ||S r.   )r   )r0   pcycler)   r)   r*   _repr_pretty_   rX   zPandasRow._repr_pretty_N)__name__
__module____qualname____doc__r   r1   r   rM   r   rO   r   rR   rW   r   r]   r_   ra   rd   r)   r)   r)   r*   r,   ?   s    &r,   c                
       s  e Zd Zd2 fddZdddeded	ee fd
dZdddeded	ee fddZdddeded	ee fddZ	dddeded	ee fddZ
dddeded	ee fddZdddededed	ee fddZd	eeeef  fddZd	efddZd	efddZd	efddZd	efdd Z	!	d3ded"ee ded	ee fd#d$Zd	ee fd%d&Zd4d(ed	ejfd)d*Zd	eee d+f fd,d-Zd.d/ Zd	efd0d1Z   Z!S )5PandasBlockColumnAccessorrG   pandas.Seriesc                       t  | d S r.   superr1   )r0   rG   	__class__r)   r*   r1         z"PandasBlockColumnAccessor.__init__T)as_pyignore_nullsrq   r4   c                C   s   |r| j  S t| j S r.   )_columncountr?   r0   rr   rq   r)   r)   r*   rt         zPandasBlockColumnAccessor.countc                C   s   |   rd S | jj|ddS )NrT   )skipna	min_count)_is_all_nullrs   sumru   r)   r)   r*   rz      s   zPandasBlockColumnAccessor.sumc                C      |   rd S | jj|dS Nrw   )ry   rs   minru   r)   r)   r*   r~         zPandasBlockColumnAccessor.minc                C   r{   r|   )ry   rs   maxru   r)   r)   r*   r      r   zPandasBlockColumnAccessor.maxc                C   s(   | j |d}t|s|| j|d S |S )Nrr   )rz   r   rt   )r0   rr   rq   sum_r)   r)   r*   mean   s
   zPandasBlockColumnAccessor.meanqc                C   s   | j j|dS )N)r   )rs   quantile)r0   r   rr   rq   r)   r)   r*   r      s   z"PandasBlockColumnAccessor.quantilec                 C   s0   | j  }t|dkrd S |j |j dS )Nr   )valuescounts)rs   value_countsr?   indextolistr   )r0   r   r)   r)   r*   r      s   
z&PandasBlockColumnAccessor.value_countsc                 C   sr   ddl m} tdd | jD d }t||r| jdd | _dd l}|| j }|	 j
|jdd}| S )	Nr   r5   c                 s       | ]	}|d ur|V  qd S r.   r)   r;   xr)   r)   r*   r=          z1PandasBlockColumnAccessor.hash.<locals>.<genexpr>c                 S      |   S r.   r8   r   r)   r)   r*   <lambda>       z0PandasBlockColumnAccessor.hash.<locals>.<lambda>T)wrap_numerical)+ray.data._internal.tensor_extensions.pandasr6   nextrs   rA   applypolarsfrom_pandasto_frame	hash_rowscastInt64	to_pandas)r0   r6   first_non_nullpldfhashesr)   r)   r*   hash   s   
zPandasBlockColumnAccessor.hashc              
   C   s~   t  }z|  r| jdd }n| j}|| W S  ty> } zdt|v r9|| j  W  Y d }~S  d }~ww )Nc                 S   s   | d u r| S t | S r.   )rB   )lr)   r)   r*   r          z2PandasBlockColumnAccessor.unique.<locals>.<lambda>z buffer source array is read-only)	r+   is_composed_of_listsrs   mapSeriesuniquerD   rM   copy)r0   rY   rG   rI   r)   r)   r*   r      s    z PandasBlockColumnAccessor.uniquec                    sr   ddl m  tdd | jD d }t| s| j}n
| j fdd}|  r3|dd }|| }|jdd	S )
Nr   r5   c                 s   r   r.   r)   r   r)   r)   r*   r=      r   z4PandasBlockColumnAccessor.flatten.<locals>.<genexpr>c                    s   t |  r	|  S | S r.   )rA   r9   r   r5   r)   r*   r      s    z3PandasBlockColumnAccessor.flatten.<locals>.<lambda>c                 S   s   | d uo	t | dkS r%   )r?   r   r)   r)   r*   r      r   Tignore_index)r   r6   r   rs   rA   r   r   explode)r0   r   columnmaskr)   r5   r*   flatten   s   

z!PandasBlockColumnAccessor.flattenc                 C   
   | j  S r.   )rs   dropnarV   r)   r)   r*   r      r2   z PandasBlockColumnAccessor.dropnaNr   c                 C   s6   |d u r
| j |d}t|r|S | j| d j|dS )Nr      r}   )r   r   rs   rz   )r0   rr   r   rq   r)   r)   r*   sum_of_squared_diffs_from_mean   s
   z8PandasBlockColumnAccessor.sum_of_squared_diffs_from_meanc                 C   r   r.   )rs   to_listrV   r)   r)   r*   	to_pylist  r2   z#PandasBlockColumnAccessor.to_pylistFzero_copy_onlyc                 C   s   | j j| dS )zqNOTE: Unlike Arrow, specifying `zero_copy_only=True` isn't a guarantee
        that no copy will be made
        r   )rs   r9   )r0   r   r)   r)   r*   r9     s   z"PandasBlockColumnAccessor.to_numpyzpyarrow.Arrayc                 C   r   r.   )r   rV   r)   r)   r*   _as_arrow_compatible  r`   z.PandasBlockColumnAccessor._as_arrow_compatiblec                 C   s   | j    S r.   )rs   notnaanyrV   r)   r)   r*   ry     rp   z&PandasBlockColumnAccessor._is_all_nullc                 C   s8   ddl m} ttj|f}tdd | jD d }t||S )Nr   r5   c                 s   r   r.   r)   r   r)   r)   r*   r=   !  r   zAPandasBlockColumnAccessor.is_composed_of_lists.<locals>.<genexpr>)r   r6   listnpndarrayr   rs   rA   )r0   r6   typesr   r)   r)   r*   r     s   
z.PandasBlockColumnAccessor.is_composed_of_lists)rG   rj   NTF)"re   rf   rg   r1   boolr   r   rt   rz   r~   r   r   floatr   r   rM   r   r   r   r   r   r   r   r   r   r   r   r   r9   r   r   ry   r   __classcell__r)   r)   rn   r*   ri      sL         
	
ri   c                       s   e Zd Z fddZedeeee f ddfddZ	eded ddfd	d
Z
edefddZedddZdefddZ  ZS )PandasBlockBuilderc                    s   t  }t |j d S r.   )r+   rm   r1   	DataFrame)r0   r'   rn   r)   r*   r1   &  s   zPandasBlockBuilder.__init__rP   r4   pandas.DataFramec                    s.   ddl m  t }| fdd|  D S )Nr   TensorArrayc                    s8   i | ]\}}|t |d krt||r t|n|qS r   )r?   r   r   )r;   column_namecolumn_valuesr   r)   r*   
<dictcomp>1  s    z9PandasBlockBuilder._table_from_pydict.<locals>.<dictcomp>)$ray.data.extensions.tensor_extensionr   r+   r   rH   )rP   r'   r)   r   r*   _table_from_pydict*  s   
z%PandasBlockBuilder._table_from_pydicttablesc                 C   s^   t  }ddlm} t| dkr|j| dd}|jddd n| d }t }|jr-||}|S )Nr   ))_cast_ndarray_columns_to_tensor_extensionrT   Tr   dropinplace)	r+   #ray.data.util.data_batch_conversionr   r?   concatreset_indexr   get_currentenable_tensor_extension_casting)r   r'   r   r   ctxr)   r)   r*   _combine_tables<  s   z"PandasBlockBuilder._combine_tablesc                   C   s   dS r   r)   r)   r)   r)   r*   _concat_would_copyO  s   z%PandasBlockBuilder._concat_would_copyc                  C   s   t  } |  S r.   )r+   r   r(   r)   r)   r*   _empty_tableS  s   zPandasBlockBuilder._empty_tablec                 C      t jS r.   r   PANDASrV   r)   r)   r*   
block_typeX     zPandasBlockBuilder.block_typer4   r   )re   rf   rg   r1   staticmethodr   rM   r   r   r   r   r   r   r   r   r   r   r)   r)   rn   r*   r   %  s     r   PandasBlockSchemanamesr   c                	       sF  e Zd ZeZdS fddZdedefddZdee	 fd	d
Z
de	dedefddZdTdedededdfddZdee ddfddZdee	 defddZdee	 ddfddZdee	e	f ddfddZd e	d!eddfd"d#Zd$ee ddfd%d&Zdefd'd(ZdUd)d*Z	+dVdeee	ee	 f  deejee	ejf f fd,d-ZdWd/d0Z defd1d2Z!defd3d4Z"d5e#ddfd6d7Z$e%de&fd8d9Z'e%dUd:d;Z(d<ed=d>ddfd?d@Z)dXdAdBZ*dCee+ d=d>dee fdDdEZ,e%dFee d=d>de-edGf fdHdIZ.de/fdJdKZ0dLede1ee2ejf  fdMdNZ3dYdQdRZ4  Z5S )ZPandasBlockAccessortabler   c                    rk   r.   rl   )r0   r   rn   r)   r*   r1   d  rp   zPandasBlockAccessor.__init__r   r4   c                 C   s   | j ||d dd}t|S )NrT   Fr   )slicer,   )r0   r   base_rowr)   r)   r*   _get_rowg  s   zPandasBlockAccessor._get_rowc                 C   s   | j j S r.   )_tablerP   r   rV   r)   r)   r*   column_namesk  rX   z PandasBlockAccessor.column_namesnamer\   c                 C   s4   t |tjtjfr| ||S | jjdi ||iS )Nr)   )rA   rY   r   r   r   upsert_columnr   assign)r0   r   r\   r)   r)   r*   fill_columnn  s   zPandasBlockAccessor.fill_columnFstartendr   c                 C   s0   | j || }|jddd |r|jdd}|S )NTr   deep)r   r   r   )r0   r   r   r   viewr)   r)   r*   r   u  s
   zPandasBlockAccessor.sliceindicesc                 C   s   | j |}|jddd |S )NTr   )r   taker   )r0   r   r   r)   r)   r*   r   |  s   zPandasBlockAccessor.takerP   c                 C   s   | j j|ddS )NrP   )axis)r   r   r0   rP   r)   r)   r*   r     rp   zPandasBlockAccessor.dropc                 C   s,   t dd |D std| d| j| S )Nc                 s   s    | ]}t |tV  qd S r.   rA   rM   )r;   rG   r)   r)   r*   r=     s    z-PandasBlockAccessor.select.<locals>.<genexpr>zZColumns must be a list of column name strings when aggregating on Pandas blocks, but got: .)allrD   r   r   r)   r)   r*   select  s   
zPandasBlockAccessor.selectcolumns_renamec                 C   s   | j j|dddS )NF)rP   r   r   )r   rename)r0   r   r)   r)   r*   rename_columns  s   z"PandasBlockAccessor.rename_columnsr   column_datac                 C   s8   dd l }t||j|jfr| }| jjdi ||iS )Nr   r)   )pyarrowrA   ArrayChunkedArrayr   r   r   )r0   r   r  r  r)   r)   r*   r     s   z!PandasBlockAccessor.upsert_columnrandom_seedc                 C   s"   | j jd|d}|jddd |S )NrT   )fracrandom_stateTr   )r   sampler   )r0   r  r   r)   r)   r*   random_shuffle  s   z"PandasBlockAccessor.random_shufflec                 C   sJ   | j j}t|j |j d}tdd |jD r#td|jd|S )N)r   r   c                 s   s    | ]	}t |t V  qd S r.   r   )r;   r   r)   r)   r*   r=     r   z-PandasBlockAccessor.schema.<locals>.<genexpr>zwA Pandas DataFrame with column names of non-str types is not supported by Ray Dataset. Column names of this DataFrame: r   )	r   dtypesr   r   r   r   r   r   rD   )r0   r
  schemar)   r)   r*   r    s   zPandasBlockAccessor.schemac                 C   s,   ddl m} t }| j}|jr||}|S )Nr   ) _cast_tensor_columns_to_ndarrays)r   r  r   r   r   r   )r0   r  r   r   r)   r)   r*   r     s   zPandasBlockAccessor.to_pandasNc                 C   s   |d u r| j j }d}nt|trd}n|g}d}t| j j}|D ]}||vr6td| d| j j  q"g }|D ]}|| j |   q;|rP|d }|S t	t
||}|S )NFTzCannot find column z, available columns: r   )r   rP   r   rA   r   setrD   appendr9   dictzip)r0   rP   should_be_single_ndarraycolumn_names_setr   arraysr)   r)   r*   r9     s0   

zPandasBlockAccessor.to_numpypyarrow.Tablec           	      C   s   dd l }ddlm} |jj| jdd}i }t| jjD ]%\}}| j| }t|j	|r+q|
  s@|jt|| d|||f< q| D ]\\}}}||||}qE|S )Nr   )TensorDtypeF)preserve_index)type)r  r   r  Tabler   r   	enumeraterP   rA   dtyper   r   nullsr?   nullrH   
set_column)	r0   par  arrow_tablenull_coerced_columnsidxcol_namerG   null_colr)   r)   r*   to_arrow  s    

zPandasBlockAccessor.to_arrowc                 C   rS   r%   )r   rU   rV   r)   r)   r*   num_rows  rX   zPandasBlockAccessor.num_rowsc                    sj  ddl m} ddlm m} t  fdd| jjddd}|f}t}| jj	D ]}| j| j
}t|s>t|s>t||rt| j| }t||}	|	dkrOq)| j| j|	d	j}
z5t|
|rmt|
d jtjrm|
j}ntfd
d}t||
}|||	  }||  t|7  < W q) ty } ztd| d|  W Y d }~q)d }~ww q)| }t|S )Nr   r   )r6   r  c                    s:  t  }d}t| g}|r| }t|ttttfr$t	
|}||7 }qt||v r+q|t| zt	
|}W n tyD   d}Y nw ||7 }t|tjrW||j| 7 }nBt|jrk||jddd | 7 }n.t|ttt fry|| n t|tr||  ||  nt| r||  |s|S )zhCalculates the memory size of objects,
            including nested objects using an iterative approach.r   Tr   r   )r  collectionsdequepoprA   rM   bytesintr   sys	getsizeofidadd	TypeErrorr   r   nbytesr   memory_usagerz   r   rB   extendr  r7   r   r9   )objseen
total_sizeobjectscurrentsize)r6   rY   r)   r*   get_deep_size  sB   


$z5PandasBlockAccessor.size_bytes.<locals>.get_deep_sizeTFr&  )nc                    s    | S r.   r)   r   )r:  r)   r*   r   T  r   z0PandasBlockAccessor.size_bytes.<locals>.<lambda>z#Error calculating size for column 'z': )r   r   rL   r6   r  r+   r   r2  #_PANDAS_SIZE_BYTES_MAX_SAMPLE_COUNTrP   r  r   r   rA   r?   r~   r  r   r   
issubdtypenumpy_dtypenumberr1  	vectorizerz   r+  	ExceptionrE   rF   )r0   r   r  r2  object_need_checkmax_sample_countr   r  r6  sample_sizesampled_datacolumn_memory_samplevectorized_size_calccolumn_memoryrI   total_memory_usager)   )r6   r:  rY   r*   
size_bytes  sL   /
""zPandasBlockAccessor.size_bytesaccc           	      C   sz   |   jdd}|  }|jD ]+}|| }t|j}||v r6d}|}||v r4d||}|d7 }||v s&|}|||< q|S )NFr   rT   z{}_{})r   r   rP   r   format)	r0   rK  rsr"  rG   r   inew_namer)   r)   r*   _zipe  s   


zPandasBlockAccessor._zipc                   C   s   t  S r.   )r   r)   r)   r)   r*   builderv  s   zPandasBlockAccessor.builderc                   C   s   t  S r.   )r   r   r)   r)   r)   r*   r   z  s   z PandasBlockAccessor._empty_table	n_samplessort_keyr    c                 C   s   | j |  j|ddS )NTr   )r   get_columnsr  )r0   rS  rT  r)   r)   r*   _sample~  rv   zPandasBlockAccessor._samplec                 C   sP   |  sJ d|   d| jjd dkr|  S | \}}| jj||dS )Nz'Sorting columns couldn't be empty (got )r   by	ascending)rU  r   rU   r   to_pandas_sort_argssort_values)r0   rT  rP   rZ  r)   r)   r*   sort  s   zPandasBlockAccessor.sort
boundariesc                    sZ     |}|jd dkr fddtt|d D S t|dkr$|gS t|||S )Nr   c                    s   g | ]}   qS r)   )r   )r;   _rV   r)   r*   
<listcomp>  r   z:PandasBlockAccessor.sort_and_partition.<locals>.<listcomp>rT   )r]  rU   ranger?   r   	for_block_find_partitions_sorted)r0   r^  rT  r   r)   rV   r*   sort_and_partition  s   

z&PandasBlockAccessor.sort_and_partitionblocksr"   c                 C   s   t  }t }dd | D } t| dkrt }nt| tj	} |j
| dd}| \}}|j||d}ddlm} ||j|| dfS )	Nc                 S   s   g | ]}|j d  d kr|qS r   )rU   )r;   br)   r)   r*   r`    s    z;PandasBlockAccessor.merge_sorted_blocks.<locals>.<listcomp>r   Tr   rX  r!   )stats)r+   r   rR  r?   r   r   r   normalize_block_typesr   r   r   r[  r\  ray.data.blockr"   
from_blockbuild)re  rT  rY   rg  retrP   rZ  r"   r)   r)   r*   merge_sorted_blocks  s   
z'PandasBlockAccessor.merge_sorted_blocksc                 C   r   r.   r   rV   r)   r)   r*   r     r   zPandasBlockAccessor.block_typepublic_row_formatc                 c   s<    |   }t|D ]}| |}|r| V  q	|V  q	d S r.   )r%  ra  r   r]   )r0   rn  r%  rO  r-   r)   r)   r*   	iter_rows  s   
zPandasBlockAccessor.iter_rowspredicate_exprr   c                 C   s0   | j jr| j S ddlm} ||| j }| j | S )z,Filter rows based on a predicate expression.r   )	eval_expr)r   empty?ray.data._internal.planner.plan_expression.expression_evaluatorrq  )r0   rp  rq  r   r)   r)   r*   filter  s
   
zPandasBlockAccessor.filter)r   r   r   r   r.   )r4   r  )rT  r    )rp  r   r4   r   )6re   rf   rg   r,   ROW_TYPEr1   r+  r   r   rM   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	  r   r  r   r   r   r   r9   r$  r%  rJ  r   rQ  r   r   rR  r   rV  r]  r#   rd  r	   rm  r   r   r   r   ro  rt  r   r)   r)   rn   r*   r   a  sv    





(f



r   )@r'  loggingr,  typingr   r   r   r   r   r   r   r	   r
   r   numpyr   r'   rY   pandas.api.typesr   r   r    ray.data._internal.numpy_supportr   ray.data._internal.rowr   r   r   ray.data._internal.table_blockr   r   *ray.data._internal.tensor_extensions.utilsr   ray.data._internal.utilr   ri  r   r   r   r   r   r   r   ray.data.contextr   ray.data.expressionsr   r  2ray.data._internal.planner.exchange.sort_task_specr    r"   r#   r<  	getLoggerre   rE   r&   r+   r,   ri   r   
namedtupler   r   r)   r)   r)   r*   <module>   s>    0$	
	L 9