o
    ci                     @   sh   d dl Z d dlmZmZ d dlZd dlZd dlm	Z	 d dl
mZ e eZeddG dd de	ZdS )	    N)ListOptional)Preprocessor)	PublicAPIalpha)	stabilityc                   @   sr   e Zd ZdZdZ				ddee dedeej	 de
d	e
f
d
dZdejddfddZdejfddZdd ZdS )Concatenatora  Combine numeric columns into a column of type
    :class:`~ray.air.util.tensor_extensions.pandas.TensorDtype`. Only columns
    specified in ``columns`` will be concatenated.

    This preprocessor concatenates numeric columns and stores the result in a new
    column. The new column contains
    :class:`~ray.air.util.tensor_extensions.pandas.TensorArrayElement` objects of
    shape :math:`(m,)`, where :math:`m` is the number of columns concatenated.
    The :math:`m` concatenated columns are dropped after concatenation.
    The preprocessor preserves the order of the columns provided in the ``colummns``
    argument and will use that order when calling ``transform()`` and ``transform_batch()``.

    Examples:
        >>> import numpy as np
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import Concatenator

        :py:class:`Concatenator` combines numeric columns into a column of
        :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype`.

        >>> df = pd.DataFrame({"X0": [0, 3, 1], "X1": [0.5, 0.2, 0.9]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> concatenator = Concatenator(columns=["X0", "X1"])
        >>> concatenator.transform(ds).to_pandas()  # doctest: +SKIP
           concat_out
        0  [0.0, 0.5]
        1  [3.0, 0.2]
        2  [1.0, 0.9]

        By default, the created column is called `"concat_out"`, but you can specify
        a different name.

        >>> concatenator = Concatenator(columns=["X0", "X1"], output_column_name="tensor")
        >>> concatenator.transform(ds).to_pandas()  # doctest: +SKIP
               tensor
        0  [0.0, 0.5]
        1  [3.0, 0.2]
        2  [1.0, 0.9]

        >>> concatenator = Concatenator(columns=["X0", "X1"], dtype=np.float32)
        >>> concatenator.transform(ds)  # doctest: +SKIP
        Dataset(num_rows=3, schema={Y: object, concat_out: TensorDtype(shape=(2,), dtype=float32)})

        When ``flatten=True``, nested vectors in the columns will be flattened during concatenation:

        >>> df = pd.DataFrame({"X0": [[1, 2], [3, 4]], "X1": [0.5, 0.2]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> concatenator = Concatenator(columns=["X0", "X1"], flatten=True)
        >>> concatenator.transform(ds).to_pandas()  # doctest: +SKIP
           concat_out
        0  [1.0, 2.0, 0.5]
        1  [3.0, 4.0, 0.2]

    Args:
        columns: A list of columns to concatenate. The provided order of the columns
             will be retained during concatenation.
        output_column_name: The desired name for the new column.
            Defaults to ``"concat_out"``.
        dtype: The ``dtype`` to convert the output tensors to. If unspecified,
            the ``dtype`` is determined by standard coercion rules.
        raise_if_missing: If ``True``, an error is raised if any
            of the columns in ``columns`` don't exist.
            Defaults to ``False``.
        flatten: If ``True``, nested vectors in the columns will be flattened during
            concatenation. Defaults to ``False``.

    Raises:
        ValueError: if `raise_if_missing` is `True` and a column in `columns` or
            doesn't exist in the dataset.
    F
concat_outNcolumnsoutput_column_namedtyperaise_if_missingflattenc                 C   s"   || _ || _|| _|| _|| _d S N)r
   r   r   r   r   )selfr
   r   r   r   r    r   W/home/ubuntu/.local/lib/python3.10/site-packages/ray/data/preprocessors/concatenator.py__init__Y   s
   
zConcatenator.__init__dfreturnc                 C   sH   t | jt | }|r"d| j d| }| jrt|t| d S d S )NzMissing columns specified in 'z': )setr
   r   
ValueErrorloggerwarning)r   r   missing_columnsmessager   r   r   	_validateh   s   zConcatenator._validatec                    sv     |  jr| j  } fdd|D }n
| j j jd}|j jd}tt||j	d d  j
f< |S )Nc                    s$   g | ]}t  fd d|D qS )c                    s2   g | ]} j d u rt|nt| j qS r   )r   np
atleast_1dastype).0elemr   r   r   
<listcomp>z   s    

z=Concatenator._transform_pandas.<locals>.<listcomp>.<listcomp>)r   concatenate)r    rowr"   r   r   r#   x   s    	
z2Concatenator._transform_pandas.<locals>.<listcomp>)r   )r
   )r   r   r
   to_numpyr   droppdSerieslistlocr   )r   r   concatenatedr   r"   r   _transform_pandass   s   

	zConcatenator._transform_pandasc                 C   sf   dd d ddd}g }|  D ]\}}t| |}||kr%|| d|  q| jj dd| dS )Nr	   F)r   r
   r   r   r   =(z, ))itemsgetattrappend	__class____name__join)r   default_valuesnon_default_arguments	parameterdefault_valuevaluer   r   r   __repr__   s   
zConcatenator.__repr__)r	   NFF)r5   
__module____qualname____doc___is_fittabler   strr   r   r   boolr   r(   	DataFramer   r-   r<   r   r   r   r   r      s*    H
r   )loggingtypingr   r   numpyr   pandasr(   ray.data.preprocessorr   ray.util.annotationsr   	getLoggerr5   r   r   r   r   r   r   <module>   s    
