o
    cit                     @   s~  d dl mZmZ d dlmZ d dlmZmZmZ d dl	Z
d dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ ed	d
G dd deZed	d
G dd deZed	d
G dd deZed	d
G dd deZed	d
G dd deZ				d'dedee dededeeeef  dedeeeeef f fdd Zd!ej deddfd"d#Z!d$ej"defd%d&Z#dS )(    )CounterOrderedDict)partial)DictListOptionalN)BatchFormat)Dataset)PreprocessorPreprocessorNotFittedException)	PublicAPIalpha)	stabilityc                	   @   sd   e Zd ZdZddddee dedeee  fdd	Zd
e	de
fddZdejfddZdd ZdS )OrdinalEncodera  Encode values within columns as ordered integer values.

    :class:`OrdinalEncoder` encodes categorical features as integers that range from
    :math:`0` to :math:`n - 1`, where :math:`n` is the number of categories.

    If you transform a value that isn't in the fitted datset, then the value is encoded
    as ``float("nan")``.

    Columns must contain either hashable values or lists of hashable values. Also, you
    can't have both scalars and lists in the same column.

    Examples:
        Use :class:`OrdinalEncoder` to encode categorical features as integers.

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import OrdinalEncoder
        >>> df = pd.DataFrame({
        ...     "sex": ["male", "female", "male", "female"],
        ...     "level": ["L4", "L5", "L3", "L4"],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OrdinalEncoder(columns=["sex", "level"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sex  level
        0    1      1
        1    0      2
        2    1      0
        3    0      1

        :class:`OrdinalEncoder` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = OrdinalEncoder(columns=["sex", "level"], output_columns=["sex_encoded", "level_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
              sex level  sex_encoded  level_encoded
        0    male    L4            1              1
        1  female    L5            0              2
        2    male    L3            1              0
        3  female    L4            0              1


        If you transform a value not present in the original dataset, then the value
        is encoded as ``float("nan")``.

        >>> df = pd.DataFrame({"sex": ["female"], "level": ["L6"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(ds).to_pandas()  # doctest: +SKIP
           sex  level
        0    0    NaN

        :class:`OrdinalEncoder` can also encode categories in a list.

        >>> df = pd.DataFrame({
        ...     "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"],
        ...     "genre": [
        ...         ["comedy", "action", "sports"],
        ...         ["animation", "comedy",  "action"],
        ...         ["documentary"],
        ...     ],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OrdinalEncoder(columns=["genre"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name      genre
        0                 Shaolin Soccer  [2, 0, 4]
        1                          Moana  [1, 2, 0]
        2  The Smartest Guys in the Room        [3]

    Args:
        columns: The columns to separately encode.
        encode_lists: If ``True``, encode list elements.  If ``False``, encode
            whole lists (i.e., replace each list with an integer). ``True``
            by default.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`OneHotEncoder`
            Another preprocessor that encodes categorical data.
    TN)encode_listsoutput_columnscolumnsr   r   c                C      || _ || _t||| _d S N)r   r   r
   #_derive_and_validate_output_columnsr   )selfr   r   r    r   R/home/ubuntu/.local/lib/python3.10/site-packages/ray/data/preprocessors/encoder.py__init__f   
   
zOrdinalEncoder.__init__datasetreturnc                 C   s   t || j| jd| _| S )Nr   )_get_unique_value_indicesr   r   stats_r   r   r   r   r   _fitt   s   
zOrdinalEncoder._fitdfc                    sX   t |gjR   dtdtffdd dtjf fdd}|j ||j< |S )Nelementnamec                   s    fdd| D S )Nc                    s$   g | ]}j d   d |qS )unique_values())r   get.0x)r$   r   r   r   
<listcomp>~   s   $ zIOrdinalEncoder._transform_pandas.<locals>.encode_list.<locals>.<listcomp>r   )r#   r$   r   r$   r   encode_list}   s   z5OrdinalEncoder._transform_pandas.<locals>.encode_listsc                    sX   t  rjr t jdS  fdd} |S jd j d } |S )Nr-   c                    s"   t | } jd j d | S Nr%   r&   )tupler   r$   r'   r#   )r/   r   r   r   list_as_category   s   zZOrdinalEncoder._transform_pandas.<locals>.column_ordinal_encoder.<locals>.list_as_categoryr%   r&   )_is_series_composed_of_listsr   mapr   r$   applyr   )r/   r3   s_valuesr.   r   )r/   r   column_ordinal_encoder   s   

z@OrdinalEncoder._transform_pandas.<locals>.column_ordinal_encoder)_validate_dfr   liststrpdSeriesr6   r   )r   r"   r9   r   r8   r   _transform_pandasz   s
   z OrdinalEncoder._transform_pandasc                 C   &   | j j d| jd| jd| jdS )N	(columns=z, encode_lists=, output_columns=r&   )	__class____name__r   r   r   r,   r   r   r   __repr__      zOrdinalEncoder.__repr__)rD   
__module____qualname____doc__r   r<   boolr   r   r	   r
   r!   r=   	DataFramer?   rE   r   r   r   r   r      s    Y

r   c                	   @   p   e Zd ZdZddddee deeeef  deee  fddZ	d	e
d
efddZdejfddZdd ZdS )OneHotEncodera-  `One-hot encode <https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics>`_
    categorical data.

    This preprocessor transforms each specified column into a one-hot encoded vector.
    Each element in the vector corresponds to a unique category in the column, with a
    value of 1 if the category matches and 0 otherwise.

    If a category is infrequent (based on ``max_categories``) or not present in the
    fitted dataset, it is encoded as all 0s.

    Columns must contain hashable objects or lists of hashable objects.

    .. note::
        Lists are treated as categories. If you want to encode individual list
        elements, use :class:`MultiHotEncoder`.

    Example:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import OneHotEncoder
        >>>
        >>> df = pd.DataFrame({"color": ["red", "green", "red", "red", "blue", "green"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OneHotEncoder(columns=["color"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
               color
        0  [0, 0, 1]
        1  [0, 1, 0]
        2  [0, 0, 1]
        3  [0, 0, 1]
        4  [1, 0, 0]
        5  [0, 1, 0]

        OneHotEncoder can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = OneHotEncoder(columns=["color"], output_columns=["color_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           color color_encoded
        0    red     [0, 0, 1]
        1  green     [0, 1, 0]
        2    red     [0, 0, 1]
        3    red     [0, 0, 1]
        4   blue     [1, 0, 0]
        5  green     [0, 1, 0]

        If you one-hot encode a value that isn't in the fitted dataset, then the
        value is encoded with zeros.

        >>> df = pd.DataFrame({"color": ["yellow"]})
        >>> batch = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(batch).to_pandas()  # doctest: +SKIP
            color color_encoded
        0  yellow     [0, 0, 0]

        Likewise, if you one-hot encode an infrequent value, then the value is encoded
        with zeros.

        >>> encoder = OneHotEncoder(columns=["color"], max_categories={"color": 2})
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
            color
        0  [1, 0]
        1  [0, 1]
        2  [1, 0]
        3  [1, 0]
        4  [0, 0]
        5  [0, 1]

    Args:
        columns: The columns to separately encode.
        max_categories: The maximum number of features to create for each column.
            If a value isn't specified for a column, then a feature is created
            for every category in that column.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`MultiHotEncoder`
            If you want to encode individual list elements, use
            :class:`MultiHotEncoder`.

        :class:`OrdinalEncoder`
            If your categories are ordered, you may want to use
            :class:`OrdinalEncoder`.
    Nmax_categoriesr   r   rO   r   c                C   r   r   r   rO   r
   r   r   r   r   rO   r   r   r   r   r      r   zOneHotEncoder.__init__r   r   c                 C      t || j| jdd| _| S )NFrO   r   r   r   rO   r   r    r   r   r   r!        zOneHotEncoder._fitr"   c           	      C   s   t |g| jR   t| j| jD ]R\}}t|| r#|| t||< | jd| d }t|}t	j
t||ftd}|| |fdd }|dk}d|t	|d || tf< | ||< q|S )	Nr%   r&   )dtypec                 S   s   | | dS )Nr'   )vmr   r   r   <lambda>  s    z1OneHotEncoder._transform_pandas.<locals>.<lambda>rW      r   )r:   r   zipr   r4   r5   r1   r   lennpzerosintr6   to_numpynonzeroastypetolist)	r   r"   columnoutput_columnstatsnum_categoriesone_hotcodes
valid_rowsr   r   r   r?     s    zOneHotEncoder._transform_pandasc                 C   r@   NrA   z, max_categories=rB   r&   rC   rD   r   rO   r   r,   r   r   r   rE      rF   zOneHotEncoder.__repr__rD   rG   rH   rI   r   r<   r   r   ra   r   r	   r
   r!   r=   rK   r?   rE   r   r   r   r   rM      s    ]

	rM   c                	   @   rL   )MultiHotEncodera  Multi-hot encode categorical data.

    This preprocessor replaces each list of categories with an :math:`m`-length binary
    list, where :math:`m` is the number of unique categories in the column or the value
    specified in ``max_categories``. The :math:`i\\text{-th}` element of the binary list
    is :math:`1` if category :math:`i` is in the input list and :math:`0` otherwise.

    Columns must contain hashable objects or lists of hashable objects.
    Also, you can't have both types in the same column.

    .. note::
        The logic is similar to scikit-learn's [MultiLabelBinarizer][1]

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import MultiHotEncoder
        >>>
        >>> df = pd.DataFrame({
        ...     "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"],
        ...     "genre": [
        ...         ["comedy", "action", "sports"],
        ...         ["animation", "comedy",  "action"],
        ...         ["documentary"],
        ...     ],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> encoder = MultiHotEncoder(columns=["genre"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name            genre
        0                 Shaolin Soccer  [1, 0, 1, 0, 1]
        1                          Moana  [1, 1, 1, 0, 0]
        2  The Smartest Guys in the Room  [0, 0, 0, 1, 0]

        :class:`MultiHotEncoder` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = MultiHotEncoder(columns=["genre"], output_columns=["genre_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name                        genre    genre_encoded
        0                 Shaolin Soccer     [comedy, action, sports]  [1, 0, 1, 0, 1]
        1                          Moana  [animation, comedy, action]  [1, 1, 1, 0, 0]
        2  The Smartest Guys in the Room                [documentary]  [0, 0, 0, 1, 0]

        If you specify ``max_categories``, then :class:`MultiHotEncoder`
        creates features for only the most frequent categories.

        >>> encoder = MultiHotEncoder(columns=["genre"], max_categories={"genre": 3})
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name      genre
        0                 Shaolin Soccer  [1, 1, 1]
        1                          Moana  [1, 1, 0]
        2  The Smartest Guys in the Room  [0, 0, 0]
        >>> encoder.stats_  # doctest: +SKIP
        OrderedDict([('unique_values(genre)', {'comedy': 0, 'action': 1, 'sports': 2})])

    Args:
        columns: The columns to separately encode.
        max_categories: The maximum number of features to create for each column.
            If a value isn't specified for a column, then a feature is created
            for every unique category in that column.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`OneHotEncoder`
            If you're encoding individual categories instead of lists of
            categories, use :class:`OneHotEncoder`.

        :class:`OrdinalEncoder`
            If your categories are ordered, you may want to use
            :class:`OrdinalEncoder`.

    [1]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
    NrN   r   rO   r   c                C   r   r   rP   rQ   r   r   r   r   z  r   zMultiHotEncoder.__init__r   r   c                 C   rR   )NTrS   rT   r    r   r   r   r!     rU   zMultiHotEncoder._fitr"   c                    s^   t |g jR   dtdtf fdd}t j jD ]\}}|| t||d||< q|S )Nr#   r$   c                   sR   t | tjr|  } nt | ts| g} jd| d }t|   fdd|D S )Nr%   r&   c                    s   g | ]}  |d qS r   rX   r(   counterr   r   r+     s    zJMultiHotEncoder._transform_pandas.<locals>.encode_list.<locals>.<listcomp>)
isinstancer_   ndarrayre   r;   r   r   )r#   r$   rh   r,   rr   r   r.     s   

z6MultiHotEncoder._transform_pandas.<locals>.encode_listr-   )r:   r   r;   r<   r]   r   r5   r   )r   r"   r.   rf   rg   r   r,   r   r?     s
   	z!MultiHotEncoder._transform_pandasc                 C   s&   | j j d| jd| jd| j dS rm   rn   r,   r   r   r   rE     rF   zMultiHotEncoder.__repr__ro   r   r   r   r   rp   (  s    T

	rp   c                   @   sp   e Zd ZdZdddedee fddZded	efd
dZ	de
jfddZdddZde
jfddZdd ZdS )LabelEncodera
  Encode labels as integer targets.

    :class:`LabelEncoder` encodes labels as integer targets that range from
    :math:`0` to :math:`n - 1`, where :math:`n` is the number of unique labels.

    If you transform a label that isn't in the fitted datset, then the label is encoded
    as ``float("nan")``.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> df = pd.DataFrame({
        ...     "sepal_width": [5.1, 7, 4.9, 6.2],
        ...     "sepal_height": [3.5, 3.2, 3, 3.4],
        ...     "species": ["setosa", "versicolor", "setosa", "virginica"]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> from ray.data.preprocessors import LabelEncoder
        >>> encoder = LabelEncoder(label_column="species")
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height  species
        0          5.1           3.5        0
        1          7.0           3.2        1
        2          4.9           3.0        0
        3          6.2           3.4        2

        You can also provide the name of the output column that should hold the encoded
        labels if you want to use :class:`LabelEncoder` in append mode.

        >>> encoder = LabelEncoder(label_column="species", output_column="species_encoded")
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height     species  species_encoded
        0          5.1           3.5      setosa                0
        1          7.0           3.2  versicolor                1
        2          4.9           3.0      setosa                0
        3          6.2           3.4   virginica                2

        If you transform a label not present in the original dataset, then the new
        label is encoded as ``float("nan")``.

        >>> df = pd.DataFrame({
        ...     "sepal_width": [4.2],
        ...     "sepal_height": [2.7],
        ...     "species": ["bracteata"]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height  species
        0          4.2           2.7      NaN

    Args:
        label_column: A column containing labels that you want to encode.
        output_column: The name of the column that will contain the encoded
            labels. If None, the output column will have the same name as the
            input column.

    .. seealso::

        :class:`OrdinalEncoder`
            If you're encoding ordered features, use :class:`OrdinalEncoder` instead of
            :class:`LabelEncoder`.
    N)rg   label_columnrg   c                C   s   || _ |p|| _d S r   )rw   rg   )r   rw   rg   r   r   r   r     s   zLabelEncoder.__init__r   r   c                 C   s   t || jg| _| S r   )r   rw   r   r    r   r   r   r!     s   zLabelEncoder._fitr"   c                    s:   t | j dtjf fdd}| j || j< |S )Nr/   c                    s    j d| j d }| |S r0   )r   r$   r5   )r/   r7   r,   r   r   column_label_encoder  s   
z<LabelEncoder._transform_pandas.<locals>.column_label_encoder)r:   rw   r=   r>   	transformrg   )r   r"   rx   r   r,   r   r?     s   zLabelEncoder._transform_pandasdsr	   c                 C   sF   |   }|tjjtjjfv rtd|  }|j| jfdt	j
i|S )a/  Inverse transform the given dataset.

        Args:
            ds: Input Dataset that has been fitted and/or transformed.

        Returns:
            ray.data.Dataset: The inverse transformed Dataset.

        Raises:
            PreprocessorNotFittedException: if ``fit`` is not called yet.
        z1`fit` must be called before `inverse_transform`, batch_format)
fit_statusr
   	FitStatusPARTIALLY_FITTED
NOT_FITTEDr   _get_transform_configmap_batches_inverse_transform_pandasr   PANDAS)r   rz   r|   kwargsr   r   r   inverse_transform  s    zLabelEncoder.inverse_transformc                    s.   dt jf fdd}| j || j< |S )Nr/   c                    s,   dd  j d j d  D }| |S )Nc                 S      i | ]\}}||qS r   r   )r)   keyvaluer   r   r   
<dictcomp>  s    zXLabelEncoder._inverse_transform_pandas.<locals>.column_label_decoder.<locals>.<dictcomp>r%   r&   )r   rw   itemsr5   )r/   inverse_valuesr,   r   r   column_label_decoder  s   
zDLabelEncoder._inverse_transform_pandas.<locals>.column_label_decoder)r=   r>   rg   ry   rw   )r   r"   r   r   r,   r   r     s   	z&LabelEncoder._inverse_transform_pandasc                 C   s   | j j d| jd| jdS )Nz(label_column=z, output_column=r&   )rC   rD   rw   rg   r,   r   r   r   rE   (  s   zLabelEncoder.__repr__)rz   r	   r   r	   )rD   rG   rH   rI   r<   r   r   r	   r
   r!   r=   rK   r?   r   r   rE   r   r   r   r   rv     s    @

rv   c                	   @   sp   e Zd ZdZ		ddee deeeej	f  deee  fddZ
ded	efd
dZdejfddZdd ZdS )Categorizera^
  Convert columns to ``pd.CategoricalDtype``.

    Use this preprocessor with frameworks that have built-in support for
    ``pd.CategoricalDtype`` like LightGBM.

    .. warning::

        If you don't specify ``dtypes``, fit this preprocessor before splitting
        your dataset into train and test splits. This ensures categories are
        consistent across splits.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import Categorizer
        >>>
        >>> df = pd.DataFrame(
        ... {
        ...     "sex": ["male", "female", "male", "female"],
        ...     "level": ["L4", "L5", "L3", "L4"],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> categorizer = Categorizer(columns=["sex", "level"])
        >>> categorizer.fit_transform(ds).schema().types  # doctest: +SKIP
        [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5'], ordered=False)]

        :class:`Categorizer` can also be used in append mode by providing the
        name of the output_columns that should hold the categorized values.

        >>> categorizer = Categorizer(columns=["sex", "level"], output_columns=["sex_cat", "level_cat"])
        >>> categorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
              sex level sex_cat level_cat
        0    male    L4    male        L4
        1  female    L5  female        L5
        2    male    L3    male        L3
        3  female    L4  female        L4

        If you know the categories in advance, you can specify the categories with the
        ``dtypes`` parameter.

        >>> categorizer = Categorizer(
        ...     columns=["sex", "level"],
        ...     dtypes={"level": pd.CategoricalDtype(["L3", "L4", "L5", "L6"], ordered=True)},
        ... )
        >>> categorizer.fit_transform(ds).schema().types  # doctest: +SKIP
        [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5', 'L6'], ordered=True)]

    Args:
        columns: The columns to convert to ``pd.CategoricalDtype``.
        dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype``
            objects. If you don't include a column in ``dtypes``, the categories
            are inferred.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    Nr   dtypesr   c                 C   s&   |si }|| _ || _t||| _d S r   )r   r   r
   r   r   )r   r   r   r   r   r   r   r   i  s   
zCategorizer.__init__r   r   c                    sX    fdd j D }|rt||ddd}dd | D }ni }i  j|}| _ S )Nc                    s   g | ]}|t  jvr|qS r   )setr   r)   rf   r,   r   r   r+   y  s    z$Categorizer._fit.<locals>.<listcomp>Tz{0})drop_na_values
key_formatc                 S   s    i | ]\}}|t | qS r   )r=   CategoricalDtypekeys)r)   rf   values_indicesr   r   r   r     s    z$Categorizer._fit.<locals>.<dictcomp>)r   r   r   r   r   )r   r   columns_to_getunique_indicesr   r,   r   r!   x  s   
zCategorizer._fitr"   c                 C   s   || j  | j|| j< |S r   )r   rd   r   r   )r   r"   r   r   r   r?     s   zCategorizer._transform_pandasc                 C   r@   )NrA   z	, dtypes=rB   r&   )rC   rD   r   r   r   r,   r   r   r   rE     s   zCategorizer.__repr__)NN)rD   rG   rH   rI   r   r<   r   r   r=   r   r   r	   r
   r!   rK   r?   rE   r   r   r   r   r   ,  s    >

r   Funique_values({0})Tr   r   r   r   rO   r   r   c                    s  |du ri }t  }|D ]}||vrtd| d  dqdtjffdddtjd	ttttf  f fd
d}| j	|dd}	dd  D }
|	j
ddD ]#}| D ]\}}|D ]}dd | D }|
|  t|7  < q[qUqO D ]/}|r|
| }t|}dd | D }t||
|< qutdd |
| D rtd| dqut } D ]2}||v rdd t|
| || D |||< qdd ttt|
|  D |||< q|S )z8If drop_na_values is True, will silently drop NA values.NzYou set `max_categories` for z, which is not present in .colc                    sN   t | rrt   fdd}| |  S | dd } t| jdd S )Nc                    s     |  | S r   )updater2   rr   r   r   update_counter  s   
zY_get_unique_value_indices.<locals>.get_pd_value_counts_per_column.<locals>.update_counterc                 S   s   t | S r   )r1   )r*   r   r   r   r[     s    zS_get_unique_value_indices.<locals>.get_pd_value_counts_per_column.<locals>.<lambda>F)dropna)r4   r   r5   value_countsto_dict)r   r   r   rr   r   get_pd_value_counts_per_column  s   
zA_get_unique_value_indices.<locals>.get_pd_value_counts_per_columnr"   r   c                    sJ   | j  }i } D ]}||v r| | g||< q	td| d| |S )NzColumn 'z2' does not exist in DataFrame, which has columns: )r   re   
ValueError)r"   
df_columnsresultr   )r   r   r   r   get_pd_value_counts  s   
z6_get_unique_value_indices.<locals>.get_pd_value_countspandas)r{   c                 S   s   i | ]}|t  qS r   )r   )r)   r   r   r   r   r     s    z-_get_unique_value_indices.<locals>.<dictcomp>)
batch_sizec                 S   s   i | ]\}}|d ur||qS r   r   r)   krY   r   r   r   r     s    c                 S   s    i | ]\}}t |s||qS r   r=   isnullr   r   r   r   r     s     c                 s   s    | ]}t |V  qd S r   r   )r)   r   r   r   r   	<genexpr>  s    z,_get_unique_value_indices.<locals>.<genexpr>zUnable to fit column 'zJ' because it contains null values. Consider imputing missing values first.c                 S   s   i | ]	\}}|d  |qS rq   r   r)   jr   r   r   r   r     s    c                 S   r   r   r   r   r   r   r   r     s    
)r   r   r=   r>   rK   r   r   r<   r   r   iter_batchesr   dictanyr   	enumeratemost_commonformatsortedr   )r   r   r   r   rO   r   columns_setrf   r   r   final_countersbatchr   countersrs   counter_dictsanitized_dictunique_values_with_indicesr   )r   r   r   r   r     sZ   
&
r   r"   c                    s*    fdd|D }|rt d| dd S )Nc                    s"   g | ]} |   j r|qS r   )r   valuesr   r   r"   r   r   r+     s   " z _validate_df.<locals>.<listcomp>zUnable to transform columns zJ because they contain null values. Consider imputing missing values first.)r   )r"   r   null_columnsr   r   r   r:     s   
r:   seriesc                 C   s4   t dd | D d }tjj| jot|ttj	fS )Nc                 s   s    | ]	}|d ur|V  qd S r   r   )r)   r#   r   r   r   r     s    z/_is_series_composed_of_lists.<locals>.<genexpr>)
nextr   apitypesis_object_dtyperV   rt   r;   r_   ru   )r   first_not_none_elementr   r   r   r4     s   
r4   )Fr   NT)$collectionsr   r   	functoolsr   typingr   r   r   numpyr_   r   r=   pandas.api.types"ray.air.util.data_batch_conversionr   ray.datar	   ray.data.preprocessorr
   r   ray.util.annotationsr   r   rM   rp   rv   r   r<   rJ   ra   r   rK   r:   r>   r4   r   r   r   r   <module>   sZ        k
Y	