o
    $i                      @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lm Z  er`d dl!m"Z" e #e$Z%e ddedddG dd deZ&e ddedddG dd deZ'e ddedddG dd deZ(e ddedddG dd deZ)e ddedddG dd deZ*ddd d!d"d#ee+ d$ed%e,d&ee	e+e-f  f
d'd(Z.d4d*e,d+eege	e+e-f f fd,d-Z/d.ej0d#e+d+dfd/d0Z1d1ej2d+e,fd2d3Z3dS )5    N)Counter)partial)TYPE_CHECKINGAnyCallableDictHashableListOptionalSet)BatchFormatis_null)PreprocessorPreprocessorNotFittedExceptionSerializablePreprocessorBase)make_post_processor)SerializablePreprocessor)	PublicAPI)Datasetalpha)	stability   z$io.ray.preprocessors.ordinal_encoder)version
identifierc                	       s   e Zd ZdZddddee dedeee  f fdd	Zd
dde	fddZ
dedefddZdejfddZdeeef fddZdeeef defddZdd Z  ZS )OrdinalEncodera  Encode values within columns as ordered integer values.

    :class:`OrdinalEncoder` encodes categorical features as integers that range from
    :math:`0` to :math:`n - 1`, where :math:`n` is the number of categories.

    If you transform a value that isn't in the fitted datset, then the value is encoded
    as ``float("nan")``.

    Columns must contain either hashable values or lists of hashable values. Also, you
    can't have both scalars and lists in the same column.

    Examples:
        Use :class:`OrdinalEncoder` to encode categorical features as integers.

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import OrdinalEncoder
        >>> df = pd.DataFrame({
        ...     "sex": ["male", "female", "male", "female"],
        ...     "level": ["L4", "L5", "L3", "L4"],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OrdinalEncoder(columns=["sex", "level"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sex  level
        0    1      1
        1    0      2
        2    1      0
        3    0      1

        :class:`OrdinalEncoder` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = OrdinalEncoder(columns=["sex", "level"], output_columns=["sex_encoded", "level_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
              sex level  sex_encoded  level_encoded
        0    male    L4            1              1
        1  female    L5            0              2
        2    male    L3            1              0
        3  female    L4            0              1


        If you transform a value not present in the original dataset, then the value
        is encoded as ``float("nan")``.

        >>> df = pd.DataFrame({"sex": ["female"], "level": ["L6"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(ds).to_pandas()  # doctest: +SKIP
           sex  level
        0    0    NaN

        :class:`OrdinalEncoder` can also encode categories in a list.

        >>> df = pd.DataFrame({
        ...     "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"],
        ...     "genre": [
        ...         ["comedy", "action", "sports"],
        ...         ["animation", "comedy",  "action"],
        ...         ["documentary"],
        ...     ],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OrdinalEncoder(columns=["genre"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name      genre
        0                 Shaolin Soccer  [2, 0, 4]
        1                          Moana  [1, 2, 0]
        2  The Smartest Guys in the Room        [3]

    Args:
        columns: The columns to separately encode.
        encode_lists: If ``True``, encode list elements.  If ``False``, encode
            whole lists (i.e., replace each list with an integer). ``True``
            by default.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`OneHotEncoder`
            Another preprocessor that encodes categorical data.
    TN)encode_listsoutput_columnscolumnsr   r   c                   s(   t    || _|| _t||| _d S N)super__init__r   r   r   #_derive_and_validate_output_columnsr   )selfr   r   r   	__class__ [/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/data/preprocessors/encoder.pyr!   t   s   

zOrdinalEncoder.__init__datasetr   returnc                    0   j j fddt dd dd jd S )Nc                    s   t  jj| dS )N)r(   r   r   key_gen)compute_unique_value_indicesr   r   r+   r(   r#   r&   r'   <lambda>   s    z%OrdinalEncoder._fit.<locals>.<lambda>c                 S      d|  dS Nzunique()r&   colr&   r&   r'   r/          c                 S   r0   Nunique_values(r2   r&   r3   r&   r&   r'   r/      r5   stat_fnpost_process_fnstat_key_fnpost_key_fnr   stat_computation_planadd_callable_statunique_post_fnr   r#   r(   r&   r.   r'   _fit   s   zOrdinalEncoder._fitelementcolumn_namec                   s8   | j d| d  | jr fdd|D S  t|S )Nr7   r2   c                    s   g | ]}  |qS r&   get.0xordinal_mapr&   r'   
<listcomp>   s    z7OrdinalEncoder._encode_list_element.<locals>.<listcomp>)stats_r   rF   tuple)r#   rC   rD   r&   rJ   r'   _encode_list_element   s   z#OrdinalEncoder._encode_list_elementdfc                    s@   t |g jR   dtjf fdd}| j || j< |S )Nsc                    s:   t  r  fddS jd j d } |S )Nc                    s   j |  jdS )N)rD   )rO   name)elem)rQ   r#   r&   r'   r/      s    zROrdinalEncoder._transform_pandas.<locals>.column_ordinal_encoder.<locals>.<lambda>r7   r2   )_is_series_composed_of_listsmaprM   rR   rQ   s_valuesr#   )rQ   r'   column_ordinal_encoder   s   
z@OrdinalEncoder._transform_pandas.<locals>.column_ordinal_encoder)_validate_dfr   pdSeriesapplyr   )r#   rP   rY   r&   rX   r'   _transform_pandas   s   	z OrdinalEncoder._transform_pandasc                 C      | j | j| jt| dd dS )N_fitted)r   r   r   r`   )r   r   r   getattrrX   r&   r&   r'   _get_serializable_fields   
   
z'OrdinalEncoder._get_serializable_fieldsfieldsr   c                 C   .   |d | _ |d | _|d | _|d| _d S )Nr   r   r   r`   )r   r   r   rF   r`   r#   rd   r   r&   r&   r'   _set_serializable_fields      


z'OrdinalEncoder._set_serializable_fieldsc                 C   &   | j j d| jd| jd| jdS )N	(columns=z, encode_lists=, output_columns=r2   )r%   __name__r   r   r   rX   r&   r&   r'   __repr__      zOrdinalEncoder.__repr__)rl   
__module____qualname____doc__r	   strboolr
   r!   r   rB   listrO   r[   	DataFramer^   r   r   rb   intrg   rm   __classcell__r&   r&   r$   r'   r      s"    Y
	r   z$io.ray.preprocessors.one_hot_encoderc                	       s   e Zd ZdZddddee deeeef  deee  f fddZ	d	d
de
fddZdedeeef fddZdejfddZdeeef fddZdeeef defddZdd Z  ZS )OneHotEncodera-  `One-hot encode <https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics>`_
    categorical data.

    This preprocessor transforms each specified column into a one-hot encoded vector.
    Each element in the vector corresponds to a unique category in the column, with a
    value of 1 if the category matches and 0 otherwise.

    If a category is infrequent (based on ``max_categories``) or not present in the
    fitted dataset, it is encoded as all 0s.

    Columns must contain hashable objects or lists of hashable objects.

    .. note::
        Lists are treated as categories. If you want to encode individual list
        elements, use :class:`MultiHotEncoder`.

    Example:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import OneHotEncoder
        >>>
        >>> df = pd.DataFrame({"color": ["red", "green", "red", "red", "blue", "green"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder = OneHotEncoder(columns=["color"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
               color
        0  [0, 0, 1]
        1  [0, 1, 0]
        2  [0, 0, 1]
        3  [0, 0, 1]
        4  [1, 0, 0]
        5  [0, 1, 0]

        OneHotEncoder can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = OneHotEncoder(columns=["color"], output_columns=["color_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           color color_encoded
        0    red     [0, 0, 1]
        1  green     [0, 1, 0]
        2    red     [0, 0, 1]
        3    red     [0, 0, 1]
        4   blue     [1, 0, 0]
        5  green     [0, 1, 0]

        If you one-hot encode a value that isn't in the fitted dataset, then the
        value is encoded with zeros.

        >>> df = pd.DataFrame({"color": ["yellow"]})
        >>> batch = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(batch).to_pandas()  # doctest: +SKIP
            color color_encoded
        0  yellow     [0, 0, 0]

        Likewise, if you one-hot encode an infrequent value, then the value is encoded
        with zeros.

        >>> encoder = OneHotEncoder(columns=["color"], max_categories={"color": 2})
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
            color
        0  [1, 0]
        1  [0, 1]
        2  [1, 0]
        3  [1, 0]
        4  [0, 0]
        5  [0, 1]

    Args:
        columns: The columns to separately encode.
        max_categories: The maximum number of features to create for each column.
            If a value isn't specified for a column, then a feature is created
            for every category in that column.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`MultiHotEncoder`
            If you want to encode individual list elements, use
            :class:`MultiHotEncoder`.

        :class:`OrdinalEncoder`
            If your categories are ordered, you may want to use
            :class:`OrdinalEncoder`.
    Nmax_categoriesr   r   rz   r   c                   ,   t    || _|pi | _t||| _d S r   r    r!   r   rz   r   r"   r   r#   r   rz   r   r$   r&   r'   r!        


zOneHotEncoder.__init__r(   r   r)   c                    r*   )Nc                       t  jd| jdS )NFr(   r   r   r+   rz   r,   r   rz   r-   r.   r&   r'   r/   /      z$OneHotEncoder._fit.<locals>.<lambda>c                 S   r0   r1   r&   r3   r&   r&   r'   r/   7  r5   c                 S   r0   r6   r&   r3   r&   r&   r'   r/   8  r5   r8   r=   rA   r&   r.   r'   rB   -     zOneHotEncoder._fitvstatsc                 C   s2   t |ttjfrt|}t |tr||dS dS )N)
isinstancert   npndarrayrN   r   rF   )r#   r   r   r&   r&   r'   safe_get=  s
   
zOneHotEncoder.safe_getrP   c           	         s   t |g jR   t j jD ]D\}} jd| d t}tjt||ftjd}|| 	 fdd
 }|dk}t|d }d|||| f< | ||< q|S )	Nr7   r2   )dtypec                    s     | S r   )r   )r   r#   r   r&   r'   r/   N  r5   z1OneHotEncoder._transform_pandas.<locals>.<lambda>r   r   r   )rZ   r   zipr   rM   lenr   zerosuint8r]   to_numpynonzerotolist)	r#   rP   columnoutput_columnnum_categoriesone_hotcodesvalid_category_masknon_zero_indicesr&   r   r'   r^   E  s    zOneHotEncoder._transform_pandasc                 C   r_   Nr`   )r   r   rz   r`   r   r   rz   ra   rX   r&   r&   r'   rb   \  rc   z&OneHotEncoder._get_serializable_fieldsrd   r   c                 C   re   Nr   r   rz   r`   r   r   rz   rF   r`   rf   r&   r&   r'   rg   d  rh   z&OneHotEncoder._set_serializable_fieldsc                 C   ri   Nrj   z, max_categories=rk   r2   r%   rl   r   rz   r   rX   r&   r&   r'   rm   l  rn   zOneHotEncoder.__repr__)rl   ro   rp   rq   r	   rr   r
   r   rv   r!   r   rB   r   r   r[   ru   r^   rb   rg   rm   rw   r&   r&   r$   r'   rx      s"    ]
rx   z&io.ray.preprocessors.multi_hot_encoderc                	       s   e Zd ZdZddddee deeeef  deee  f fddZ	d	d
de
fddZdejfddZdeeef fddZdeeef defddZdd Z  ZS )MultiHotEncodera  Multi-hot encode categorical data.

    This preprocessor replaces each list of categories with an :math:`m`-length binary
    list, where :math:`m` is the number of unique categories in the column or the value
    specified in ``max_categories``. The :math:`i\\text{-th}` element of the binary list
    is :math:`1` if category :math:`i` is in the input list and :math:`0` otherwise.

    Columns must contain hashable objects or lists of hashable objects.
    Also, you can't have both types in the same column.

    .. note::
        The logic is similar to scikit-learn's [MultiLabelBinarizer][1]

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import MultiHotEncoder
        >>>
        >>> df = pd.DataFrame({
        ...     "name": ["Shaolin Soccer", "Moana", "The Smartest Guys in the Room"],
        ...     "genre": [
        ...         ["comedy", "action", "sports"],
        ...         ["animation", "comedy",  "action"],
        ...         ["documentary"],
        ...     ],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> encoder = MultiHotEncoder(columns=["genre"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name            genre
        0                 Shaolin Soccer  [1, 0, 1, 0, 1]
        1                          Moana  [1, 1, 1, 0, 0]
        2  The Smartest Guys in the Room  [0, 0, 0, 1, 0]

        :class:`MultiHotEncoder` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> encoder = MultiHotEncoder(columns=["genre"], output_columns=["genre_encoded"])
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name                        genre    genre_encoded
        0                 Shaolin Soccer     [comedy, action, sports]  [1, 0, 1, 0, 1]
        1                          Moana  [animation, comedy, action]  [1, 1, 1, 0, 0]
        2  The Smartest Guys in the Room                [documentary]  [0, 0, 0, 1, 0]

        If you specify ``max_categories``, then :class:`MultiHotEncoder`
        creates features for only the most frequent categories.

        >>> encoder = MultiHotEncoder(columns=["genre"], max_categories={"genre": 3})
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
                                    name      genre
        0                 Shaolin Soccer  [1, 1, 1]
        1                          Moana  [1, 1, 0]
        2  The Smartest Guys in the Room  [0, 0, 0]
        >>> encoder.stats_  # doctest: +SKIP
        OrderedDict([('unique_values(genre)', {'comedy': 0, 'action': 1, 'sports': 2})])

    Args:
        columns: The columns to separately encode.
        max_categories: The maximum number of features to create for each column.
            If a value isn't specified for a column, then a feature is created
            for every unique category in that column.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`OneHotEncoder`
            If you're encoding individual categories instead of lists of
            categories, use :class:`OneHotEncoder`.

        :class:`OrdinalEncoder`
            If your categories are ordered, you may want to use
            :class:`OrdinalEncoder`.

    [1]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html
    Nry   r   rz   r   c                   r{   r   r|   r}   r$   r&   r'   r!     r~   zMultiHotEncoder.__init__r(   r   r)   c                    r*   )Nc                    r   )NTr   r   r-   r.   r&   r'   r/     r   z&MultiHotEncoder._fit.<locals>.<lambda>c                 S   r0   r1   r&   r3   r&   r&   r'   r/     r5   c                 S   r0   r6   r&   r3   r&   r&   r'   r/     r5   r8   r=   rA   r&   r.   r'   rB     r   zMultiHotEncoder._fitrP   c                    s^   t |g jR   dtdtf fdd}t j jD ]\}}|| t||d||< q|S )NrC   rR   c                   sR   t | tjr|  } nt | ts| g} jd| d }t|   fdd|D S )Nr7   r2   c                    s   g | ]}  |d qS )r   rE   rG   counterr&   r'   rL         zJMultiHotEncoder._transform_pandas.<locals>.encode_list.<locals>.<listcomp>)r   r   r   r   rt   rM   r   )rC   rR   r   rX   r   r'   encode_list  s   

z6MultiHotEncoder._transform_pandas.<locals>.encode_list)rR   )rZ   r   rt   rr   r   r   rU   r   )r#   rP   r   r   r   r&   rX   r'   r^     s
   	z!MultiHotEncoder._transform_pandasc                 C   r_   r   r   rX   r&   r&   r'   rb     rc   z(MultiHotEncoder._get_serializable_fieldsrd   r   c                 C   re   r   r   rf   r&   r&   r'   rg     rh   z(MultiHotEncoder._set_serializable_fieldsc                 C   s&   | j j d| jd| jd| j dS r   r   rX   r&   r&   r'   rm   	  rn   zMultiHotEncoder.__repr__)rl   ro   rp   rq   r	   rr   r
   r   rv   r!   r   rB   r[   ru   r^   r   rb   rg   rm   rw   r&   r&   r$   r'   r   t  s     T
r   z"io.ray.preprocessors.label_encoderc                       s   e Zd ZdZdddedee f fddZdd	d
efddZde	j
fddZd!ddZde	j
fddZd
ee fddZd
ee fddZd
eeef fddZdeeef defddZdd  Z  ZS )"LabelEncodera
  Encode labels as integer targets.

    :class:`LabelEncoder` encodes labels as integer targets that range from
    :math:`0` to :math:`n - 1`, where :math:`n` is the number of unique labels.

    If you transform a label that isn't in the fitted datset, then the label is encoded
    as ``float("nan")``.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> df = pd.DataFrame({
        ...     "sepal_width": [5.1, 7, 4.9, 6.2],
        ...     "sepal_height": [3.5, 3.2, 3, 3.4],
        ...     "species": ["setosa", "versicolor", "setosa", "virginica"]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>>
        >>> from ray.data.preprocessors import LabelEncoder
        >>> encoder = LabelEncoder(label_column="species")
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height  species
        0          5.1           3.5        0
        1          7.0           3.2        1
        2          4.9           3.0        0
        3          6.2           3.4        2

        You can also provide the name of the output column that should hold the encoded
        labels if you want to use :class:`LabelEncoder` in append mode.

        >>> encoder = LabelEncoder(label_column="species", output_column="species_encoded")
        >>> encoder.fit_transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height     species  species_encoded
        0          5.1           3.5      setosa                0
        1          7.0           3.2  versicolor                1
        2          4.9           3.0      setosa                0
        3          6.2           3.4   virginica                2

        If you transform a label not present in the original dataset, then the new
        label is encoded as ``float("nan")``.

        >>> df = pd.DataFrame({
        ...     "sepal_width": [4.2],
        ...     "sepal_height": [2.7],
        ...     "species": ["bracteata"]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> encoder.transform(ds).to_pandas()  # doctest: +SKIP
           sepal_width  sepal_height  species
        0          4.2           2.7      NaN

    Args:
        label_column: A column containing labels that you want to encode.
        output_column: The name of the column that will contain the encoded
            labels. If None, the output column will have the same name as the
            input column.

    .. seealso::

        :class:`OrdinalEncoder`
            If you're encoding ordered features, use :class:`OrdinalEncoder` instead of
            :class:`LabelEncoder`.
    Nr   label_columnr   c                   s   t    || _|p|| _d S r   )r    r!   r   r   )r#   r   r   r$   r&   r'   r!   T  s   
zLabelEncoder.__init__r(   r   r)   c                    s2   j j fddt dd dd jgd S )Nc                    s   t  jg| dS N)r(   r   r+   )r,   r   r-   r.   r&   r'   r/   [  s
    z#LabelEncoder._fit.<locals>.<lambda>c                 S   r0   r1   r&   r3   r&   r&   r'   r/   a  r5   c                 S   r0   r6   r&   r3   r&   r&   r'   r/   b  r5   r8   )r>   r?   r@   r   rA   r&   r.   r'   rB   Y  s   zLabelEncoder._fitrP   c                    s:   t | j dtjf fdd}| j || j< |S )NrQ   c                    s    j d| j d }| |S r6   )rM   rR   rU   rV   rX   r&   r'   column_label_encoderj  s   
z<LabelEncoder._transform_pandas.<locals>.column_label_encoder)rZ   r   r[   r\   	transformr   )r#   rP   r   r&   rX   r'   r^   g  s   zLabelEncoder._transform_pandasdsc                 C   sF   |   }|tjjtjjfv rtd|  }|j| jfdt	j
i|S )a/  Inverse transform the given dataset.

        Args:
            ds: Input Dataset that has been fitted and/or transformed.

        Returns:
            ray.data.Dataset: The inverse transformed Dataset.

        Raises:
            PreprocessorNotFittedException: if ``fit`` is not called yet.
        z1`fit` must be called before `inverse_transform`, batch_format)
fit_statusr   	FitStatusPARTIALLY_FITTED
NOT_FITTEDr   _get_transform_configmap_batches_inverse_transform_pandasr   PANDAS)r#   r   r   kwargsr&   r&   r'   inverse_transformq  s    zLabelEncoder.inverse_transformc                    s.   dt jf fdd}| j || j< |S )NrQ   c                    s,   dd  j d j d  D }| |S )Nc                 S   s   i | ]\}}||qS r&   r&   )rH   keyvaluer&   r&   r'   
<dictcomp>  s    zXLabelEncoder._inverse_transform_pandas.<locals>.column_label_decoder.<locals>.<dictcomp>r7   r2   )rM   r   itemsrU   )rQ   inverse_valuesrX   r&   r'   column_label_decoder  s   
zDLabelEncoder._inverse_transform_pandas.<locals>.column_label_decoder)r[   r\   r   r   r   )r#   rP   r   r&   rX   r'   r     s   	z&LabelEncoder._inverse_transform_pandasc                 C      | j gS r   )r   rX   r&   r&   r'   get_input_columns     zLabelEncoder.get_input_columnsc                 C   r   r   r   rX   r&   r&   r'   get_output_columns  r   zLabelEncoder.get_output_columnsc                 C   s   | j | jt| dd dS )Nr`   )r   r   r`   )r   r   ra   rX   r&   r&   r'   rb     s   
z%LabelEncoder._get_serializable_fieldsrd   r   c                 C   s$   |d | _ |d | _|d| _d S )Nr   r   r`   )r   r   rF   r`   rf   r&   r&   r'   rg     s   

z%LabelEncoder._set_serializable_fieldsc                 C   s   | j j d| jd| jdS )Nz(label_column=z, output_column=r2   )r%   rl   r   r   rX   r&   r&   r'   rm     s   zLabelEncoder.__repr__)r   r   r)   r   )rl   ro   rp   rq   rr   r
   r!   r   rB   r[   ru   r^   r   r   r	   r   r   r   r   rb   rv   rg   rm   rw   r&   r&   r$   r'   r     s     @

r   z io.ray.preprocessors.categorizerc                	       s   e Zd ZdZ		ddee deeeej	f  deee  f fddZ
dd	d
efddZdejfddZd
eeef fddZdeeef defddZdd Z  ZS )Categorizera^
  Convert columns to ``pd.CategoricalDtype``.

    Use this preprocessor with frameworks that have built-in support for
    ``pd.CategoricalDtype`` like LightGBM.

    .. warning::

        If you don't specify ``dtypes``, fit this preprocessor before splitting
        your dataset into train and test splits. This ensures categories are
        consistent across splits.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import Categorizer
        >>>
        >>> df = pd.DataFrame(
        ... {
        ...     "sex": ["male", "female", "male", "female"],
        ...     "level": ["L4", "L5", "L3", "L4"],
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> categorizer = Categorizer(columns=["sex", "level"])
        >>> categorizer.fit_transform(ds).schema().types  # doctest: +SKIP
        [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5'], ordered=False)]

        :class:`Categorizer` can also be used in append mode by providing the
        name of the output_columns that should hold the categorized values.

        >>> categorizer = Categorizer(columns=["sex", "level"], output_columns=["sex_cat", "level_cat"])
        >>> categorizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
              sex level sex_cat level_cat
        0    male    L4    male        L4
        1  female    L5  female        L5
        2    male    L3    male        L3
        3  female    L4  female        L4

        If you know the categories in advance, you can specify the categories with the
        ``dtypes`` parameter.

        >>> categorizer = Categorizer(
        ...     columns=["sex", "level"],
        ...     dtypes={"level": pd.CategoricalDtype(["L3", "L4", "L5", "L6"], ordered=True)},
        ... )
        >>> categorizer.fit_transform(ds).schema().types  # doctest: +SKIP
        [CategoricalDtype(categories=['female', 'male'], ordered=False), CategoricalDtype(categories=['L3', 'L4', 'L5', 'L6'], ordered=True)]

    Args:
        columns: The columns to convert to ``pd.CategoricalDtype``.
        dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype``
            objects. If you don't include a column in ``dtypes``, the categories
            are inferred.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    Nr   dtypesr   c                    s0   t    |s	i }|| _|| _t||| _d S r   )r    r!   r   r   r   r"   r   )r#   r   r   r   r$   r&   r'   r!     s   

zCategorizer.__init__r(   r   r)   c                    s   fddj D   jjO  _ sS dtttf dtjfdd}jj fddt	t
d	d
|gddd dd  d S )Nc                    s   g | ]	}| j vr|qS r&   )r   rH   r   rX   r&   r'   rL     s    z$Categorizer._fit.<locals>.<listcomp>unique_indicesr)   c                 S   s   t |  S r   )r[   CategoricalDtypekeys)r   r&   r&   r'   callback	  s   z"Categorizer._fit.<locals>.callbackc                    s   t  | dS r   )r,   r-   )columns_to_getr(   r&   r'   r/     s
    z"Categorizer._fit.<locals>.<lambda>Tdrop_na_values)base_fn	callbacksc                 S   r0   r1   r&   r3   r&   r&   r'   r/     r5   c                 S   s   | S r   r&   r3   r&   r&   r'   r/     s    r8   )r   rM   r   r   rr   r[   r   r>   r?   r   r@   )r#   r(   r   r&   )r   r(   r#   r'   rB     s$   
zCategorizer._fitrP   c                 C   s   || j  | j|| j< |S r   )r   astyperM   r   )r#   rP   r&   r&   r'   r^     s   zCategorizer._transform_pandasc                 C   sB   | j | jt| dd t| dr| jrdd | j D dS d dS )Nr`   r   c                 S   s$   i | ]\}}|t |j|jd qS )
categoriesordered)rt   r   r   )rH   r4   r   r&   r&   r'   r   %  s    z8Categorizer._get_serializable_fields.<locals>.<dictcomp>)r   r   r`   r   )r   r   ra   hasattrr   r   rX   r&   r&   r'   rb      s   
	z$Categorizer._get_serializable_fieldsrd   r   c                 C   sJ   | drdd |d  D ni | _|d | _|d | _| d| _d S )Nr   c                 S   s(   i | ]\}}|t j|d  |d dqS )r   r   r   )r[   r   )rH   r4   
dtype_datar&   r&   r'   r   1  s    z8Categorizer._set_serializable_fields.<locals>.<dictcomp>r   r   r`   )rF   r   r   r   r   r`   rf   r&   r&   r'   rg   -  s   



z$Categorizer._set_serializable_fieldsc                 C   ri   )Nrj   z	, dtypes=rk   r2   )r%   rl   r   r   r   rX   r&   r&   r'   rm   @  s   zCategorizer.__repr__)NN)rl   ro   rp   rq   r	   rr   r
   r   r[   r   r!   r   rB   ru   r^   r   rb   rv   rg   rm   rw   r&   r&   r$   r'   r     s     >
r   T)r   rz   r(   r   r   r+   r   rz   c                    s  |d u ri }t  }|D ]}||vrtd| d  dqdtjdtffdddtjdtttt f f fd	d
}| j|dd}fdd D }	|j	d dD ]3}
|

 D ],\}}|D ]%}dd |
 D }||v rytt||| }|	| |  q_qYqS|	S )NzYou set `max_categories` for z, which is not present in .r4   r)   c                    sN   t | rrt   fdd}| |  S | dd } t| jdd S )Nc                    s     |  | S r   )update)rC   r   r&   r'   update_counter`  s   
z\compute_unique_value_indices.<locals>.get_pd_value_counts_per_column.<locals>.update_counterc                 S   s   t | S r   )rN   )rI   r&   r&   r'   r/   h  s    zVcompute_unique_value_indices.<locals>.get_pd_value_counts_per_column.<locals>.<lambda>F)dropna)rT   r   rU   value_countsto_dict)r4   r   )r   r   r'   get_pd_value_counts_per_columnY  s   
zDcompute_unique_value_indices.<locals>.get_pd_value_counts_per_columnrP   c                    sJ   | j  }i } D ]}||v r| | g||< q	td| d| |S )NzColumn 'z2' does not exist in DataFrame, which has columns: )r   r   
ValueError)rP   
df_columnsresultr4   )r   r   r&   r'   get_pd_value_countsk  s   
z9compute_unique_value_indices.<locals>.get_pd_value_countspandas)r   c                    s   i | ]} |t  qS r&   )set)rH   r4   r-   r&   r'   r   y  r   z0compute_unique_value_indices.<locals>.<dictcomp>)
batch_sizec                 S   s   i | ]\}}|d ur||qS r   r&   )rH   kr   r&   r&   r'   r   }  s    )r   r   r[   r\   r   ru   rr   r	   r   iter_batchesr   dictr   most_commonr   r   )r(   r   r+   r   rz   columns_setr   r   value_counts_dsunique_values_by_colbatchr4   countersr   r&   )r   r   r   r+   r'   r,   G  s:   &r,   Fr   r)   c                    s"   dt dtttf f fdd}|S )a  
    Returns a post-processing function that generates an encoding map by
    sorting the unique values produced during aggregation or stats computation.

    Args:
        drop_na_values: If True, NA/null values will be silently dropped from the
            encoding map. If False, raises an error if any NA/null values are present.

    Returns:
        A callable that takes a set of unique values and returns a dictionary
        mapping each value to a unique integer index.
    valuesr)   c                    sB   t dd | D r stddd | D }dd tt|D S )Nc                 s   s    | ]}t |V  qd S r   r   rH   r   r&   r&   r'   	<genexpr>  s    z:unique_post_fn.<locals>.gen_value_index.<locals>.<genexpr>z]Unable to fit column because it contains null values. Consider imputing missing values first.c                 S   s   g | ]}t |s|qS r&   r   r   r&   r&   r'   rL     r   z;unique_post_fn.<locals>.gen_value_index.<locals>.<listcomp>c                 S   s(   i | ]\}}t |ts|nt||qS r&   )r   rt   rN   )rH   ir   r&   r&   r'   r     s    z;unique_post_fn.<locals>.gen_value_index.<locals>.<dictcomp>)anyr   	enumeratesorted)r   non_null_valuesr   r&   r'   gen_value_index  s   
z'unique_post_fn.<locals>.gen_value_index)r	   r   r   rv   )r   r   r&   r   r'   r@     s   r@   rP   c                    s*    fdd|D }|rt d| dd S )Nc                    s"   g | ]} |   j r|qS r&   )isnullr   r   r   rP   r&   r'   rL     s   " z _validate_df.<locals>.<listcomp>zUnable to transform columns zJ because they contain null values. Consider imputing missing values first.)r   )rP   r   null_columnsr&   r   r'   rZ     s   
rZ   seriesc                 C   s4   t dd | D d }tjj| jot|ttj	fS )Nc                 s   s    | ]	}|d ur|V  qd S r   r&   )rH   rC   r&   r&   r'   r     s    z/_is_series_composed_of_lists.<locals>.<genexpr>)
nextr   apitypesis_object_dtyper   r   rt   r   r   )r   first_not_none_elementr&   r&   r'   rT     s   
rT   )F)4loggingcollectionsr   	functoolsr   typingr   r   r   r   r   r	   r
   r   numpyr   r   r[   pandas.api.types"ray.air.util.data_batch_conversionr   ray.data._internal.utilr   ray.data.preprocessorr   r   r   ray.data.preprocessors.utilsr   &ray.data.preprocessors.version_supportr   ray.util.annotationsr   ray.data.datasetr   	getLoggerrl   loggerr   rx   r   r   r   rr   rs   rv   r,   r@   ru   rZ   r\   rT   r&   r&   r&   r'   <module>   sl    (

 %
 1 
 !
 
&C"	