o
    ci2?                     @   s   d dl mZmZmZmZmZmZ d dlZd dl	Z
d dlmZ d dlmZmZ d dlmZ d dlmZ G dd deZed	d
G dd deZed	d
G dd deZdededededee f
ddZdS )    )DictIterableListOptionalTypeUnionN)Dataset)MaxMin)Preprocessor)	PublicAPIc                   @   s0   e Zd ZdZdejfddZdd Zdd Zd	S )
_AbstractKBinsDiscretizerzAbstract base class for all KBinsDiscretizers.

    Essentially a thin wraper around ``pd.cut``.

    Expects either ``self.stats_`` or ``self.bins`` to be set and
    contain {column:list_of_bin_intervals}.
    dfc                    s<   dt jdt jf fdd}|j|dd}| j | j< |S )Nsreturnc              
      s   | j  jvr| S  jr j| j nd}d}|r)t|tjr'|j}t|j	}nd} j
r/ jn j}tj| t|tr?|| j  n| j||d j jdS )NFT)rightlabelsorderedretbinsinclude_lowest
duplicates)namecolumnsdtypesget
isinstancepdCategoricalDtyper   list
categories_is_fittablestats_binscutdictr   r   r   )r   r   r   r"   self V/home/ubuntu/.local/lib/python3.10/site-packages/ray/data/preprocessors/discretizer.py
bin_values   s(   z?_AbstractKBinsDiscretizer._transform_pandas.<locals>.bin_valuesr   )axis)r   Seriesapplyr   output_columns)r&   r   r)   	binned_dfr'   r%   r(   _transform_pandas   s   z+_AbstractKBinsDiscretizer._transform_pandasc                    s4   t  jtrt fdd jD stdd S d S )Nc                 3   s    | ]}| j v V  qd S N)r"   ).0colr%   r'   r(   	<genexpr>3   s    

zC_AbstractKBinsDiscretizer._validate_bins_columns.<locals>.<genexpr>zKIf `bins` is a dictionary, all elements of `columns` must be present in it.)r   r"   r$   allr   
ValueErrorr%   r'   r%   r(   _validate_bins_columns2   s   z0_AbstractKBinsDiscretizer._validate_bins_columnsc                 C   s0   d dd t|  D }| jj d| dS )Nz, c                 S   s(   g | ]\}}| d s| d|qS )_=)
startswith)r1   	attr_name
attr_valuer'   r'   r(   
<listcomp>=   s    z6_AbstractKBinsDiscretizer.__repr__.<locals>.<listcomp>())joinvarsitems	__class____name__)r&   attr_strr'   r'   r(   __repr__;   s   
z"_AbstractKBinsDiscretizer.__repr__N)	rC   
__module____qualname____doc__r   	DataFramer/   r6   rE   r'   r'   r'   r(   r      s
    	r   alpha)	stabilityc                   @   s   e Zd ZdZdddddddee deee e	j
eeeee e	j
f f f d	ed
ededeeeee	jeej f f  deee  fddZdZdS )CustomKBinsDiscretizerak  Bin values into discrete intervals using custom bin edges.

    Columns must contain numerical values.

    Examples:
        Use :class:`CustomKBinsDiscretizer` to bin continuous features.

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import CustomKBinsDiscretizer
        >>> df = pd.DataFrame({
        ...     "value_1": [0.2, 1.4, 2.5, 6.2, 9.7, 2.1],
        ...     "value_2": [10, 15, 13, 12, 23, 25],
        ... })
        >>> ds = ray.data.from_pandas(df)
        >>> discretizer = CustomKBinsDiscretizer(
        ...     columns=["value_1", "value_2"],
        ...     bins=[0, 1, 4, 10, 25]
        ... )
        >>> discretizer.transform(ds).to_pandas()
           value_1  value_2
        0        0        2
        1        1        3
        2        1        3
        3        2        3
        4        2        3
        5        1        3

        :class:`CustomKBinsDiscretizer` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> discretizer = CustomKBinsDiscretizer(
        ...     columns=["value_1", "value_2"],
        ...     bins=[0, 1, 4, 10, 25],
        ...     output_columns=["value_1_discretized", "value_2_discretized"]
        ... )
        >>> discretizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
           value_1  value_2  value_1_discretized  value_2_discretized
        0      0.2       10                    0                    2
        1      1.4       15                    1                    3
        2      2.5       13                    1                    3
        3      6.2       12                    2                    3
        4      9.7       23                    2                    3
        5      2.1       25                    1                    3

        You can also specify different bin edges per column.

        >>> discretizer = CustomKBinsDiscretizer(
        ...     columns=["value_1", "value_2"],
        ...     bins={"value_1": [0, 1, 4], "value_2": [0, 18, 35, 70]},
        ... )
        >>> discretizer.transform(ds).to_pandas()
           value_1  value_2
        0      0.0        0
        1      1.0        0
        2      1.0        0
        3      NaN        0
        4      NaN        1
        5      1.0        1


    Args:
        columns: The columns to discretize.
        bins: Defines custom bin edges. Can be an iterable of numbers,
            a ``pd.IntervalIndex``, or a dict mapping columns to either of them.
            Note that ``pd.IntervalIndex`` for bins must be non-overlapping.
        right: Indicates whether bins include the rightmost edge.
        include_lowest: Indicates whether the first interval should be left-inclusive.
        duplicates: Can be either 'raise' or 'drop'. If bin edges are not unique,
            raise ``ValueError`` or drop non-uniques.
        dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype``
            objects or ``np.integer`` types. If you don't include a column in ``dtypes``
            or specify it as an integer dtype, the outputted column will consist of
            ordered integers corresponding to bins. If you use a
            ``pd.CategoricalDtype``, the outputted column will be a
            ``pd.CategoricalDtype`` with the categories being mapped to bins.
            You can use ``pd.CategoricalDtype(categories, ordered=True)`` to
            preserve information about bin order.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`UniformKBinsDiscretizer`
            If you want to bin data into uniform width bins.
    TFraiseNr   r   r   r   r-   r   r"   r   r   r   r   r-   c                C   s>   || _ || _|| _|| _|| _|| _t||| _| 	  d S r0   )
r   r"   r   r   r   r   r   #_derive_and_validate_output_columnsr-   r6   r&   r   r"   r   r   r   r   r-   r'   r'   r(   __init__   s   zCustomKBinsDiscretizer.__init__)rC   rF   rG   rH   r   strr   r   floatr   IntervalIndexr   boolr   r   r   npintegerrQ   r    r'   r'   r'   r(   rL   F   s<    b	


rL   c                   @   s   e Zd ZdZdddddddee deeeeef f d	e	d
e	dede
eeeejeej f f  de
ee  fddZdedefddZdd ZdefddZdS )UniformKBinsDiscretizerar  Bin values into discrete intervals (bins) of uniform width.

    Columns must contain numerical values.

    Examples:
        Use :class:`UniformKBinsDiscretizer` to bin continuous features.

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import UniformKBinsDiscretizer
        >>> df = pd.DataFrame({
        ...     "value_1": [0.2, 1.4, 2.5, 6.2, 9.7, 2.1],
        ...     "value_2": [10, 15, 13, 12, 23, 25],
        ... })
        >>> ds = ray.data.from_pandas(df)
        >>> discretizer = UniformKBinsDiscretizer(
        ...     columns=["value_1", "value_2"], bins=4
        ... )
        >>> discretizer.fit_transform(ds).to_pandas()
           value_1  value_2
        0        0        0
        1        0        1
        2        0        0
        3        2        0
        4        3        3
        5        0        3

        :class:`UniformKBinsDiscretizer` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> discretizer = UniformKBinsDiscretizer(
        ...     columns=["value_1", "value_2"],
        ...     bins=4,
        ...     output_columns=["value_1_discretized", "value_2_discretized"]
        ... )
        >>> discretizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
           value_1  value_2  value_1_discretized  value_2_discretized
        0      0.2       10                    0                    0
        1      1.4       15                    0                    1
        2      2.5       13                    0                    0
        3      6.2       12                    2                    0
        4      9.7       23                    3                    3
        5      2.1       25                    0                    3

        You can also specify different number of bins per column.

        >>> discretizer = UniformKBinsDiscretizer(
        ...     columns=["value_1", "value_2"], bins={"value_1": 4, "value_2": 3}
        ... )
        >>> discretizer.fit_transform(ds).to_pandas()
           value_1  value_2
        0        0        0
        1        0        0
        2        0        0
        3        2        0
        4        3        2
        5        0        2


    Args:
        columns: The columns to discretize.
        bins: Defines the number of equal-width bins.
            Can be either an integer (which will be applied to all columns),
            or a dict that maps columns to integers.
            The range is extended by .1% on each side to include
            the minimum and maximum values.
        right: Indicates whether bins includes the rightmost edge or not.
        include_lowest: Whether the first interval should be left-inclusive
            or not.
        duplicates: Can be either 'raise' or 'drop'. If bin edges are not unique,
            raise ``ValueError`` or drop non-uniques.
        dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype``
            objects or ``np.integer`` types. If you don't include a column in ``dtypes``
            or specify it as an integer dtype, the outputted column will consist of
            ordered integers corresponding to bins. If you use a
            ``pd.CategoricalDtype``, the outputted column will be a
            ``pd.CategoricalDtype`` with the categories being mapped to bins.
            You can use ``pd.CategoricalDtype(categories, ordered=True)`` to
            preserve information about bin order.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`CustomKBinsDiscretizer`
            If you want to specify your own bin edges.
    TFrM   NrN   r   r"   r   r   r   r   r-   c                C   s6   || _ || _|| _|| _|| _|| _t||| _d S r0   )	r   r"   r   r   r   r   r   rO   r-   rP   r'   r'   r(   rQ     s   
z UniformKBinsDiscretizer.__init__datasetr   c                 C   s   |    i }g }t| jtr| j }n| j}|D ]
}|| | q|j| }i }i }|	 D ]\}	}
|	dd }|	
drD|
||< |	
drM|
||< q1| D ]}t| jtr_| j| n| j}t|| || || j||< qR|| _| S )N   minmax)_validate_on_fitr   r"   r$   keysr   extend._fit_uniform_covert_bin_to_aggregate_if_needed	aggregaterA   r9   ._translate_min_max_number_of_bins_to_bin_edgesr   r!   )r&   rY   stats
aggregatesr   columnaggregate_statsminsmaxeskeyvaluecolumn_namer"   r'   r'   r(   _fit4  s6   



zUniformKBinsDiscretizer._fitc                 C   s   |    d S r0   )r6   r%   r'   r'   r(   r^   U  s   z(UniformKBinsDiscretizer._validate_on_fitrf   c                 C   sD   t | jtr| j| n| j}t |trt|t|fS td| )Nz5`bins` must be an integer or a dict of integers, got )r   r"   r$   intr
   r	   	TypeError)r&   rf   r"   r'   r'   r(   ra   X  s   
zFUniformKBinsDiscretizer._fit_uniform_covert_bin_to_aggregate_if_needed)rC   rF   rG   rH   r   rR   r   rn   r   rU   r   r   r   r   rV   rW   rQ   r   r   rm   r^   ra   r'   r'   r'   r(   rX      s6    _

!rX   mnmxr"   r   r   c                 C   s   | |f}dd |D \} }t | st |rtd| |krH| | dkr*dt|  nd8 } ||dkr8dt| nd7 }t j| ||d dd}|S t j| ||d dd}||  d }|re|d  |8  < |S |d	  |7  < |S )
zETranslates a range and desired number of bins into list of bin edges.c                 s   s    | ]}|d  V  qdS )g        Nr'   )r1   mir'   r'   r(   r3     s    zA_translate_min_max_number_of_bins_to_bin_edges.<locals>.<genexpr>z@Cannot specify integer `bins` when input data contains infinity.r   gMbP?   T)endpointr[   )rV   isinfr5   abslinspace)rp   rq   r"   r   rngadjr'   r'   r(   rc     s$   rc   )typingr   r   r   r   r   r   numpyrV   pandasr   ray.datar   ray.data.aggregater	   r
   ray.data.preprocessorr   ray.util.annotationsr   r   rL   rX   rS   rn   rU   rc   r'   r'   r'   r(   <module>   s0     :z D