o
    `۷i@                     @   s   d dl mZmZmZmZmZmZmZ d dlZ	d dl
Zd dlmZmZ d dlmZ d dlmZ er6d dlmZ G dd deZed	d
G dd deZed	d
G dd deZdedeeef defddZdededededee f
ddZdS )    )TYPE_CHECKINGDictIterableListOptionalTypeUnionN)MaxMin)Preprocessor)	PublicAPI)Datasetc                   @   s0   e Zd ZdZdejfddZdd Zdd Zd	S )
_AbstractKBinsDiscretizerzAbstract base class for all KBinsDiscretizers.

    Essentially a thin wraper around ``pd.cut``.

    Expects either ``self.stats_`` or ``self.bins`` to be set and
    contain {column:list_of_bin_intervals}.
    dfc                    s<   dt jdt jf fdd}|j|dd}| j | j< |S )Nsreturnc              
      s   | j  jvr| S  jr j| j nd}d}|r)t|tjr'|j}t|j	}nd} j
r/ jn j}tj| t|tr?|| j  n| j||d j jdS )NFT)rightlabelsorderedretbinsinclude_lowest
duplicates)namecolumnsdtypesget
isinstancepdCategoricalDtyper   list
categories_is_fittablestats_binscutdictr   r   r   )r   r   r   r#   self X/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/data/preprocessors/discretizer.py
bin_values   s(   z?_AbstractKBinsDiscretizer._transform_pandas.<locals>.bin_valuesr   )axis)r   Seriesapplyr   output_columns)r'   r   r*   	binned_dfr(   r&   r)   _transform_pandas   s   z+_AbstractKBinsDiscretizer._transform_pandasc                    s4   t  jtrt fdd jD stdd S d S )Nc                 3   s    | ]}| j v V  qd S N)r#   ).0colr&   r(   r)   	<genexpr>5   s    

zC_AbstractKBinsDiscretizer._validate_bins_columns.<locals>.<genexpr>zKIf `bins` is a dictionary, all elements of `columns` must be present in it.)r   r#   r%   allr   
ValueErrorr&   r(   r&   r)   _validate_bins_columns4   s   z0_AbstractKBinsDiscretizer._validate_bins_columnsc                 C   s0   d dd t|  D }| jj d| dS )Nz, c                 S   s(   g | ]\}}| d s| d|qS )_=)
startswith)r2   	attr_name
attr_valuer(   r(   r)   
<listcomp>?   s    z6_AbstractKBinsDiscretizer.__repr__.<locals>.<listcomp>())joinvarsitems	__class____name__)r'   attr_strr(   r(   r)   __repr__=   s   
z"_AbstractKBinsDiscretizer.__repr__N)	rD   
__module____qualname____doc__r   	DataFramer0   r7   rF   r(   r(   r(   r)   r      s
    	r   alpha)	stabilityc                   @   s   e Zd ZdZdddddddee deee e	j
eeeee e	j
f f f d	ed
ededeeeee	jeej f f  deee  fddZdZdS )CustomKBinsDiscretizerak  Bin values into discrete intervals using custom bin edges.

    Columns must contain numerical values.

    Examples:
        Use :class:`CustomKBinsDiscretizer` to bin continuous features.

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import CustomKBinsDiscretizer
        >>> df = pd.DataFrame({
        ...     "value_1": [0.2, 1.4, 2.5, 6.2, 9.7, 2.1],
        ...     "value_2": [10, 15, 13, 12, 23, 25],
        ... })
        >>> ds = ray.data.from_pandas(df)
        >>> discretizer = CustomKBinsDiscretizer(
        ...     columns=["value_1", "value_2"],
        ...     bins=[0, 1, 4, 10, 25]
        ... )
        >>> discretizer.transform(ds).to_pandas()
           value_1  value_2
        0        0        2
        1        1        3
        2        1        3
        3        2        3
        4        2        3
        5        1        3

        :class:`CustomKBinsDiscretizer` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> discretizer = CustomKBinsDiscretizer(
        ...     columns=["value_1", "value_2"],
        ...     bins=[0, 1, 4, 10, 25],
        ...     output_columns=["value_1_discretized", "value_2_discretized"]
        ... )
        >>> discretizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
           value_1  value_2  value_1_discretized  value_2_discretized
        0      0.2       10                    0                    2
        1      1.4       15                    1                    3
        2      2.5       13                    1                    3
        3      6.2       12                    2                    3
        4      9.7       23                    2                    3
        5      2.1       25                    1                    3

        You can also specify different bin edges per column.

        >>> discretizer = CustomKBinsDiscretizer(
        ...     columns=["value_1", "value_2"],
        ...     bins={"value_1": [0, 1, 4], "value_2": [0, 18, 35, 70]},
        ... )
        >>> discretizer.transform(ds).to_pandas()
           value_1  value_2
        0      0.0        0
        1      1.0        0
        2      1.0        0
        3      NaN        0
        4      NaN        1
        5      1.0        1


    Args:
        columns: The columns to discretize.
        bins: Defines custom bin edges. Can be an iterable of numbers,
            a ``pd.IntervalIndex``, or a dict mapping columns to either of them.
            Note that ``pd.IntervalIndex`` for bins must be non-overlapping.
        right: Indicates whether bins include the rightmost edge.
        include_lowest: Indicates whether the first interval should be left-inclusive.
        duplicates: Can be either 'raise' or 'drop'. If bin edges are not unique,
            raise ``ValueError`` or drop non-uniques.
        dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype``
            objects or ``np.integer`` types. If you don't include a column in ``dtypes``
            or specify it as an integer dtype, the outputted column will consist of
            ordered integers corresponding to bins. If you use a
            ``pd.CategoricalDtype``, the outputted column will be a
            ``pd.CategoricalDtype`` with the categories being mapped to bins.
            You can use ``pd.CategoricalDtype(categories, ordered=True)`` to
            preserve information about bin order.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`UniformKBinsDiscretizer`
            If you want to bin data into uniform width bins.
    TFraiseNr   r   r   r   r.   r   r#   r   r   r   r   r.   c                C   s>   || _ || _|| _|| _|| _|| _t||| _| 	  d S r1   )
r   r#   r   r   r   r   r   #_derive_and_validate_output_columnsr.   r7   r'   r   r#   r   r   r   r   r.   r(   r(   r)   __init__   s   zCustomKBinsDiscretizer.__init__)rD   rG   rH   rI   r   strr   r   floatr   IntervalIndexr   boolr   r   r   npintegerrR   r!   r(   r(   r(   r)   rM   H   s<    b	


rM   c                       s   e Zd ZdZdddddddee deeeeef f d	e	d
e	dede
eeeejeej f f  de
ee  f fddZdddefddZdd ZdddZ  ZS )UniformKBinsDiscretizerar  Bin values into discrete intervals (bins) of uniform width.

    Columns must contain numerical values.

    Examples:
        Use :class:`UniformKBinsDiscretizer` to bin continuous features.

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import UniformKBinsDiscretizer
        >>> df = pd.DataFrame({
        ...     "value_1": [0.2, 1.4, 2.5, 6.2, 9.7, 2.1],
        ...     "value_2": [10, 15, 13, 12, 23, 25],
        ... })
        >>> ds = ray.data.from_pandas(df)
        >>> discretizer = UniformKBinsDiscretizer(
        ...     columns=["value_1", "value_2"], bins=4
        ... )
        >>> discretizer.fit_transform(ds).to_pandas()
           value_1  value_2
        0        0        0
        1        0        1
        2        0        0
        3        2        0
        4        3        3
        5        0        3

        :class:`UniformKBinsDiscretizer` can also be used in append mode by providing the
        name of the output_columns that should hold the encoded values.

        >>> discretizer = UniformKBinsDiscretizer(
        ...     columns=["value_1", "value_2"],
        ...     bins=4,
        ...     output_columns=["value_1_discretized", "value_2_discretized"]
        ... )
        >>> discretizer.fit_transform(ds).to_pandas()  # doctest: +SKIP
           value_1  value_2  value_1_discretized  value_2_discretized
        0      0.2       10                    0                    0
        1      1.4       15                    0                    1
        2      2.5       13                    0                    0
        3      6.2       12                    2                    0
        4      9.7       23                    3                    3
        5      2.1       25                    0                    3

        You can also specify different number of bins per column.

        >>> discretizer = UniformKBinsDiscretizer(
        ...     columns=["value_1", "value_2"], bins={"value_1": 4, "value_2": 3}
        ... )
        >>> discretizer.fit_transform(ds).to_pandas()
           value_1  value_2
        0        0        0
        1        0        0
        2        0        0
        3        2        0
        4        3        2
        5        0        2


    Args:
        columns: The columns to discretize.
        bins: Defines the number of equal-width bins.
            Can be either an integer (which will be applied to all columns),
            or a dict that maps columns to integers.
            The range is extended by .1% on each side to include
            the minimum and maximum values.
        right: Indicates whether bins includes the rightmost edge or not.
        include_lowest: Whether the first interval should be left-inclusive
            or not.
        duplicates: Can be either 'raise' or 'drop'. If bin edges are not unique,
            raise ``ValueError`` or drop non-uniques.
        dtypes: An optional dictionary that maps columns to ``pd.CategoricalDtype``
            objects or ``np.integer`` types. If you don't include a column in ``dtypes``
            or specify it as an integer dtype, the outputted column will consist of
            ordered integers corresponding to bins. If you use a
            ``pd.CategoricalDtype``, the outputted column will be a
            ``pd.CategoricalDtype`` with the categories being mapped to bins.
            You can use ``pd.CategoricalDtype(categories, ordered=True)`` to
            preserve information about bin order.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    .. seealso::

        :class:`CustomKBinsDiscretizer`
            If you want to specify your own bin edges.
    TFrN   NrO   r   r#   r   r   r   r   r.   c                   s@   t    || _|| _|| _|| _|| _|| _t	||| _
d S r1   )superrR   r   r#   r   r   r   r   r   rP   r.   rQ   rC   r(   r)   rR     s   

z UniformKBinsDiscretizer.__init__datasetr   r   c                 C   s   |    t| jtr| j }n| j}|D ]}t| jtr"| j| n| j}t|ts1td| q| jj	t
|d | jj	t|d | S )Nz5`bins` must be an integer or a dict of integers, got )aggregator_fnr   )_validate_on_fitr   r#   r%   keysr   int	TypeErrorstat_computation_planadd_aggregatorr
   r	   )r'   r\   r   columnr#   r(   r(   r)   _fit7  s(   
zUniformKBinsDiscretizer._fitc                 C   s   |    d S r1   )r7   r&   r(   r(   r)   r^   Q  s   z(UniformKBinsDiscretizer._validate_on_fitc                 C   s"   | j |}t|| j| j| _| S r1   )rb   computepost_fit_processorr#   r   r"   )r'   r\   statsr(   r(   r)   _fit_executeT  s   z$UniformKBinsDiscretizer._fit_execute)r\   r   )rD   rG   rH   rI   r   rS   r   r`   r   rV   r   r   r   r   rW   rX   rR   r   re   r^   ri   __classcell__r(   r(   r[   r)   rY      s6    _
rY   aggregate_statsr#   r   c           
      C   s   i i i }}}|   D ]\}}|dd }|dr|||< |dr(|||< q| D ]}	t||	 ||	 t|tr?||	 n||d||	< q-|S )N   minmax)mnmxr#   r   )rB   r:   r_   ._translate_min_max_number_of_bins_to_bin_edgesr   r%   )
rk   r#   r   minsmaxesrh   keyvaluecolumn_namerd   r(   r(   r)   rg   Z  s    

rg   rp   rq   r   c                 C   s   | |f}dd |D \} }t | st |rtd| |krH| | dkr*dt|  nd8 } ||dkr8dt| nd7 }t j| ||d dd}|S t j| ||d dd}||  d }|re|d  |8  < |S |d	  |7  < |S )
zETranslates a range and desired number of bins into list of bin edges.c                 s   s    | ]}|d  V  qdS )g        Nr(   )r2   mir(   r(   r)   r4     s    zA_translate_min_max_number_of_bins_to_bin_edges.<locals>.<genexpr>z@Cannot specify integer `bins` when input data contains infinity.r   gMbP?   T)endpointrm   )rW   isinfr6   abslinspace)rp   rq   r#   r   rngadjr(   r(   r)   rr     s$   rr   )typingr   r   r   r   r   r   r   numpyrW   pandasr   ray.data.aggregater	   r
   ray.data.preprocessorr   ray.util.annotationsr   ray.data.datasetr   r   rM   rY   r%   rS   rV   rg   rT   r`   rr   r(   r(   r(   r)   <module>   s4   $ :z 7