o
    `۷i#                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ erPd d
lmZ e eZ eddedddG dd deZ!ddde
e" dee"ge"f de	e"ee"ef f fddZ#dS )    NCounter)Number)TYPE_CHECKINGAnyCallableDictListOptionalUnion)is_categorical_dtype)Mean)SerializablePreprocessorBase)SerializablePreprocessor)	PublicAPI)Datasetalpha)	stability   z#io.ray.preprocessors.simple_imputer)version
identifierc                       s   e Zd ZdZg dZ		ddddee dedeeee	f  d	eee  f fd
dZ
dddefddZdejfddZdd Zdd Zdeeef fddZdeeef defddZ  ZS )SimpleImputera  Replace missing values with imputed values. If the column is missing from a
    batch, it will be filled with the imputed value.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import SimpleImputer
        >>> df = pd.DataFrame({"X": [0, None, 3, 3], "Y": [None, "b", "c", "c"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> ds.to_pandas()  # doctest: +SKIP
             X     Y
        0  0.0  None
        1  NaN     b
        2  3.0     c
        3  3.0     c

        The `"mean"` strategy imputes missing values with the mean of non-missing
        values. This strategy doesn't work with categorical data.

        >>> preprocessor = SimpleImputer(columns=["X"], strategy="mean")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X     Y
        0  0.0  None
        1  2.0     b
        2  3.0     c
        3  3.0     c

        The `"most_frequent"` strategy imputes missing values with the most frequent
        value in each column.

        >>> preprocessor = SimpleImputer(columns=["X", "Y"], strategy="most_frequent")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X  Y
        0  0.0  c
        1  3.0  b
        2  3.0  c
        3  3.0  c

        The `"constant"` strategy imputes missing values with the value specified by
        `fill_value`.

        >>> preprocessor = SimpleImputer(
        ...     columns=["Y"],
        ...     strategy="constant",
        ...     fill_value="?",
        ... )
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X  Y
        0  0.0  ?
        1  NaN  b
        2  3.0  c
        3  3.0  c

        :class:`SimpleImputer` can also be used in append mode by providing the
        name of the output_columns that should hold the imputed values.

        >>> preprocessor = SimpleImputer(columns=["X"], output_columns=["X_imputed"], strategy="mean")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X     Y  X_imputed
        0  0.0  None        0.0
        1  NaN     b        2.0
        2  3.0     c        3.0
        3  3.0     c        3.0

    Args:
        columns: The columns to apply imputation to.
        strategy: How imputed values are chosen.

            * ``"mean"``: The mean of non-missing values. This strategy only works with numeric columns.
            * ``"most_frequent"``: The most common value.
            * ``"constant"``: The value passed to ``fill_value``.

        fill_value: The value to use when ``strategy`` is ``"constant"``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    Raises:
        ValueError: if ``strategy`` is not ``"mean"``, ``"most_frequent"``, or
            ``"constant"``.
    )meanmost_frequentconstantr   N)output_columnscolumnsstrategy
fill_valuer   c                   sl   t    || _|| _|| _|| jvrtd| d| j |dkr-d| _|d u r-tdt	||| _
d S )N	Strategy z( is not supported.Supported values are: r   Fz8`fill_value` must be set when using "constant" strategy.)super__init__r   r   r   _valid_strategies
ValueError_is_fittabler   #_derive_and_validate_output_columnsr   )selfr   r   r   r   	__class__ T/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/data/preprocessors/imputer.pyr!   p   s(   

zSimpleImputer.__init__datasetr   returnc                    sP   j dkrjjtjd S j dkr&jj fdddd jd S )Nr   )aggregator_fnr   r   c                    s   t  j| dS )N)r+   r   key_gen)_get_most_frequent_valuesr   )r.   r+   r&   r)   r*   <lambda>   s
    z$SimpleImputer._fit.<locals>.<lambda>c                 S   s   d|  dS )Nmost_frequent()r)   )colr)   r)   r*   r1      s    )stat_fnstat_key_fnr   )r   stat_computation_planadd_aggregatorr   r   add_callable_stat)r&   r+   r)   r0   r*   _fit   s   


zSimpleImputer._fitdfc                 C   s   t | j| jD ]X\}}| |}|d u rtd| d||jvr&|||< qt|j| r8|| j|g||< ||ksLt	|| j
tjrV|| j
jjsV|| jdd||< |j||idd q|S )NzColumn zA has no fill value. Check the data used to fit the SimpleImputer.T)deep)inplace)zipr   r   _get_fill_valuer#   r   dtypescatadd_categories
isinstancevaluesnpndarrayflags	writeablecopyfillna)r&   r;   columnoutput_columnvaluer)   r)   r*   _transform_pandas   s$   



zSimpleImputer._transform_pandasc                 C   sZ   | j dkr| jd| d S | j dkr| jd| d S | j dkr$| jS td| j  d)	Nr   zmean(r3   r   r2   r   r   zA is not supported. Supported values are: {self._valid_strategies})r   stats_r   r#   )r&   rK   r)   r)   r*   r?      s   


zSimpleImputer._get_fill_valuec              
   C   s.   | j j d| jd| jd| jd| jd
S )Nz	(columns=z, strategy=z, fill_value=z, output_columns=r3   )r(   __name__r   r   r   r   r&   r)   r)   r*   __repr__   s   zSimpleImputer.__repr__c                 C   s&   | j | jt| dd | jt| dd dS )N_fittedr   )r   r   rS   r   r   )r   r   getattrr   rQ   r)   r)   r*   _get_serializable_fields   s   

z&SimpleImputer._get_serializable_fieldsfieldsr   c                 C   sN   |d | _ |d | _|d | _|d| _|d| _| jdkr%d| _d S d S )Nr   r   r   rS   r   r   F)r   r   r   getrS   r   r$   )r&   rV   r   r)   r)   r*   _set_serializable_fields   s   




z&SimpleImputer._set_serializable_fields)r   N)rP   
__module____qualname____doc__r"   r	   strr
   r   r   r!   r   r:   pd	DataFramerN   r?   rR   r   r   rU   intrX   __classcell__r)   r)   r'   r*   r      s,    S
! "	r   r+   r   r   r.   r,   c           	         s   dt jdtttt f f fdd}| j|dd}dd  D |jd d	D ]}| D ]\}}|D ]
}|  |7  < q2q,q&fd
d D S )Nr;   r,   c                    s    fddD S )Nc                    s$   i | ]}|t  |   gqS r)   )r   value_countsto_dict.0r4   r;   r)   r*   
<dictcomp>   s   $ zJ_get_most_frequent_values.<locals>.get_pd_value_counts.<locals>.<dictcomp>r)   re   )r   re   r*   get_pd_value_counts   s   z6_get_most_frequent_values.<locals>.get_pd_value_countspandas)batch_formatc                 S   s   i | ]}|t  qS r)   r   rc   r)   r)   r*   rf      s    z-_get_most_frequent_values.<locals>.<dictcomp>)
batch_sizec                    s(   i | ]}| |  d d d qS )r   r   )most_common)rd   rK   )final_countersr.   r)   r*   rf      s    )	r]   r^   r   r\   r	   r   map_batchesiter_batchesitems)	r+   r   r.   rg   ra   batchr4   counterscounterr)   )r   rl   r.   r*   r/      s   $r/   )$loggingcollectionsr   numbersr   typingr   r   r   r   r	   r
   r   numpyrE   rh   r]   pandas.api.typesr   ray.data.aggregater   ray.data.preprocessorr   &ray.data.preprocessors.version_supportr   Serializableray.util.annotationsr   ray.data.datasetr   	getLoggerrP   loggerr   r\   r/   r)   r)   r)   r*   <module>   s6    $

 T