o
    ci                  
   @   s   d dl mZ d dlmZ d dlmZmZmZmZ d dl	Z
d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ ed
dG dd deZdededeeeeef f fddZdS )    Counter)Number)DictListOptionalUnionN)is_categorical_dtype)DatasetMean)Preprocessor)	PublicAPIalpha)	stabilityc                   @   s   e Zd ZdZg dZ		ddddee dedeeee	f  d	eee  fd
dZ
dedefddZdejfddZdd Zdd ZdS )SimpleImputera  Replace missing values with imputed values. If the column is missing from a
    batch, it will be filled with the imputed value.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import SimpleImputer
        >>> df = pd.DataFrame({"X": [0, None, 3, 3], "Y": [None, "b", "c", "c"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
        >>> ds.to_pandas()  # doctest: +SKIP
             X     Y
        0  0.0  None
        1  NaN     b
        2  3.0     c
        3  3.0     c

        The `"mean"` strategy imputes missing values with the mean of non-missing
        values. This strategy doesn't work with categorical data.

        >>> preprocessor = SimpleImputer(columns=["X"], strategy="mean")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X     Y
        0  0.0  None
        1  2.0     b
        2  3.0     c
        3  3.0     c

        The `"most_frequent"` strategy imputes missing values with the most frequent
        value in each column.

        >>> preprocessor = SimpleImputer(columns=["X", "Y"], strategy="most_frequent")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X  Y
        0  0.0  c
        1  3.0  b
        2  3.0  c
        3  3.0  c

        The `"constant"` strategy imputes missing values with the value specified by
        `fill_value`.

        >>> preprocessor = SimpleImputer(
        ...     columns=["Y"],
        ...     strategy="constant",
        ...     fill_value="?",
        ... )
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X  Y
        0  0.0  ?
        1  NaN  b
        2  3.0  c
        3  3.0  c

        :class:`SimpleImputer` can also be used in append mode by providing the
        name of the output_columns that should hold the imputed values.

        >>> preprocessor = SimpleImputer(columns=["X"], output_columns=["X_imputed"], strategy="mean")
        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
             X     Y  X_imputed
        0  0.0  None        0.0
        1  NaN     b        2.0
        2  3.0     c        3.0
        3  3.0     c        3.0

    Args:
        columns: The columns to apply imputation to.
        strategy: How imputed values are chosen.

            * ``"mean"``: The mean of non-missing values. This strategy only works with numeric columns.
            * ``"most_frequent"``: The most common value.
            * ``"constant"``: The value passed to ``fill_value``.

        fill_value: The value to use when ``strategy`` is ``"constant"``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.

    Raises:
        ValueError: if ``strategy`` is not ``"mean"``, ``"most_frequent"``, or
            ``"constant"``.
    )meanmost_frequentconstantr   N)output_columnscolumnsstrategy
fill_valuer   c                C   sb   || _ || _|| _|| jvrtd| d| j |dkr(d| _|d u r(tdt||| _d S )N	Strategy z( is not supported.Supported values are: r   Fz8`fill_value` must be set when using "constant" strategy.)	r   r   r   _valid_strategies
ValueError_is_fittabler   #_derive_and_validate_output_columnsr   )selfr   r   r   r    r   R/home/ubuntu/.local/lib/python3.10/site-packages/ray/data/preprocessors/imputer.py__init__f   s$   

zSimpleImputer.__init__datasetreturnc                 C   sL   | j dkrdd | jD }|j| | _| S | j dkr$t|g| jR  | _| S )Nr   c                 S   s   g | ]}t |qS r   r   .0colr   r   r    
<listcomp>       z&SimpleImputer._fit.<locals>.<listcomp>r   )r   r   	aggregatestats__get_most_frequent_values)r   r"   
aggregatesr   r   r    _fit   s   

zSimpleImputer._fitdfc                 C   s   t | j| jD ]X\}}| |}|d u rtd| d||jvr&|||< qt|j| r8|| j|g||< ||ksLt	|| j
tjrV|| j
jjsV|| jdd||< || j|dd q|S )NzColumn zA has no fill value. Check the data used to fit the SimpleImputer.T)deep)inplace)zipr   r   _get_fill_valuer   r	   dtypescatadd_categories
isinstancevaluesnpndarrayflags	writeablecopyfillna)r   r.   columnoutput_columnvaluer   r   r    _transform_pandas   s$   



zSimpleImputer._transform_pandasc                 C   sZ   | j dkr| jd| d S | j dkr| jd| d S | j dkr$| jS td| j  d)	Nr   zmean()r   most_frequent(r   r   zA is not supported. Supported values are: {self._valid_strategies})r   r*   r   r   )r   r>   r   r   r    r2      s   


zSimpleImputer._get_fill_valuec              
   C   s.   | j j d| jd| jd| jd| jd
S )Nz	(columns=z, strategy=z, fill_value=z, output_columns=rB   )	__class____name__r   r   r   r   )r   r   r   r    __repr__   s   zSimpleImputer.__repr__)r   N)rE   
__module____qualname____doc__r   r   strr   r   r   r!   r
   r   r-   pd	DataFramerA   r2   rF   r   r   r   r    r      s(    S

	 r   r"   r   r#   c                    s   t   dtjdttttf  f fdd}| j|dd}dd  D |jd d	D ]}|	 D ]\}}|D ]
}|  |7  < q6q0q*fd
d D S )Nr.   r#   c                    s    fddD S )Nc                    s$   i | ]}|t  |   gqS r   )r   value_countsto_dictr$   r.   r   r    
<dictcomp>   s   $ zJ_get_most_frequent_values.<locals>.get_pd_value_counts.<locals>.<dictcomp>r   rO   )r   rO   r    get_pd_value_counts   s   z6_get_most_frequent_values.<locals>.get_pd_value_countspandas)batch_formatc                 S   s   i | ]}|t  qS r   r   r$   r   r   r    rP      r(   z-_get_most_frequent_values.<locals>.<dictcomp>)
batch_sizec                    s,   i | ]}d | d |  dd d qS )rC   rB      r   )most_common)r%   r>   )final_countersr   r    rP      s    )
listrK   rL   r   r   rJ   r   map_batchesiter_batchesitems)r"   r   rQ   rM   batchr&   counterscounterr   )r   rW   r    r+      s   $
r+   )collectionsr   numbersr   typingr   r   r   r   numpyr8   rR   rK   pandas.api.typesr	   ray.datar
   ray.data.aggregater   ray.data.preprocessorr   ray.util.annotationsr   r   rJ   r+   r   r   r   r    <module>   s(     3