o
    bis`                     @   s  d dl mZ d dlmZmZmZmZmZmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlmZmZmZmZmZ d dl m!Z! d d	l"m#Z# d d
l$m%Z% dZ&dZ'G dd dZ(deedgef dedee) de	e) dededee fddZ*e(Z+dS )    )partial)	AnyCallableDictIterableIteratorListOptionalTupleUnion)ComputeStrategy)LogicalPlan)	Aggregate)AggregateFnCountMaxMeanMinStdSum)BlockBlockAccessorCallableClass	DataBatchUserDefinedFunction)ShuffleStrategy)Dataset)	PublicAPIz!Computations or Descriptive StatszFunction Applicationc                !   @   sj  e Zd ZdZdedeeeee f  dee	 fddZ
defdd	Zeed
dedefddZdedeeee f fddZeed
ddddddddddddddeeef dedeeef dee deee  deeeef  deee  deeeef  dee dee d ee d!eee	ee	e	f f  d"eeg eeef f  dd#fd$d%Zeed
defd&d'Zeed
	(d6deeee f d)edefd*d+Zeed
	(d6deeee f d)edefd,d-Z eed
	(d6deeee f d)edefd.d/Z!eed
	(d6deeee f d)edefd0d1Z"eed
		2	(d7deeee f d3e	d)edefd4d5Z#dS )8GroupedDatazRepresents a grouped dataset created by calling ``Dataset.groupby()``.

    The actual groupby is deferred until an aggregation is applied.
    datasetkeynum_partitionsc                C   s   || _ || _|| _dS )zConstruct a dataset grouped by key (internal API).

        The constructor is not part of the GroupedData API.
        Use the ``Dataset.groupby()`` method to construct one.
        N)_dataset_key_num_partitions)selfr   r    r!    r&   I/home/ubuntu/.local/lib/python3.10/site-packages/ray/data/grouped_data.py__init__   s   
zGroupedData.__init__returnc                 C   s   | j j d| j d| jdS )Nz	(dataset=z, key=))	__class____name__r"   r#   r%   r&   r&   r'   __repr__-   s   zGroupedData.__repr__)	api_groupaggsc                 G   s>   | j j }t| j jj| j|| jd}t|| j j	}t
||S )a  Implements an accumulator-based aggregation.

        Args:
            aggs: Aggregations to do.

        Returns:
            The output is an dataset of ``n + 1`` columns where the first column
            is the groupby key and the second through ``n + 1`` columns are the
            results of the aggregations.
            If groupby key is ``None`` then the key part of return is omitted.
        )r    r0   r!   )r"   _plancopyr   _logical_plandagr#   r$   r   contextr   )r%   r0   planoplogical_planr&   r&   r'   	aggregate2   s   zGroupedData.aggregateagg_clsonc                 O   s,   | j j||g|R d| ji|}| j| S )aV  Helper for aggregating on a particular subset of the dataset.

        This validates the `on` argument, and converts a list of column names
        to a multi-aggregation. A null `on` results in a
        multi-aggregation on all columns for an Arrow Dataset, and a single
        aggregation on the entire row for a simple Dataset.
        	skip_cols)r"   _build_multicolumn_aggsr#   r9   )r%   r:   r;   argskwargsr0   r&   r&   r'   _aggregate_onM   s   
zGroupedData._aggregate_onFNdefault)zero_copy_batchcomputebatch_formatfn_args	fn_kwargsfn_constructor_argsfn_constructor_kwargsnum_cpusnum_gpusmemoryconcurrencyray_remote_args_fnfnrB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   r   c                   s&  | j du r| jd}n"| jjjtjkr'| jp| jjj}| jj|| j dd}n| j	| j }| j du r6g nt
| j trA| j gnt
| j trK| j n	td| j  dt
treG  fddd}n fd	d
}t
trxjj|_nj|_|j|fd|d||||||	|
|||d|S )a@  Apply the given function to each group of records of this dataset.

        While map_groups() is very flexible, note that it comes with downsides:

        * It may be slower than using more specific methods such as min(), max().
        * It requires that each group fits in memory on a single node.

        In general, prefer to use `aggregate()` instead of `map_groups()`.

        .. warning::
            Specifying both ``num_cpus`` and ``num_gpus`` for map tasks is experimental,
            and may result in scheduling or stability issues. Please
            `report any issues <https://github.com/ray-project/ray/issues/new/choose>`_
            to the Ray team.

        Examples:
            >>> # Return a single record per group (list of multiple records in,
            >>> # list of a single record out).
            >>> import ray
            >>> import pandas as pd
            >>> import numpy as np
            >>> # Get first value per group.
            >>> ds = ray.data.from_items([ # doctest: +SKIP
            ...     {"group": 1, "value": 1},
            ...     {"group": 1, "value": 2},
            ...     {"group": 2, "value": 3},
            ...     {"group": 2, "value": 4}])
            >>> ds.groupby("group").map_groups( # doctest: +SKIP
            ...     lambda g: {"result": np.array([g["value"][0]])})

            >>> # Return multiple records per group (dataframe in, dataframe out).
            >>> df = pd.DataFrame(
            ...     {"A": ["a", "a", "b"], "B": [1, 1, 3], "C": [4, 6, 5]}
            ... )
            >>> ds = ray.data.from_pandas(df) # doctest: +SKIP
            >>> grouped = ds.groupby("A") # doctest: +SKIP
            >>> grouped.map_groups( # doctest: +SKIP
            ...     lambda g: g.apply(
            ...         lambda c: c / g[c.name].sum() if c.name in ["B", "C"] else c
            ...     )
            ... ) # doctest: +SKIP

        Args:
            fn: The function to apply to each group of records, or a class type
                that can be instantiated to create such a callable. It takes as
                input a batch of all records from a single group, and returns a
                batch of zero or more records, similar to map_batches().
            zero_copy_batch: If True, each group of rows (batch) will be provided w/o
                making an additional copy.
            compute: This argument is deprecated. Use ``concurrency`` argument.
            batch_format: Specify ``"default"`` to use the default block format
                (NumPy), ``"pandas"`` to select ``pandas.DataFrame``, "pyarrow" to
                select ``pyarrow.Table``, or ``"numpy"`` to select
                ``Dict[str, numpy.ndarray]``, or None to return the underlying block
                exactly as is with no additional formatting.
            fn_args: Arguments to `fn`.
            fn_kwargs: Keyword arguments to `fn`.
            fn_constructor_args: Positional arguments to pass to ``fn``'s constructor.
                You can only provide this if ``fn`` is a callable class. These arguments
                are top-level arguments in the underlying Ray actor construction task.
            fn_constructor_kwargs: Keyword arguments to pass to ``fn``'s constructor.
                This can only be provided if ``fn`` is a callable class. These arguments
                are top-level arguments in the underlying Ray actor construction task.
            num_cpus: The number of CPUs to reserve for each parallel map worker.
            num_gpus: The number of GPUs to reserve for each parallel map worker. For
                example, specify `num_gpus=1` to request 1 GPU for each parallel map
                worker.
            memory: The heap memory in bytes to reserve for each parallel map worker.
            ray_remote_args_fn: A function that returns a dictionary of remote args
                passed to each map worker. The purpose of this argument is to generate
                dynamic arguments for each actor or task, and will be called each time prior
                to initializing the worker. Args returned from this dict will always
                override the args in ``ray_remote_args``. Note: this is an advanced,
                experimental feature.
            concurrency: The semantics of this argument depend on the type of ``fn``:

                * If ``fn`` is a function and ``concurrency`` isn't set (default), the
                  actual concurrency is implicitly determined by the available
                  resources and number of input blocks.

                * If ``fn`` is a function and ``concurrency`` is an  int ``n``, Ray Data
                  launches *at most* ``n`` concurrent tasks.

                * If ``fn`` is a class and ``concurrency`` is an int ``n``, Ray Data
                  uses an actor  pool with *exactly* ``n`` workers.

                * If ``fn`` is a class and  ``concurrency`` is a tuple ``(m, n)``, Ray
                  Data uses an autoscaling actor pool from ``m`` to ``n`` workers.

                * If ``fn`` is a class and ``concurrency`` isn't set (default), this
                  method raises an error.

            ray_remote_args: Additional resource requirements to request from
                Ray (e.g., num_gpus=1 to request GPUs for the map tasks). See
                :func:`ray.remote` for details.

        Returns:
            The return type is determined by the return type of ``fn``, and the return
            value is combined from results of all groups.

        .. seealso::

            :meth:`GroupedData.aggregate`
                Use this method for common aggregation use cases.
        N   T)keyssortzYGroup-by keys are expected to either be a single column (str) or a list of columns (got 'z')c                       s&   e Zd ZfddZ fddZdS )*GroupedData.map_groups.<locals>.wrapped_fnc                    s    |i || _ d S NrN   )r%   r>   r?   rT   r&   r'   r(     s   z3GroupedData.map_groups.<locals>.wrapped_fn.__init__c                 ?   s*    t | j| g|R i |E d H  d S rS   )_apply_udf_to_groupsrN   )r%   batchr>   r?   )rD   rP   r&   r'   __call__  s   
z3GroupedData.map_groups.<locals>.wrapped_fn.__call__N)r,   
__module____qualname__r(   rW   r&   rD   rN   rP   r&   r'   
wrapped_fn
  s    r[   c                 ?   s(    t |  g|R i |E d H  d S rS   )rU   )rV   r>   r?   rZ   r&   r'   r[     s   rR   )
batch_sizerC   rD   rB   rE   rF   rG   rH   rI   rJ   rK   rL   rM   )r#   r"   repartitionr5   shuffle_strategyr   HASH_SHUFFLEr$    default_hash_shuffle_parallelismrQ   
isinstancestrr   
ValueErrorr   r   funcr,   *_map_batches_without_batch_size_validation)r%   rN   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   ray_remote_argsshuffled_dsr!   r[   r&   rZ   r'   
map_groups`   sb    



	
zGroupedData.map_groupsc                 C   s   |  t S )a  Compute count aggregation.

        Examples:
            >>> import ray
            >>> ray.data.from_items([ # doctest: +SKIP
            ...     {"A": x % 3, "B": x} for x in range(100)]).groupby( # doctest: +SKIP
            ...     "A").count() # doctest: +SKIP

        Returns:
            A dataset of ``[k, v]`` columns where ``k`` is the groupby key and
            ``v`` is the number of rows with that key.
            If groupby key is ``None`` then the key part of return is omitted.
        )r9   r   r-   r&   r&   r'   count8  s   zGroupedData.countTignore_nullsc                 C      | j t||dS )a  Compute grouped sum aggregation.

        Examples:
            >>> import ray
            >>> ray.data.from_items([ # doctest: +SKIP
            ...     (i % 3, i, i**2) # doctest: +SKIP
            ...     for i in range(100)])  # doctest: +SKIP
            ...     .groupby(lambda x: x[0] % 3)  # doctest: +SKIP
            ...     .sum(lambda x: x[2]) # doctest: +SKIP
            >>> ray.data.range(100).groupby("id").sum() # doctest: +SKIP
            >>> ray.data.from_items([ # doctest: +SKIP
            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
            ...     for i in range(100)])  # doctest: +SKIP
            ...     .groupby("A")  # doctest: +SKIP
            ...     .sum(["B", "C"]) # doctest: +SKIP

        Args:
            on: a column name or a list of column names to aggregate.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the sum; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.

        Returns:
            The sum result.

            For different values of ``on``, the return varies:

            - ``on=None``: a dataset containing a groupby key column,
              ``"k"``, and a column-wise sum column for each original column
              in the dataset.
            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
              columns where the first column is the groupby key and the second
              through ``n + 1`` columns are the results of the aggregations.

            If groupby key is ``None`` then the key part of return is omitted.
        rj   )r@   r   r%   r;   rj   r&   r&   r'   sumI  s   *zGroupedData.sumc                 C   rk   )a  Compute grouped min aggregation.

        Examples:
            >>> import ray
            >>> ray.data.le(100).groupby("value").min() # doctest: +SKIP
            >>> ray.data.from_items([ # doctest: +SKIP
            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
            ...     for i in range(100)])  # doctest: +SKIP
            ...     .groupby("A")  # doctest: +SKIP
            ...     .min(["B", "C"]) # doctest: +SKIP

        Args:
            on: a column name or a list of column names to aggregate.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the min; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.

        Returns:
            The min result.

            For different values of ``on``, the return varies:

            - ``on=None``: a dataset containing a groupby key column,
              ``"k"``, and a column-wise min column for each original column in
              the dataset.
            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
              columns where the first column is the groupby key and the second
              through ``n + 1`` columns are the results of the aggregations.

            If groupby key is ``None`` then the key part of return is omitted.
        rl   )r@   r   rm   r&   r&   r'   minu     %zGroupedData.minc                 C   rk   )a  Compute grouped max aggregation.

        Examples:
            >>> import ray
            >>> ray.data.le(100).groupby("value").max() # doctest: +SKIP
            >>> ray.data.from_items([ # doctest: +SKIP
            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
            ...     for i in range(100)])  # doctest: +SKIP
            ...     .groupby("A")  # doctest: +SKIP
            ...     .max(["B", "C"]) # doctest: +SKIP

        Args:
            on: a column name or a list of column names to aggregate.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the max; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.

        Returns:
            The max result.

            For different values of ``on``, the return varies:

            - ``on=None``: a dataset containing a groupby key column,
              ``"k"``, and a column-wise max column for each original column in
              the dataset.
            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
              columns where the first column is the groupby key and the second
              through ``n + 1`` columns are the results of the aggregations.

            If groupby key is ``None`` then the key part of return is omitted.
        rl   )r@   r   rm   r&   r&   r'   max  rp   zGroupedData.maxc                 C   rk   )a  Compute grouped mean aggregation.

        Examples:
            >>> import ray
            >>> ray.data.le(100).groupby("value").mean() # doctest: +SKIP
            >>> ray.data.from_items([ # doctest: +SKIP
            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
            ...     for i in range(100)])  # doctest: +SKIP
            ...     .groupby("A")  # doctest: +SKIP
            ...     .mean(["B", "C"]) # doctest: +SKIP

        Args:
            on: a column name or a list of column names to aggregate.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the mean; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.

        Returns:
            The mean result.

            For different values of ``on``, the return varies:

            - ``on=None``: a dataset containing a groupby key column,
              ``"k"``, and a column-wise mean column for each original column
              in the dataset.
            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
              columns where the first column is the groupby key and the second
              through ``n + 1`` columns are the results of the aggregations.

            If groupby key is ``None`` then the key part of return is omitted.
        rl   )r@   r   rm   r&   r&   r'   mean  rp   zGroupedData.meanrO   ddofc                 C   s   | j t|||dS )ah  Compute grouped standard deviation aggregation.

        Examples:
            >>> import ray
            >>> ray.data.range(100).groupby("id").std(ddof=0) # doctest: +SKIP
            >>> ray.data.from_items([ # doctest: +SKIP
            ...     {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
            ...     for i in range(100)])  # doctest: +SKIP
            ...     .groupby("A")  # doctest: +SKIP
            ...     .std(["B", "C"]) # doctest: +SKIP

        NOTE: This uses Welford's online method for an accumulator-style
        computation of the standard deviation. This method was chosen due to
        it's numerical stability, and it being computable in a single pass.
        This may give different (but more accurate) results than NumPy, Pandas,
        and sklearn, which use a less numerically stable two-pass algorithm.
        See
        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm

        Args:
            on: a column name or a list of column names to aggregate.
            ddof: Delta Degrees of Freedom. The divisor used in calculations
                is ``N - ddof``, where ``N`` represents the number of elements.
            ignore_nulls: Whether to ignore null values. If ``True``, null
                values will be ignored when computing the std; if ``False``,
                if a null value is encountered, the output will be null.
                We consider np.nan, None, and pd.NaT to be null values.
                Default is ``True``.

        Returns:
            The standard deviation result.

            For different values of ``on``, the return varies:

            - ``on=None``: a dataset containing a groupby key column,
              ``"k"``, and a column-wise std column for each original column in
              the dataset.
            - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
              columns where the first column is the groupby key and the second
              through ``n + 1`` columns are the results of the aggregations.

            If groupby key is ``None`` then the key part of return is omitted.
        )rj   rs   )r@   r   )r%   r;   rs   rj   r&   r&   r'   std  s   2zGroupedData.std)NT)NrO   T)$r,   rX   rY   __doc__r   r	   r   rb   r   intr(   r.   r   FA_API_GROUPr   r9   typer@   r   r   boolr   r   r   r   floatr
   r   rh   CDS_API_GROUPri   rn   ro   rq   rr   rt   r&   r&   r&   r'   r      s    





	
 X+&&&r   udf.blockrP   rD   r>   r?   r)   c                 o   sv    t |}||}t|dd |dd D ] \}}	|j||	dd}
t |
}| ||g|R i |V  qdS )zApply UDF to groups of rows having the same set of values of the specified
    columns (keys).

    NOTE: This function is defined at module level to avoid capturing closures and make it serializable.NrO   F)r2   )r   	for_block_get_group_boundaries_sortedzipsliceto_batch_format)r|   r}   rP   rD   r>   r?   block_accessor
boundariesstartendgroup_blockgroup_block_accessorr&   r&   r'   rU     s   

"
 rU   N),	functoolsr   typingr   r   r   r   r   r   r	   r
   r   ray.data._internal.computer   %ray.data._internal.logical.interfacesr   8ray.data._internal.logical.operators.all_to_all_operatorr   ray.data.aggregater   r   r   r   r   r   r   ray.data.blockr   r   r   r   r   ray.data.contextr   ray.data.datasetr   ray.util.annotationsr   r{   rw   r   rb   rU   GroupedDatasetr&   r&   r&   r'   <module>   sB    ,$    
