o
    cid                     @   s^   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 e
ddG dd	 d	eZdS )
    N)List)Preprocessor)simple_hash)	PublicAPIalpha)	stabilityc                   @   sF   e Zd ZdZdZdee dedefddZde	j
fd	d
Zdd ZdS )FeatureHashera  Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a
    table that describes token frequencies.

    :class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``,
    where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
    ``hash_{index}`` describes the frequency of tokens that hash to ``index``.

    Distinct tokens can correspond to the same index. However, if ``num_features`` is
    large enough, then columns probably correspond to a unique token.

    This preprocessor is memory efficient and quick to pickle. However, given a
    transformed column, you can't know which tokens correspond to it. This might make it
    hard to determine which tokens are important to your model.

    .. warning::
        Sparse matrices aren't supported. If you use a large ``num_features``, this
        preprocessor might behave poorly.

    Examples:

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import FeatureHasher

        The data below describes the frequencies of tokens in ``"I like Python"`` and
        ``"I dislike Python"``.

        >>> df = pd.DataFrame({
        ...     "I": [1, 1],
        ...     "like": [1, 0],
        ...     "dislike": [0, 1],
        ...     "Python": [1, 1]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP

        :class:`FeatureHasher` hashes each token to determine its index. For example,
        the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.

        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
        >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy()  # doctest: +SKIP
        array([[0, 0, 0, 2, 0, 1, 0, 0],
               [0, 0, 0, 1, 0, 1, 1, 0]])

        Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
        :math:`3`. You can avoid hash collisions like these by increasing
        ``num_features``.

    Args:
        columns: The columns to apply the hashing trick to. Each column should describe
            the frequency of a token.
        num_features: The number of features used to represent the vocabulary. You
            should choose a value large enough to prevent hash collisions between
            distinct tokens.
        output_column: The name of the column that contains the hashed features.

    .. seealso::
        :class:`~ray.data.preprocessors.CountVectorizer`
            Use this preprocessor to generate inputs for :class:`FeatureHasher`.

        :class:`ray.data.preprocessors.HashingVectorizer`
            If your input data describes documents rather than token frequencies,
            use :class:`~ray.data.preprocessors.HashingVectorizer`.
    Fcolumnsnum_featuresoutput_columnc                 C   s   || _ || _|| _d S )N)r	   r
   r   )selfr	   r
   r    r   Q/home/ubuntu/.local/lib/python3.10/site-packages/ray/data/preprocessors/hasher.py__init__O   s   
zFeatureHasher.__init__dfc                    sn    fdd}|j d d  jf j|ddd}dd t jD }||  }tt||j d d  j	f< |S )Nc                    sN   t t jD ]}t|j} |  | | 7  < q fddtjD S )Nc                    s   i | ]
}d |  | qS hash_r   .0ihash_countsr   r   
<dictcomp>b   s    zOFeatureHasher._transform_pandas.<locals>.row_feature_hasher.<locals>.<dictcomp>)collectionsdefaultdictintr	   r   r
   range)rowcolumnhashed_valuer   r   r   row_feature_hasher]   s
   

z;FeatureHasher._transform_pandas.<locals>.row_feature_hasher   expand)axisresult_typec                 S   s   g | ]}d | qS r   r   r   r   r   r   
<listcomp>i   s    z3FeatureHasher._transform_pandas.<locals>.<listcomp>)
locr	   applyr   r
   to_numpypdSerieslistr   )r   r   r!   feature_columnshash_columnsconcatenatedr   r    r   _transform_pandas[   s   zFeatureHasher._transform_pandasc                 C   s&   | j j d| jd| jd| jdS )Nz	(columns=z, num_features=z, output_column=))	__class____name__r	   r
   r   r    r   r   r   __repr__q   s   zFeatureHasher.__repr__N)r3   
__module____qualname____doc___is_fittabler   strr   r   r*   	DataFramer0   r4   r   r   r   r   r      s    @
r   )r   typingr   pandasr*   ray.data.preprocessorr   ray.data.preprocessors.utilsr   ray.util.annotationsr   r   r   r   r   r   <module>   s    