o
    `۷i                     @   s^   d dl Z d dlmZ d dlZd dlmZ d dlmZ d dl	m
Z
 e
ddG dd	 d	eZdS )
    N)List)Preprocessor)simple_hash)	PublicAPIalpha)	stabilityc                       sr   e Zd ZdZdZdee dedef fddZde	j
fd	d
Zdee fddZdee fddZdd Z  ZS )FeatureHashera  Apply the `hashing trick <https://en.wikipedia.org/wiki/Feature_hashing>`_ to a
    table that describes token frequencies.

    :class:`FeatureHasher` creates ``num_features`` columns named ``hash_{index}``,
    where ``index`` ranges from :math:`0` to ``num_features``:math:`- 1`. The column
    ``hash_{index}`` describes the frequency of tokens that hash to ``index``.

    Distinct tokens can correspond to the same index. However, if ``num_features`` is
    large enough, then columns probably correspond to a unique token.

    This preprocessor is memory efficient and quick to pickle. However, given a
    transformed column, you can't know which tokens correspond to it. This might make it
    hard to determine which tokens are important to your model.

    .. warning::
        Sparse matrices aren't supported. If you use a large ``num_features``, this
        preprocessor might behave poorly.

    Examples:

        >>> import pandas as pd
        >>> import ray
        >>> from ray.data.preprocessors import FeatureHasher

        The data below describes the frequencies of tokens in ``"I like Python"`` and
        ``"I dislike Python"``.

        >>> df = pd.DataFrame({
        ...     "I": [1, 1],
        ...     "like": [1, 0],
        ...     "dislike": [0, 1],
        ...     "Python": [1, 1]
        ... })
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP

        :class:`FeatureHasher` hashes each token to determine its index. For example,
        the index of ``"I"`` is :math:`hash(\\texttt{"I"}) \pmod 8 = 5`.

        >>> hasher = FeatureHasher(columns=["I", "like", "dislike", "Python"], num_features=8, output_column = "hashed")
        >>> hasher.fit_transform(ds)["hashed"].to_pandas().to_numpy()  # doctest: +SKIP
        array([[0, 0, 0, 2, 0, 1, 0, 0],
               [0, 0, 0, 1, 0, 1, 1, 0]])

        Notice the hash collision: both ``"like"`` and ``"Python"`` correspond to index
        :math:`3`. You can avoid hash collisions like these by increasing
        ``num_features``.

    Args:
        columns: The columns to apply the hashing trick to. Each column should describe
            the frequency of a token.
        num_features: The number of features used to represent the vocabulary. You
            should choose a value large enough to prevent hash collisions between
            distinct tokens.
        output_column: The name of the column that contains the hashed features.

    .. seealso::
        :class:`~ray.data.preprocessors.CountVectorizer`
            Use this preprocessor to generate inputs for :class:`FeatureHasher`.

        :class:`ray.data.preprocessors.HashingVectorizer`
            If your input data describes documents rather than token frequencies,
            use :class:`~ray.data.preprocessors.HashingVectorizer`.
    Fcolumnsnum_featuresoutput_columnc                    s    t    || _|| _|| _d S N)super__init__r	   r
   r   )selfr	   r
   r   	__class__ S/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/data/preprocessors/hasher.pyr   O   s   

zFeatureHasher.__init__dfc                    sn    fdd}|j d d  jf j|ddd}dd t jD }||  }tt||j d d  j	f< |S )Nc                    sN   t t jD ]}t|j} |  | | 7  < q fddtjD S )Nc                    s   i | ]
}d |  | qS hash_r   .0ihash_countsr   r   
<dictcomp>c   s    zOFeatureHasher._transform_pandas.<locals>.row_feature_hasher.<locals>.<dictcomp>)collectionsdefaultdictintr	   r   r
   range)rowcolumnhashed_valuer   r   r   row_feature_hasher^   s
   

z;FeatureHasher._transform_pandas.<locals>.row_feature_hasher   expand)axisresult_typec                 S   s   g | ]}d | qS r   r   r   r   r   r   
<listcomp>j   s    z3FeatureHasher._transform_pandas.<locals>.<listcomp>)
locr	   applyr    r
   to_numpypdSerieslistr   )r   r   r%   feature_columnshash_columnsconcatenatedr   r$   r   _transform_pandas\   s   zFeatureHasher._transform_pandasreturnc                 C   s   | j S r   )r	   r$   r   r   r   get_input_columnsr   s   zFeatureHasher.get_input_columnsc                 C   s   | j gS r   )r   r$   r   r   r   get_output_columnsu   s   z FeatureHasher.get_output_columnsc                 C   s&   | j j d| jd| jd| jdS )Nz	(columns=z, num_features=z, output_column=))r   __name__r	   r
   r   r$   r   r   r   __repr__x   s   zFeatureHasher.__repr__)r9   
__module____qualname____doc___is_fittabler   strr   r   r.   	DataFramer4   r6   r7   r:   __classcell__r   r   r   r   r      s    @r   )r   typingr   pandasr.   ray.data.preprocessorr   ray.data.preprocessors.utilsr   ray.util.annotationsr   r   r   r   r   r   <module>   s    