o
    $i                     @   s^   d dl mZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ eddG dd	 d	eZdS )
    )CallableListOptionalN)Preprocessor)simple_split_tokenizer)	PublicAPIalpha)	stabilityc                	       sn   e Zd ZdZdZ		ddee deeegee f  deee  f fddZ	d	e
jfd
dZdd Z  ZS )	Tokenizera  Replace each string with a list of tokens.

    Examples:
        >>> import pandas as pd
        >>> import ray
        >>> df = pd.DataFrame({"text": ["Hello, world!", "foo bar\nbaz"]})
        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP

        The default ``tokenization_fn`` delimits strings using the space character.

        >>> from ray.data.preprocessors import Tokenizer
        >>> tokenizer = Tokenizer(columns=["text"])
        >>> tokenizer.transform(ds).to_pandas()  # doctest: +SKIP
                       text
        0  [Hello,, world!]
        1   [foo, bar\nbaz]

        If the default logic isn't adequate for your use case, you can specify a
        custom ``tokenization_fn``.

        >>> import string
        >>> def tokenization_fn(s):
        ...     for character in string.punctuation:
        ...         s = s.replace(character, "")
        ...     return s.split()
        >>> tokenizer = Tokenizer(columns=["text"], tokenization_fn=tokenization_fn)
        >>> tokenizer.transform(ds).to_pandas()  # doctest: +SKIP
                      text
        0   [Hello, world]
        1  [foo, bar, baz]

        :class:`Tokenizer` can also be used in append mode by providing the
        name of the output_columns that should hold the tokenized values.

        >>> tokenizer = Tokenizer(columns=["text"], output_columns=["text_tokenized"])
        >>> tokenizer.transform(ds).to_pandas()  # doctest: +SKIP
                    text    text_tokenized
        0  Hello, world!  [Hello,, world!]
        1   foo bar\nbaz   [foo, bar\nbaz]

    Args:
        columns: The columns to tokenize.
        tokenization_fn: The function used to generate tokens. This function
            should accept a string as input and return a list of tokens as
            output. If unspecified, the tokenizer uses a function equivalent to
            ``lambda s: s.split(" ")``.
        output_columns: The names of the transformed columns. If None, the transformed
            columns will be the same as the input columns. If not None, the length of
            ``output_columns`` must match the length of ``columns``, othwerwise an error
            will be raised.
    FNcolumnstokenization_fnoutput_columnsc                    s,   t    || _|pt| _t||| _d S N)super__init__r   r   r   r   #_derive_and_validate_output_columnsr   )selfr   r   r   	__class__ ]/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/ray/data/preprocessors/tokenizer.pyr   B   s   


zTokenizer.__init__dfc                    s8   dt jf fdd}|jd d  jf || j< |S )Nsc                    s   |   jS r   )mapr   )r   r   r   r   column_tokenizerQ   s   z5Tokenizer._transform_pandas.<locals>.column_tokenizer)pdSerieslocr   	transformr   )r   r   r   r   r   r   _transform_pandasP   s    zTokenizer._transform_pandasc                 C   s4   t | jd| j}| jj d| jd| d| jdS )N__name__z	(columns=z, tokenization_fn=z, output_columns=))getattrr   r   r!   r   r   )r   namer   r   r   __repr__W   s   zTokenizer.__repr__)NN)r!   
__module____qualname____doc___is_fittabler   strr   r   r   r   	DataFramer    r%   __classcell__r   r   r   r   r
   
   s    4
r
   )typingr   r   r   pandasr   ray.data.preprocessorr   ray.data.preprocessors.utilsr   ray.util.annotationsr   r
   r   r   r   r   <module>   s    