o
    `۷i6                     @  s   d Z ddlmZ ddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZ ddlmZmZ er<ddlmZmZ dddZeG dd dZdS )zCString namespace for expression operations on string-typed columns.    )annotations)	dataclass)TYPE_CHECKINGAnyCallableLiteralN)DataType)_create_pyarrow_compute_udfpyarrow_udf)ExprUDFExprpc_funcCallable[..., pyarrow.Array]return_dtyper   returnCallable[..., 'UDFExpr']c                 C  s   t | |dS )a  Helper to create a string UDF that wraps a PyArrow compute function.

    This helper handles all types of PyArrow compute operations:
    - Unary operations (no args): upper(), lower(), reverse()
    - Pattern operations (pattern + args): starts_with(), contains()
    - Multi-argument operations: replace(), replace_slice()

    Args:
        pc_func: PyArrow compute function that takes (array, *positional, **kwargs)
        return_dtype: The return data type

    Returns:
        A callable that creates UDFExpr instances
    r   )r	   )r   r    r   e/home/ubuntu/vllm_env/lib/python3.10/site-packages/ray/data/namespace_expressions/string_namespace.py_create_str_udf   s   r   c                   @  s  e Zd ZU dZded< drddZdrdd	Zdrd
dZdrddZdrddZ	drddZ
drddZdrddZdrddZdrddZdrddZdrddZdrddZdrd d!Zdrd"d#Zdrd$d%Zdrd&d'Zdrd(d)Zdsd/d0Zdsd1d2Zdsd3d4Zdsd5d6Zdsd7d8Zdsd9d:Zdsd;d<Zdsd=d>Zdsd?d@ZdrdAdBZ dtdCdDZ!dudFdGZ"dudHdIZ#dvdMdNZ$dsdOdPZ%dsdQdRZ&dtdSdTZ'dsdUdVZ(dwdXdYZ)	Zdxdyd]d^Z*	Zdxdyd_d`Z+	ZdxdydadbZ,dzd{dfdgZ-dzd{dhdiZ.dzd{djdkZ/	Z	ld|d}dpdqZ0dcS )~_StringNamespacea  Namespace for string operations on expression columns.

    This namespace provides methods for operating on string-typed columns using
    PyArrow compute functions.

    Example:
        >>> from ray.data.expressions import col
        >>> # Convert to uppercase
        >>> expr = col("name").str.upper()
        >>> # Get string length
        >>> expr = col("name").str.len()
        >>> # Check if string starts with a prefix
        >>> expr = col("name").str.starts_with("A")
    r   _exprr   	'UDFExpr'c                 C     t tjt | jS )z,Get the length of each string in characters.)r   pcutf8_lengthr   int32r   selfr   r   r   len;      z_StringNamespace.lenc                 C  r   )z'Get the length of each string in bytes.)r   r   binary_lengthr   r   r   r   r   r   r   byte_len?   r    z_StringNamespace.byte_lenc                 C  r   )zConvert strings to uppercase.)r   r   
utf8_upperr   stringr   r   r   r   r   upperD   r    z_StringNamespace.upperc                 C  r   )zConvert strings to lowercase.)r   r   
utf8_lowerr   r$   r   r   r   r   r   lowerH   r    z_StringNamespace.lowerc                 C  r   )z.Capitalize the first character of each string.)r   r   utf8_capitalizer   r$   r   r   r   r   r   
capitalizeL   r    z_StringNamespace.capitalizec                 C  r   )zConvert strings to title case.)r   r   
utf8_titler   r$   r   r   r   r   r   titleP   r    z_StringNamespace.titlec                 C  r   )z Swap the case of each character.)r   r   utf8_swapcaser   r$   r   r   r   r   r   swapcaseT   r    z_StringNamespace.swapcasec                 C  r   )z4Check if strings contain only alphabetic characters.)r   r   utf8_is_alphar   boolr   r   r   r   r   is_alphaY   r    z_StringNamespace.is_alphac                 C  r   )z6Check if strings contain only alphanumeric characters.)r   r   utf8_is_alnumr   r/   r   r   r   r   r   is_alnum]   r    z_StringNamespace.is_alnumc                 C  r   )z%Check if strings contain only digits.)r   r   utf8_is_digitr   r/   r   r   r   r   r   is_digita   r    z_StringNamespace.is_digitc                 C  r   )z1Check if strings contain only decimal characters.)r   r   utf8_is_decimalr   r/   r   r   r   r   r   
is_decimale   r    z_StringNamespace.is_decimalc                 C  r   )z1Check if strings contain only numeric characters.)r   r   utf8_is_numericr   r/   r   r   r   r   r   
is_numerici   r    z_StringNamespace.is_numericc                 C  r   )z)Check if strings contain only whitespace.)r   r   utf8_is_spacer   r/   r   r   r   r   r   is_spacem   r    z_StringNamespace.is_spacec                 C  r   )zCheck if strings are lowercase.)r   r   utf8_is_lowerr   r/   r   r   r   r   r   is_lowerq   r    z_StringNamespace.is_lowerc                 C  r   )zCheck if strings are uppercase.)r   r   utf8_is_upperr   r/   r   r   r   r   r   is_upperu   r    z_StringNamespace.is_upperc                 C  r   )z!Check if strings are title-cased.)r   r   utf8_is_titler   r/   r   r   r   r   r   is_titley   r    z_StringNamespace.is_titlec                 C  r   )z3Check if strings contain only printable characters.)r   r   utf8_is_printabler   r/   r   r   r   r   r   is_printable}   r    z_StringNamespace.is_printablec                 C  r   )z/Check if strings contain only ASCII characters.)r   r   string_is_asciir   r/   r   r   r   r   r   is_ascii   r    z_StringNamespace.is_asciipatternstrargsr   kwargsc                 O  &   t tjt | j|g|R i |S )z&Check if strings start with a pattern.)r   r   starts_withr   r/   r   r   rE   rG   rH   r   r   r   rJ         z_StringNamespace.starts_withc                 O  rI   )z$Check if strings end with a pattern.)r   r   	ends_withr   r/   r   rK   r   r   r   rM      rL   z_StringNamespace.ends_withc                 O  rI   )z%Check if strings contain a substring.)r   r   match_substringr   r/   r   rK   r   r   r   contains   rL   z_StringNamespace.containsc                 O  rI   )z)Match strings against a SQL LIKE pattern.)r   r   
match_liker   r/   r   rK   r   r   r   match   rL   z_StringNamespace.matchc                 O  rI   )z)Find the first occurrence of a substring.)r   r   find_substringr   r   r   rK   r   r   r   find   rL   z_StringNamespace.findc                 O  rI   )z!Count occurrences of a substring.)r   r   count_substringr   r   r   rK   r   r   r   count   rL   z_StringNamespace.countc                 O  rI   )z3Find the first occurrence matching a regex pattern.)r   r   find_substring_regexr   r   r   rK   r   r   r   
find_regex   rL   z_StringNamespace.find_regexc                 O  rI   )z+Count occurrences matching a regex pattern.)r   r   count_substring_regexr   r   r   rK   r   r   r   count_regex   rL   z_StringNamespace.count_regexc                 O  rI   )z'Check if strings match a regex pattern.)r   r   match_substring_regexr   r/   r   rK   r   r   r   match_regex   rL   z_StringNamespace.match_regexc                 C  r   )zReverse each string.)r   r   utf8_reverser   r$   r   r   r   r   r   reverse   r    z_StringNamespace.reversec                 O  s$   t tjt | jg|R i |S )z"Slice strings by codeunit indices.)r   r   utf8_slice_codeunitsr   r$   r   r   rG   rH   r   r   r   slice      z_StringNamespace.slicereplacementc                 O  (   t tjt | j||g|R i |S )z#Replace occurrences of a substring.)r   r   replace_substringr   r$   r   r   rE   rb   rG   rH   r   r   r   replace      z_StringNamespace.replacec                 O  rc   )z-Replace occurrences matching a regex pattern.)r   r   replace_substring_regexr   r$   r   re   r   r   r   replace_regex   rg   z_StringNamespace.replace_regexstartintstopc                 O  s*   t tjt | j|||g|R i |S )zReplace a slice with a string.)r   r   binary_replace_slicer   r$   r   )r   rj   rl   rb   rG   rH   r   r   r   replace_slice   s   
z_StringNamespace.replace_slicec                 O  &   t tjtt| j|g|R i |S )zSplit strings by a pattern.)r   r   split_patternr   objectr   rK   r   r   r   split   rL   z_StringNamespace.splitc                 O  ro   )z!Split strings by a regex pattern.)r   r   split_pattern_regexr   rq   r   rK   r   r   r   split_regex   rL   z_StringNamespace.split_regexc                 O  s$   t tjtt| jg|R i |S )zSplit strings on whitespace.)r   r   utf8_split_whitespacer   rq   r   r_   r   r   r   split_whitespace   ra   z!_StringNamespace.split_whitespacec                 O  rI   )z-Extract a substring matching a regex pattern.)r   r   extract_regexr   r$   r   rK   r   r   r   extract   rL   z_StringNamespace.extractnc                 O  rI   )zRepeat each string n times.)r   r   binary_repeatr   r$   r   )r   ry   rG   rH   r   r   r   repeat   rL   z_StringNamespace.repeat widthpaddingc                 O  rc   )z)Center strings in a field of given width.)r   r   utf8_centerr   r$   r   r   r}   r~   rG   rH   r   r   r   center   rg   z_StringNamespace.centerc                 O  rc   )zRight-align strings by padding with a given character while respecting ``width``.

        If the string is longer than the specified width, it remains intact (no truncation occurs).
        )r   r   	utf8_lpadr   r$   r   r   r   r   r   lpad     z_StringNamespace.lpadc                 O  rc   )zLeft-align strings by padding with a given character while respecting ``width``.

        If the string is longer than the specified width, it remains intact (no truncation occurs).
        )r   r   	utf8_rpadr   r$   r   r   r   r   r   rpad  r   z_StringNamespace.rpadN
characters
str | Nonec                   &   t t dd fdd}|| jS )	zRemove leading and trailing whitespace or specified characters.

        Args:
            characters: Characters to remove. If None, removes whitespace.

        Returns:
            UDFExpr that strips characters from both ends.
        r   arrpyarrow.Arrayr   c                        d u r	t | S t j|  dS Nr   )r   utf8_trim_whitespace	utf8_trimr   r   r   r   
_str_strip&     
z*_StringNamespace.strip.<locals>._str_stripNr   r   r   r   r
   r   r$   r   )r   r   r   r   r   r   strip     

z_StringNamespace.stripc                   r   )	zRemove leading whitespace or specified characters.

        Args:
            characters: Characters to remove. If None, removes whitespace.

        Returns:
            UDFExpr that strips characters from the left.
        r   r   r   r   c                   r   r   )r   utf8_ltrim_whitespace
utf8_ltrimr   r   r   r   _str_lstrip9  r   z,_StringNamespace.lstrip.<locals>._str_lstripNr   r   )r   r   r   r   r   r   lstrip/  r   z_StringNamespace.lstripc                   r   )	zRemove trailing whitespace or specified characters.

        Args:
            characters: Characters to remove. If None, removes whitespace.

        Returns:
            UDFExpr that strips characters from the right.
        r   r   r   r   c                   r   r   )r   utf8_rtrim_whitespace
utf8_rtrimr   r   r   r   _str_rstripL  r   z,_StringNamespace.rstrip.<locals>._str_rstripNr   r   )r   r   r   r   r   r   rstripB  r   z_StringNamespace.rstriprightfillcharside Literal['left', 'right', 'both']c                   s*   t t dd fdd}|| jS )	a  Pad strings to a specified width.

        Args:
            width: Target width.
            fillchar: Character to use for padding.
            side: "left", "right", or "both" for padding side.

        Returns:
            UDFExpr that pads strings.
        r   r   r   r   c                   sP   dkrt j|  dS dkrt j|  dS dkr$t j|  dS td)Nr   )r}   r~   leftbothz'side must be 'left', 'right', or 'both')r   r   r   r   
ValueErrorr   r   r   r}   r   r   _str_padg  s   z&_StringNamespace.pad.<locals>._str_padNr   r   )r   r}   r   r   r   r   r   r   padV  s   

z_StringNamespace.pad)r   r   )rE   rF   rG   r   rH   r   r   r   )rG   r   rH   r   r   r   )
rE   rF   rb   rF   rG   r   rH   r   r   r   )rj   rk   rl   rk   rb   rF   rG   r   rH   r   r   r   )ry   rk   rG   r   rH   r   r   r   )r|   )
r}   rk   r~   rF   rG   r   rH   r   r   r   )N)r   r   r   r   )r|   r   )r}   rk   r   rF   r   r   r   r   )1__name__
__module____qualname____doc____annotations__r   r"   r%   r'   r)   r+   r-   r0   r2   r4   r6   r8   r:   r<   r>   r@   rB   rD   rJ   rM   rO   rQ   rS   rU   rW   rY   r[   r]   r`   rf   ri   rn   rr   rt   rv   rx   r{   r   r   r   r   r   r   r   r   r   r   r   r   '   sh   
 




































	r   )r   r   r   r   r   r   )r   
__future__r   dataclassesr   typingr   r   r   r   pyarrowpyarrow.computecomputer   ray.data.datatyper   ray.data.expressionsr	   r
   r   r   r   r   r   r   r   r   <module>   s    
