Skip to content

Commit c39c65b

Browse files
Docs
Signed-off-by: Goutam <goutam@anyscale.com>
1 parent 9665ae3 commit c39c65b

File tree

4 files changed

+91
-251
lines changed

4 files changed

+91
-251
lines changed

doc/source/data/api/expressions.rst

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,22 @@ instantiate them directly, but you may encounter them when working with expressi
3939
BinaryExpr
4040
UnaryExpr
4141
UDFExpr
42-
StarExpr
42+
StarExpr
43+
44+
Namespace Classes
45+
-----------------
46+
47+
These classes provide specialized operations for list, string, and struct columns.
48+
You access them through properties on expressions: ``.list``, ``.str``, and ``.struct``.
49+
50+
.. autoclass:: _ListNamespace
51+
:members:
52+
:exclude-members: _expr
53+
54+
.. autoclass:: _StringNamespace
55+
:members:
56+
:exclude-members: _expr
57+
58+
.. autoclass:: _StructNamespace
59+
:members:
60+
:exclude-members: _expr

python/ray/data/expressions.py

Lines changed: 60 additions & 230 deletions
Original file line numberDiff line numberDiff line change
@@ -543,9 +543,6 @@ def _add_methods_from_config(
543543
"len": _PyArrowMethodConfig(
544544
"list_value_length", DataType.int32(), docstring="Get the length of each list."
545545
),
546-
"flatten": _PyArrowMethodConfig(
547-
"list_flatten", DataType(object), docstring="Flatten nested lists."
548-
),
549546
}
550547

551548
_STRING_METHODS = {
@@ -687,6 +684,66 @@ def _add_methods_from_config(
687684
"reverse": _PyArrowMethodConfig(
688685
"utf8_reverse", DataType.string(), docstring="Reverse each string."
689686
),
687+
"slice": _PyArrowMethodConfig(
688+
"utf8_slice_codeunits",
689+
DataType.string(),
690+
params=["start", "stop", "step"],
691+
docstring="Slice strings by codeunit indices.",
692+
),
693+
"replace": _PyArrowMethodConfig(
694+
"replace_substring",
695+
DataType.string(),
696+
params=["pattern", "replacement", "max_replacements"],
697+
docstring="Replace occurrences of a substring.",
698+
),
699+
"replace_regex": _PyArrowMethodConfig(
700+
"replace_substring_regex",
701+
DataType.string(),
702+
params=["pattern", "replacement", "max_replacements"],
703+
docstring="Replace occurrences matching a regex pattern.",
704+
),
705+
"replace_slice": _PyArrowMethodConfig(
706+
"binary_replace_slice",
707+
DataType.string(),
708+
params=["start", "stop", "replacement"],
709+
docstring="Replace a slice with a string.",
710+
),
711+
"split": _PyArrowMethodConfig(
712+
"split_pattern",
713+
DataType(object),
714+
params=["pattern", "max_splits", "reverse"],
715+
docstring="Split strings by a pattern.",
716+
),
717+
"split_regex": _PyArrowMethodConfig(
718+
"split_pattern_regex",
719+
DataType(object),
720+
params=["pattern", "max_splits", "reverse"],
721+
docstring="Split strings by a regex pattern.",
722+
),
723+
"split_whitespace": _PyArrowMethodConfig(
724+
"utf8_split_whitespace",
725+
DataType(object),
726+
params=["max_splits", "reverse"],
727+
docstring="Split strings on whitespace.",
728+
),
729+
"extract": _PyArrowMethodConfig(
730+
"extract_regex",
731+
DataType.string(),
732+
params=["pattern"],
733+
docstring="Extract a substring matching a regex pattern.",
734+
),
735+
"repeat": _PyArrowMethodConfig(
736+
"binary_repeat",
737+
DataType.string(),
738+
params=["n"],
739+
docstring="Repeat each string n times.",
740+
),
741+
"center": _PyArrowMethodConfig(
742+
"utf8_center",
743+
DataType.string(),
744+
params=["width", "padding"],
745+
docstring="Center strings in a field of given width.",
746+
),
690747
}
691748

692749

@@ -877,233 +934,6 @@ def _str_pad(arr):
877934

878935
return _str_pad(self._expr)
879936

880-
def center(self, width: int, fillchar: str = " ") -> "UDFExpr":
881-
"""Center strings in a field of given width.
882-
883-
Args:
884-
width: Target width.
885-
fillchar: Character to use for padding.
886-
887-
Returns:
888-
UDFExpr that centers strings.
889-
"""
890-
891-
@udf(return_dtype=DataType.string())
892-
def _str_center(arr):
893-
return pc.utf8_center(arr, width=width, padding=fillchar)
894-
895-
return _str_center(self._expr)
896-
897-
def slice(self, start: int, stop: int = None, step: int = 1) -> "UDFExpr":
898-
"""Slice strings by codeunit indices.
899-
900-
Args:
901-
start: Start position.
902-
stop: Stop position (exclusive). If None, slices to the end.
903-
step: Step size.
904-
905-
Returns:
906-
UDFExpr that slices each string.
907-
"""
908-
909-
@udf(return_dtype=DataType.string())
910-
def _str_slice(arr):
911-
if stop is None:
912-
return pc.utf8_slice_codeunits(arr, start=start, step=step)
913-
else:
914-
return pc.utf8_slice_codeunits(arr, start=start, stop=stop, step=step)
915-
916-
return _str_slice(self._expr)
917-
918-
# Replacement
919-
def replace(
920-
self, pattern: str, replacement: str, max_replacements: int = None
921-
) -> "UDFExpr":
922-
"""Replace occurrences of a substring.
923-
924-
Args:
925-
pattern: The substring to replace.
926-
replacement: The replacement string.
927-
max_replacements: Maximum number of replacements. None means replace all.
928-
929-
Returns:
930-
UDFExpr that replaces substrings.
931-
"""
932-
933-
@udf(return_dtype=DataType.string())
934-
def _str_replace(arr):
935-
if max_replacements is None:
936-
return pc.replace_substring(
937-
arr, pattern=pattern, replacement=replacement
938-
)
939-
else:
940-
return pc.replace_substring(
941-
arr,
942-
pattern=pattern,
943-
replacement=replacement,
944-
max_replacements=max_replacements,
945-
)
946-
947-
return _str_replace(self._expr)
948-
949-
def replace_regex(
950-
self, pattern: str, replacement: str, max_replacements: int = None
951-
) -> "UDFExpr":
952-
"""Replace occurrences matching a regex pattern.
953-
954-
Args:
955-
pattern: The regex pattern to match.
956-
replacement: The replacement string.
957-
max_replacements: Maximum number of replacements. None means replace all.
958-
959-
Returns:
960-
UDFExpr that replaces matching substrings.
961-
"""
962-
963-
@udf(return_dtype=DataType.string())
964-
def _str_replace_regex(arr):
965-
if max_replacements is None:
966-
return pc.replace_substring_regex(
967-
arr, pattern=pattern, replacement=replacement
968-
)
969-
else:
970-
return pc.replace_substring_regex(
971-
arr,
972-
pattern=pattern,
973-
replacement=replacement,
974-
max_replacements=max_replacements,
975-
)
976-
977-
return _str_replace_regex(self._expr)
978-
979-
def replace_slice(self, start: int, stop: int, replacement: str) -> "UDFExpr":
980-
"""Replace a slice with a string.
981-
982-
Args:
983-
start: Start position of slice.
984-
stop: Stop position of slice.
985-
replacement: The replacement string.
986-
987-
Returns:
988-
UDFExpr that replaces the slice.
989-
"""
990-
991-
@udf(return_dtype=DataType.string())
992-
def _str_replace_slice(arr):
993-
return pc.binary_replace_slice(
994-
arr, start=start, stop=stop, replacement=replacement
995-
)
996-
997-
return _str_replace_slice(self._expr)
998-
999-
# Splitting and joining
1000-
def split(
1001-
self, pattern: str, max_splits: int = None, reverse: bool = False
1002-
) -> "UDFExpr":
1003-
"""Split strings by a pattern.
1004-
1005-
Args:
1006-
pattern: The pattern to split on.
1007-
max_splits: Maximum number of splits. None means split all.
1008-
reverse: Whether to split from the right.
1009-
1010-
Returns:
1011-
UDFExpr that returns lists of split strings.
1012-
"""
1013-
1014-
@udf(return_dtype=DataType(object))
1015-
def _str_split(arr):
1016-
if max_splits is None:
1017-
return pc.split_pattern(arr, pattern=pattern, reverse=reverse)
1018-
else:
1019-
return pc.split_pattern(
1020-
arr, pattern=pattern, max_splits=max_splits, reverse=reverse
1021-
)
1022-
1023-
return _str_split(self._expr)
1024-
1025-
def split_regex(
1026-
self, pattern: str, max_splits: int = None, reverse: bool = False
1027-
) -> "UDFExpr":
1028-
"""Split strings by a regex pattern.
1029-
1030-
Args:
1031-
pattern: The regex pattern to split on.
1032-
max_splits: Maximum number of splits. None means split all.
1033-
reverse: Whether to split from the right.
1034-
1035-
Returns:
1036-
UDFExpr that returns lists of split strings.
1037-
"""
1038-
1039-
@udf(return_dtype=DataType(object))
1040-
def _str_split_regex(arr):
1041-
if max_splits is None:
1042-
return pc.split_pattern_regex(arr, pattern=pattern, reverse=reverse)
1043-
else:
1044-
return pc.split_pattern_regex(
1045-
arr, pattern=pattern, max_splits=max_splits, reverse=reverse
1046-
)
1047-
1048-
return _str_split_regex(self._expr)
1049-
1050-
def split_whitespace(
1051-
self, max_splits: int = None, reverse: bool = False
1052-
) -> "UDFExpr":
1053-
"""Split strings on whitespace.
1054-
1055-
Args:
1056-
max_splits: Maximum number of splits. None means split all.
1057-
reverse: Whether to split from the right.
1058-
1059-
Returns:
1060-
UDFExpr that returns lists of split strings.
1061-
"""
1062-
1063-
@udf(return_dtype=DataType(object))
1064-
def _str_split_whitespace(arr):
1065-
if max_splits is None:
1066-
return pc.utf8_split_whitespace(arr, reverse=reverse)
1067-
else:
1068-
return pc.utf8_split_whitespace(
1069-
arr, max_splits=max_splits, reverse=reverse
1070-
)
1071-
1072-
return _str_split_whitespace(self._expr)
1073-
1074-
# Regex extraction
1075-
def extract(self, pattern: str) -> "UDFExpr":
1076-
"""Extract a substring matching a regex pattern.
1077-
1078-
Args:
1079-
pattern: The regex pattern to extract.
1080-
1081-
Returns:
1082-
UDFExpr that returns the first matching substring.
1083-
"""
1084-
1085-
@udf(return_dtype=DataType.string())
1086-
def _str_extract(arr):
1087-
return pc.extract_regex(arr, pattern=pattern)
1088-
1089-
return _str_extract(self._expr)
1090-
1091-
def repeat(self, n: int) -> "UDFExpr":
1092-
"""Repeat each string n times.
1093-
1094-
Args:
1095-
n: Number of repetitions.
1096-
1097-
Returns:
1098-
UDFExpr that repeats strings.
1099-
"""
1100-
1101-
@udf(return_dtype=DataType.string())
1102-
def _str_repeat(arr):
1103-
return pc.binary_repeat(arr, n)
1104-
1105-
return _str_repeat(self._expr)
1106-
1107937

1108938
@dataclass
1109939
class _StructNamespace:

python/ray/data/expressions.pyi

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,6 @@ class _ListNamespace:
146146

147147
# Auto-generated methods
148148
def len(self) -> UDFExpr: ...
149-
def flatten(self) -> UDFExpr: ...
150149

151150
class _StringNamespace:
152151
"""Namespace for string operations."""
@@ -189,18 +188,6 @@ class _StringNamespace:
189188

190189
# Auto-generated transformations
191190
def reverse(self) -> UDFExpr: ...
192-
193-
# Manual methods (complex logic)
194-
def strip(self, characters: str = None) -> UDFExpr: ...
195-
def lstrip(self, characters: str = None) -> UDFExpr: ...
196-
def rstrip(self, characters: str = None) -> UDFExpr: ...
197-
def pad(
198-
self,
199-
width: int,
200-
fillchar: str = " ",
201-
side: Literal["left", "right", "both"] = "right",
202-
) -> UDFExpr: ...
203-
def center(self, width: int, fillchar: str = " ") -> UDFExpr: ...
204191
def slice(self, start: int, stop: int = None, step: int = 1) -> UDFExpr: ...
205192
def replace(
206193
self, pattern: str, replacement: str, max_replacements: int = None
@@ -220,6 +207,18 @@ class _StringNamespace:
220207
) -> UDFExpr: ...
221208
def extract(self, pattern: str) -> UDFExpr: ...
222209
def repeat(self, n: int) -> UDFExpr: ...
210+
def center(self, width: int, padding: str = " ") -> UDFExpr: ...
211+
212+
# Manual methods (complex logic)
213+
def strip(self, characters: str = None) -> UDFExpr: ...
214+
def lstrip(self, characters: str = None) -> UDFExpr: ...
215+
def rstrip(self, characters: str = None) -> UDFExpr: ...
216+
def pad(
217+
self,
218+
width: int,
219+
fillchar: str = " ",
220+
side: Literal["left", "right", "both"] = "right",
221+
) -> UDFExpr: ...
223222

224223
class _StructNamespace:
225224
"""Namespace for struct operations."""

python/ray/data/tests/test_namespace_expressions.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,6 @@ def test_list_bracket_index(self):
7575
)
7676
assert_df_equal(result, expected)
7777

78-
@pytest.mark.skip(reason="list_flatten changes row structure in complex ways")
79-
def test_list_flatten(self):
80-
"""Test list.flatten() flattens one level of nested lists."""
81-
# Note: list_flatten is available but changes row count, making it
82-
# incompatible with with_column() which expects same row count
83-
pass
84-
8578

8679
# ──────────────────────────────────────
8780
# String Namespace Tests

0 commit comments

Comments
 (0)