@@ -543,9 +543,6 @@ def _add_methods_from_config(
543543 "len" : _PyArrowMethodConfig (
544544 "list_value_length" , DataType .int32 (), docstring = "Get the length of each list."
545545 ),
546- "flatten" : _PyArrowMethodConfig (
547- "list_flatten" , DataType (object ), docstring = "Flatten nested lists."
548- ),
549546}
550547
551548_STRING_METHODS = {
@@ -687,6 +684,66 @@ def _add_methods_from_config(
687684 "reverse" : _PyArrowMethodConfig (
688685 "utf8_reverse" , DataType .string (), docstring = "Reverse each string."
689686 ),
687+ "slice" : _PyArrowMethodConfig (
688+ "utf8_slice_codeunits" ,
689+ DataType .string (),
690+ params = ["start" , "stop" , "step" ],
691+ docstring = "Slice strings by codeunit indices." ,
692+ ),
693+ "replace" : _PyArrowMethodConfig (
694+ "replace_substring" ,
695+ DataType .string (),
696+ params = ["pattern" , "replacement" , "max_replacements" ],
697+ docstring = "Replace occurrences of a substring." ,
698+ ),
699+ "replace_regex" : _PyArrowMethodConfig (
700+ "replace_substring_regex" ,
701+ DataType .string (),
702+ params = ["pattern" , "replacement" , "max_replacements" ],
703+ docstring = "Replace occurrences matching a regex pattern." ,
704+ ),
705+ "replace_slice" : _PyArrowMethodConfig (
706+ "binary_replace_slice" ,
707+ DataType .string (),
708+ params = ["start" , "stop" , "replacement" ],
709+ docstring = "Replace a slice with a string." ,
710+ ),
711+ "split" : _PyArrowMethodConfig (
712+ "split_pattern" ,
713+ DataType (object ),
714+ params = ["pattern" , "max_splits" , "reverse" ],
715+ docstring = "Split strings by a pattern." ,
716+ ),
717+ "split_regex" : _PyArrowMethodConfig (
718+ "split_pattern_regex" ,
719+ DataType (object ),
720+ params = ["pattern" , "max_splits" , "reverse" ],
721+ docstring = "Split strings by a regex pattern." ,
722+ ),
723+ "split_whitespace" : _PyArrowMethodConfig (
724+ "utf8_split_whitespace" ,
725+ DataType (object ),
726+ params = ["max_splits" , "reverse" ],
727+ docstring = "Split strings on whitespace." ,
728+ ),
729+ "extract" : _PyArrowMethodConfig (
730+ "extract_regex" ,
731+ DataType .string (),
732+ params = ["pattern" ],
733+ docstring = "Extract a substring matching a regex pattern." ,
734+ ),
735+ "repeat" : _PyArrowMethodConfig (
736+ "binary_repeat" ,
737+ DataType .string (),
738+ params = ["n" ],
739+ docstring = "Repeat each string n times." ,
740+ ),
741+ "center" : _PyArrowMethodConfig (
742+ "utf8_center" ,
743+ DataType .string (),
744+ params = ["width" , "padding" ],
745+ docstring = "Center strings in a field of given width." ,
746+ ),
690747}
691748
692749
@@ -877,233 +934,6 @@ def _str_pad(arr):
877934
878935 return _str_pad (self ._expr )
879936
880- def center (self , width : int , fillchar : str = " " ) -> "UDFExpr" :
881- """Center strings in a field of given width.
882-
883- Args:
884- width: Target width.
885- fillchar: Character to use for padding.
886-
887- Returns:
888- UDFExpr that centers strings.
889- """
890-
891- @udf (return_dtype = DataType .string ())
892- def _str_center (arr ):
893- return pc .utf8_center (arr , width = width , padding = fillchar )
894-
895- return _str_center (self ._expr )
896-
897- def slice (self , start : int , stop : int = None , step : int = 1 ) -> "UDFExpr" :
898- """Slice strings by codeunit indices.
899-
900- Args:
901- start: Start position.
902- stop: Stop position (exclusive). If None, slices to the end.
903- step: Step size.
904-
905- Returns:
906- UDFExpr that slices each string.
907- """
908-
909- @udf (return_dtype = DataType .string ())
910- def _str_slice (arr ):
911- if stop is None :
912- return pc .utf8_slice_codeunits (arr , start = start , step = step )
913- else :
914- return pc .utf8_slice_codeunits (arr , start = start , stop = stop , step = step )
915-
916- return _str_slice (self ._expr )
917-
918- # Replacement
919- def replace (
920- self , pattern : str , replacement : str , max_replacements : int = None
921- ) -> "UDFExpr" :
922- """Replace occurrences of a substring.
923-
924- Args:
925- pattern: The substring to replace.
926- replacement: The replacement string.
927- max_replacements: Maximum number of replacements. None means replace all.
928-
929- Returns:
930- UDFExpr that replaces substrings.
931- """
932-
933- @udf (return_dtype = DataType .string ())
934- def _str_replace (arr ):
935- if max_replacements is None :
936- return pc .replace_substring (
937- arr , pattern = pattern , replacement = replacement
938- )
939- else :
940- return pc .replace_substring (
941- arr ,
942- pattern = pattern ,
943- replacement = replacement ,
944- max_replacements = max_replacements ,
945- )
946-
947- return _str_replace (self ._expr )
948-
949- def replace_regex (
950- self , pattern : str , replacement : str , max_replacements : int = None
951- ) -> "UDFExpr" :
952- """Replace occurrences matching a regex pattern.
953-
954- Args:
955- pattern: The regex pattern to match.
956- replacement: The replacement string.
957- max_replacements: Maximum number of replacements. None means replace all.
958-
959- Returns:
960- UDFExpr that replaces matching substrings.
961- """
962-
963- @udf (return_dtype = DataType .string ())
964- def _str_replace_regex (arr ):
965- if max_replacements is None :
966- return pc .replace_substring_regex (
967- arr , pattern = pattern , replacement = replacement
968- )
969- else :
970- return pc .replace_substring_regex (
971- arr ,
972- pattern = pattern ,
973- replacement = replacement ,
974- max_replacements = max_replacements ,
975- )
976-
977- return _str_replace_regex (self ._expr )
978-
979- def replace_slice (self , start : int , stop : int , replacement : str ) -> "UDFExpr" :
980- """Replace a slice with a string.
981-
982- Args:
983- start: Start position of slice.
984- stop: Stop position of slice.
985- replacement: The replacement string.
986-
987- Returns:
988- UDFExpr that replaces the slice.
989- """
990-
991- @udf (return_dtype = DataType .string ())
992- def _str_replace_slice (arr ):
993- return pc .binary_replace_slice (
994- arr , start = start , stop = stop , replacement = replacement
995- )
996-
997- return _str_replace_slice (self ._expr )
998-
999- # Splitting and joining
1000- def split (
1001- self , pattern : str , max_splits : int = None , reverse : bool = False
1002- ) -> "UDFExpr" :
1003- """Split strings by a pattern.
1004-
1005- Args:
1006- pattern: The pattern to split on.
1007- max_splits: Maximum number of splits. None means split all.
1008- reverse: Whether to split from the right.
1009-
1010- Returns:
1011- UDFExpr that returns lists of split strings.
1012- """
1013-
1014- @udf (return_dtype = DataType (object ))
1015- def _str_split (arr ):
1016- if max_splits is None :
1017- return pc .split_pattern (arr , pattern = pattern , reverse = reverse )
1018- else :
1019- return pc .split_pattern (
1020- arr , pattern = pattern , max_splits = max_splits , reverse = reverse
1021- )
1022-
1023- return _str_split (self ._expr )
1024-
1025- def split_regex (
1026- self , pattern : str , max_splits : int = None , reverse : bool = False
1027- ) -> "UDFExpr" :
1028- """Split strings by a regex pattern.
1029-
1030- Args:
1031- pattern: The regex pattern to split on.
1032- max_splits: Maximum number of splits. None means split all.
1033- reverse: Whether to split from the right.
1034-
1035- Returns:
1036- UDFExpr that returns lists of split strings.
1037- """
1038-
1039- @udf (return_dtype = DataType (object ))
1040- def _str_split_regex (arr ):
1041- if max_splits is None :
1042- return pc .split_pattern_regex (arr , pattern = pattern , reverse = reverse )
1043- else :
1044- return pc .split_pattern_regex (
1045- arr , pattern = pattern , max_splits = max_splits , reverse = reverse
1046- )
1047-
1048- return _str_split_regex (self ._expr )
1049-
1050- def split_whitespace (
1051- self , max_splits : int = None , reverse : bool = False
1052- ) -> "UDFExpr" :
1053- """Split strings on whitespace.
1054-
1055- Args:
1056- max_splits: Maximum number of splits. None means split all.
1057- reverse: Whether to split from the right.
1058-
1059- Returns:
1060- UDFExpr that returns lists of split strings.
1061- """
1062-
1063- @udf (return_dtype = DataType (object ))
1064- def _str_split_whitespace (arr ):
1065- if max_splits is None :
1066- return pc .utf8_split_whitespace (arr , reverse = reverse )
1067- else :
1068- return pc .utf8_split_whitespace (
1069- arr , max_splits = max_splits , reverse = reverse
1070- )
1071-
1072- return _str_split_whitespace (self ._expr )
1073-
1074- # Regex extraction
1075- def extract (self , pattern : str ) -> "UDFExpr" :
1076- """Extract a substring matching a regex pattern.
1077-
1078- Args:
1079- pattern: The regex pattern to extract.
1080-
1081- Returns:
1082- UDFExpr that returns the first matching substring.
1083- """
1084-
1085- @udf (return_dtype = DataType .string ())
1086- def _str_extract (arr ):
1087- return pc .extract_regex (arr , pattern = pattern )
1088-
1089- return _str_extract (self ._expr )
1090-
1091- def repeat (self , n : int ) -> "UDFExpr" :
1092- """Repeat each string n times.
1093-
1094- Args:
1095- n: Number of repetitions.
1096-
1097- Returns:
1098- UDFExpr that repeats strings.
1099- """
1100-
1101- @udf (return_dtype = DataType .string ())
1102- def _str_repeat (arr ):
1103- return pc .binary_repeat (arr , n )
1104-
1105- return _str_repeat (self ._expr )
1106-
1107937
1108938@dataclass
1109939class _StructNamespace :
0 commit comments