[docs] Whitespace (#1785)

stevhliu · web-flow · commit 2658dd268beb · 2025-06-17T16:29:43.000+02:00
* Update pre_tokenizers.rs

* fix

* fix

* fix

* update

* fix

* fix

* fix
diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
@@ -523,7 +523,34 @@ class UnicodeScripts(PreTokenizer):
 
 class Whitespace(PreTokenizer):
     """
-    This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+    This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
+    regex pattern. It splits on word characters or characters that aren't words or
+    whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
+
+    Example:
+        Use the `Whitespace` function as shown below::
+
+            ```python
+            from tokenizers.pre_tokenizers import Whitespace
+
+            pre_tokenizer = Whitespace()
+            text = "Hello, world! Let's try the Whitespace pre-tokenizer."
+            pre_tokenizer.pretokenize_str(text)
+            [('Hello', (0, 5)),
+             (',', (5, 6)),
+             ('world', (7, 12)),
+             ('!', (12, 13)),
+             ('Let', (14, 17)),
+             ("'", (17, 18)),
+             ('s', (18, 19)),
+             ('try', (20, 23)),
+             ('the', (24, 27)),
+             ('Whitespace', (28, 38)),
+             ('pre', (39, 42)),
+             ('-', (42, 43)),
+             ('tokenizer', (43, 52)),
+             ('.', (52, 53))]
+            ```
     """
     def __init__(self):
         pass
diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
@@ -341,7 +341,34 @@ impl PyByteLevel {
     }
 }
 
-/// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
+/// This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
+/// regex pattern. It splits on word characters or characters that aren't words or
+/// whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
+///
+/// Example:
+///     Use the `Whitespace` function as shown below::
+///
+///         ```python
+///         from tokenizers.pre_tokenizers import Whitespace
+///
+///         pre_tokenizer = Whitespace()
+///         text = "Hello, world! Let's try the Whitespace pre-tokenizer."
+///         pre_tokenizer.pre_tokenize_str(text)
+///         [('Hello', (0, 5)),
+///          (',', (5, 6)),
+///          ('world', (7, 12)),
+///          ('!', (12, 13)),
+///          ('Let', (14, 17)),
+///          ("'", (17, 18)),
+///          ('s', (18, 19)),
+///          ('try', (20, 23)),
+///          ('the', (24, 27)),
+///          ('Whitespace', (28, 38)),
+///          ('pre', (39, 42)),
+///          ('-', (42, 43)),
+///          ('tokenizer', (43, 52)),
+///          ('.', (52, 53))]
+///         ```
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")]
 pub struct PyWhitespace {}
 #[pymethods]