Skip to content

Commit 2658dd2

Browse files
authored
[docs] Whitespace (#1785)
* Update pre_tokenizers.rs * fix * fix * fix * update * fix * fix * fix
1 parent 32bbe1b commit 2658dd2

File tree

2 files changed

+56
-2
lines changed

2 files changed

+56
-2
lines changed

bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,34 @@ class UnicodeScripts(PreTokenizer):
523523

524524
class Whitespace(PreTokenizer):
525525
"""
526-
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
526+
This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
527+
regex pattern. It splits on word characters or characters that aren't words or
528+
whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
529+
530+
Example:
531+
Use the `Whitespace` function as shown below::
532+
533+
```python
534+
from tokenizers.pre_tokenizers import Whitespace
535+
536+
pre_tokenizer = Whitespace()
537+
text = "Hello, world! Let's try the Whitespace pre-tokenizer."
538+
pre_tokenizer.pretokenize_str(text)
539+
[('Hello', (0, 5)),
540+
(',', (5, 6)),
541+
('world', (7, 12)),
542+
('!', (12, 13)),
543+
('Let', (14, 17)),
544+
("'", (17, 18)),
545+
('s', (18, 19)),
546+
('try', (20, 23)),
547+
('the', (24, 27)),
548+
('Whitespace', (28, 38)),
549+
('pre', (39, 42)),
550+
('-', (42, 43)),
551+
('tokenizer', (43, 52)),
552+
('.', (52, 53))]
553+
```
527554
"""
528555
def __init__(self):
529556
pass

bindings/python/src/pre_tokenizers.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,34 @@ impl PyByteLevel {
341341
}
342342
}
343343

344-
/// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
344+
/// This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
345+
/// regex pattern. It splits on word characters or characters that aren't words or
346+
/// whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
347+
///
348+
/// Example:
349+
/// Use the `Whitespace` function as shown below::
350+
///
351+
/// ```python
352+
/// from tokenizers.pre_tokenizers import Whitespace
353+
///
354+
/// pre_tokenizer = Whitespace()
355+
/// text = "Hello, world! Let's try the Whitespace pre-tokenizer."
356+
/// pre_tokenizer.pre_tokenize_str(text)
357+
/// [('Hello', (0, 5)),
358+
/// (',', (5, 6)),
359+
/// ('world', (7, 12)),
360+
/// ('!', (12, 13)),
361+
/// ('Let', (14, 17)),
362+
/// ("'", (17, 18)),
363+
/// ('s', (18, 19)),
364+
/// ('try', (20, 23)),
365+
/// ('the', (24, 27)),
366+
/// ('Whitespace', (28, 38)),
367+
/// ('pre', (39, 42)),
368+
/// ('-', (42, 43)),
369+
/// ('tokenizer', (43, 52)),
370+
/// ('.', (52, 53))]
371+
/// ```
345372
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")]
346373
pub struct PyWhitespace {}
347374
#[pymethods]

0 commit comments

Comments
 (0)