File tree Expand file tree Collapse file tree 2 files changed +56
-2
lines changed
py_src/tokenizers/pre_tokenizers Expand file tree Collapse file tree 2 files changed +56
-2
lines changed Original file line number Diff line number Diff line change @@ -523,7 +523,34 @@ class UnicodeScripts(PreTokenizer):
523523
524524class Whitespace (PreTokenizer ):
525525 """
526- This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
526+ This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
527+ regex pattern. It splits on word characters or characters that aren't words or
528+ whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
529+
530+ Example:
531+ Use the `Whitespace` function as shown below::
532+
533+ ```python
534+ from tokenizers.pre_tokenizers import Whitespace
535+
536+ pre_tokenizer = Whitespace()
537+ text = "Hello, world! Let's try the Whitespace pre-tokenizer."
538+ pre_tokenizer.pretokenize_str(text)
539+ [('Hello', (0, 5)),
540+ (',', (5, 6)),
541+ ('world', (7, 12)),
542+ ('!', (12, 13)),
543+ ('Let', (14, 17)),
544+ ("'", (17, 18)),
545+ ('s', (18, 19)),
546+ ('try', (20, 23)),
547+ ('the', (24, 27)),
548+ ('Whitespace', (28, 38)),
549+ ('pre', (39, 42)),
550+ ('-', (42, 43)),
551+ ('tokenizer', (43, 52)),
552+ ('.', (52, 53))]
553+ ```
527554 """
528555 def __init__ (self ):
529556 pass
Original file line number Diff line number Diff line change @@ -341,7 +341,34 @@ impl PyByteLevel {
341341 }
342342}
343343
344- /// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
344+ /// This pre-tokenizer splits on word boundaries according to the `\w+|[^\w\s]+`
345+ /// regex pattern. It splits on word characters or characters that aren't words or
346+ /// whitespaces (punctuation such as hyphens, apostrophes, commas, etc.).
347+ ///
348+ /// Example:
349+ /// Use the `Whitespace` function as shown below::
350+ ///
351+ /// ```python
352+ /// from tokenizers.pre_tokenizers import Whitespace
353+ ///
354+ /// pre_tokenizer = Whitespace()
355+ /// text = "Hello, world! Let's try the Whitespace pre-tokenizer."
356+ /// pre_tokenizer.pre_tokenize_str(text)
357+ /// [('Hello', (0, 5)),
358+ /// (',', (5, 6)),
359+ /// ('world', (7, 12)),
360+ /// ('!', (12, 13)),
361+ /// ('Let', (14, 17)),
362+ /// ("'", (17, 18)),
363+ /// ('s', (18, 19)),
364+ /// ('try', (20, 23)),
365+ /// ('the', (24, 27)),
366+ /// ('Whitespace', (28, 38)),
367+ /// ('pre', (39, 42)),
368+ /// ('-', (42, 43)),
369+ /// ('tokenizer', (43, 52)),
370+ /// ('.', (52, 53))]
371+ /// ```
345372#[ pyclass( extends=PyPreTokenizer , module = "tokenizers.pre_tokenizers" , name = "Whitespace" ) ]
346373pub struct PyWhitespace { }
347374#[ pymethods]
You can’t perform that action at this time.
0 commit comments