Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bindings/python/benches/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
os.environ["RAYON_NUM_THREADS"] = str(num_threads)
num_bytes = sum(map(len, map(str.encode, documents)))
readable_size, unit = format_byte_size(num_bytes)
print(f"==============")
print("==============")
print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}")
filename = hf_hub_download(MODEL_ID, "original/tokenizer.model")
mergeable_ranks = load_tiktoken_bpe(filename)
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/examples/using_the_visualizer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@
}
],
"source": [
"funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n",
"funnyAnnotations = [{\"startPlace\": i, \"endPlace\": i + 3, \"theTag\": str(i)} for i in range(0, 20, 4)]\n",
"funnyAnnotations"
]
},
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/py_src/tokenizers/tools/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
"""
if len(annotations) == 0:
return {}
labels = set(map(lambda x: x.label, annotations))
labels = {x.label for x in annotations}
num_labels = len(labels)
h_step = int(255 / num_labels)
if h_step < 20:
Expand Down
5 changes: 5 additions & 0 deletions bindings/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ target-version = ["py35"]
[tool.ruff]
line-length = 119
target-version = "py311"
lint.extend-select = [
"ASYNC",
"C4",
"PERF",
Comment on lines +56 to +59
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you explain this?

Copy link
Author

@cclauss cclauss Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are adding these three families of rules to the ruff linter. The rules are documented in the URLs in the commit message.

https://docs.astral.sh/ruff/settings/#lint_extend-select

]
lint.ignore = [
# a == None in tests vs is None.
"E711",
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/scripts/sentencepiece_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:

# Save content
dump(vocab, vocab_f)
merges_f.writelines(map(lambda x: f"{x[0]} {x[1]}{linesep}", merges))
merges_f.writelines((f"{x[0]} {x[1]}{linesep}" for x in merges))
finally:
# If model was downloaded from internet we need to cleanup the tmp folder.
if hasattr(args, "remote_model") and exists(args.model):
Expand Down
16 changes: 8 additions & 8 deletions bindings/python/tests/bindings/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import pytest
import numpy as np
import asyncio
from time import perf_counter
from tokenizers import AddedToken, Encoding, Tokenizer
from tokenizers.implementations import BertWordPieceTokenizer
from tokenizers.models import BPE, Model, Unigram
from tokenizers.pre_tokenizers import ByteLevel, Metaspace
from tokenizers.processors import RobertaProcessing, TemplateProcessing
from tokenizers.normalizers import Strip, Lowercase, Sequence
from tokenizers.decoders import ByteFallback, DecodeStream, Metaspace as DecoderMetaspace
import time

from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files

Expand Down Expand Up @@ -341,7 +341,7 @@ def test_padding(self):

# Can pad to the longest in a batch
output = tokenizer.encode_batch(["my name", "my name is john"])
assert all([len(encoding) == 4 for encoding in output])
assert all(len(encoding) == 4 for encoding in output)

# Can pad to the specified length otherwise
tokenizer.enable_padding(length=4)
Expand Down Expand Up @@ -950,21 +950,21 @@ async def encode_async(_):
# Measure sync performance with pre-initialized executor
# Warm up
await asyncio.gather(*[encode_sync_with_executor(i) for i in range(10)])
time.sleep(0.03)
asyncio.sleep(0.03)
# Actual measurement
start = time.perf_counter()
start = perf_counter()
await asyncio.gather(*[encode_sync_with_executor(i) for i in range(n_tasks)])
sync_time = time.perf_counter() - start
sync_time = perf_counter() - start

# Measure async performance
# Warm up
await asyncio.gather(*[encode_async(i) for i in range(10)])

# Actual measurement
time.sleep(0.03)
start = time.perf_counter()
asyncio.sleep(0.03)
start = perf_counter()
await asyncio.gather(*[encode_async(i) for i in range(n_tasks)])
async_time = time.perf_counter() - start
async_time = perf_counter() - start

# Log times
print(f"sync vs async processing times: {sync_time:.4f}s vs {async_time:.4f}s for {n_tasks} tasks")
Expand Down