Skip to content

Commit 45bfaff

Browse files
committed
Python: Add ruff rules for asyncio and performance
1 parent a05b60c commit 45bfaff

File tree

6 files changed

+17
-12
lines changed

6 files changed

+17
-12
lines changed

bindings/python/benches/test_tiktoken.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
2929
os.environ["RAYON_NUM_THREADS"] = str(num_threads)
3030
num_bytes = sum(map(len, map(str.encode, documents)))
3131
readable_size, unit = format_byte_size(num_bytes)
32-
print(f"==============")
32+
print("==============")
3333
print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}")
3434
filename = hf_hub_download(MODEL_ID, "original/tokenizer.model")
3535
mergeable_ranks = load_tiktoken_bpe(filename)

bindings/python/examples/using_the_visualizer.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,7 @@
552552
}
553553
],
554554
"source": [
555-
"funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n",
555+
"funnyAnnotations = [{\"startPlace\": i, \"endPlace\": i + 3, \"theTag\": str(i)} for i in range(0, 20, 4)]\n",
556556
"funnyAnnotations"
557557
]
558558
},

bindings/python/py_src/tokenizers/tools/visualizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
164164
"""
165165
if len(annotations) == 0:
166166
return {}
167-
labels = set(map(lambda x: x.label, annotations))
167+
labels = {x.label for x in annotations}
168168
num_labels = len(labels)
169169
h_step = int(255 / num_labels)
170170
if h_step < 20:

bindings/python/pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ target-version = ["py35"]
5353
[tool.ruff]
5454
line-length = 119
5555
target-version = "py311"
56+
lint.extend-select = [
57+
"ASYNC",
58+
"C4",
59+
"PERF",
60+
]
5661
lint.ignore = [
5762
# a == None in tests vs is None.
5863
"E711",

bindings/python/scripts/sentencepiece_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
138138

139139
# Save content
140140
dump(vocab, vocab_f)
141-
merges_f.writelines(map(lambda x: f"{x[0]} {x[1]}{linesep}", merges))
141+
merges_f.writelines((f"{x[0]} {x[1]}{linesep}" for x in merges))
142142
finally:
143143
# If model was downloaded from internet we need to cleanup the tmp folder.
144144
if hasattr(args, "remote_model") and exists(args.model):

bindings/python/tests/bindings/test_tokenizer.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
import pytest
44
import numpy as np
55
import asyncio
6+
from time import perf_counter
67
from tokenizers import AddedToken, Encoding, Tokenizer
78
from tokenizers.implementations import BertWordPieceTokenizer
89
from tokenizers.models import BPE, Model, Unigram
910
from tokenizers.pre_tokenizers import ByteLevel, Metaspace
1011
from tokenizers.processors import RobertaProcessing, TemplateProcessing
1112
from tokenizers.normalizers import Strip, Lowercase, Sequence
1213
from tokenizers.decoders import ByteFallback, DecodeStream, Metaspace as DecoderMetaspace
13-
import time
1414

1515
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
1616

@@ -341,7 +341,7 @@ def test_padding(self):
341341

342342
# Can pad to the longest in a batch
343343
output = tokenizer.encode_batch(["my name", "my name is john"])
344-
assert all([len(encoding) == 4 for encoding in output])
344+
assert all(len(encoding) == 4 for encoding in output)
345345

346346
# Can pad to the specified length otherwise
347347
tokenizer.enable_padding(length=4)
@@ -950,21 +950,21 @@ async def encode_async(_):
950950
# Measure sync performance with pre-initialized executor
951951
# Warm up
952952
await asyncio.gather(*[encode_sync_with_executor(i) for i in range(10)])
953-
time.sleep(0.03)
953+
asyncio.sleep(0.03)
954954
# Actual measurement
955-
start = time.perf_counter()
955+
start = perf_counter()
956956
await asyncio.gather(*[encode_sync_with_executor(i) for i in range(n_tasks)])
957-
sync_time = time.perf_counter() - start
957+
sync_time = perf_counter() - start
958958

959959
# Measure async performance
960960
# Warm up
961961
await asyncio.gather(*[encode_async(i) for i in range(10)])
962962

963963
# Actual measurement
964-
time.sleep(0.03)
965-
start = time.perf_counter()
964+
asyncio.sleep(0.03)
965+
start = perf_counter()
966966
await asyncio.gather(*[encode_async(i) for i in range(n_tasks)])
967-
async_time = time.perf_counter() - start
967+
async_time = perf_counter() - start
968968

969969
# Log times
970970
print(f"sync vs async processing times: {sync_time:.4f}s vs {async_time:.4f}s for {n_tasks} tasks")

0 commit comments

Comments
 (0)