Python: Add ruff rules for asyncio and performance

cclauss · cclauss · commit 45bfaff1ceb9 · 2025-11-28T09:20:37.000+01:00
diff --git a/bindings/python/benches/test_tiktoken.py b/bindings/python/benches/test_tiktoken.py
@@ -29,7 +29,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
     os.environ["RAYON_NUM_THREADS"] = str(num_threads)
     num_bytes = sum(map(len, map(str.encode, documents)))
     readable_size, unit = format_byte_size(num_bytes)
-    print(f"==============")
+    print("==============")
     print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}")
     filename = hf_hub_download(MODEL_ID, "original/tokenizer.model")
     mergeable_ranks = load_tiktoken_bpe(filename)
diff --git a/bindings/python/examples/using_the_visualizer.ipynb b/bindings/python/examples/using_the_visualizer.ipynb
@@ -552,7 +552,7 @@
     }
    ],
    "source": [
-    "funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n",
+    "funnyAnnotations = [{\"startPlace\": i, \"endPlace\": i + 3, \"theTag\": str(i)} for i in range(0, 20, 4)]\n",
     "funnyAnnotations"
    ]
   },
diff --git a/bindings/python/py_src/tokenizers/tools/visualizer.py b/bindings/python/py_src/tokenizers/tools/visualizer.py
@@ -164,7 +164,7 @@ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]:
         """
         if len(annotations) == 0:
             return {}
-        labels = set(map(lambda x: x.label, annotations))
+        labels = {x.label for x in annotations}
         num_labels = len(labels)
         h_step = int(255 / num_labels)
         if h_step < 20:
diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml
@@ -53,6 +53,11 @@ target-version = ["py35"]
 [tool.ruff]
 line-length = 119
 target-version = "py311"
+lint.extend-select = [
+  "ASYNC",
+  "C4",
+  "PERF",
+]
 lint.ignore = [
   # a == None in tests vs is None.
   "E711",
diff --git a/bindings/python/scripts/sentencepiece_extractor.py b/bindings/python/scripts/sentencepiece_extractor.py
@@ -138,7 +138,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]:
 
                 # Save content
                 dump(vocab, vocab_f)
-                merges_f.writelines(map(lambda x: f"{x[0]} {x[1]}{linesep}", merges))
+                merges_f.writelines((f"{x[0]} {x[1]}{linesep}" for x in merges))
     finally:
         # If model was downloaded from internet we need to cleanup the tmp folder.
         if hasattr(args, "remote_model") and exists(args.model):
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
@@ -3,14 +3,14 @@
 import pytest
 import numpy as np
 import asyncio
+from time import perf_counter
 from tokenizers import AddedToken, Encoding, Tokenizer
 from tokenizers.implementations import BertWordPieceTokenizer
 from tokenizers.models import BPE, Model, Unigram
 from tokenizers.pre_tokenizers import ByteLevel, Metaspace
 from tokenizers.processors import RobertaProcessing, TemplateProcessing
 from tokenizers.normalizers import Strip, Lowercase, Sequence
 from tokenizers.decoders import ByteFallback, DecodeStream, Metaspace as DecoderMetaspace
-import time
 
 from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
 
@@ -341,7 +341,7 @@ def test_padding(self):
 
         # Can pad to the longest in a batch
         output = tokenizer.encode_batch(["my name", "my name is john"])
-        assert all([len(encoding) == 4 for encoding in output])
+        assert all(len(encoding) == 4 for encoding in output)
 
         # Can pad to the specified length otherwise
         tokenizer.enable_padding(length=4)
@@ -950,21 +950,21 @@ async def encode_async(_):
                 # Measure sync performance with pre-initialized executor
                 # Warm up
                 await asyncio.gather(*[encode_sync_with_executor(i) for i in range(10)])
-                time.sleep(0.03)
+                asyncio.sleep(0.03)
                 # Actual measurement
-                start = time.perf_counter()
+                start = perf_counter()
                 await asyncio.gather(*[encode_sync_with_executor(i) for i in range(n_tasks)])
-                sync_time = time.perf_counter() - start
+                sync_time = perf_counter() - start
 
                 # Measure async performance
                 # Warm up
                 await asyncio.gather(*[encode_async(i) for i in range(10)])
 
                 # Actual measurement
-                time.sleep(0.03)
-                start = time.perf_counter()
+                asyncio.sleep(0.03)
+                start = perf_counter()
                 await asyncio.gather(*[encode_async(i) for i in range(n_tasks)])
-                async_time = time.perf_counter() - start
+                async_time = perf_counter() - start
 
                 # Log times
                 print(f"sync vs async processing times: {sync_time:.4f}s vs {async_time:.4f}s for {n_tasks} tasks")

Original file line number	Diff line number	Diff line change
`@@ -552,7 +552,7 @@`
`552`	`552`	`}`
`553`	`553`	`],`
`554`	`554`	`"source": [`
`555`		`- "funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n",`
	`555`	`+ "funnyAnnotations = [{\"startPlace\": i, \"endPlace\": i + 3, \"theTag\": str(i)} for i in range(0, 20, 4)]\n",`
`556`	`556`	`"funnyAnnotations"`
`557`	`557`	`]`
`558`	`558`	`},`