diff --git a/bindings/python/benches/test_tiktoken.py b/bindings/python/benches/test_tiktoken.py index 3fdad5daf..28538df8d 100644 --- a/bindings/python/benches/test_tiktoken.py +++ b/bindings/python/benches/test_tiktoken.py @@ -29,7 +29,7 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document os.environ["RAYON_NUM_THREADS"] = str(num_threads) num_bytes = sum(map(len, map(str.encode, documents))) readable_size, unit = format_byte_size(num_bytes) - print(f"==============") + print("==============") print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}") filename = hf_hub_download(MODEL_ID, "original/tokenizer.model") mergeable_ranks = load_tiktoken_bpe(filename) diff --git a/bindings/python/examples/using_the_visualizer.ipynb b/bindings/python/examples/using_the_visualizer.ipynb index 61d6fb845..6e0d9d18d 100644 --- a/bindings/python/examples/using_the_visualizer.ipynb +++ b/bindings/python/examples/using_the_visualizer.ipynb @@ -552,7 +552,7 @@ } ], "source": [ - "funnyAnnotations = [dict(startPlace=i, endPlace=i + 3, theTag=str(i)) for i in range(0, 20, 4)]\n", + "funnyAnnotations = [{\"startPlace\": i, \"endPlace\": i + 3, \"theTag\": str(i)} for i in range(0, 20, 4)]\n", "funnyAnnotations" ] }, diff --git a/bindings/python/py_src/tokenizers/tools/visualizer.py b/bindings/python/py_src/tokenizers/tools/visualizer.py index d8b6158c7..f890f1480 100644 --- a/bindings/python/py_src/tokenizers/tools/visualizer.py +++ b/bindings/python/py_src/tokenizers/tools/visualizer.py @@ -164,7 +164,7 @@ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]: """ if len(annotations) == 0: return {} - labels = set(map(lambda x: x.label, annotations)) + labels = {x.label for x in annotations} num_labels = len(labels) h_step = int(255 / num_labels) if h_step < 20: diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 2fd2918db..c78e9b7a1 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -53,6 +53,11 @@ target-version = ["py35"] [tool.ruff] line-length = 119 target-version = "py311" +lint.extend-select = [ + "ASYNC", + "C4", + "PERF", +] lint.ignore = [ # a == None in tests vs is None. "E711", diff --git a/bindings/python/scripts/sentencepiece_extractor.py b/bindings/python/scripts/sentencepiece_extractor.py index a7bce9b49..6490292d5 100644 --- a/bindings/python/scripts/sentencepiece_extractor.py +++ b/bindings/python/scripts/sentencepiece_extractor.py @@ -138,7 +138,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: # Save content dump(vocab, vocab_f) - merges_f.writelines(map(lambda x: f"{x[0]} {x[1]}{linesep}", merges)) + merges_f.writelines((f"{x[0]} {x[1]}{linesep}" for x in merges)) finally: # If model was downloaded from internet we need to cleanup the tmp folder. if hasattr(args, "remote_model") and exists(args.model): diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 28f6b38d4..56f57595c 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -3,6 +3,7 @@ import pytest import numpy as np import asyncio +from time import perf_counter from tokenizers import AddedToken, Encoding, Tokenizer from tokenizers.implementations import BertWordPieceTokenizer from tokenizers.models import BPE, Model, Unigram @@ -10,7 +11,6 @@ from tokenizers.processors import RobertaProcessing, TemplateProcessing from tokenizers.normalizers import Strip, Lowercase, Sequence from tokenizers.decoders import ByteFallback, DecodeStream, Metaspace as DecoderMetaspace -import time from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files @@ -341,7 +341,7 @@ def test_padding(self): # Can pad to the longest in a batch output = tokenizer.encode_batch(["my name", "my name is john"]) - assert all([len(encoding) == 4 for encoding in output]) + assert all(len(encoding) == 4 for encoding in output) # Can pad to the specified length otherwise tokenizer.enable_padding(length=4) @@ -950,21 +950,21 @@ async def encode_async(_): # Measure sync performance with pre-initialized executor # Warm up await asyncio.gather(*[encode_sync_with_executor(i) for i in range(10)]) - time.sleep(0.03) + asyncio.sleep(0.03) # Actual measurement - start = time.perf_counter() + start = perf_counter() await asyncio.gather(*[encode_sync_with_executor(i) for i in range(n_tasks)]) - sync_time = time.perf_counter() - start + sync_time = perf_counter() - start # Measure async performance # Warm up await asyncio.gather(*[encode_async(i) for i in range(10)]) # Actual measurement - time.sleep(0.03) - start = time.perf_counter() + asyncio.sleep(0.03) + start = perf_counter() await asyncio.gather(*[encode_async(i) for i in range(n_tasks)]) - async_time = time.perf_counter() - start + async_time = perf_counter() - start # Log times print(f"sync vs async processing times: {sync_time:.4f}s vs {async_time:.4f}s for {n_tasks} tasks")