fix

tukwila · tukwila · commit ac8ef6931009 · 2025-09-09T11:42:10.000+08:00
diff --git a/src/guidellm/utils/preprocessing_sharegpt_data.py b/src/guidellm/utils/preprocessing_sharegpt_data.py
@@ -3,7 +3,7 @@
 import os
 import re
 from pathlib import Path
-from typing import Callable, Optional
+from typing import Optional
 
 import numpy as np
 from datasets import load_dataset
@@ -13,34 +13,31 @@
 MAX_CHAR = 1000
 
 
-def create_token_estimator(
-    model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
-) -> Callable[[str], int]:
-    _tokenizer: Optional[AutoTokenizer] = None
+class TokenCounter:
+    def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2"):
+        self.model_name = model_name
+        self._tokenizer: Optional[AutoTokenizer] = None
 
-    def initialize() -> None:
-        nonlocal _tokenizer
-        if _tokenizer is None:
+    def _initialize_tokenizer(self) -> None:
+        if self._tokenizer is None:
             os.environ["TOKENIZERS_PARALLELISM"] = "false"
             try:
-                _tokenizer = AutoTokenizer.from_pretrained(model_name)
+                self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
             except (OSError, ImportError, ValueError) as e:
                 raise RuntimeError(f"Failed to initialize tokenizer: {e}") from e
 
-    def estimate_num_tokens(text: str) -> int:
-        initialize()
+    def estimate_num_tokens(self, text: str) -> int:
+        self._initialize_tokenizer()
 
-        if _tokenizer is None:
+        if self._tokenizer is None:
             return 0
 
         try:
-            encoding = _tokenizer(text, return_tensors=None)
+            encoding = self._tokenizer(text, return_tensors=None)
             return len(encoding["input_ids"])
         except (AttributeError, TypeError, RuntimeError) as e:
             raise ValueError(f"Error processing text: {e}") from e
 
-    return estimate_num_tokens
-
 
 def extract_and_save_with_filtering(file):
     """substract human prompts and apply filtering conditions"""
@@ -93,7 +90,7 @@ def extract_and_save_with_filtering(file):
     with Path(sharegpt_file).open("r", encoding="utf-8") as file:
         data = json.load(file)
 
-    estimate_tokens = create_token_estimator()
+    counter = TokenCounter()
     num_of_ids = len(data)
     data = data[: int(num_of_ids * args.parse)]
     for d in data:
@@ -102,9 +99,9 @@ def extract_and_save_with_filtering(file):
         gpt_tokens = []
         for conv in d["conversations"]:
             if conv["from"] == "human":
-                human_tokens.append(estimate_tokens(conv["value"]))
+                human_tokens.append(counter.estimate_num_tokens(conv["value"]))
             if conv["from"] == "gpt":
-                token_number = estimate_tokens(conv["value"])
+                token_number = counter.estimate_num_tokens(conv["value"])
                 conv["num_tokens"] = token_number
                 gpt_tokens.append(token_number)
         if len(human_tokens) == 0: