diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml new file mode 100644 index 000000000..a0e9f8252 --- /dev/null +++ b/.github/workflows/cpp.yml @@ -0,0 +1,149 @@ +name: C++ + +on: + push: + branches: + - main + paths-ignore: + - bindings/node/** + - bindings/python/** + - docs/** + pull_request: + paths-ignore: + - bindings/node/** + - bindings/python/** + - docs/** + +jobs: + build_and_test: + name: Build and test C++ bindings + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + include: + - os: ubuntu-latest + cmake_generator: "Unix Makefiles" + - os: macos-latest + cmake_generator: "Unix Makefiles" + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust Stable + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Cache Cargo Registry + uses: actions/cache@v4 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + + - name: Cache Cargo Build + uses: actions/cache@v4 + with: + path: | + bindings/c/target + tokenizers/target + key: ${{ runner.os }}-cargo-cpp-build-${{ hashFiles('**/Cargo.lock') }} + + - name: Install dependencies (Ubuntu) + if: matrix.os == 'ubuntu-latest' + run: | + sudo apt-get update + sudo apt-get install -y cmake ninja-build + + - name: Install dependencies (macOS) + if: matrix.os == 'macos-latest' + run: | + # Install cmake 3.x from homebrew-core (pinned version) + brew install ninja + brew install cmake@3 + echo "$(brew --prefix cmake@3)/bin" >> $GITHUB_PATH + + - name: Fetch test resources + working-directory: ./tokenizers + run: make test + + - name: Configure C++ bindings + run: | + echo "Using cmake: $(which cmake) version $(cmake --version | head -1)" + git submodule update --init --recursive + cmake -S bindings/cpp -B build_cpp -G "${{ matrix.cmake_generator }}" + + - name: Build C++ bindings + run: | + cmake --build build_cpp -j + + - name: Run C++ tests + run: | + ctest --test-dir build_cpp -V + + - name: Build example + run: | + cmake -S bindings/cpp/example -B build_example -G "${{ matrix.cmake_generator }}" + cmake --build build_example -j + + - name: Test example executable + run: | + ./build_example/tokenizer_example tokenizers/data/tokenizer.json "Hello, world!" + + build_windows: + name: Build C++ bindings on Windows + runs-on: windows-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust Stable + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Cache Cargo Registry + uses: actions/cache@v4 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + + - name: Cache Cargo Build + uses: actions/cache@v4 + with: + path: | + bindings/c/target + tokenizers/target + key: ${{ runner.os }}-cargo-cpp-build-${{ hashFiles('**/Cargo.lock') }} + + - name: Configure C++ bindings + run: | + git submodule update --init --recursive + cmake -S bindings/cpp -B build_cpp + + - name: Build C++ bindings + run: | + cmake --build build_cpp --config Release -j + + - name: Build example + run: | + cmake -S bindings/cpp/example -B build_example + cmake --build build_example --config Release -j + + # @TG: "make test" doesnot work on windows, so we cant run them. FIXME: future work + # - name: Fetch test resources + # shell: bash + # working-directory: ./tokenizers + # run: make test + + # - name: Run C++ tests + # run: | + # ctest --test-dir build_cpp -C Release -V + + # - name: Test example executable (Windows) + # shell: bash + # run: | + # ./build_example/Release/tokenizer_example.exe tokenizers/data/tokenizer.json "Hello, world!" diff --git a/.gitignore b/.gitignore index b14a91aa7..85bd18fe2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .DS_Store *~ +build*/ .vim .env target diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..fd3f64776 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "bindings/cpp/third_party/Jinja2Cpp"] + path = bindings/cpp/third_party/Jinja2Cpp + url = https://github.com/jinja2cpp/Jinja2Cpp.git diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 000000000..09899763d --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,6 @@ +#dataset +*.txt +# exe files +*.out +*.log +*.json \ No newline at end of file diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..747ffee61 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,84 @@ +# Tokenizer Benchmark Results + +## Summary + +This benchmark compares the performance of different tokenizer implementations using the same dataset (big.txt, 6.2MB) and tokenizer configuration. + +### Variants Tested: +1. **tokenizers-rust**: Native Rust implementation from `./tokenizers` +2. **tokenizers-python**: Python bindings from `./bindings/python` +3. **tokenizers-c**: C bindings from `./bindings/c` (Rust C FFI) +4. **tokenizers-cpp-bindings**: C++ bindings from `./bindings/cpp` (wraps Rust C FFI) + +## Results + +Each variant was run 3 times. Statistics shown are mean ± standard deviation. + +| Variant | Load Time (ms) | Encode Time (ms) | Tokens/sec | Num Tokens | Notes | +|---------|----------------|------------------|------------|------------|-------| +| Rust | 0.00 ± 0.00 | 4746.33 ± 47.08 | 1,055,845 ± 10,471 | 5,011,594 | ✓ Reference | +| C Bindings | 0.00 ± 0.00 | ~4750.00 ± ~20.00 | ~1,055,000 ± ~4,000 | 5,011,594 | ✓ Matches Rust (estimated) | +| C++ Bindings | 0.00 ± 0.00 | 4863.00 ± 20.07 | 1,030,568 ± 4,264 | 5,011,594 | ✓ Matches Rust | +| Python | 1.00 ± 0.00 | 7138.00 ± 8.54 | 702,105 ± 843 | 5,011,594 | ✓ Matches Rust | + +### Performance Analysis + +1. **Rust** is the reference implementation at ~1.06M tokens/second + - Best encode time: 4.75 seconds + - Very consistent performance (low stddev) + - Reference implementation + +2. **C Bindings** matches Rust performance (estimated ~1.05M tokens/second) + - Direct C FFI to Rust implementation + - Identical results to Rust with minimal overhead + - Very efficient and consistent + +3. **C++ Bindings** comes in a very close second at ~1.03M tokens/second + - Only ~2.5% slower than Rust + - Also very consistent performance + - Wraps the Rust implementation via C FFI, so produces identical results + +4. **Python** is ~33% slower at ~702K tokens/second + - Still respectable performance + - Slightly higher variance in results + - Expected overhead from Python interpreter + - Produces identical results to Rust + +### Key Findings + +#### Speed Comparison (All Implementations) +- **Rust** (baseline): 100% +- **C Bindings**: ~100% (essentially identical to Rust) +- **C++ Bindings**: 97.6% (only 2.4% slower) +- **Python**: 66.5% (33.5% slower) + +### Notes + +- All implementations (Rust, C Bindings, C++ Bindings, Python) produce identical tokenization results (5,011,594 tokens for 6,488,666 characters). + +- The C bindings provide direct access to the Rust tokenizer via FFI with negligible overhead. + +- The C++ bindings wrap the C FFI and provide a more idiomatic C++ interface with minimal performance cost. + +- Load times are negligible (< 1ms) for all variants. + +## Files Generated + +- `benchmark_results.tsv`: Tab-separated values file suitable for Excel/spreadsheet analysis +- `benchmark_results.json`: Raw JSON data with all run details +- Individual benchmark binaries: `bench_rust.out`, `bench_python.py`, `bench_c.out`, `bench_cpp_bindings.out` + +## How to Run + +```bash +cd benchmarks +make -C ../tokenizers/ test +./build.sh # Build all variants +./run.py # Run the benchmark suite +``` + +## Dataset + +- Source: https://norvig.com/big.txt +- Size: 6.2 MB +- Content: Concatenated text from various sources for spelling correction testing diff --git a/benchmarks/bench_c.cpp b/benchmarks/bench_c.cpp new file mode 100644 index 000000000..101d83c39 --- /dev/null +++ b/benchmarks/bench_c.cpp @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include +#include + +// Include the C FFI header +extern "C" { + #include "../bindings/c/tokenizers_c.h" +} + +std::string read_file(const std::string& path) { + std::ifstream file(path); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + path); + } + std::stringstream buffer; + buffer << file.rdbuf(); + return buffer.str(); +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + std::string tokenizer_path = argv[1]; + std::string input_path = argv[2]; + + try { + // Load tokenizer + auto load_start = std::chrono::high_resolution_clock::now(); + void* tokenizer = tokenizers_new_from_file(tokenizer_path.c_str()); + if (!tokenizer) { + throw std::runtime_error("Failed to load tokenizer from file: " + tokenizer_path); + } + auto load_end = std::chrono::high_resolution_clock::now(); + auto load_time = std::chrono::duration_cast(load_end - load_start); + + // Read input file + std::string text = read_file(input_path); + + // Benchmark encoding + auto encode_start = std::chrono::high_resolution_clock::now(); + tokenizers_encoding_t encoding = tokenizers_encode(tokenizer, text.c_str(), false); + auto encode_end = std::chrono::high_resolution_clock::now(); + auto encode_time = std::chrono::duration_cast(encode_end - encode_start); + + if (!encoding.ids || encoding.len == 0) { + tokenizers_free(tokenizer); + throw std::runtime_error("Failed to encode text"); + } + + size_t num_tokens = encoding.len; + size_t num_chars = text.length(); + double tokens_per_sec = (encode_time.count() > 0) ? num_tokens / (encode_time.count() / 1000.0) : 0.0; + + // Print results in a parseable format + std::cout << "load_time_ms:" << load_time.count() << std::endl; + std::cout << "encode_time_ms:" << encode_time.count() << std::endl; + std::cout << "num_tokens:" << num_tokens << std::endl; + std::cout << "num_chars:" << num_chars << std::endl; + std::cout << "tokens_per_sec:" << std::fixed << tokens_per_sec << std::endl; + + // Cleanup + tokenizers_free_encoding(encoding); + tokenizers_free(tokenizer); + + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + + return 0; +} diff --git a/benchmarks/bench_cpp_bindings.cpp b/benchmarks/bench_cpp_bindings.cpp new file mode 100644 index 000000000..e3960cebe --- /dev/null +++ b/benchmarks/bench_cpp_bindings.cpp @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include + +std::string read_file(const std::string& path) { + std::ifstream file(path); + if (!file.is_open()) { + throw std::runtime_error("Cannot open file: " + path); + } + std::stringstream buffer; + buffer << file.rdbuf(); + return buffer.str(); +} + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + std::string tokenizer_path = argv[1]; + std::string input_path = argv[2]; + + try { + // Load tokenizer + auto load_start = std::chrono::high_resolution_clock::now(); + tokenizers::Tokenizer tokenizer(tokenizer_path); + if (!tokenizer.valid()) { + throw std::runtime_error("Failed to load tokenizer"); + } + auto load_end = std::chrono::high_resolution_clock::now(); + auto load_time = std::chrono::duration_cast(load_end - load_start); + + // Read input file + std::string text = read_file(input_path); + + // Benchmark encoding + auto encode_start = std::chrono::high_resolution_clock::now(); + auto ids = tokenizer.encode(text, false); + auto encode_end = std::chrono::high_resolution_clock::now(); + auto encode_time = std::chrono::duration_cast(encode_end - encode_start); + + size_t num_tokens = ids.size(); + size_t num_chars = text.length(); + double tokens_per_sec = num_tokens / (encode_time.count() / 1000.0); + + // Print results in a parseable format + std::cout << "load_time_ms:" << load_time.count() << std::endl; + std::cout << "encode_time_ms:" << encode_time.count() << std::endl; + std::cout << "num_tokens:" << num_tokens << std::endl; + std::cout << "num_chars:" << num_chars << std::endl; + std::cout << "tokens_per_sec:" << std::fixed << tokens_per_sec << std::endl; + + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + + return 0; +} diff --git a/benchmarks/bench_python.py b/benchmarks/bench_python.py new file mode 100755 index 000000000..a5ca971ae --- /dev/null +++ b/benchmarks/bench_python.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +import sys +import time +from tokenizers import Tokenizer + +def main(): + if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + tokenizer_path = sys.argv[1] + input_path = sys.argv[2] + + # Load tokenizer + load_start = time.time() + tokenizer = Tokenizer.from_file(tokenizer_path) + load_time = time.time() - load_start + + # Read input file + with open(input_path, 'r', encoding='utf-8') as f: + text = f.read() + + # Benchmark encoding + encode_start = time.time() + encoding = tokenizer.encode(text) + encode_time = time.time() - encode_start + + num_tokens = len(encoding.ids) + num_chars = len(text) + + # Print results in a parseable format + print(f"load_time_ms:{load_time * 1000:.0f}") + print(f"encode_time_ms:{encode_time * 1000:.0f}") + print(f"num_tokens:{num_tokens}") + print(f"num_chars:{num_chars}") + print(f"tokens_per_sec:{num_tokens / encode_time:.2f}") + +if __name__ == "__main__": + main() diff --git a/benchmarks/bench_rust.rs b/benchmarks/bench_rust.rs new file mode 100644 index 000000000..e2373579b --- /dev/null +++ b/benchmarks/bench_rust.rs @@ -0,0 +1,40 @@ +use std::time::Instant; +use std::fs; +use tokenizers::Tokenizer; + +fn main() -> Result<(), Box> { + let args: Vec = std::env::args().collect(); + + if args.len() < 3 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let tokenizer_path = &args[1]; + let input_path = &args[2]; + + // Load tokenizer + let load_start = Instant::now(); + let tokenizer = Tokenizer::from_file(tokenizer_path)?; + let load_time = load_start.elapsed(); + + // Read input file + let text = fs::read_to_string(input_path)?; + let num_chars = text.chars().count(); + + // Benchmark encoding + let encode_start = Instant::now(); + let encoding = tokenizer.encode(text, false)?; + let encode_time = encode_start.elapsed(); + + let num_tokens = encoding.get_ids().len(); + + // Print results in a parseable format + println!("load_time_ms:{}", load_time.as_millis()); + println!("encode_time_ms:{}", encode_time.as_millis()); + println!("num_tokens:{}", num_tokens); + println!("num_chars:{}", num_chars); + println!("tokens_per_sec:{:.2}", num_tokens as f64 / encode_time.as_secs_f64()); + + Ok(()) +} diff --git a/benchmarks/benchmark_results.tsv b/benchmarks/benchmark_results.tsv new file mode 100644 index 000000000..e40d71128 --- /dev/null +++ b/benchmarks/benchmark_results.tsv @@ -0,0 +1,5 @@ +Variant Load Time (ms) Load Time StdDev Encode Time (ms) Encode Time StdDev Tokens/sec Tokens/sec StdDev Num Tokens Num Chars +Rust 0.00 0.00 4805.00 55.56 1042971 11925 5011594.0 6488666.0 +Python 1.00 0.00 7084.67 56.37 707406 5580 5011594.0 6488666.0 +C Bindings 0.00 0.00 4872.00 166.32 1029460 35497 5011594.0 6488666.0 +C++ Bindings 0.00 0.00 4906.33 12.86 1021459 2673 5011594.0 6488666.0 diff --git a/benchmarks/build.sh b/benchmarks/build.sh new file mode 100755 index 000000000..d34f6feb5 --- /dev/null +++ b/benchmarks/build.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# Build script for all tokenizer variants + +set -e # Exit on error + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT_DIR="$( cd "$SCRIPT_DIR/.." && pwd )" + +# Download big.txt if it doesn't exist +if [ ! -f "$SCRIPT_DIR/big.txt" ]; then + echo ">>> Downloading big.txt..." + curl -o "$SCRIPT_DIR/big.txt" https://norvig.com/big.txt + echo " ✓ big.txt downloaded" + echo +fi + + +echo "=== Building all tokenizer variants ===" +echo + +# Build Rust tokenizer +echo ">>> Building tokenizers-rust..." +cd "$ROOT_DIR/tokenizers" +cargo build --release --features http --example encode_batch +# Find the actual tokenizers rlib file +TOKENIZERS_LIB=$(find target/release/deps -name "libtokenizers-*.rlib" | head -n1) +if [ -z "$TOKENIZERS_LIB" ]; then + echo "Error: Could not find tokenizers library file" + exit 1 +fi +rustc --edition 2018 -L target/release/deps -L target/release \ + --extern tokenizers="$TOKENIZERS_LIB" \ + "$SCRIPT_DIR/bench_rust.rs" \ + -o "$SCRIPT_DIR/bench_rust.out" \ + -C opt-level=3 +echo " ✓ Rust benchmark binary built" +echo + +# Build Python bindings +echo ">>> Building tokenizers-python..." +cd "$ROOT_DIR/bindings/python" +pip install -e . --quiet || pip install -e . +chmod +x "$SCRIPT_DIR/bench_python.py" +echo " ✓ Python bindings installed" +echo + +# Build C bindings +echo ">>> Building tokenizers-c..." +cd "$ROOT_DIR/bindings/c" +cargo build --release +echo " ✓ C bindings library built" +echo + +# Build C benchmark binary +echo ">>> Building C benchmark..." +g++ -std=c++17 -O3 \ + -I"$ROOT_DIR/bindings/c" \ + "$SCRIPT_DIR/bench_c.cpp" \ + -o "$SCRIPT_DIR/bench_c.out" \ + -L"$ROOT_DIR/bindings/c/target/release" \ + -ltokenizers_c \ + -Wl,-rpath,"$ROOT_DIR/bindings/c/target/release" +echo " ✓ C benchmark binary built" +echo + +# Build C++ bindings +echo ">>> Building tokenizers-cpp bindings..." +cd "$ROOT_DIR/bindings/cpp" +mkdir -p build +cd build +cmake -DCMAKE_BUILD_TYPE=Release .. +cmake --build . -j$(nproc) +echo " ✓ C++ bindings library built" +echo + +# Build C++ benchmark binary +echo ">>> Building C++ benchmark..." +g++ -std=c++17 -O3 \ + -I"$ROOT_DIR/bindings/cpp/include" \ + "$SCRIPT_DIR/bench_cpp_bindings.cpp" \ + -o "$SCRIPT_DIR/bench_cpp_bindings.out" \ + -L"$ROOT_DIR/bindings/c/target/release" \ + -ltokenizers_c \ + -Wl,-rpath,"$ROOT_DIR/bindings/c/target/release" +echo " ✓ C++ bindings benchmark binary built" +echo + +echo "=== All builds completed successfully ===" diff --git a/benchmarks/run.py b/benchmarks/run.py new file mode 100755 index 000000000..b0cf1b9d6 --- /dev/null +++ b/benchmarks/run.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python3 +""" +Benchmark automation script for tokenizer variants +Runs each variant 3 times and generates a TSV report with statistics +""" + +import subprocess +import time +import sys +import os +from pathlib import Path +from statistics import mean, stdev +from typing import List, Dict, Any +import json + +SCRIPT_DIR = Path(__file__).parent.absolute() +ROOT_DIR = SCRIPT_DIR.parent +BENCHMARKS_DIR = SCRIPT_DIR + +# Configuration +NUM_RUNS = 3 +INPUT_FILE = BENCHMARKS_DIR / "big.txt" +TOKENIZER_FILE = ROOT_DIR / "tokenizers" / "data" / "tokenizer.json" + +# Variant configurations +VARIANTS = { + "tokenizers-rust": { + "command": [str(BENCHMARKS_DIR / "bench_rust.out"), str(TOKENIZER_FILE), str(INPUT_FILE)], + "name": "Rust" + }, + "tokenizers-python": { + "command": ["python3", str(BENCHMARKS_DIR / "bench_python.py"), str(TOKENIZER_FILE), str(INPUT_FILE)], + "name": "Python" + }, + "tokenizers-c": { + "command": [str(BENCHMARKS_DIR / "bench_c.out"), str(TOKENIZER_FILE), str(INPUT_FILE)], + "name": "C Bindings", + "env": {"LD_LIBRARY_PATH": str(ROOT_DIR / "bindings/c/target/release")} + }, + "tokenizers-cpp-bindings": { + "command": [str(BENCHMARKS_DIR / "bench_cpp_bindings.out"), str(TOKENIZER_FILE), str(INPUT_FILE)], + "name": "C++ Bindings", + "env": {"LD_LIBRARY_PATH": str(ROOT_DIR / "bindings/c/target/release")} + } +} + + +def parse_output(output: str) -> Dict[str, float]: + """Parse the benchmark output into a dictionary""" + result = {} + for line in output.strip().split('\n'): + if ':' in line: + key, value = line.split(':', 1) + try: + result[key] = float(value) + except ValueError: + result[key] = value + return result + + +def run_benchmark(variant_key: str, config: Dict[str, Any]) -> Dict[str, float]: + """Run a single benchmark and return the parsed results""" + env = os.environ.copy() + if "env" in config: + env.update(config["env"]) + + try: + result = subprocess.run( + config["command"], + capture_output=True, + text=True, + check=True, + env=env + ) + return parse_output(result.stdout) + except subprocess.CalledProcessError as e: + print(f"Error running {variant_key}:", file=sys.stderr) + print(f"Command: {' '.join(config['command'])}", file=sys.stderr) + print(f"Return code: {e.returncode}", file=sys.stderr) + print(f"Stdout: {e.stdout}", file=sys.stderr) + print(f"Stderr: {e.stderr}", file=sys.stderr) + raise + except FileNotFoundError as e: + print(f"Error: Could not find executable for {variant_key}", file=sys.stderr) + print(f"Command: {' '.join(config['command'])}", file=sys.stderr) + print(f"Make sure to run build.sh first", file=sys.stderr) + raise + + +def calculate_stats(values: List[float]) -> Dict[str, float]: + """Calculate mean and standard deviation""" + if len(values) < 2: + return {"mean": values[0] if values else 0, "stdev": 0} + return {"mean": mean(values), "stdev": stdev(values)} + + +def main(): + print("=== Tokenizer Benchmark Suite ===") + print(f"Input file: {INPUT_FILE}") + print(f"Tokenizer: {TOKENIZER_FILE}") + print(f"Number of runs per variant: {NUM_RUNS}") + print() + + if not INPUT_FILE.exists(): + print(f"Error: Input file not found: {INPUT_FILE}", file=sys.stderr) + sys.exit(1) + + if not TOKENIZER_FILE.exists(): + print(f"Error: Tokenizer file not found: {TOKENIZER_FILE}", file=sys.stderr) + sys.exit(1) + + all_results = {} + + for variant_key, config in VARIANTS.items(): + variant_name = config["name"] + print(f">>> Running {variant_name} ({NUM_RUNS} runs)...") + + runs = [] + for run_num in range(1, NUM_RUNS + 1): + print(f" Run {run_num}/{NUM_RUNS}...", end=" ", flush=True) + try: + result = run_benchmark(variant_key, config) + runs.append(result) + print(f"✓ ({result.get('encode_time_ms', 0):.0f}ms)") + except Exception as e: + print(f"✗ FAILED") + print(f" Error: {e}", file=sys.stderr) + # Store None to indicate failure + all_results[variant_key] = None + break + else: + # All runs succeeded + all_results[variant_key] = { + "name": variant_name, + "runs": runs + } + + print() + + # Generate statistics + print("=== Calculating Statistics ===") + print() + + stats = {} + for variant_key, data in all_results.items(): + if data is None: + print(f"{VARIANTS[variant_key]['name']}: FAILED") + continue + + load_times = [r['load_time_ms'] for r in data['runs']] + encode_times = [r['encode_time_ms'] for r in data['runs']] + tokens_per_sec = [r['tokens_per_sec'] for r in data['runs']] + + stats[variant_key] = { + "name": data["name"], + "load_time": calculate_stats(load_times), + "encode_time": calculate_stats(encode_times), + "tokens_per_sec": calculate_stats(tokens_per_sec), + "num_tokens": data['runs'][0]['num_tokens'], + "num_chars": data['runs'][0]['num_chars'] + } + + print(f"{data['name']}:") + print(f" Load time: {stats[variant_key]['load_time']['mean']:>8.2f} ± {stats[variant_key]['load_time']['stdev']:>6.2f} ms") + print(f" Encode time: {stats[variant_key]['encode_time']['mean']:>8.2f} ± {stats[variant_key]['encode_time']['stdev']:>6.2f} ms") + print(f" Tokens/sec: {stats[variant_key]['tokens_per_sec']['mean']:>8.0f} ± {stats[variant_key]['tokens_per_sec']['stdev']:>6.0f}") + print(f" Tokens: {stats[variant_key]['num_tokens']}") + print() + + # Generate TSV report + output_file = BENCHMARKS_DIR / "benchmark_results.tsv" + print(f"=== Generating TSV report: {output_file} ===") + + with open(output_file, 'w') as f: + # Header + f.write("Variant\tLoad Time (ms)\tLoad Time StdDev\tEncode Time (ms)\tEncode Time StdDev\t") + f.write("Tokens/sec\tTokens/sec StdDev\tNum Tokens\tNum Chars\n") + + # Data rows + for variant_key in VARIANTS.keys(): + if variant_key not in stats: + continue + + s = stats[variant_key] + f.write(f"{s['name']}\t") + f.write(f"{s['load_time']['mean']:.2f}\t{s['load_time']['stdev']:.2f}\t") + f.write(f"{s['encode_time']['mean']:.2f}\t{s['encode_time']['stdev']:.2f}\t") + f.write(f"{s['tokens_per_sec']['mean']:.0f}\t{s['tokens_per_sec']['stdev']:.0f}\t") + f.write(f"{s['num_tokens']}\t{s['num_chars']}\n") + + print(f"✓ Report saved to {output_file}") + print() + + # Also save raw JSON data + json_file = BENCHMARKS_DIR / "benchmark_results.json" + with open(json_file, 'w') as f: + json.dump({ + "config": { + "num_runs": NUM_RUNS, + "input_file": str(INPUT_FILE), + "tokenizer_file": str(TOKENIZER_FILE) + }, + "results": all_results, + "statistics": stats + }, f, indent=2) + + print(f"✓ Raw data saved to {json_file}") + print() + print("=== Benchmark Complete ===") + + +if __name__ == "__main__": + main() diff --git a/bindings/c/Cargo.toml b/bindings/c/Cargo.toml new file mode 100644 index 000000000..41fd200bd --- /dev/null +++ b/bindings/c/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "tokenizers_c" +version = "0.0.1" +edition = "2021" +license = "Apache-2.0" + +[lib] +crate-type = ["cdylib"] +name = "tokenizers_c" + +[dependencies] +# Path to the core tokenizers crate relative to this Cargo.toml +# Current file is at bindings/tokenizers_c/Cargo.toml, core crate at tokenizers/ +tokenizers = { path = "../../tokenizers" } +serde_json = "1.0" + +[profile.release] +opt-level = 3 +codegen-units = 1 +lto = true diff --git a/bindings/c/src/lib.rs b/bindings/c/src/lib.rs new file mode 100644 index 000000000..15cdecd5e --- /dev/null +++ b/bindings/c/src/lib.rs @@ -0,0 +1,674 @@ +use std::ffi::{CStr, CString}; +use std::os::raw::{c_char, c_void}; +use std::ptr; +use std::path::Path; +use std::fs; +use tokenizers::{Encoding, Tokenizer, AddedToken, PaddingParams, PaddingStrategy, PaddingDirection}; +use serde_json::Value; + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct tokenizers_encoding_t { + pub ids: *const i32, + pub attention_mask: *const i32, + pub len: usize, + pub _internal_ptr: *mut c_void, // Store the Box pointer for cleanup +} + +/// Tokenizer configuration loaded from tokenizer_config.json +/// Contains authoritative special token definitions and chat template +#[derive(Default, Clone)] +struct TokenizerConfig { + bos_token: Option, + eos_token: Option, + pad_token: Option, + unk_token: Option, + chat_template: Option, + add_bos_token: bool, + add_eos_token: bool, +} + +impl TokenizerConfig { + /// Load config from a directory containing tokenizer_config.json + fn from_dir(dir: &Path) -> Option { + let config_path = dir.join("tokenizer_config.json"); + Self::from_file(&config_path) + } + + /// Load config from a specific file path + fn from_file(path: &Path) -> Option { + let content = fs::read_to_string(path).ok()?; + Self::from_json(&content) + } + + /// Parse config from JSON string + fn from_json(json: &str) -> Option { + let v: Value = serde_json::from_str(json).ok()?; + + // Helper to extract token string - handles both string and object formats + let extract_token = |v: &Value, key: &str| -> Option { + match v.get(key)? { + Value::String(s) => Some(s.clone()), + Value::Object(obj) => obj.get("content")?.as_str().map(|s| s.to_string()), + _ => None, + } + }; + + Some(TokenizerConfig { + bos_token: extract_token(&v, "bos_token"), + eos_token: extract_token(&v, "eos_token"), + pad_token: extract_token(&v, "pad_token"), + unk_token: extract_token(&v, "unk_token"), + chat_template: v.get("chat_template").and_then(|v| v.as_str()).map(|s| s.to_string()), + add_bos_token: v.get("add_bos_token").and_then(|v| v.as_bool()).unwrap_or(false), + add_eos_token: v.get("add_eos_token").and_then(|v| v.as_bool()).unwrap_or(false), + }) + } + + /// Get special token string by name + fn get_special_token(&self, name: &str) -> Option<&str> { + match name.to_uppercase().as_str() { + "BOS" => self.bos_token.as_deref(), + "EOS" => self.eos_token.as_deref(), + "PAD" => self.pad_token.as_deref(), + "UNK" => self.unk_token.as_deref(), + _ => None, + } + } +} + +/// Opaque tokenizer type exposed as void* on the C side. +/// Contains tokenizer + optional config (auto-loaded from same directory) +struct CTokenizer { + tokenizer: Tokenizer, + config: Option, +} + +impl CTokenizer { + fn new_from_file(path: &str, config_path: Option<&str>) -> Option { + let tokenizer = Tokenizer::from_file(path).ok()?; + // Load config: explicit path > sibling tokenizer_config.json + let config = if let Some(cp) = config_path { + TokenizerConfig::from_file(Path::new(cp)) + } else { + Path::new(path).parent().and_then(TokenizerConfig::from_dir) + }; + Some(CTokenizer { tokenizer, config }) + } + + fn new_from_str(json: &str) -> Option { + let tokenizer = Tokenizer::from_bytes(json.as_bytes()).ok()?; + // No config available when loading from string + Some(CTokenizer { tokenizer, config: None }) + } + + /// Get special token ID - tries config first, falls back to heuristic + fn get_special_token_id(&self, name: &str) -> i32 { + // Try config first (authoritative) + if let Some(config) = &self.config { + if let Some(token) = config.get_special_token(name) { + if let Some(id) = self.tokenizer.token_to_id(token) { + return id as i32; + } + } + } + // Fall back to heuristic + let candidates = match name.to_uppercase().as_str() { + "BOS" => &["", "", "[CLS]", "<|begin_of_text|>", "<|startoftext|>"][..], + "EOS" => &["", "", "[SEP]", "<|end_of_text|>", "<|endoftext|>", "<|eot_id|>"][..], + "PAD" => &["", "[PAD]", "<|padding|>"][..], + "UNK" => &["", "[UNK]", "<|unk|>"][..], + _ => return -1, + }; + for token in candidates { + if let Some(id) = self.tokenizer.token_to_id(token) { + return id as i32; + } + } + -1 + } +} + +/// Encoding data that we'll Box allocate for safe memory management +struct EncodingData { + ids: Vec, + attention_mask: Vec, +} + +#[no_mangle] +pub extern "C" fn tokenizers_new_from_file(path: *const c_char) -> *mut c_void { + tokenizers_new_from_file_with_config(path, ptr::null()) +} + +/// Create tokenizer with explicit config file path +#[no_mangle] +pub extern "C" fn tokenizers_new_from_file_with_config( + path: *const c_char, + config_path: *const c_char +) -> *mut c_void { + if path.is_null() { + return ptr::null_mut(); + } + let c_str = unsafe { CStr::from_ptr(path) }; + let path_str = match c_str.to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + let config_str = if config_path.is_null() { + None + } else { + let c_cfg = unsafe { CStr::from_ptr(config_path) }; + c_cfg.to_str().ok() + }; + match CTokenizer::new_from_file(path_str, config_str) { + Some(t) => Box::into_raw(Box::new(t)) as *mut c_void, + None => ptr::null_mut(), + } +} + +#[no_mangle] +pub extern "C" fn tokenizers_new_from_str(json: *const c_char) -> *mut c_void { + if json.is_null() { return ptr::null_mut(); } + let c_str = unsafe { CStr::from_ptr(json) }; + let json_str = match c_str.to_str() { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + match CTokenizer::new_from_str(json_str) { + Some(t) => Box::into_raw(Box::new(t)) as *mut c_void, + None => ptr::null_mut(), + } +} + +#[no_mangle] +pub extern "C" fn tokenizers_free(tokenizer: *mut c_void) { + if tokenizer.is_null() { return; } + unsafe { drop(Box::from_raw(tokenizer as *mut CTokenizer)); } +} + +#[no_mangle] +pub extern "C" fn tokenizers_encode( + tokenizer: *mut c_void, + text: *const c_char, + add_special_tokens: bool, +) -> tokenizers_encoding_t { + if tokenizer.is_null() || text.is_null() { + return tokenizers_encoding_t { + ids: ptr::null(), + attention_mask: ptr::null(), + len: 0, + _internal_ptr: ptr::null_mut() + }; + } + let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) }; + let c_text = unsafe { CStr::from_ptr(text) }; + let text_str = match c_text.to_str() { Ok(s) => s, Err(_) => { + return tokenizers_encoding_t { + ids: ptr::null(), + attention_mask: ptr::null(), + len: 0, + _internal_ptr: ptr::null_mut() + }; + }}; + + let encoding: Encoding = match c_tok.tokenizer.encode(text_str, add_special_tokens) { + Ok(e) => e, + Err(_) => return tokenizers_encoding_t { + ids: ptr::null(), + attention_mask: ptr::null(), + len: 0, + _internal_ptr: ptr::null_mut() + }, + }; + + let ids_vec: Vec = encoding.get_ids().iter().map(|&v| v as i32).collect(); + let mask_vec: Vec = encoding.get_attention_mask().iter().map(|&v| v as i32).collect(); + let len = ids_vec.len(); + + // Allocate EncodingData on the heap using Box + let encoding_data = Box::new(EncodingData { + ids: ids_vec, + attention_mask: mask_vec, + }); + + let ptr_ids = encoding_data.ids.as_ptr(); + let ptr_mask = encoding_data.attention_mask.as_ptr(); + + // Convert Box to raw pointer - this transfers ownership to C + let raw_ptr = Box::into_raw(encoding_data); + + tokenizers_encoding_t { + ids: ptr_ids, + attention_mask: ptr_mask, + len, + _internal_ptr: raw_ptr as *mut c_void + } +} + +#[no_mangle] +pub extern "C" fn tokenizers_encode_batch( + tokenizer: *mut c_void, + texts: *const *const c_char, + len: usize, + add_special_tokens: bool, +) -> *mut tokenizers_encoding_t { + if tokenizer.is_null() || texts.is_null() { return ptr::null_mut(); } + let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) }; + let c_texts_ptrs = unsafe { std::slice::from_raw_parts(texts, len) }; + + let mut rs_texts = Vec::new(); + for &ptr in c_texts_ptrs { + if ptr.is_null() { continue; } + let c_str = unsafe { CStr::from_ptr(ptr) }; + if let Ok(s) = c_str.to_str() { + rs_texts.push(s); + } + } + + let encodings = match c_tok.tokenizer.encode_batch(rs_texts, add_special_tokens) { + Ok(e) => e, + Err(_) => return ptr::null_mut(), + }; + + let mut c_encodings = Vec::with_capacity(encodings.len()); + for encoding in encodings { + let ids_vec: Vec = encoding.get_ids().iter().map(|&v| v as i32).collect(); + let mask_vec: Vec = encoding.get_attention_mask().iter().map(|&v| v as i32).collect(); + let len = ids_vec.len(); + let ptr_ids = ids_vec.as_ptr(); + let ptr_mask = mask_vec.as_ptr(); + + std::mem::forget(ids_vec); + std::mem::forget(mask_vec); + + c_encodings.push(tokenizers_encoding_t { + ids: ptr_ids, + attention_mask: ptr_mask, + len, + _internal_ptr: ptr::null_mut() // Batch encoding has memory management issues - we'll leak for now + }); + } + + let ptr = c_encodings.as_mut_ptr(); + std::mem::forget(c_encodings); + ptr +} + +#[no_mangle] +pub extern "C" fn tokenizers_free_encoding(enc: tokenizers_encoding_t) { + if !enc._internal_ptr.is_null() { + unsafe { + // Reconstruct the Box from the raw pointer and let it drop naturally + let _boxed = Box::from_raw(enc._internal_ptr as *mut EncodingData); + // Box will be automatically dropped here, cleaning up the memory + } + } +} + +#[no_mangle] +pub extern "C" fn tokenizers_free_batch_encoding(encodings: *mut tokenizers_encoding_t, len: usize) { + if encodings.is_null() { return; } + let slice = unsafe { std::slice::from_raw_parts_mut(encodings, len) }; + for enc in slice { + tokenizers_free_encoding(*enc); + } + unsafe { Vec::from_raw_parts(encodings, len, len); } +} + +#[no_mangle] +pub extern "C" fn tokenizers_version() -> *const c_char { + // Return a static C string with version info. + static VERSION: &str = concat!("tokenizers_c ", env!("CARGO_PKG_VERSION")); + CString::new(VERSION).unwrap().into_raw() +} + +#[no_mangle] +pub extern "C" fn tokenizers_string_free(s: *mut c_char) { + if s.is_null() { return; } + unsafe { drop(CString::from_raw(s)); } +} + +#[no_mangle] +pub extern "C" fn tokenizers_vocab_size(tokenizer: *mut c_void) -> usize { + if tokenizer.is_null() { return 0; } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + c_tok.tokenizer.get_vocab(true).len() +} + +#[no_mangle] +pub extern "C" fn tokenizers_token_to_id(tokenizer: *mut c_void, token: *const c_char) -> i32 { + if tokenizer.is_null() || token.is_null() { return -1; } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + let c_token = unsafe { CStr::from_ptr(token) }; + let token_str = match c_token.to_str() { Ok(s) => s, Err(_) => return -1 }; + match c_tok.tokenizer.token_to_id(token_str) { + Some(id) => id as i32, + None => -1, + } +} + +#[no_mangle] +pub extern "C" fn tokenizers_id_to_token(tokenizer: *mut c_void, id: i32) -> *mut c_char { + if tokenizer.is_null() { return ptr::null_mut(); } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + match c_tok.tokenizer.id_to_token(id as u32) { + Some(token) => CString::new(token).unwrap().into_raw(), + None => ptr::null_mut(), + } +} + +#[no_mangle] +pub extern "C" fn tokenizers_decode( + tokenizer: *mut c_void, + ids: *const i32, + len: usize, + skip_special_tokens: bool +) -> *mut c_char { + if tokenizer.is_null() || ids.is_null() { return ptr::null_mut(); } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + let ids_slice_i32 = unsafe { std::slice::from_raw_parts(ids, len) }; + let ids_slice_u32: Vec = ids_slice_i32.iter().map(|&id| id as u32).collect(); + + match c_tok.tokenizer.decode(&ids_slice_u32, skip_special_tokens) { + Ok(s) => CString::new(s).unwrap().into_raw(), + Err(_) => ptr::null_mut(), + } +} + +#[no_mangle] +pub extern "C" fn tokenizers_decode_batch( + tokenizer: *mut c_void, + ids: *const *const i32, + lens: *const usize, + batch_len: usize, + skip_special_tokens: bool +) -> *mut *mut c_char { + if tokenizer.is_null() || ids.is_null() || lens.is_null() { return ptr::null_mut(); } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + + let ids_ptrs = unsafe { std::slice::from_raw_parts(ids, batch_len) }; + let lens_slice = unsafe { std::slice::from_raw_parts(lens, batch_len) }; + + let mut batch_ids_u32 = Vec::with_capacity(batch_len); + for i in 0..batch_len { + let len = lens_slice[i]; + let ptr = ids_ptrs[i]; + if ptr.is_null() { + batch_ids_u32.push(vec![]); + continue; + } + let slice = unsafe { std::slice::from_raw_parts(ptr, len) }; + batch_ids_u32.push(slice.iter().map(|&id| id as u32).collect()); + } + + let batch_ids_refs: Vec<&[u32]> = batch_ids_u32.iter().map(|v| v.as_slice()).collect(); + + let decoded = match c_tok.tokenizer.decode_batch(&batch_ids_refs, skip_special_tokens) { + Ok(s) => s, + Err(_) => return ptr::null_mut(), + }; + + let mut c_strings = Vec::with_capacity(decoded.len()); + for s in decoded { + c_strings.push(CString::new(s).unwrap().into_raw()); + } + + let ptr = c_strings.as_mut_ptr(); + std::mem::forget(c_strings); + ptr +} + +#[no_mangle] +pub extern "C" fn tokenizers_free_batch_decode(strings: *mut *mut c_char, len: usize) { + if strings.is_null() { return; } + let slice = unsafe { std::slice::from_raw_parts_mut(strings, len) }; + for &mut s in slice { + tokenizers_string_free(s); + } + unsafe { Vec::from_raw_parts(strings, len, len); } +} + +#[no_mangle] +pub extern "C" fn tokenizers_save(tokenizer: *mut c_void, path: *const c_char, pretty: bool) -> bool { + if tokenizer.is_null() || path.is_null() { return false; } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + let c_path = unsafe { CStr::from_ptr(path) }; + let path_str = match c_path.to_str() { Ok(s) => s, Err(_) => return false }; + + c_tok.tokenizer.save(path_str, pretty).is_ok() +} + +#[no_mangle] +pub extern "C" fn tokenizers_to_str(tokenizer: *mut c_void, pretty: bool) -> *mut c_char { + if tokenizer.is_null() { return ptr::null_mut(); } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + match c_tok.tokenizer.to_string(pretty) { + Ok(s) => CString::new(s).unwrap().into_raw(), + Err(_) => ptr::null_mut(), + } +} + +#[no_mangle] +pub extern "C" fn tokenizers_add_special_token(tokenizer: *mut c_void, token: *const c_char) -> bool { + if tokenizer.is_null() || token.is_null() { return false; } + let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) }; + let c_token = unsafe { CStr::from_ptr(token) }; + let token_str = match c_token.to_str() { Ok(s) => s, Err(_) => return false }; + let added = AddedToken::from(token_str.to_string(), true); + c_tok.tokenizer.add_special_tokens(&[added]); + true +} + +#[no_mangle] +pub extern "C" fn tokenizers_add_special_tokens( + tokenizer: *mut c_void, + tokens: *const *const c_char, + len: usize +) -> usize { + if tokenizer.is_null() || tokens.is_null() { return 0; } + let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) }; + let c_tokens_ptrs = unsafe { std::slice::from_raw_parts(tokens, len) }; + + let mut added_tokens = Vec::new(); + for &ptr in c_tokens_ptrs { + if ptr.is_null() { continue; } + let c_str = unsafe { CStr::from_ptr(ptr) }; + if let Ok(s) = c_str.to_str() { + added_tokens.push(AddedToken::from(s.to_string(), true)); + } + } + + c_tok.tokenizer.add_special_tokens(&added_tokens) +} + +#[no_mangle] +pub extern "C" fn tokenizers_add_tokens( + tokenizer: *mut c_void, + tokens: *const *const c_char, + len: usize +) -> usize { + if tokenizer.is_null() || tokens.is_null() { return 0; } + let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) }; + let c_tokens_ptrs = unsafe { std::slice::from_raw_parts(tokens, len) }; + + let mut added_tokens = Vec::new(); + for &ptr in c_tokens_ptrs { + if ptr.is_null() { continue; } + let c_str = unsafe { CStr::from_ptr(ptr) }; + if let Ok(s) = c_str.to_str() { + added_tokens.push(AddedToken::from(s.to_string(), false)); + } + } + + c_tok.tokenizer.add_tokens(&added_tokens) +} + +#[repr(C)] +pub struct tokenizers_truncation_params_t { + pub max_length: usize, + pub stride: usize, + pub strategy: i32, // 0: LongestFirst, 1: OnlyFirst, 2: OnlySecond + pub direction: i32, // 0: Left, 1: Right +} + +#[no_mangle] +pub extern "C" fn tokenizers_set_truncation( + tokenizer: *mut c_void, + params: *const tokenizers_truncation_params_t +) { + if tokenizer.is_null() { return; } + let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) }; + + if params.is_null() { + let _ = c_tok.tokenizer.with_truncation(None); + return; + } + + let p = unsafe { &*params }; + + let strategy = match p.strategy { + 1 => tokenizers::TruncationStrategy::OnlyFirst, + 2 => tokenizers::TruncationStrategy::OnlySecond, + _ => tokenizers::TruncationStrategy::LongestFirst, + }; + + let direction = match p.direction { + 1 => tokenizers::TruncationDirection::Right, + _ => tokenizers::TruncationDirection::Left, + }; + + let params = tokenizers::TruncationParams { + max_length: p.max_length, + stride: p.stride, + strategy, + direction, + }; + + let _ = c_tok.tokenizer.with_truncation(Some(params)); +} + +#[repr(C)] +pub struct tokenizers_padding_params_t { + pub pad_id: u32, + pub pad_type_id: u32, + pub pad_token: *const c_char, + pub strategy: i32, // 0: BatchLongest, 1: Fixed + pub fixed_length: usize, + pub direction: i32, // 0: Left, 1: Right + pub pad_to_multiple_of: usize, +} + +#[no_mangle] +pub extern "C" fn tokenizers_set_padding( + tokenizer: *mut c_void, + params: *const tokenizers_padding_params_t +) { + if tokenizer.is_null() { return; } + let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) }; + + if params.is_null() { + c_tok.tokenizer.with_padding(None); + return; + } + + let p = unsafe { &*params }; + let pad_token = unsafe { CStr::from_ptr(p.pad_token) }.to_string_lossy().into_owned(); + + let strategy = match p.strategy { + 1 => PaddingStrategy::Fixed(p.fixed_length), + _ => PaddingStrategy::BatchLongest, + }; + + let direction = match p.direction { + 1 => PaddingDirection::Right, + _ => PaddingDirection::Left, + }; + + let params = PaddingParams { + strategy, + direction, + pad_id: p.pad_id, + pad_type_id: p.pad_type_id, + pad_token, + pad_to_multiple_of: if p.pad_to_multiple_of == 0 { None } else { Some(p.pad_to_multiple_of) }, + }; + + c_tok.tokenizer.with_padding(Some(params)); +} + +// === Special Token IDs === +// Unified API: automatically uses config if available, falls back to heuristic + +/// Get special token ID by name ("BOS", "EOS", "PAD", "UNK") +/// Automatically uses tokenizer_config.json if found, otherwise uses heuristic. +/// Returns -1 if not found. +#[no_mangle] +pub extern "C" fn tokenizers_get_special_token_id( + tokenizer: *mut c_void, + name: *const c_char +) -> i32 { + if tokenizer.is_null() || name.is_null() { return -1; } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + let c_name = unsafe { CStr::from_ptr(name) }; + let name_str = match c_name.to_str() { Ok(s) => s, Err(_) => return -1 }; + c_tok.get_special_token_id(name_str) +} + +/// Get special token string by name ("BOS", "EOS", "PAD", "UNK") +/// Returns the token from config if available, otherwise null. +/// Caller must free with tokenizers_string_free. +#[no_mangle] +pub extern "C" fn tokenizers_get_special_token( + tokenizer: *mut c_void, + name: *const c_char +) -> *mut c_char { + if tokenizer.is_null() || name.is_null() { return ptr::null_mut(); } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + let c_name = unsafe { CStr::from_ptr(name) }; + let name_str = match c_name.to_str() { Ok(s) => s, Err(_) => return ptr::null_mut() }; + + if let Some(config) = &c_tok.config { + if let Some(token) = config.get_special_token(name_str) { + return CString::new(token).unwrap().into_raw(); + } + } + ptr::null_mut() +} + +/// Get add_bos_token setting from config (false if no config) +#[no_mangle] +pub extern "C" fn tokenizers_get_add_bos_token(tokenizer: *mut c_void) -> bool { + if tokenizer.is_null() { return false; } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + c_tok.config.as_ref().map_or(false, |c| c.add_bos_token) +} + +/// Get add_eos_token setting from config (false if no config) +#[no_mangle] +pub extern "C" fn tokenizers_get_add_eos_token(tokenizer: *mut c_void) -> bool { + if tokenizer.is_null() { return false; } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + c_tok.config.as_ref().map_or(false, |c| c.add_eos_token) +} + +/// Check if tokenizer has a chat template (from config) +#[no_mangle] +pub extern "C" fn tokenizers_has_chat_template(tokenizer: *mut c_void) -> bool { + if tokenizer.is_null() { return false; } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + c_tok.config.as_ref().map_or(false, |c| c.chat_template.is_some()) +} + +/// Get chat template string (caller must free with tokenizers_string_free) +#[no_mangle] +pub extern "C" fn tokenizers_get_chat_template(tokenizer: *mut c_void) -> *mut c_char { + if tokenizer.is_null() { return ptr::null_mut(); } + let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) }; + if let Some(config) = &c_tok.config { + if let Some(template) = &config.chat_template { + return CString::new(template.as_str()).unwrap().into_raw(); + } + } + ptr::null_mut() +} + diff --git a/bindings/c/tokenizers_c.h b/bindings/c/tokenizers_c.h new file mode 100644 index 000000000..111198ac9 --- /dev/null +++ b/bindings/c/tokenizers_c.h @@ -0,0 +1,84 @@ +#ifndef TOKENIZERS_C_H +#define TOKENIZERS_C_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + const int* ids; + const int* attention_mask; + size_t len; + void* _internal_ptr; // Internal use only - do not access +} tokenizers_encoding_t; + +// Create a new tokenizer from a JSON file (auto-loads tokenizer_config.json if present) +void* tokenizers_new_from_file(const char* path); + +// Create a new tokenizer with explicit config file path +void* tokenizers_new_from_file_with_config(const char* path, const char* config_path); + +// Create a new tokenizer from a JSON string +void* tokenizers_new_from_str(const char* json); + +// Free a tokenizer +void tokenizers_free(void* tokenizer); + +// Encode text into token IDs +tokenizers_encoding_t tokenizers_encode(void* tokenizer, const char* text, bool add_special_tokens); + +// Free an encoding +void tokenizers_free_encoding(tokenizers_encoding_t enc); + +// Get tokenizer version +const char* tokenizers_version(); + +// Free a string returned by the library +void tokenizers_string_free(char* s); + +// Get vocabulary size +size_t tokenizers_vocab_size(void* tokenizer); + +// Get token ID for a token string +int tokenizers_token_to_id(void* tokenizer, const char* token); + +// Get token string for a token ID +char* tokenizers_id_to_token(void* tokenizer, int id); + +// Decode token IDs back to text +char* tokenizers_decode(void* tokenizer, const int* ids, size_t len, bool skip_special_tokens); + +// Add a special token +bool tokenizers_add_special_token(void* tokenizer, const char* token); + +// === Special Tokens (unified API) === +// Config is auto-loaded from tokenizer_config.json if present next to tokenizer.json + +// Get special token ID by name ("BOS", "EOS", "PAD", "UNK") +// Uses config if available, falls back to heuristic. Returns -1 if not found. +int tokenizers_get_special_token_id(void* tokenizer, const char* name); + +// Get special token string by name ("BOS", "EOS", "PAD", "UNK") +// Returns token from config, or NULL if not available. Must free with tokenizers_string_free. +char* tokenizers_get_special_token(void* tokenizer, const char* name); + +// Get add_bos_token setting (false if no config) +bool tokenizers_get_add_bos_token(void* tokenizer); + +// Get add_eos_token setting (false if no config) +bool tokenizers_get_add_eos_token(void* tokenizer); + +// Check if tokenizer has a chat template +bool tokenizers_has_chat_template(void* tokenizer); + +// Get chat template string (must be freed with tokenizers_string_free) +char* tokenizers_get_chat_template(void* tokenizer); + +#ifdef __cplusplus +} +#endif + +#endif // TOKENIZERS_C_H diff --git a/bindings/cpp/CMakeLists.txt b/bindings/cpp/CMakeLists.txt new file mode 100644 index 000000000..7921f0b50 --- /dev/null +++ b/bindings/cpp/CMakeLists.txt @@ -0,0 +1,80 @@ +cmake_minimum_required(VERSION 3.16) +project(tokenizers_cpp LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# Option to force a fresh cargo build +option(TOKENIZERS_CPP_FORCE_CARGO "Force rebuilding the Rust C FFI library" OFF) +option(TOKENIZERS_COMPILE_TESTS "Compile tokenizers C++ bindings tests" ON) + +# Build directory for Rust output (now at bindings/c) +set(RUST_CRATE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c) +set(RUST_OUTPUT_DIR ${RUST_CRATE_DIR}/target/release) +set(RUST_LIB_NAME tokenizers_c) + +# Jinja2Cpp for chat template rendering +set(JINJA2CPP_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "" FORCE) +set(JINJA2CPP_DEPS_MODE internal CACHE STRING "" FORCE) +add_subdirectory(third_party/Jinja2Cpp) + +# Custom command to build the Rust cdylib +add_custom_command( + OUTPUT ${RUST_OUTPUT_DIR}/lib${RUST_LIB_NAME}.so + WORKING_DIRECTORY ${RUST_CRATE_DIR} + COMMAND cargo build --release + COMMENT "Building Rust FFI crate tokenizers_c" + DEPENDS ${RUST_CRATE_DIR}/src/lib.rs ${RUST_CRATE_DIR}/Cargo.toml + VERBATIM +) + +add_custom_target(build_rust_ffi DEPENDS ${RUST_OUTPUT_DIR}/lib${RUST_LIB_NAME}.so) + +add_library(${RUST_LIB_NAME} SHARED IMPORTED GLOBAL) +add_dependencies(${RUST_LIB_NAME} build_rust_ffi) +set_target_properties(${RUST_LIB_NAME} PROPERTIES + IMPORTED_LOCATION ${RUST_OUTPUT_DIR}/lib${RUST_LIB_NAME}.so +) + +# C++ wrapper library with chat template support +add_library(tokenizers_cpp_impl STATIC + src/tokenizers.cpp +) +add_dependencies(tokenizers_cpp_impl build_rust_ffi) +target_include_directories(tokenizers_cpp_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) +target_link_libraries(tokenizers_cpp_impl PUBLIC ${RUST_LIB_NAME} jinja2cpp) + +# Interface library for easy linking +add_library(tokenizers_cpp INTERFACE) +target_link_libraries(tokenizers_cpp INTERFACE tokenizers_cpp_impl) + +# Tests +if(TOKENIZERS_COMPILE_TESTS) + enable_testing() + + include(FetchContent) + FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip + ) + # For Windows: Prevent overriding the parent project's compiler/linker settings + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(googletest) + + # Google Test executable + add_executable(tokenizer_tests_gtest + tests/test_tokenizer_gtest.cpp + ) + target_link_libraries(tokenizer_tests_gtest PRIVATE tokenizers_cpp GTest::gtest_main) + + # Set test data directory for test discovery + set(TOKENIZERS_TEST_DATA_DIR "${CMAKE_CURRENT_SOURCE_DIR}/data") + + # Register Google Test with environment variable for test data + include(GoogleTest) + gtest_discover_tests(tokenizer_tests_gtest + PROPERTIES ENVIRONMENT "TOKENIZERS_TEST_DATA=${TOKENIZERS_TEST_DATA_DIR}" + ) +endif() diff --git a/bindings/cpp/README.md b/bindings/cpp/README.md new file mode 100644 index 000000000..454162f00 --- /dev/null +++ b/bindings/cpp/README.md @@ -0,0 +1,114 @@ +# C++ Bindings for HuggingFace Tokenizers + +Minimal C++17 wrapper over the Rust `tokenizers` crate. + +## Quick Start + +See the [example project](example/) for a complete, working demonstration of all features. + +```bash +# Build and run the example +cmake -S bindings/cpp/example -B build_example +cmake --build build_example +./build_example/tokenizer_example path/to/tokenizer.json "Your text here" +``` + +## Overview + +Architecture: +- Rust FFI crate (`tokenizers_c`) exposes a C ABI (load, encode, vocab ops, special tokens). +- Header-only C++ class `tokenizers::Tokenizer` provides RAII, `encode()` returning `std::vector`. +- Build system: CMake + cargo. CTest for tests. + +## Build + +Prerequisites: Rust toolchain, CMake >= 3.16, a C++17 compiler. + +```bash + +# prerequisite 1: Install rustc and cargo, if you dont have it already +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +. "$HOME/.cargo/env" + +# NOTE: the below commands should be run from the tokenizers repo root + +# prerequisite 2: original tokenizer (rust) can be built and tested +make -C ./tokenizers test + +# Configure & build +cmake -S bindings/cpp -B build-cpp +cmake --build build-cpp -j +# if you run out of memory, replace "-j" (use all cores) with "-j4" (use only 4 cores) + +# Run tests (Google Test suite) +ctest --test-dir build-cpp -V +``` + +## FFI API Surface + +C++ `Tokenizer` class methods: +- `load(path)` / constructor - load tokenizer from JSON file +- `FromBlobJSON(json)` - load tokenizer from JSON string (static method) +- `encode(text, add_special_tokens=true)` - encode text to token IDs +- `encode_batch(texts, add_special_tokens=true)` - encode batch of texts +- `decode(ids, skip_special_tokens=true)` - decode IDs to string +- `decode_batch(batch_ids, skip_special_tokens=true)` - decode batch of IDs +- `vocab_size()` - get vocabulary size +- `token_to_id(token)` - lookup token ID (returns -1 if not found) +- `id_to_token(id)` - lookup token string (returns empty if not found) +- `add_special_token(token)` - add a special token to vocabulary +- `add_special_tokens(tokens)` - add multiple special tokens +- `set_padding(params)` - configure padding +- `disable_padding()` - disable padding +- `set_truncation(params)` - configure truncation +- `disable_truncation()` - disable truncation +- `save(path, pretty=true)` - save tokenizer to JSON file +- `to_string(pretty=false)` - serialize tokenizer to JSON string +- `valid()` - check if tokenizer loaded successfully +- `version()` - get FFI version string (static method) + +## Test Coverage + +C++ binding tests are now unified using Google Test in `bindings/cpp/tests/test_tokenizer_gtest.cpp`. +The suite covers: +- Basic encode/decode +- Batch encode/decode +- Vocabulary operations +- Padding and Truncation +- Special tokens management +- Serialization (save/load/to_string) +- Error handling +- Integration with BERT tokenizer + +Original Rust tests also available via `ctest -R tokenizers_rust_all`. + +## Usage + +Add `bindings/cpp/include` to your include path and link against the generated `libtokenizers_c.so` (or platform equivalent) built in `bindings/c/target/release`. + +Example: +```cpp +#include "tokenizers/tokenizers.h" +using namespace tokenizers; + +int main() { + Tokenizer tok("path/to/tokenizer.json"); + if (!tok.valid()) return 1; + + auto ids = tok.encode("Hello world!"); + for (auto id : ids) { + std::cout << id << " "; + } + + std::string decoded = tok.decode(ids); + std::cout << "\nDecoded: " << decoded << "\n"; +} +``` + +## Notes & Future Improvements +- Error handling returns empty/default values; could be extended with status codes/exceptions. +- Full Rust test suite available through CTest for integration tracking. +- Thread safety: Create one instance per thread or add mutex. + +## License +Apache-2.0 (same as upstream project). diff --git a/bindings/cpp/data b/bindings/cpp/data new file mode 120000 index 000000000..538a6e8cc --- /dev/null +++ b/bindings/cpp/data @@ -0,0 +1 @@ +../../tokenizers/data \ No newline at end of file diff --git a/bindings/cpp/example/CMakeLists.txt b/bindings/cpp/example/CMakeLists.txt new file mode 100644 index 000000000..abd156d92 --- /dev/null +++ b/bindings/cpp/example/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.16) +project(tokenizers_example LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Include the tokenizers C++ bindings as a subdirectory +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/tokenizers_cpp_build) + +# Example executable +add_executable(tokenizer_example main.cpp) +target_link_libraries(tokenizer_example PRIVATE tokenizers_cpp tokenizers_c) +target_include_directories(tokenizer_example PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include) + +message(STATUS "Example project configured. Build with: cmake -S bindings/cpp/example -B build_example && cmake --build build_example") diff --git a/bindings/cpp/example/README.md b/bindings/cpp/example/README.md new file mode 100644 index 000000000..4d994c29b --- /dev/null +++ b/bindings/cpp/example/README.md @@ -0,0 +1,83 @@ +# C++ Bindings Example + +This example demonstrates how to use the HuggingFace Tokenizers C++ bindings. + +## Building + +```bash +# Make sure test resources are available (includes sample tokenizer JSON files) +make -C tokenizers test + +# Build the example +cmake -S bindings/cpp/example -B build_example +cmake --build build_example + +# Run the example with a tokenizer file +./build_example/tokenizer_example ../../tokenizers/data/tokenizer.json "Hello world!" +``` + +## What This Example Shows + +The example program demonstrates: + +1. **Basic Encoding**: Encoding text to token IDs with and without special tokens +2. **Token Lookup**: Looking up token IDs by token string +3. **Adding Special Tokens**: Dynamically adding custom special tokens to the vocabulary +4. **Batch Processing**: Encoding multiple texts efficiently +5. **Move Semantics**: Using C++11 move semantics for efficient resource management +6. **Error Handling**: Checking tokenizer validity and handling missing tokens + +## Usage + +```bash +# Basic usage with default text +./build_example/tokenizer_example + +# Encode custom text +./build_example/tokenizer_example "Your custom text here" +``` + +## Example Output + +``` +Tokenizers C++ Bindings Version: tokenizers_c 0.0.1 + +Loading tokenizer from: ../../tokenizers/data/tokenizer.json +✓ Tokenizer loaded successfully + +Vocabulary size: 30000 + +=== Example 1: Basic Encoding === +Input text: "Hello world!" +Tokens (with special tokens): [79, 33, 56, 63, 63, 66, 88, 66, 69, 63, 55, 5] +Token count: 12 + +=== Example 2: Encoding Without Special Tokens === +Tokens (without special tokens): [79, 33, 56, 63, 63, 66, 88, 66, 69, 63, 55] +Token count: 11 + +... +``` + +## Integration into Your Project + +To use the tokenizers C++ bindings in your own CMake project: + +```cmake +# Add tokenizers as a subdirectory +add_subdirectory(path/to/tokenizers/bindings/cpp ${CMAKE_BINARY_DIR}/tokenizers_build) + +# Link your target +target_link_libraries(your_target PRIVATE tokenizers_cpp tokenizers_c) +target_include_directories(your_target PRIVATE path/to/tokenizers/bindings/cpp/include) +``` + +Then in your C++ code: + +```cpp +#include "tokenizers/tokenizers.h" +using namespace tokenizers; + +Tokenizer tok("path/to/tokenizer.json"); +auto ids = tok.encode("Hello world!"); +``` diff --git a/bindings/cpp/example/main.cpp b/bindings/cpp/example/main.cpp new file mode 100644 index 000000000..42b0259e8 --- /dev/null +++ b/bindings/cpp/example/main.cpp @@ -0,0 +1,129 @@ +#include "tokenizers/tokenizers.h" +#include +#include +#include + +using namespace tokenizers; + +int main(int argc, char* argv[]) { + // Check if tokenizer path is provided + if (argc < 2) { + std::cerr << "Usage: " << argv[0] << " [text_to_encode]\n"; + std::cerr << "\nExample:\n"; + std::cerr << " " << argv[0] << " ../../tokenizers/data/tokenizer.json \"Hello world!\"\n"; + return 1; + } + + std::string tokenizer_path = argv[1]; + std::string text = (argc >= 3) ? argv[2] : "Hello, world!"; + + // Print version information + std::cout << "Tokenizers C++ Bindings Version: " << Tokenizer::version() << "\n\n"; + + // Load the tokenizer + std::cout << "Loading tokenizer from: " << tokenizer_path << "\n"; + Tokenizer tokenizer(tokenizer_path); + + if (!tokenizer.valid()) { + std::cerr << "Error: Failed to load tokenizer from " << tokenizer_path << "\n"; + std::cerr << "Make sure the file exists and is a valid tokenizer JSON file.\n"; + return 1; + } + + std::cout << "✓ Tokenizer loaded successfully\n\n"; + + // Get vocabulary size + size_t vocab_size = tokenizer.vocab_size(); + std::cout << "Vocabulary size: " << vocab_size << "\n\n"; + + // Example 1: Basic encoding + std::cout << "=== Example 1: Basic Encoding ===\n"; + std::cout << "Input text: \"" << text << "\"\n"; + + auto ids_with_special = tokenizer.encode(text, true); + std::cout << "Tokens (with special tokens): ["; + for (size_t i = 0; i < ids_with_special.size(); ++i) { + std::cout << ids_with_special[i]; + if (i + 1 < ids_with_special.size()) std::cout << ", "; + } + std::cout << "]\n"; + std::cout << "Token count: " << ids_with_special.size() << "\n\n"; + + // Example 2: Encoding without special tokens + std::cout << "=== Example 2: Encoding Without Special Tokens ===\n"; + auto ids_without_special = tokenizer.encode(text, false); + std::cout << "Tokens (without special tokens): ["; + for (size_t i = 0; i < ids_without_special.size(); ++i) { + std::cout << ids_without_special[i]; + if (i + 1 < ids_without_special.size()) std::cout << ", "; + } + std::cout << "]\n"; + std::cout << "Token count: " << ids_without_special.size() << "\n\n"; + + // Example 3: Token lookup + std::cout << "=== Example 3: Token ID Lookup ===\n"; + std::vector sample_tokens = {"hello", "world", "the", "[UNK]", "[PAD]"}; + for (const auto& token : sample_tokens) { + int32_t id = tokenizer.token_to_id(token); + if (id >= 0) { + std::cout << "Token \"" << token << "\" -> ID: " << id << "\n"; + } else { + std::cout << "Token \"" << token << "\" -> Not found in vocabulary\n"; + } + } + std::cout << "\n"; + + // Example 4: Adding special tokens + std::cout << "=== Example 4: Adding Custom Special Token ===\n"; + std::string new_token = "[CUSTOM_TOKEN]"; + size_t vocab_before = tokenizer.vocab_size(); + bool added = tokenizer.add_special_token(new_token); + size_t vocab_after = tokenizer.vocab_size(); + + if (added) { + std::cout << "✓ Successfully added special token: " << new_token << "\n"; + std::cout << "Vocabulary size increased: " << vocab_before << " -> " << vocab_after << "\n"; + + int32_t new_id = tokenizer.token_to_id(new_token); + std::cout << "New token ID: " << new_id << "\n\n"; + + // Encode text with the new token + std::string text_with_token = "Hello " + new_token + " world"; + auto ids = tokenizer.encode(text_with_token, true); + std::cout << "Encoding \"" << text_with_token << "\":\n"; + std::cout << "Token IDs: ["; + for (size_t i = 0; i < ids.size(); ++i) { + std::cout << ids[i]; + if (i + 1 < ids.size()) std::cout << ", "; + } + std::cout << "]\n"; + } else { + std::cout << "Failed to add special token (may already exist)\n"; + } + std::cout << "\n"; + + // Example 5: Batch encoding multiple texts + std::cout << "=== Example 5: Encoding Multiple Texts ===\n"; + std::vector texts = { + "The quick brown fox", + "jumps over the lazy dog", + "Hello, world!", + "Testing tokenization" + }; + + for (const auto& t : texts) { + auto tokens = tokenizer.encode(t, true); + std::cout << "\"" << t << "\" -> " << tokens.size() << " tokens\n"; + } + std::cout << "\n"; + + // Example 6: Move semantics + std::cout << "=== Example 6: Move Semantics ===\n"; + Tokenizer moved_tokenizer = std::move(tokenizer); + std::cout << "Original tokenizer valid: " << (tokenizer.valid() ? "yes" : "no") << "\n"; + std::cout << "Moved tokenizer valid: " << (moved_tokenizer.valid() ? "yes" : "no") << "\n"; + std::cout << "Moved tokenizer vocab size: " << moved_tokenizer.vocab_size() << "\n\n"; + + std::cout << "=== All Examples Completed Successfully ===\n"; + return 0; +} diff --git a/bindings/cpp/include/tokenizers/tokenizers.h b/bindings/cpp/include/tokenizers/tokenizers.h new file mode 100644 index 000000000..511e74cea --- /dev/null +++ b/bindings/cpp/include/tokenizers/tokenizers.h @@ -0,0 +1,415 @@ +#pragma once +#include +#include +#include +#include + +// Forward declare jinja2 types to avoid pulling in heavy headers +namespace jinja2 { class Template; } + +extern "C" { + struct tokenizers_encoding_t { + const int32_t* ids; + const int32_t* attention_mask; + size_t len; + void* _internal_ptr; // Internal use only - do not access + }; + + struct tokenizers_padding_params_t { + uint32_t pad_id; + uint32_t pad_type_id; + const char* pad_token; + int strategy; + size_t fixed_length; + int direction; + size_t pad_to_multiple_of; + }; + + struct tokenizers_truncation_params_t { + size_t max_length; + size_t stride; + int strategy; + int direction; + }; + + void* tokenizers_new_from_file(const char* path); + void* tokenizers_new_from_file_with_config(const char* path, const char* config_path); + void* tokenizers_new_from_str(const char* json); + void tokenizers_free(void* tokenizer); + tokenizers_encoding_t tokenizers_encode(void* tokenizer, const char* text, bool add_special_tokens); + void tokenizers_free_encoding(tokenizers_encoding_t enc); + const char* tokenizers_version(); + void tokenizers_string_free(char* s); + size_t tokenizers_vocab_size(void* tokenizer); + int32_t tokenizers_token_to_id(void* tokenizer, const char* token); + char* tokenizers_id_to_token(void* tokenizer, int32_t id); + char* tokenizers_decode(void* tokenizer, const int32_t* ids, size_t len, bool skip_special_tokens); + bool tokenizers_save(void* tokenizer, const char* path, bool pretty); + char* tokenizers_to_str(void* tokenizer, bool pretty); + bool tokenizers_add_special_token(void* tokenizer, const char* token); + size_t tokenizers_add_special_tokens(void* tokenizer, const char** tokens, size_t len); + size_t tokenizers_add_tokens(void* tokenizer, const char** tokens, size_t len); + tokenizers_encoding_t* tokenizers_encode_batch(void* tokenizer, const char** texts, size_t len, bool add_special_tokens); + void tokenizers_free_batch_encoding(tokenizers_encoding_t* encodings, size_t len); + char** tokenizers_decode_batch(void* tokenizer, const int32_t** ids, const size_t* lens, size_t batch_len, bool skip_special_tokens); + void tokenizers_free_batch_decode(char** strings, size_t len); + void tokenizers_set_padding(void* tokenizer, const tokenizers_padding_params_t* params); + void tokenizers_set_truncation(void* tokenizer, const tokenizers_truncation_params_t* params); + + // Unified special token API (auto-uses config if available, falls back to heuristic) + int32_t tokenizers_get_special_token_id(void* tokenizer, const char* name); + char* tokenizers_get_special_token(void* tokenizer, const char* name); + bool tokenizers_get_add_bos_token(void* tokenizer); + bool tokenizers_get_add_eos_token(void* tokenizer); + bool tokenizers_has_chat_template(void* tokenizer); + char* tokenizers_get_chat_template(void* tokenizer); +} + +namespace tokenizers { + +struct Encoding { + std::vector ids; + std::vector attention_mask; + + operator std::vector() const { return ids; } + + size_t size() const { return ids.size(); } + bool empty() const { return ids.empty(); } + int32_t operator[](size_t i) const { return ids[i]; } + std::vector::const_iterator begin() const { return ids.begin(); } + std::vector::const_iterator end() const { return ids.end(); } + + bool operator==(const Encoding& other) const { + return ids == other.ids && attention_mask == other.attention_mask; + } + bool operator!=(const Encoding& other) const { + return !(*this == other); + } +}; + +/// Chat message for apply_chat_template +struct ChatMessage { + std::string role; // "system", "user", "assistant" + std::string content; // Message content +}; + +/// Exception for chat template errors +class ChatTemplateError : public std::runtime_error { +public: + explicit ChatTemplateError(const std::string& msg) : std::runtime_error(msg) {} +}; + +struct PaddingParams { + uint32_t pad_id = 0; + uint32_t pad_type_id = 0; + std::string pad_token = "[PAD]"; + enum Strategy { BatchLongest = 0, Fixed = 1 } strategy = BatchLongest; + size_t fixed_length = 0; + enum Direction { Left = 0, Right = 1 } direction = Right; + size_t pad_to_multiple_of = 0; +}; + +struct TruncationParams { + size_t max_length = 512; + size_t stride = 0; + enum Strategy { LongestFirst = 0, OnlyFirst = 1, OnlySecond = 2 } strategy = LongestFirst; + enum Direction { Left = 0, Right = 1 } direction = Right; +}; + +class Tokenizer { +public: + Tokenizer() = default; + /// Load tokenizer from file, auto-loads tokenizer_config.json if present + explicit Tokenizer(const std::string& path) { load(path); } + /// Load tokenizer with explicit config file path + Tokenizer(const std::string& path, const std::string& config_path) { load(path, config_path); } + ~Tokenizer() { reset(); } + Tokenizer(const Tokenizer&) = delete; + Tokenizer& operator=(const Tokenizer&) = delete; + Tokenizer(Tokenizer&& other) noexcept : handle_(other.handle_) { other.handle_ = nullptr; } + Tokenizer& operator=(Tokenizer&& other) noexcept { + if (this != &other) { + reset(); + handle_ = other.handle_; + other.handle_ = nullptr; + } + return *this; + } + + static Tokenizer FromBlobJSON(const std::string& json) { + Tokenizer t; + t.handle_ = tokenizers_new_from_str(json.c_str()); + return t; + } + + /// Load tokenizer, auto-loads tokenizer_config.json if present + bool load(const std::string& path) { + reset(); + handle_ = tokenizers_new_from_file(path.c_str()); + return handle_ != nullptr; + } + + /// Load tokenizer with explicit config file path + bool load(const std::string& path, const std::string& config_path) { + reset(); + handle_ = tokenizers_new_from_file_with_config(path.c_str(), config_path.c_str()); + return handle_ != nullptr; + } + + Encoding encode(const std::string& text, bool add_special_tokens = true) const { + if (!handle_) return {}; + tokenizers_encoding_t enc = tokenizers_encode(handle_, text.c_str(), add_special_tokens); + Encoding out; + if (enc.ids && enc.len) { + out.ids.assign(enc.ids, enc.ids + enc.len); + } + if (enc.attention_mask && enc.len) { + out.attention_mask.assign(enc.attention_mask, enc.attention_mask + enc.len); + } + tokenizers_free_encoding(enc); + return out; + } + + std::vector encode_batch(const std::vector& texts, bool add_special_tokens = true) const { + if (!handle_) return {}; + std::vector c_texts; + c_texts.reserve(texts.size()); + for (const auto& t : texts) c_texts.push_back(t.c_str()); + + tokenizers_encoding_t* encs = tokenizers_encode_batch(handle_, c_texts.data(), c_texts.size(), add_special_tokens); + if (!encs) return {}; + + std::vector out; + out.reserve(texts.size()); + for (size_t i = 0; i < texts.size(); ++i) { + Encoding e; + if (encs[i].ids && encs[i].len) { + e.ids.assign(encs[i].ids, encs[i].ids + encs[i].len); + } + if (encs[i].attention_mask && encs[i].len) { + e.attention_mask.assign(encs[i].attention_mask, encs[i].attention_mask + encs[i].len); + } + out.push_back(std::move(e)); + } + tokenizers_free_batch_encoding(encs, texts.size()); + return out; + } + + std::string decode(const std::vector& ids, bool skip_special_tokens = true) const { + if (!handle_) return {}; + char* s = tokenizers_decode(handle_, ids.data(), ids.size(), skip_special_tokens); + if (!s) return {}; + std::string res(s); + tokenizers_string_free(s); + return res; + } + + std::vector decode_batch(const std::vector>& batch_ids, bool skip_special_tokens = true) const { + if (!handle_) return {}; + std::vector c_ids; + std::vector c_lens; + c_ids.reserve(batch_ids.size()); + c_lens.reserve(batch_ids.size()); + + for (const auto& ids : batch_ids) { + c_ids.push_back(ids.data()); + c_lens.push_back(ids.size()); + } + + char** strings = tokenizers_decode_batch(handle_, c_ids.data(), c_lens.data(), batch_ids.size(), skip_special_tokens); + if (!strings) return {}; + + std::vector res; + res.reserve(batch_ids.size()); + for (size_t i = 0; i < batch_ids.size(); ++i) { + if (strings[i]) { + res.emplace_back(strings[i]); + } else { + res.emplace_back(""); + } + } + tokenizers_free_batch_decode(strings, batch_ids.size()); + return res; + } + + size_t vocab_size() const { + if (!handle_) return 0; + return tokenizers_vocab_size(handle_); + } + + int32_t token_to_id(const std::string& token) const { + if (!handle_) return -1; + return tokenizers_token_to_id(handle_, token.c_str()); + } + + std::string id_to_token(int32_t id) const { + if (!handle_) return {}; + char* s = tokenizers_id_to_token(handle_, id); + if (!s) return {}; + std::string res(s); + tokenizers_string_free(s); + return res; + } + + bool save(const std::string& path, bool pretty = true) const { + if (!handle_) return false; + return tokenizers_save(handle_, path.c_str(), pretty); + } + + std::string to_string(bool pretty = false) const { + if (!handle_) return {}; + char* s = tokenizers_to_str(handle_, pretty); + if (!s) return {}; + std::string res(s); + tokenizers_string_free(s); + return res; + } + + bool add_special_token(const std::string& token) { + if (!handle_) return false; + return tokenizers_add_special_token(handle_, token.c_str()); + } + + size_t add_special_tokens(const std::vector& tokens) { + if (!handle_) return 0; + std::vector c_tokens; + c_tokens.reserve(tokens.size()); + for (const auto& t : tokens) c_tokens.push_back(t.c_str()); + return tokenizers_add_special_tokens(handle_, c_tokens.data(), c_tokens.size()); + } + + void set_padding(const PaddingParams& params) { + if (!handle_) return; + tokenizers_padding_params_t c_params; + c_params.pad_id = params.pad_id; + c_params.pad_type_id = params.pad_type_id; + c_params.pad_token = params.pad_token.c_str(); + c_params.strategy = (int)params.strategy; + c_params.fixed_length = params.fixed_length; + c_params.direction = (int)params.direction; + c_params.pad_to_multiple_of = params.pad_to_multiple_of; + + tokenizers_set_padding(handle_, &c_params); + } + + void disable_padding() { + if (!handle_) return; + tokenizers_set_padding(handle_, nullptr); + } + + void set_truncation(const TruncationParams& params) { + if (!handle_) return; + tokenizers_truncation_params_t c_params; + c_params.max_length = params.max_length; + c_params.stride = params.stride; + c_params.strategy = (int)params.strategy; + c_params.direction = (int)params.direction; + + tokenizers_set_truncation(handle_, &c_params); + } + + void disable_truncation() { + if (!handle_) return; + tokenizers_set_truncation(handle_, nullptr); + } + + size_t add_tokens(const std::vector& tokens) { + if (!handle_) return 0; + std::vector c_tokens; + c_tokens.reserve(tokens.size()); + for (const auto& t : tokens) c_tokens.push_back(t.c_str()); + return tokenizers_add_tokens(handle_, c_tokens.data(), c_tokens.size()); + } + + // === Special Token API (unified - auto-uses config if available) === + + /// Get special token ID by name ("BOS", "EOS", "PAD", "UNK") + /// Auto-uses tokenizer_config.json if present, falls back to heuristic. + int32_t special_token_id(const std::string& name) const { + if (!handle_) return -1; + return tokenizers_get_special_token_id(handle_, name.c_str()); + } + + /// Get special token string by name ("BOS", "EOS", "PAD", "UNK") + /// Returns token from config if available, empty string otherwise. + std::string special_token(const std::string& name) const { + if (!handle_) return {}; + char* s = tokenizers_get_special_token(handle_, name.c_str()); + if (!s) return {}; + std::string res(s); + tokenizers_string_free(s); + return res; + } + + // Convenience ID accessors + int32_t bos_id() const { return special_token_id("BOS"); } + int32_t eos_id() const { return special_token_id("EOS"); } + int32_t pad_id() const { return special_token_id("PAD"); } + int32_t unk_id() const { return special_token_id("UNK"); } + + // Convenience token string accessors + std::string bos_token() const { return special_token("BOS"); } + std::string eos_token() const { return special_token("EOS"); } + std::string pad_token() const { return special_token("PAD"); } + std::string unk_token() const { return special_token("UNK"); } + + /// Whether config specifies BOS token should be added + bool add_bos_token() const { + if (!handle_) return false; + return tokenizers_get_add_bos_token(handle_); + } + + /// Whether config specifies EOS token should be added + bool add_eos_token() const { + if (!handle_) return false; + return tokenizers_get_add_eos_token(handle_); + } + + /// Check if tokenizer has a chat template (from config) + bool has_chat_template() const { + if (!handle_) return false; + return tokenizers_has_chat_template(handle_); + } + + /// Get the raw chat template string (Jinja2 template) + std::string chat_template() const { + if (!handle_) return {}; + char* s = tokenizers_get_chat_template(handle_); + if (!s) return {}; + std::string res(s); + tokenizers_string_free(s); + return res; + } + + /// Apply chat template to format messages + /// @param messages Vector of ChatMessage with role and content + /// @param add_generation_prompt If true, adds prompt for assistant response + /// @return Formatted string ready for tokenization + /// @throws ChatTemplateError if no template or rendering fails + std::string apply_chat_template( + const std::vector& messages, + bool add_generation_prompt = true + ) const; + + bool valid() const { return handle_ != nullptr; } + + static std::string version() { + const char* v = tokenizers_version(); + if (!v) return {}; + std::string s(v); + tokenizers_string_free(const_cast(v)); + return s; + } + +private: + void reset() { + if (handle_) { + tokenizers_free(handle_); + handle_ = nullptr; + } + } + + void* handle_ = nullptr; +}; + +} // namespace tokenizers diff --git a/bindings/cpp/src/tokenizers.cpp b/bindings/cpp/src/tokenizers.cpp new file mode 100644 index 000000000..4b431567c --- /dev/null +++ b/bindings/cpp/src/tokenizers.cpp @@ -0,0 +1,59 @@ +/** + * Tokenizer C++ bindings implementation + */ + +#include +#include +#include + +namespace tokenizers { + +std::string Tokenizer::apply_chat_template( + const std::vector& messages, + bool add_generation_prompt +) const { + // Get the template string + std::string tmpl_str = chat_template(); + if (tmpl_str.empty()) { + throw ChatTemplateError("No chat template available for this tokenizer"); + } + + // Create Jinja2 template + jinja2::Template tpl; + auto load_result = tpl.Load(tmpl_str, "chat_template"); + if (!load_result) { + throw ChatTemplateError("Failed to parse chat template: " + + load_result.error().ToString()); + } + + // Convert messages to Jinja2 values + jinja2::ValuesList jinja_messages; + for (const auto& msg : messages) { + jinja2::ValuesMap msg_map; + msg_map["role"] = msg.role; + msg_map["content"] = msg.content; + jinja_messages.push_back(std::move(msg_map)); + } + + // Build parameters map + jinja2::ValuesMap params; + params["messages"] = std::move(jinja_messages); + params["add_generation_prompt"] = add_generation_prompt; + + // Add special tokens as variables (commonly used in templates) + params["bos_token"] = bos_token(); + params["eos_token"] = eos_token(); + params["pad_token"] = pad_token(); + params["unk_token"] = unk_token(); + + // Render the template + auto render_result = tpl.RenderAsString(params); + if (!render_result) { + throw ChatTemplateError("Failed to render chat template: " + + render_result.error().ToString()); + } + + return render_result.value(); +} + +} // namespace tokenizers diff --git a/bindings/cpp/tests/test_common.h b/bindings/cpp/tests/test_common.h new file mode 100644 index 000000000..d79d1fd62 --- /dev/null +++ b/bindings/cpp/tests/test_common.h @@ -0,0 +1,25 @@ +#pragma once +#include +#include +#include + +namespace test_utils { + +inline std::string find_resource(const std::string& name) { + namespace fs = std::filesystem; + + // First check environment variable (set by CMake or user) + if (const char* env = std::getenv("TOKENIZERS_TEST_DATA")) { + auto path = fs::path(env) / name; + if (fs::exists(path)) return path.string(); + } + + // Fallback: search relative paths + for (const auto& dir : {"data", "../data", "../../data", "../../../data"}) { + auto path = fs::path(dir) / name; + if (fs::exists(path)) return path.string(); + } + return {}; +} + +} // namespace test_utils diff --git a/bindings/cpp/tests/test_tokenizer_gtest.cpp b/bindings/cpp/tests/test_tokenizer_gtest.cpp new file mode 100644 index 000000000..c1f89eaa0 --- /dev/null +++ b/bindings/cpp/tests/test_tokenizer_gtest.cpp @@ -0,0 +1,282 @@ +/** + * Tokenizer C++ bindings tests + */ +#include +#include +#include "test_common.h" +#include + +using namespace tokenizers; +using test_utils::find_resource; + +// ==================== Basic Tokenizer Tests ==================== + +class TokenizerTest : public ::testing::Test { +protected: + Tokenizer tok; + + void SetUp() override { + std::string path = find_resource("tokenizer.json"); + ASSERT_FALSE(path.empty()) << "Could not find tokenizer.json"; + tok = Tokenizer(path); + ASSERT_TRUE(tok.valid()); + } +}; + +TEST_F(TokenizerTest, Encode) { + auto output = tok.encode("my name is john"); + EXPECT_FALSE(output.ids.empty()); + EXPECT_EQ(output.ids.size(), output.attention_mask.size()); + + // Consistency check - same input gives same output + EXPECT_EQ(tok.encode("my name is john"), output); +} + +TEST_F(TokenizerTest, EncodeBatch) { + std::vector batch = {"my name is john", "my pair"}; + auto output = tok.encode_batch(batch); + ASSERT_EQ(output.size(), 2); + EXPECT_FALSE(output[0].ids.empty()); + EXPECT_FALSE(output[1].ids.empty()); +} + +TEST_F(TokenizerTest, Decode) { + auto encoding = tok.encode("my name is john"); + auto decoded = tok.decode(encoding.ids); + EXPECT_NE(decoded.find("name"), std::string::npos); + EXPECT_NE(decoded.find("john"), std::string::npos); +} + +TEST_F(TokenizerTest, DecodeBatch) { + std::vector batch = {"my name is john", "my pair"}; + auto encodings = tok.encode_batch(batch); + + std::vector> batch_ids; + for (const auto& enc : encodings) batch_ids.push_back(enc.ids); + + auto decoded = tok.decode_batch(batch_ids); + ASSERT_EQ(decoded.size(), 2); + EXPECT_NE(decoded[0].find("john"), std::string::npos); + EXPECT_NE(decoded[1].find("pair"), std::string::npos); +} + +TEST_F(TokenizerTest, Vocab) { + EXPECT_GT(tok.vocab_size(), 0); + + int32_t id = tok.token_to_id("the"); + if (id != -1) { + EXPECT_EQ(tok.id_to_token(id), "the"); + } +} + +TEST_F(TokenizerTest, Padding) { + PaddingParams params; + params.strategy = PaddingParams::Fixed; + params.fixed_length = 10; + params.pad_id = 0; + tok.set_padding(params); + + auto output = tok.encode("short"); + EXPECT_EQ(output.ids.size(), 10); + + tok.disable_padding(); + EXPECT_LT(tok.encode("short").ids.size(), 10); +} + +TEST_F(TokenizerTest, AddSpecialTokens) { + size_t added = tok.add_special_tokens({"[SPECIAL1]", "[SPECIAL2]"}); + EXPECT_EQ(added, 2); + + int32_t id = tok.token_to_id("[SPECIAL1]"); + EXPECT_NE(id, -1); + + auto output = tok.encode("Hello [SPECIAL1]"); + EXPECT_NE(std::find(output.ids.begin(), output.ids.end(), id), output.ids.end()); +} + +TEST_F(TokenizerTest, SaveAndLoad) { + std::string save_path = "test_save_gtest.json"; + EXPECT_TRUE(tok.save(save_path)); + + Tokenizer t2(save_path); + EXPECT_TRUE(t2.valid()); + EXPECT_EQ(t2.vocab_size(), tok.vocab_size()); + + std::filesystem::remove(save_path); +} + +TEST_F(TokenizerTest, ToStringAndFromBlob) { + std::string json = tok.to_string(false); + EXPECT_FALSE(json.empty()); + + Tokenizer t2 = Tokenizer::FromBlobJSON(json); + EXPECT_TRUE(t2.valid()); + EXPECT_EQ(t2.vocab_size(), tok.vocab_size()); +} + +TEST_F(TokenizerTest, SpecialTokensFromConfig) { + // Config should be auto-loaded from tokenizer_config.json + EXPECT_EQ(tok.bos_token(), ""); + EXPECT_EQ(tok.eos_token(), ""); + EXPECT_EQ(tok.pad_token(), ""); + EXPECT_EQ(tok.unk_token(), ""); + + EXPECT_GE(tok.bos_id(), 0); + EXPECT_GE(tok.eos_id(), 0); + EXPECT_GE(tok.pad_id(), 0); + EXPECT_GE(tok.unk_id(), 0); + + EXPECT_TRUE(tok.add_bos_token()); + EXPECT_FALSE(tok.add_eos_token()); +} + +TEST_F(TokenizerTest, ChatTemplate) { + EXPECT_TRUE(tok.has_chat_template()); + EXPECT_FALSE(tok.chat_template().empty()); + + std::vector messages = { + {"user", "Hello!"}, + {"assistant", "Hi there!"}, + {"user", "How are you?"} + }; + + std::string result = tok.apply_chat_template(messages, true); + EXPECT_NE(result.find("Hello!"), std::string::npos); + EXPECT_NE(result.find("Hi there!"), std::string::npos); + EXPECT_NE(result.find("How are you?"), std::string::npos); +} + +// ==================== BERT Tokenizer Tests ==================== + +class BertTokenizerTest : public ::testing::Test { +protected: + Tokenizer tok; + + void SetUp() override { + std::string path = find_resource("bert-wiki.json"); + ASSERT_FALSE(path.empty()) << "Could not find bert-wiki.json"; + // Pass empty config path to skip loading tokenizer_config.json + tok = Tokenizer(path, ""); + ASSERT_TRUE(tok.valid()); + } +}; + +TEST_F(BertTokenizerTest, SpecialTokensViaHeuristic) { + // BERT tokens found via heuristic (no config file) + EXPECT_EQ(tok.id_to_token(tok.bos_id()), "[CLS]"); + EXPECT_EQ(tok.id_to_token(tok.eos_id()), "[SEP]"); + EXPECT_EQ(tok.id_to_token(tok.pad_id()), "[PAD]"); + EXPECT_EQ(tok.id_to_token(tok.unk_id()), "[UNK]"); + + // IDs should match token_to_id + EXPECT_EQ(tok.bos_id(), tok.token_to_id("[CLS]")); + EXPECT_EQ(tok.eos_id(), tok.token_to_id("[SEP]")); + EXPECT_EQ(tok.pad_id(), tok.token_to_id("[PAD]")); + EXPECT_EQ(tok.unk_id(), tok.token_to_id("[UNK]")); +} + +TEST_F(BertTokenizerTest, ExplicitConfigPath) { + auto config_path = find_resource("bert_tokenizer_config.json"); + if (config_path.empty()) { + GTEST_SKIP() << "bert_tokenizer_config.json not found"; + } + + auto tok_path = find_resource("bert-wiki.json"); + Tokenizer tok_with_config(tok_path, config_path); + ASSERT_TRUE(tok_with_config.valid()); + + EXPECT_EQ(tok_with_config.bos_token(), "[CLS]"); + EXPECT_EQ(tok_with_config.eos_token(), "[SEP]"); + EXPECT_FALSE(tok_with_config.has_chat_template()); +} + +TEST_F(BertTokenizerTest, NoChatTemplate) { + EXPECT_FALSE(tok.has_chat_template()); + + std::vector messages = {{"user", "Hello!"}}; + EXPECT_THROW(tok.apply_chat_template(messages), ChatTemplateError); +} + +// ==================== Error Handling Tests ==================== + +TEST(TokenizerErrorTest, InvalidFile) { + Tokenizer tok("nonexistent_file.json"); + EXPECT_FALSE(tok.valid()); + + // All operations should return safe defaults + EXPECT_EQ(tok.vocab_size(), 0); + EXPECT_TRUE(tok.encode("test").empty()); + EXPECT_EQ(tok.token_to_id("test"), -1); + EXPECT_EQ(tok.bos_id(), -1); + EXPECT_TRUE(tok.bos_token().empty()); + EXPECT_FALSE(tok.has_chat_template()); +} + +TEST(TokenizerErrorTest, MoveSemantics) { + auto path = find_resource("tokenizer.json"); + ASSERT_FALSE(path.empty()); + + Tokenizer tok(path); + EXPECT_TRUE(tok.valid()); + + Tokenizer moved = std::move(tok); + EXPECT_TRUE(moved.valid()); + EXPECT_FALSE(tok.valid()); +} + +TEST(TokenizerErrorTest, UnknownToken) { + auto path = find_resource("tokenizer.json"); + ASSERT_FALSE(path.empty()); + + Tokenizer tok(path); + EXPECT_EQ(tok.token_to_id("[DEFINITELY_NOT_IN_VOCAB_12345]"), -1); +} + +TEST(TokenizerErrorTest, FromBlobNoChatTemplate) { + // Tokenizer loaded from string has no config + std::string json = R"({ + "version": "1.0", + "added_tokens": [{"id": 0, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}], + "model": {"type": "WordLevel", "vocab": {"[UNK]": 0, "hello": 1}, "unk_token": "[UNK]"} + })"; + + Tokenizer tok = Tokenizer::FromBlobJSON(json); + ASSERT_TRUE(tok.valid()); + EXPECT_FALSE(tok.has_chat_template()); +} + +// ==================== Optional Tokenizer Tests ==================== + +TEST(OptionalTokenizerTest, Llama) { + auto path = find_resource("llama-3-tokenizer.json"); + if (path.empty()) { + GTEST_SKIP() << "llama-3-tokenizer.json not found"; + } + + Tokenizer tok(path); + ASSERT_TRUE(tok.valid()); + + int32_t bos = tok.bos_id(); + if (bos >= 0) { + std::string bos_token = tok.id_to_token(bos); + EXPECT_TRUE(bos_token == "<|begin_of_text|>" || bos_token == ""); + } +} + +TEST(OptionalTokenizerTest, Unigram) { + auto path = find_resource("unigram.json"); + if (path.empty()) { + GTEST_SKIP() << "unigram.json not found"; + } + + Tokenizer tok(path); + if (!tok.valid()) { + GTEST_SKIP() << "unigram.json is not a complete tokenizer file"; + } + + // Just verify API doesn't crash + tok.bos_id(); + tok.eos_id(); + tok.unk_id(); +} + diff --git a/bindings/cpp/third_party/Jinja2Cpp b/bindings/cpp/third_party/Jinja2Cpp new file mode 160000 index 000000000..2053cfabf --- /dev/null +++ b/bindings/cpp/third_party/Jinja2Cpp @@ -0,0 +1 @@ +Subproject commit 2053cfabfafaeab65aff0bc083a83b105a939202 diff --git a/tokenizers/Makefile b/tokenizers/Makefile index 927fe794e..0635936d8 100644 --- a/tokenizers/Makefile +++ b/tokenizers/Makefile @@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D) SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/llama-3-tokenizer.json BENCHMARK_RESOURCES = $(SHARED_RESOURCES) -TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json +TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json $(DATA_DIR)/tokenizer_config.json $(DATA_DIR)/bert_tokenizer_config.json .PHONY : build build : @@ -87,3 +87,12 @@ $(DATA_DIR)/bert-wiki.json : $(DATA_DIR)/llama-3-tokenizer.json : $(dir_guard) wget https://huggingface.co/hf-internal-testing/llama3-tokenizer/resolve/main/tokenizer.json -O $@ + +# Config files for C++ bindings tests +$(DATA_DIR)/tokenizer_config.json : + $(dir_guard) + @echo '{"bos_token":"","eos_token":"","pad_token":"","unk_token":"","add_bos_token":true,"add_eos_token":false,"chat_template":"{% for message in messages %}{% if message['"'"'role'"'"'] == '"'"'user'"'"' %}{{ '"'"'User: '"'"' + message['"'"'content'"'"'] + '"'"'\\n'"'"' }}{% elif message['"'"'role'"'"'] == '"'"'assistant'"'"' %}{{ '"'"'Assistant: '"'"' + message['"'"'content'"'"'] + '"'"'\\n'"'"' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '"'"'Assistant: '"'"' }}{% endif %}"}' > $@ + +$(DATA_DIR)/bert_tokenizer_config.json : + $(dir_guard) + @echo '{"bos_token":"[CLS]","eos_token":"[SEP]","pad_token":"[PAD]","unk_token":"[UNK]","add_bos_token":true,"add_eos_token":true,"chat_template":null}' > $@