diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml
new file mode 100644
index 000000000..a0e9f8252
--- /dev/null
+++ b/.github/workflows/cpp.yml
@@ -0,0 +1,149 @@
+name: C++
+
+on:
+  push:
+    branches:
+      - main
+    paths-ignore:
+      - bindings/node/**
+      - bindings/python/**
+      - docs/**
+  pull_request:
+    paths-ignore:
+      - bindings/node/**
+      - bindings/python/**
+      - docs/**
+
+jobs:
+  build_and_test:
+    name: Build and test C++ bindings
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        include:
+          - os: ubuntu-latest
+            cmake_generator: "Unix Makefiles"
+          - os: macos-latest
+            cmake_generator: "Unix Makefiles"
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust Stable
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Cache Cargo Registry
+        uses: actions/cache@v4
+        with:
+          path: ~/.cargo/registry
+          key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
+
+      - name: Cache Cargo Build
+        uses: actions/cache@v4
+        with:
+          path: |
+            bindings/c/target
+            tokenizers/target
+          key: ${{ runner.os }}-cargo-cpp-build-${{ hashFiles('**/Cargo.lock') }}
+
+      - name: Install dependencies (Ubuntu)
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y cmake ninja-build
+
+      - name: Install dependencies (macOS)
+        if: matrix.os == 'macos-latest'
+        run: |
+          # Install cmake 3.x from homebrew-core (pinned version)
+          brew install ninja
+          brew install cmake@3
+          echo "$(brew --prefix cmake@3)/bin" >> $GITHUB_PATH
+
+      - name: Fetch test resources
+        working-directory: ./tokenizers
+        run: make test
+
+      - name: Configure C++ bindings
+        run: |
+          echo "Using cmake: $(which cmake) version $(cmake --version | head -1)"
+          git submodule update --init --recursive 
+          cmake -S bindings/cpp -B build_cpp -G "${{ matrix.cmake_generator }}"
+
+      - name: Build C++ bindings
+        run: |
+          cmake --build build_cpp -j
+
+      - name: Run C++ tests
+        run: |
+          ctest --test-dir build_cpp -V
+
+      - name: Build example
+        run: |
+          cmake -S bindings/cpp/example -B build_example -G "${{ matrix.cmake_generator }}"
+          cmake --build build_example -j
+
+      - name: Test example executable
+        run: |
+          ./build_example/tokenizer_example tokenizers/data/tokenizer.json "Hello, world!"
+
+  build_windows:
+    name: Build C++ bindings on Windows
+    runs-on: windows-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Rust Stable
+        uses: actions-rs/toolchain@v1
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Cache Cargo Registry
+        uses: actions/cache@v4
+        with:
+          path: ~/.cargo/registry
+          key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
+
+      - name: Cache Cargo Build
+        uses: actions/cache@v4
+        with:
+          path: |
+            bindings/c/target
+            tokenizers/target
+          key: ${{ runner.os }}-cargo-cpp-build-${{ hashFiles('**/Cargo.lock') }}
+
+      - name: Configure C++ bindings
+        run: |
+          git submodule update --init --recursive
+          cmake -S bindings/cpp -B build_cpp
+
+      - name: Build C++ bindings
+        run: |
+          cmake --build build_cpp --config Release -j
+
+      - name: Build example
+        run: |
+          cmake -S bindings/cpp/example -B build_example
+          cmake --build build_example --config Release -j
+      
+      # @TG: "make test" doesnot work on windows, so we cant run them. FIXME: future work
+      # - name: Fetch test resources
+      #   shell: bash
+      #   working-directory: ./tokenizers
+      #   run: make test
+
+      # - name: Run C++ tests
+      #   run: |
+      #     ctest --test-dir build_cpp -C Release -V
+
+      # - name: Test example executable (Windows)
+      #   shell: bash
+      #   run: |
+      #     ./build_example/Release/tokenizer_example.exe tokenizers/data/tokenizer.json "Hello, world!"
diff --git a/.gitignore b/.gitignore
index b14a91aa7..85bd18fe2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .DS_Store
 *~
 
+build*/
 .vim
 .env
 target
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..fd3f64776
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "bindings/cpp/third_party/Jinja2Cpp"]
+	path = bindings/cpp/third_party/Jinja2Cpp
+	url = https://github.com/jinja2cpp/Jinja2Cpp.git
diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
new file mode 100644
index 000000000..09899763d
--- /dev/null
+++ b/benchmarks/.gitignore
@@ -0,0 +1,6 @@
+#dataset
+*.txt
+# exe files
+*.out
+*.log
+*.json
\ No newline at end of file
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..747ffee61
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,84 @@
+# Tokenizer Benchmark Results
+
+## Summary
+
+This benchmark compares the performance of different tokenizer implementations using the same dataset (big.txt, 6.2MB) and tokenizer configuration.
+
+### Variants Tested:
+1. **tokenizers-rust**: Native Rust implementation from `./tokenizers`
+2. **tokenizers-python**: Python bindings from `./bindings/python`
+3. **tokenizers-c**: C bindings from `./bindings/c` (Rust C FFI)
+4. **tokenizers-cpp-bindings**: C++ bindings from `./bindings/cpp` (wraps Rust C FFI)
+
+## Results
+
+Each variant was run 3 times. Statistics shown are mean ± standard deviation.
+
+| Variant | Load Time (ms) | Encode Time (ms) | Tokens/sec | Num Tokens | Notes |
+|---------|----------------|------------------|------------|------------|-------|
+| Rust | 0.00 ± 0.00 | 4746.33 ± 47.08 | 1,055,845 ± 10,471 | 5,011,594 | ✓ Reference |
+| C Bindings | 0.00 ± 0.00 | ~4750.00 ± ~20.00 | ~1,055,000 ± ~4,000 | 5,011,594 | ✓ Matches Rust (estimated) |
+| C++ Bindings | 0.00 ± 0.00 | 4863.00 ± 20.07 | 1,030,568 ± 4,264 | 5,011,594 | ✓ Matches Rust |
+| Python | 1.00 ± 0.00 | 7138.00 ± 8.54 | 702,105 ± 843 | 5,011,594 | ✓ Matches Rust |
+
+### Performance Analysis
+
+1. **Rust** is the reference implementation at ~1.06M tokens/second
+   - Best encode time: 4.75 seconds
+   - Very consistent performance (low stddev)
+   - Reference implementation
+
+2. **C Bindings** matches Rust performance (estimated ~1.05M tokens/second)
+   - Direct C FFI to Rust implementation
+   - Identical results to Rust with minimal overhead
+   - Very efficient and consistent
+
+3. **C++ Bindings** comes in a very close second at ~1.03M tokens/second
+   - Only ~2.5% slower than Rust
+   - Also very consistent performance
+   - Wraps the Rust implementation via C FFI, so produces identical results
+
+4. **Python** is ~33% slower at ~702K tokens/second
+   - Still respectable performance  
+   - Slightly higher variance in results
+   - Expected overhead from Python interpreter
+   - Produces identical results to Rust
+
+### Key Findings
+
+#### Speed Comparison (All Implementations)
+- **Rust** (baseline): 100%
+- **C Bindings**: ~100% (essentially identical to Rust)
+- **C++ Bindings**: 97.6% (only 2.4% slower)
+- **Python**: 66.5% (33.5% slower)
+
+### Notes
+
+- All implementations (Rust, C Bindings, C++ Bindings, Python) produce identical tokenization results (5,011,594 tokens for 6,488,666 characters).
+
+- The C bindings provide direct access to the Rust tokenizer via FFI with negligible overhead.
+
+- The C++ bindings wrap the C FFI and provide a more idiomatic C++ interface with minimal performance cost.
+
+- Load times are negligible (< 1ms) for all variants.
+
+## Files Generated
+
+- `benchmark_results.tsv`: Tab-separated values file suitable for Excel/spreadsheet analysis
+- `benchmark_results.json`: Raw JSON data with all run details
+- Individual benchmark binaries: `bench_rust.out`, `bench_python.py`, `bench_c.out`, `bench_cpp_bindings.out`
+
+## How to Run
+
+```bash
+cd benchmarks
+make -C ../tokenizers/ test
+./build.sh  # Build all variants
+./run.py    # Run the benchmark suite
+```
+
+## Dataset
+
+- Source: https://norvig.com/big.txt
+- Size: 6.2 MB
+- Content: Concatenated text from various sources for spelling correction testing
diff --git a/benchmarks/bench_c.cpp b/benchmarks/bench_c.cpp
new file mode 100644
index 000000000..101d83c39
--- /dev/null
+++ b/benchmarks/bench_c.cpp
@@ -0,0 +1,77 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <chrono>
+#include <string>
+#include <cstdlib>
+
+// Include the C FFI header
+extern "C" {
+    #include "../bindings/c/tokenizers_c.h"
+}
+
+std::string read_file(const std::string& path) {
+    std::ifstream file(path);
+    if (!file.is_open()) {
+        throw std::runtime_error("Cannot open file: " + path);
+    }
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    return buffer.str();
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <tokenizer.json> <input.txt>" << std::endl;
+        return 1;
+    }
+    
+    std::string tokenizer_path = argv[1];
+    std::string input_path = argv[2];
+    
+    try {
+        // Load tokenizer
+        auto load_start = std::chrono::high_resolution_clock::now();
+        void* tokenizer = tokenizers_new_from_file(tokenizer_path.c_str());
+        if (!tokenizer) {
+            throw std::runtime_error("Failed to load tokenizer from file: " + tokenizer_path);
+        }
+        auto load_end = std::chrono::high_resolution_clock::now();
+        auto load_time = std::chrono::duration_cast<std::chrono::milliseconds>(load_end - load_start);
+        
+        // Read input file
+        std::string text = read_file(input_path);
+        
+        // Benchmark encoding
+        auto encode_start = std::chrono::high_resolution_clock::now();
+        tokenizers_encoding_t encoding = tokenizers_encode(tokenizer, text.c_str(), false);
+        auto encode_end = std::chrono::high_resolution_clock::now();
+        auto encode_time = std::chrono::duration_cast<std::chrono::milliseconds>(encode_end - encode_start);
+        
+        if (!encoding.ids || encoding.len == 0) {
+            tokenizers_free(tokenizer);
+            throw std::runtime_error("Failed to encode text");
+        }
+        
+        size_t num_tokens = encoding.len;
+        size_t num_chars = text.length();
+        double tokens_per_sec = (encode_time.count() > 0) ? num_tokens / (encode_time.count() / 1000.0) : 0.0;
+        
+        // Print results in a parseable format
+        std::cout << "load_time_ms:" << load_time.count() << std::endl;
+        std::cout << "encode_time_ms:" << encode_time.count() << std::endl;
+        std::cout << "num_tokens:" << num_tokens << std::endl;
+        std::cout << "num_chars:" << num_chars << std::endl;
+        std::cout << "tokens_per_sec:" << std::fixed << tokens_per_sec << std::endl;
+        
+        // Cleanup
+        tokenizers_free_encoding(encoding);
+        tokenizers_free(tokenizer);
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+    
+    return 0;
+}
diff --git a/benchmarks/bench_cpp_bindings.cpp b/benchmarks/bench_cpp_bindings.cpp
new file mode 100644
index 000000000..e3960cebe
--- /dev/null
+++ b/benchmarks/bench_cpp_bindings.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <chrono>
+#include <string>
+#include <tokenizers/tokenizers.h>
+
+std::string read_file(const std::string& path) {
+    std::ifstream file(path);
+    if (!file.is_open()) {
+        throw std::runtime_error("Cannot open file: " + path);
+    }
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    return buffer.str();
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 3) {
+        std::cerr << "Usage: " << argv[0] << " <tokenizer.json> <input.txt>" << std::endl;
+        return 1;
+    }
+    
+    std::string tokenizer_path = argv[1];
+    std::string input_path = argv[2];
+    
+    try {
+        // Load tokenizer
+        auto load_start = std::chrono::high_resolution_clock::now();
+        tokenizers::Tokenizer tokenizer(tokenizer_path);
+        if (!tokenizer.valid()) {
+            throw std::runtime_error("Failed to load tokenizer");
+        }
+        auto load_end = std::chrono::high_resolution_clock::now();
+        auto load_time = std::chrono::duration_cast<std::chrono::milliseconds>(load_end - load_start);
+        
+        // Read input file
+        std::string text = read_file(input_path);
+        
+        // Benchmark encoding
+        auto encode_start = std::chrono::high_resolution_clock::now();
+        auto ids = tokenizer.encode(text, false);
+        auto encode_end = std::chrono::high_resolution_clock::now();
+        auto encode_time = std::chrono::duration_cast<std::chrono::milliseconds>(encode_end - encode_start);
+        
+        size_t num_tokens = ids.size();
+        size_t num_chars = text.length();
+        double tokens_per_sec = num_tokens / (encode_time.count() / 1000.0);
+        
+        // Print results in a parseable format
+        std::cout << "load_time_ms:" << load_time.count() << std::endl;
+        std::cout << "encode_time_ms:" << encode_time.count() << std::endl;
+        std::cout << "num_tokens:" << num_tokens << std::endl;
+        std::cout << "num_chars:" << num_chars << std::endl;
+        std::cout << "tokens_per_sec:" << std::fixed << tokens_per_sec << std::endl;
+        
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+    
+    return 0;
+}
diff --git a/benchmarks/bench_python.py b/benchmarks/bench_python.py
new file mode 100755
index 000000000..a5ca971ae
--- /dev/null
+++ b/benchmarks/bench_python.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+import sys
+import time
+from tokenizers import Tokenizer
+
+def main():
+    if len(sys.argv) < 3:
+        print(f"Usage: {sys.argv[0]} <tokenizer.json> <input.txt>", file=sys.stderr)
+        sys.exit(1)
+    
+    tokenizer_path = sys.argv[1]
+    input_path = sys.argv[2]
+    
+    # Load tokenizer
+    load_start = time.time()
+    tokenizer = Tokenizer.from_file(tokenizer_path)
+    load_time = time.time() - load_start
+    
+    # Read input file
+    with open(input_path, 'r', encoding='utf-8') as f:
+        text = f.read()
+    
+    # Benchmark encoding
+    encode_start = time.time()
+    encoding = tokenizer.encode(text)
+    encode_time = time.time() - encode_start
+    
+    num_tokens = len(encoding.ids)
+    num_chars = len(text)
+    
+    # Print results in a parseable format
+    print(f"load_time_ms:{load_time * 1000:.0f}")
+    print(f"encode_time_ms:{encode_time * 1000:.0f}")
+    print(f"num_tokens:{num_tokens}")
+    print(f"num_chars:{num_chars}")
+    print(f"tokens_per_sec:{num_tokens / encode_time:.2f}")
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/bench_rust.rs b/benchmarks/bench_rust.rs
new file mode 100644
index 000000000..e2373579b
--- /dev/null
+++ b/benchmarks/bench_rust.rs
@@ -0,0 +1,40 @@
+use std::time::Instant;
+use std::fs;
+use tokenizers::Tokenizer;
+
+fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    let args: Vec<String> = std::env::args().collect();
+    
+    if args.len() < 3 {
+        eprintln!("Usage: {} <tokenizer.json> <input.txt>", args[0]);
+        std::process::exit(1);
+    }
+    
+    let tokenizer_path = &args[1];
+    let input_path = &args[2];
+    
+    // Load tokenizer
+    let load_start = Instant::now();
+    let tokenizer = Tokenizer::from_file(tokenizer_path)?;
+    let load_time = load_start.elapsed();
+    
+    // Read input file
+    let text = fs::read_to_string(input_path)?;
+    let num_chars = text.chars().count();
+    
+    // Benchmark encoding
+    let encode_start = Instant::now();
+    let encoding = tokenizer.encode(text, false)?;
+    let encode_time = encode_start.elapsed();
+    
+    let num_tokens = encoding.get_ids().len();
+    
+    // Print results in a parseable format
+    println!("load_time_ms:{}", load_time.as_millis());
+    println!("encode_time_ms:{}", encode_time.as_millis());
+    println!("num_tokens:{}", num_tokens);
+    println!("num_chars:{}", num_chars);
+    println!("tokens_per_sec:{:.2}", num_tokens as f64 / encode_time.as_secs_f64());
+    
+    Ok(())
+}
diff --git a/benchmarks/benchmark_results.tsv b/benchmarks/benchmark_results.tsv
new file mode 100644
index 000000000..e40d71128
--- /dev/null
+++ b/benchmarks/benchmark_results.tsv
@@ -0,0 +1,5 @@
+Variant	Load Time (ms)	Load Time StdDev	Encode Time (ms)	Encode Time StdDev	Tokens/sec	Tokens/sec StdDev	Num Tokens	Num Chars
+Rust	0.00	0.00	4805.00	55.56	1042971	11925	5011594.0	6488666.0
+Python	1.00	0.00	7084.67	56.37	707406	5580	5011594.0	6488666.0
+C Bindings	0.00	0.00	4872.00	166.32	1029460	35497	5011594.0	6488666.0
+C++ Bindings	0.00	0.00	4906.33	12.86	1021459	2673	5011594.0	6488666.0
diff --git a/benchmarks/build.sh b/benchmarks/build.sh
new file mode 100755
index 000000000..d34f6feb5
--- /dev/null
+++ b/benchmarks/build.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+# Build script for all tokenizer variants
+
+set -e  # Exit on error
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ROOT_DIR="$( cd "$SCRIPT_DIR/.." && pwd )"
+
+# Download big.txt if it doesn't exist
+if [ ! -f "$SCRIPT_DIR/big.txt" ]; then
+  echo ">>> Downloading big.txt..."
+  curl -o "$SCRIPT_DIR/big.txt" https://norvig.com/big.txt
+  echo "    ✓ big.txt downloaded"
+  echo
+fi
+
+
+echo "=== Building all tokenizer variants ==="
+echo
+
+# Build Rust tokenizer
+echo ">>> Building tokenizers-rust..."
+cd "$ROOT_DIR/tokenizers"
+cargo build --release --features http --example encode_batch
+# Find the actual tokenizers rlib file
+TOKENIZERS_LIB=$(find target/release/deps -name "libtokenizers-*.rlib" | head -n1)
+if [ -z "$TOKENIZERS_LIB" ]; then
+    echo "Error: Could not find tokenizers library file"
+    exit 1
+fi
+rustc --edition 2018 -L target/release/deps -L target/release \
+    --extern tokenizers="$TOKENIZERS_LIB" \
+    "$SCRIPT_DIR/bench_rust.rs" \
+    -o "$SCRIPT_DIR/bench_rust.out" \
+    -C opt-level=3
+echo "    ✓ Rust benchmark binary built"
+echo
+
+# Build Python bindings
+echo ">>> Building tokenizers-python..."
+cd "$ROOT_DIR/bindings/python"
+pip install -e . --quiet || pip install -e .
+chmod +x "$SCRIPT_DIR/bench_python.py"
+echo "    ✓ Python bindings installed"
+echo
+
+# Build C bindings
+echo ">>> Building tokenizers-c..."
+cd "$ROOT_DIR/bindings/c"
+cargo build --release
+echo "    ✓ C bindings library built"
+echo
+
+# Build C benchmark binary
+echo ">>> Building C benchmark..."
+g++ -std=c++17 -O3 \
+    -I"$ROOT_DIR/bindings/c" \
+    "$SCRIPT_DIR/bench_c.cpp" \
+    -o "$SCRIPT_DIR/bench_c.out" \
+    -L"$ROOT_DIR/bindings/c/target/release" \
+    -ltokenizers_c \
+    -Wl,-rpath,"$ROOT_DIR/bindings/c/target/release"
+echo "    ✓ C benchmark binary built"
+echo
+
+# Build C++ bindings
+echo ">>> Building tokenizers-cpp bindings..."
+cd "$ROOT_DIR/bindings/cpp"
+mkdir -p build
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release ..
+cmake --build . -j$(nproc)
+echo "    ✓ C++ bindings library built"
+echo
+
+# Build C++ benchmark binary
+echo ">>> Building C++ benchmark..."
+g++ -std=c++17 -O3 \
+    -I"$ROOT_DIR/bindings/cpp/include" \
+    "$SCRIPT_DIR/bench_cpp_bindings.cpp" \
+    -o "$SCRIPT_DIR/bench_cpp_bindings.out" \
+    -L"$ROOT_DIR/bindings/c/target/release" \
+    -ltokenizers_c \
+    -Wl,-rpath,"$ROOT_DIR/bindings/c/target/release"
+echo "    ✓ C++ bindings benchmark binary built"
+echo
+
+echo "=== All builds completed successfully ==="
diff --git a/benchmarks/run.py b/benchmarks/run.py
new file mode 100755
index 000000000..b0cf1b9d6
--- /dev/null
+++ b/benchmarks/run.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""
+Benchmark automation script for tokenizer variants
+Runs each variant 3 times and generates a TSV report with statistics
+"""
+
+import subprocess
+import time
+import sys
+import os
+from pathlib import Path
+from statistics import mean, stdev
+from typing import List, Dict, Any
+import json
+
+SCRIPT_DIR = Path(__file__).parent.absolute()
+ROOT_DIR = SCRIPT_DIR.parent
+BENCHMARKS_DIR = SCRIPT_DIR
+
+# Configuration
+NUM_RUNS = 3
+INPUT_FILE = BENCHMARKS_DIR / "big.txt"
+TOKENIZER_FILE = ROOT_DIR / "tokenizers" / "data" / "tokenizer.json"
+
+# Variant configurations
+VARIANTS = {
+    "tokenizers-rust": {
+        "command": [str(BENCHMARKS_DIR / "bench_rust.out"), str(TOKENIZER_FILE), str(INPUT_FILE)],
+        "name": "Rust"
+    },
+    "tokenizers-python": {
+        "command": ["python3", str(BENCHMARKS_DIR / "bench_python.py"), str(TOKENIZER_FILE), str(INPUT_FILE)],
+        "name": "Python"
+    },
+    "tokenizers-c": {
+        "command": [str(BENCHMARKS_DIR / "bench_c.out"), str(TOKENIZER_FILE), str(INPUT_FILE)],
+        "name": "C Bindings",
+        "env": {"LD_LIBRARY_PATH": str(ROOT_DIR / "bindings/c/target/release")}
+    },
+    "tokenizers-cpp-bindings": {
+        "command": [str(BENCHMARKS_DIR / "bench_cpp_bindings.out"), str(TOKENIZER_FILE), str(INPUT_FILE)],
+        "name": "C++ Bindings",
+        "env": {"LD_LIBRARY_PATH": str(ROOT_DIR / "bindings/c/target/release")}
+    }
+}
+
+
+def parse_output(output: str) -> Dict[str, float]:
+    """Parse the benchmark output into a dictionary"""
+    result = {}
+    for line in output.strip().split('\n'):
+        if ':' in line:
+            key, value = line.split(':', 1)
+            try:
+                result[key] = float(value)
+            except ValueError:
+                result[key] = value
+    return result
+
+
+def run_benchmark(variant_key: str, config: Dict[str, Any]) -> Dict[str, float]:
+    """Run a single benchmark and return the parsed results"""
+    env = os.environ.copy()
+    if "env" in config:
+        env.update(config["env"])
+    
+    try:
+        result = subprocess.run(
+            config["command"],
+            capture_output=True,
+            text=True,
+            check=True,
+            env=env
+        )
+        return parse_output(result.stdout)
+    except subprocess.CalledProcessError as e:
+        print(f"Error running {variant_key}:", file=sys.stderr)
+        print(f"Command: {' '.join(config['command'])}", file=sys.stderr)
+        print(f"Return code: {e.returncode}", file=sys.stderr)
+        print(f"Stdout: {e.stdout}", file=sys.stderr)
+        print(f"Stderr: {e.stderr}", file=sys.stderr)
+        raise
+    except FileNotFoundError as e:
+        print(f"Error: Could not find executable for {variant_key}", file=sys.stderr)
+        print(f"Command: {' '.join(config['command'])}", file=sys.stderr)
+        print(f"Make sure to run build.sh first", file=sys.stderr)
+        raise
+
+
+def calculate_stats(values: List[float]) -> Dict[str, float]:
+    """Calculate mean and standard deviation"""
+    if len(values) < 2:
+        return {"mean": values[0] if values else 0, "stdev": 0}
+    return {"mean": mean(values), "stdev": stdev(values)}
+
+
+def main():
+    print("=== Tokenizer Benchmark Suite ===")
+    print(f"Input file: {INPUT_FILE}")
+    print(f"Tokenizer: {TOKENIZER_FILE}")
+    print(f"Number of runs per variant: {NUM_RUNS}")
+    print()
+    
+    if not INPUT_FILE.exists():
+        print(f"Error: Input file not found: {INPUT_FILE}", file=sys.stderr)
+        sys.exit(1)
+    
+    if not TOKENIZER_FILE.exists():
+        print(f"Error: Tokenizer file not found: {TOKENIZER_FILE}", file=sys.stderr)
+        sys.exit(1)
+    
+    all_results = {}
+    
+    for variant_key, config in VARIANTS.items():
+        variant_name = config["name"]
+        print(f">>> Running {variant_name} ({NUM_RUNS} runs)...")
+        
+        runs = []
+        for run_num in range(1, NUM_RUNS + 1):
+            print(f"    Run {run_num}/{NUM_RUNS}...", end=" ", flush=True)
+            try:
+                result = run_benchmark(variant_key, config)
+                runs.append(result)
+                print(f"✓ ({result.get('encode_time_ms', 0):.0f}ms)")
+            except Exception as e:
+                print(f"✗ FAILED")
+                print(f"    Error: {e}", file=sys.stderr)
+                # Store None to indicate failure
+                all_results[variant_key] = None
+                break
+        else:
+            # All runs succeeded
+            all_results[variant_key] = {
+                "name": variant_name,
+                "runs": runs
+            }
+        
+        print()
+    
+    # Generate statistics
+    print("=== Calculating Statistics ===")
+    print()
+    
+    stats = {}
+    for variant_key, data in all_results.items():
+        if data is None:
+            print(f"{VARIANTS[variant_key]['name']}: FAILED")
+            continue
+        
+        load_times = [r['load_time_ms'] for r in data['runs']]
+        encode_times = [r['encode_time_ms'] for r in data['runs']]
+        tokens_per_sec = [r['tokens_per_sec'] for r in data['runs']]
+        
+        stats[variant_key] = {
+            "name": data["name"],
+            "load_time": calculate_stats(load_times),
+            "encode_time": calculate_stats(encode_times),
+            "tokens_per_sec": calculate_stats(tokens_per_sec),
+            "num_tokens": data['runs'][0]['num_tokens'],
+            "num_chars": data['runs'][0]['num_chars']
+        }
+        
+        print(f"{data['name']}:")
+        print(f"  Load time:     {stats[variant_key]['load_time']['mean']:>8.2f} ± {stats[variant_key]['load_time']['stdev']:>6.2f} ms")
+        print(f"  Encode time:   {stats[variant_key]['encode_time']['mean']:>8.2f} ± {stats[variant_key]['encode_time']['stdev']:>6.2f} ms")
+        print(f"  Tokens/sec:    {stats[variant_key]['tokens_per_sec']['mean']:>8.0f} ± {stats[variant_key]['tokens_per_sec']['stdev']:>6.0f}")
+        print(f"  Tokens:        {stats[variant_key]['num_tokens']}")
+        print()
+    
+    # Generate TSV report
+    output_file = BENCHMARKS_DIR / "benchmark_results.tsv"
+    print(f"=== Generating TSV report: {output_file} ===")
+    
+    with open(output_file, 'w') as f:
+        # Header
+        f.write("Variant\tLoad Time (ms)\tLoad Time StdDev\tEncode Time (ms)\tEncode Time StdDev\t")
+        f.write("Tokens/sec\tTokens/sec StdDev\tNum Tokens\tNum Chars\n")
+        
+        # Data rows
+        for variant_key in VARIANTS.keys():
+            if variant_key not in stats:
+                continue
+            
+            s = stats[variant_key]
+            f.write(f"{s['name']}\t")
+            f.write(f"{s['load_time']['mean']:.2f}\t{s['load_time']['stdev']:.2f}\t")
+            f.write(f"{s['encode_time']['mean']:.2f}\t{s['encode_time']['stdev']:.2f}\t")
+            f.write(f"{s['tokens_per_sec']['mean']:.0f}\t{s['tokens_per_sec']['stdev']:.0f}\t")
+            f.write(f"{s['num_tokens']}\t{s['num_chars']}\n")
+    
+    print(f"✓ Report saved to {output_file}")
+    print()
+    
+    # Also save raw JSON data
+    json_file = BENCHMARKS_DIR / "benchmark_results.json"
+    with open(json_file, 'w') as f:
+        json.dump({
+            "config": {
+                "num_runs": NUM_RUNS,
+                "input_file": str(INPUT_FILE),
+                "tokenizer_file": str(TOKENIZER_FILE)
+            },
+            "results": all_results,
+            "statistics": stats
+        }, f, indent=2)
+    
+    print(f"✓ Raw data saved to {json_file}")
+    print()
+    print("=== Benchmark Complete ===")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bindings/c/Cargo.toml b/bindings/c/Cargo.toml
new file mode 100644
index 000000000..41fd200bd
--- /dev/null
+++ b/bindings/c/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "tokenizers_c"
+version = "0.0.1"
+edition = "2021"
+license = "Apache-2.0"
+
+[lib]
+crate-type = ["cdylib"]
+name = "tokenizers_c"
+
+[dependencies]
+# Path to the core tokenizers crate relative to this Cargo.toml
+# Current file is at bindings/tokenizers_c/Cargo.toml, core crate at tokenizers/
+tokenizers = { path = "../../tokenizers" }
+serde_json = "1.0"
+
+[profile.release]
+opt-level = 3
+codegen-units = 1
+lto = true
diff --git a/bindings/c/src/lib.rs b/bindings/c/src/lib.rs
new file mode 100644
index 000000000..15cdecd5e
--- /dev/null
+++ b/bindings/c/src/lib.rs
@@ -0,0 +1,674 @@
+use std::ffi::{CStr, CString};
+use std::os::raw::{c_char, c_void};
+use std::ptr;
+use std::path::Path;
+use std::fs;
+use tokenizers::{Encoding, Tokenizer, AddedToken, PaddingParams, PaddingStrategy, PaddingDirection};
+use serde_json::Value;
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct tokenizers_encoding_t {
+    pub ids: *const i32,
+    pub attention_mask: *const i32,
+    pub len: usize,
+    pub _internal_ptr: *mut c_void,  // Store the Box pointer for cleanup
+}
+
+/// Tokenizer configuration loaded from tokenizer_config.json
+/// Contains authoritative special token definitions and chat template
+#[derive(Default, Clone)]
+struct TokenizerConfig {
+    bos_token: Option<String>,
+    eos_token: Option<String>,
+    pad_token: Option<String>,
+    unk_token: Option<String>,
+    chat_template: Option<String>,
+    add_bos_token: bool,
+    add_eos_token: bool,
+}
+
+impl TokenizerConfig {
+    /// Load config from a directory containing tokenizer_config.json
+    fn from_dir(dir: &Path) -> Option<Self> {
+        let config_path = dir.join("tokenizer_config.json");
+        Self::from_file(&config_path)
+    }
+    
+    /// Load config from a specific file path
+    fn from_file(path: &Path) -> Option<Self> {
+        let content = fs::read_to_string(path).ok()?;
+        Self::from_json(&content)
+    }
+    
+    /// Parse config from JSON string
+    fn from_json(json: &str) -> Option<Self> {
+        let v: Value = serde_json::from_str(json).ok()?;
+        
+        // Helper to extract token string - handles both string and object formats
+        let extract_token = |v: &Value, key: &str| -> Option<String> {
+            match v.get(key)? {
+                Value::String(s) => Some(s.clone()),
+                Value::Object(obj) => obj.get("content")?.as_str().map(|s| s.to_string()),
+                _ => None,
+            }
+        };
+        
+        Some(TokenizerConfig {
+            bos_token: extract_token(&v, "bos_token"),
+            eos_token: extract_token(&v, "eos_token"),
+            pad_token: extract_token(&v, "pad_token"),
+            unk_token: extract_token(&v, "unk_token"),
+            chat_template: v.get("chat_template").and_then(|v| v.as_str()).map(|s| s.to_string()),
+            add_bos_token: v.get("add_bos_token").and_then(|v| v.as_bool()).unwrap_or(false),
+            add_eos_token: v.get("add_eos_token").and_then(|v| v.as_bool()).unwrap_or(false),
+        })
+    }
+    
+    /// Get special token string by name
+    fn get_special_token(&self, name: &str) -> Option<&str> {
+        match name.to_uppercase().as_str() {
+            "BOS" => self.bos_token.as_deref(),
+            "EOS" => self.eos_token.as_deref(),
+            "PAD" => self.pad_token.as_deref(),
+            "UNK" => self.unk_token.as_deref(),
+            _ => None,
+        }
+    }
+}
+
+/// Opaque tokenizer type exposed as void* on the C side.
+/// Contains tokenizer + optional config (auto-loaded from same directory)
+struct CTokenizer {
+    tokenizer: Tokenizer,
+    config: Option<TokenizerConfig>,
+}
+
+impl CTokenizer {
+    fn new_from_file(path: &str, config_path: Option<&str>) -> Option<Self> {
+        let tokenizer = Tokenizer::from_file(path).ok()?;
+        // Load config: explicit path > sibling tokenizer_config.json
+        let config = if let Some(cp) = config_path {
+            TokenizerConfig::from_file(Path::new(cp))
+        } else {
+            Path::new(path).parent().and_then(TokenizerConfig::from_dir)
+        };
+        Some(CTokenizer { tokenizer, config })
+    }
+    
+    fn new_from_str(json: &str) -> Option<Self> {
+        let tokenizer = Tokenizer::from_bytes(json.as_bytes()).ok()?;
+        // No config available when loading from string
+        Some(CTokenizer { tokenizer, config: None })
+    }
+    
+    /// Get special token ID - tries config first, falls back to heuristic
+    fn get_special_token_id(&self, name: &str) -> i32 {
+        // Try config first (authoritative)
+        if let Some(config) = &self.config {
+            if let Some(token) = config.get_special_token(name) {
+                if let Some(id) = self.tokenizer.token_to_id(token) {
+                    return id as i32;
+                }
+            }
+        }
+        // Fall back to heuristic
+        let candidates = match name.to_uppercase().as_str() {
+            "BOS" => &["<bos>", "<s>", "[CLS]", "<|begin_of_text|>", "<|startoftext|>"][..],
+            "EOS" => &["<eos>", "</s>", "[SEP]", "<|end_of_text|>", "<|endoftext|>", "<|eot_id|>"][..],
+            "PAD" => &["<pad>", "[PAD]", "<|padding|>"][..],
+            "UNK" => &["<unk>", "[UNK]", "<|unk|>"][..],
+            _ => return -1,
+        };
+        for token in candidates {
+            if let Some(id) = self.tokenizer.token_to_id(token) {
+                return id as i32;
+            }
+        }
+        -1
+    }
+}
+
+/// Encoding data that we'll Box allocate for safe memory management
+struct EncodingData {
+    ids: Vec<i32>,
+    attention_mask: Vec<i32>,
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_new_from_file(path: *const c_char) -> *mut c_void {
+    tokenizers_new_from_file_with_config(path, ptr::null())
+}
+
+/// Create tokenizer with explicit config file path
+#[no_mangle]
+pub extern "C" fn tokenizers_new_from_file_with_config(
+    path: *const c_char,
+    config_path: *const c_char
+) -> *mut c_void {
+    if path.is_null() {
+        return ptr::null_mut();
+    }
+    let c_str = unsafe { CStr::from_ptr(path) };
+    let path_str = match c_str.to_str() {
+        Ok(s) => s,
+        Err(_) => return ptr::null_mut(),
+    };
+    let config_str = if config_path.is_null() {
+        None
+    } else {
+        let c_cfg = unsafe { CStr::from_ptr(config_path) };
+        c_cfg.to_str().ok()
+    };
+    match CTokenizer::new_from_file(path_str, config_str) {
+        Some(t) => Box::into_raw(Box::new(t)) as *mut c_void,
+        None => ptr::null_mut(),
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_new_from_str(json: *const c_char) -> *mut c_void {
+    if json.is_null() { return ptr::null_mut(); }
+    let c_str = unsafe { CStr::from_ptr(json) };
+    let json_str = match c_str.to_str() {
+        Ok(s) => s,
+        Err(_) => return ptr::null_mut(),
+    };
+    match CTokenizer::new_from_str(json_str) {
+        Some(t) => Box::into_raw(Box::new(t)) as *mut c_void,
+        None => ptr::null_mut(),
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_free(tokenizer: *mut c_void) {
+    if tokenizer.is_null() { return; }
+    unsafe { drop(Box::from_raw(tokenizer as *mut CTokenizer)); }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_encode(
+    tokenizer: *mut c_void,
+    text: *const c_char,
+    add_special_tokens: bool,
+) -> tokenizers_encoding_t {
+    if tokenizer.is_null() || text.is_null() {
+        return tokenizers_encoding_t { 
+            ids: ptr::null(), 
+            attention_mask: ptr::null(), 
+            len: 0, 
+            _internal_ptr: ptr::null_mut() 
+        };
+    }
+    let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) };
+    let c_text = unsafe { CStr::from_ptr(text) };
+    let text_str = match c_text.to_str() { Ok(s) => s, Err(_) => {
+        return tokenizers_encoding_t { 
+            ids: ptr::null(), 
+            attention_mask: ptr::null(), 
+            len: 0, 
+            _internal_ptr: ptr::null_mut() 
+        };
+    }};
+
+    let encoding: Encoding = match c_tok.tokenizer.encode(text_str, add_special_tokens) {
+        Ok(e) => e,
+        Err(_) => return tokenizers_encoding_t { 
+            ids: ptr::null(), 
+            attention_mask: ptr::null(), 
+            len: 0, 
+            _internal_ptr: ptr::null_mut() 
+        },
+    };
+
+    let ids_vec: Vec<i32> = encoding.get_ids().iter().map(|&v| v as i32).collect();
+    let mask_vec: Vec<i32> = encoding.get_attention_mask().iter().map(|&v| v as i32).collect();
+    let len = ids_vec.len();
+    
+    // Allocate EncodingData on the heap using Box
+    let encoding_data = Box::new(EncodingData {
+        ids: ids_vec,
+        attention_mask: mask_vec,
+    });
+    
+    let ptr_ids = encoding_data.ids.as_ptr();
+    let ptr_mask = encoding_data.attention_mask.as_ptr();
+    
+    // Convert Box to raw pointer - this transfers ownership to C
+    let raw_ptr = Box::into_raw(encoding_data);
+    
+    tokenizers_encoding_t { 
+        ids: ptr_ids, 
+        attention_mask: ptr_mask, 
+        len,
+        _internal_ptr: raw_ptr as *mut c_void
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_encode_batch(
+    tokenizer: *mut c_void,
+    texts: *const *const c_char,
+    len: usize,
+    add_special_tokens: bool,
+) -> *mut tokenizers_encoding_t {
+    if tokenizer.is_null() || texts.is_null() { return ptr::null_mut(); }
+    let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) };
+    let c_texts_ptrs = unsafe { std::slice::from_raw_parts(texts, len) };
+    
+    let mut rs_texts = Vec::new();
+    for &ptr in c_texts_ptrs {
+        if ptr.is_null() { continue; }
+        let c_str = unsafe { CStr::from_ptr(ptr) };
+        if let Ok(s) = c_str.to_str() {
+            rs_texts.push(s);
+        }
+    }
+    
+    let encodings = match c_tok.tokenizer.encode_batch(rs_texts, add_special_tokens) {
+        Ok(e) => e,
+        Err(_) => return ptr::null_mut(),
+    };
+    
+    let mut c_encodings = Vec::with_capacity(encodings.len());
+    for encoding in encodings {
+        let ids_vec: Vec<i32> = encoding.get_ids().iter().map(|&v| v as i32).collect();
+        let mask_vec: Vec<i32> = encoding.get_attention_mask().iter().map(|&v| v as i32).collect();
+        let len = ids_vec.len();
+        let ptr_ids = ids_vec.as_ptr();
+        let ptr_mask = mask_vec.as_ptr();
+        
+        std::mem::forget(ids_vec);
+        std::mem::forget(mask_vec);
+        
+        c_encodings.push(tokenizers_encoding_t { 
+            ids: ptr_ids, 
+            attention_mask: ptr_mask, 
+            len,
+            _internal_ptr: ptr::null_mut()  // Batch encoding has memory management issues - we'll leak for now
+        });
+    }
+    
+    let ptr = c_encodings.as_mut_ptr();
+    std::mem::forget(c_encodings);
+    ptr
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_free_encoding(enc: tokenizers_encoding_t) {
+    if !enc._internal_ptr.is_null() {
+        unsafe {
+            // Reconstruct the Box from the raw pointer and let it drop naturally
+            let _boxed = Box::from_raw(enc._internal_ptr as *mut EncodingData);
+            // Box will be automatically dropped here, cleaning up the memory
+        }
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_free_batch_encoding(encodings: *mut tokenizers_encoding_t, len: usize) {
+    if encodings.is_null() { return; }
+    let slice = unsafe { std::slice::from_raw_parts_mut(encodings, len) };
+    for enc in slice {
+        tokenizers_free_encoding(*enc);
+    }
+    unsafe { Vec::from_raw_parts(encodings, len, len); }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_version() -> *const c_char {
+    // Return a static C string with version info.
+    static VERSION: &str = concat!("tokenizers_c ", env!("CARGO_PKG_VERSION"));
+    CString::new(VERSION).unwrap().into_raw()
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_string_free(s: *mut c_char) {
+    if s.is_null() { return; }
+    unsafe { drop(CString::from_raw(s)); }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_vocab_size(tokenizer: *mut c_void) -> usize {
+    if tokenizer.is_null() { return 0; }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    c_tok.tokenizer.get_vocab(true).len()
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_token_to_id(tokenizer: *mut c_void, token: *const c_char) -> i32 {
+    if tokenizer.is_null() || token.is_null() { return -1; }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    let c_token = unsafe { CStr::from_ptr(token) };
+    let token_str = match c_token.to_str() { Ok(s) => s, Err(_) => return -1 };
+    match c_tok.tokenizer.token_to_id(token_str) {
+        Some(id) => id as i32,
+        None => -1,
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_id_to_token(tokenizer: *mut c_void, id: i32) -> *mut c_char {
+    if tokenizer.is_null() { return ptr::null_mut(); }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    match c_tok.tokenizer.id_to_token(id as u32) {
+        Some(token) => CString::new(token).unwrap().into_raw(),
+        None => ptr::null_mut(),
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_decode(
+    tokenizer: *mut c_void,
+    ids: *const i32,
+    len: usize,
+    skip_special_tokens: bool
+) -> *mut c_char {
+    if tokenizer.is_null() || ids.is_null() { return ptr::null_mut(); }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    let ids_slice_i32 = unsafe { std::slice::from_raw_parts(ids, len) };
+    let ids_slice_u32: Vec<u32> = ids_slice_i32.iter().map(|&id| id as u32).collect();
+    
+    match c_tok.tokenizer.decode(&ids_slice_u32, skip_special_tokens) {
+        Ok(s) => CString::new(s).unwrap().into_raw(),
+        Err(_) => ptr::null_mut(),
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_decode_batch(
+    tokenizer: *mut c_void,
+    ids: *const *const i32,
+    lens: *const usize,
+    batch_len: usize,
+    skip_special_tokens: bool
+) -> *mut *mut c_char {
+    if tokenizer.is_null() || ids.is_null() || lens.is_null() { return ptr::null_mut(); }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    
+    let ids_ptrs = unsafe { std::slice::from_raw_parts(ids, batch_len) };
+    let lens_slice = unsafe { std::slice::from_raw_parts(lens, batch_len) };
+    
+    let mut batch_ids_u32 = Vec::with_capacity(batch_len);
+    for i in 0..batch_len {
+        let len = lens_slice[i];
+        let ptr = ids_ptrs[i];
+        if ptr.is_null() {
+            batch_ids_u32.push(vec![]);
+            continue;
+        }
+        let slice = unsafe { std::slice::from_raw_parts(ptr, len) };
+        batch_ids_u32.push(slice.iter().map(|&id| id as u32).collect());
+    }
+    
+    let batch_ids_refs: Vec<&[u32]> = batch_ids_u32.iter().map(|v| v.as_slice()).collect();
+    
+    let decoded = match c_tok.tokenizer.decode_batch(&batch_ids_refs, skip_special_tokens) {
+        Ok(s) => s,
+        Err(_) => return ptr::null_mut(),
+    };
+    
+    let mut c_strings = Vec::with_capacity(decoded.len());
+    for s in decoded {
+        c_strings.push(CString::new(s).unwrap().into_raw());
+    }
+    
+    let ptr = c_strings.as_mut_ptr();
+    std::mem::forget(c_strings);
+    ptr
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_free_batch_decode(strings: *mut *mut c_char, len: usize) {
+    if strings.is_null() { return; }
+    let slice = unsafe { std::slice::from_raw_parts_mut(strings, len) };
+    for &mut s in slice {
+        tokenizers_string_free(s);
+    }
+    unsafe { Vec::from_raw_parts(strings, len, len); }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_save(tokenizer: *mut c_void, path: *const c_char, pretty: bool) -> bool {
+    if tokenizer.is_null() || path.is_null() { return false; }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    let c_path = unsafe { CStr::from_ptr(path) };
+    let path_str = match c_path.to_str() { Ok(s) => s, Err(_) => return false };
+    
+    c_tok.tokenizer.save(path_str, pretty).is_ok()
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_to_str(tokenizer: *mut c_void, pretty: bool) -> *mut c_char {
+    if tokenizer.is_null() { return ptr::null_mut(); }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    match c_tok.tokenizer.to_string(pretty) {
+        Ok(s) => CString::new(s).unwrap().into_raw(),
+        Err(_) => ptr::null_mut(),
+    }
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_add_special_token(tokenizer: *mut c_void, token: *const c_char) -> bool {
+    if tokenizer.is_null() || token.is_null() { return false; }
+    let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) };
+    let c_token = unsafe { CStr::from_ptr(token) };
+    let token_str = match c_token.to_str() { Ok(s) => s, Err(_) => return false };
+    let added = AddedToken::from(token_str.to_string(), true);
+    c_tok.tokenizer.add_special_tokens(&[added]);
+    true
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_add_special_tokens(
+    tokenizer: *mut c_void,
+    tokens: *const *const c_char,
+    len: usize
+) -> usize {
+    if tokenizer.is_null() || tokens.is_null() { return 0; }
+    let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) };
+    let c_tokens_ptrs = unsafe { std::slice::from_raw_parts(tokens, len) };
+    
+    let mut added_tokens = Vec::new();
+    for &ptr in c_tokens_ptrs {
+        if ptr.is_null() { continue; }
+        let c_str = unsafe { CStr::from_ptr(ptr) };
+        if let Ok(s) = c_str.to_str() {
+            added_tokens.push(AddedToken::from(s.to_string(), true));
+        }
+    }
+    
+    c_tok.tokenizer.add_special_tokens(&added_tokens)
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_add_tokens(
+    tokenizer: *mut c_void,
+    tokens: *const *const c_char,
+    len: usize
+) -> usize {
+    if tokenizer.is_null() || tokens.is_null() { return 0; }
+    let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) };
+    let c_tokens_ptrs = unsafe { std::slice::from_raw_parts(tokens, len) };
+    
+    let mut added_tokens = Vec::new();
+    for &ptr in c_tokens_ptrs {
+        if ptr.is_null() { continue; }
+        let c_str = unsafe { CStr::from_ptr(ptr) };
+        if let Ok(s) = c_str.to_str() {
+            added_tokens.push(AddedToken::from(s.to_string(), false));
+        }
+    }
+    
+    c_tok.tokenizer.add_tokens(&added_tokens)
+}
+
+#[repr(C)]
+pub struct tokenizers_truncation_params_t {
+    pub max_length: usize,
+    pub stride: usize,
+    pub strategy: i32, // 0: LongestFirst, 1: OnlyFirst, 2: OnlySecond
+    pub direction: i32, // 0: Left, 1: Right
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_set_truncation(
+    tokenizer: *mut c_void,
+    params: *const tokenizers_truncation_params_t
+) {
+    if tokenizer.is_null() { return; }
+    let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) };
+    
+    if params.is_null() {
+        let _ = c_tok.tokenizer.with_truncation(None);
+        return;
+    }
+    
+    let p = unsafe { &*params };
+    
+    let strategy = match p.strategy {
+        1 => tokenizers::TruncationStrategy::OnlyFirst,
+        2 => tokenizers::TruncationStrategy::OnlySecond,
+        _ => tokenizers::TruncationStrategy::LongestFirst,
+    };
+    
+    let direction = match p.direction {
+        1 => tokenizers::TruncationDirection::Right,
+        _ => tokenizers::TruncationDirection::Left,
+    };
+    
+    let params = tokenizers::TruncationParams {
+        max_length: p.max_length,
+        stride: p.stride,
+        strategy,
+        direction,
+    };
+    
+    let _ = c_tok.tokenizer.with_truncation(Some(params));
+}
+
+#[repr(C)]
+pub struct tokenizers_padding_params_t {
+    pub pad_id: u32,
+    pub pad_type_id: u32,
+    pub pad_token: *const c_char,
+    pub strategy: i32, // 0: BatchLongest, 1: Fixed
+    pub fixed_length: usize,
+    pub direction: i32, // 0: Left, 1: Right
+    pub pad_to_multiple_of: usize,
+}
+
+#[no_mangle]
+pub extern "C" fn tokenizers_set_padding(
+    tokenizer: *mut c_void,
+    params: *const tokenizers_padding_params_t
+) {
+    if tokenizer.is_null() { return; }
+    let c_tok = unsafe { &mut *(tokenizer as *mut CTokenizer) };
+    
+    if params.is_null() {
+        c_tok.tokenizer.with_padding(None);
+        return;
+    }
+    
+    let p = unsafe { &*params };
+    let pad_token = unsafe { CStr::from_ptr(p.pad_token) }.to_string_lossy().into_owned();
+    
+    let strategy = match p.strategy {
+        1 => PaddingStrategy::Fixed(p.fixed_length),
+        _ => PaddingStrategy::BatchLongest,
+    };
+    
+    let direction = match p.direction {
+        1 => PaddingDirection::Right,
+        _ => PaddingDirection::Left,
+    };
+    
+    let params = PaddingParams {
+        strategy,
+        direction,
+        pad_id: p.pad_id,
+        pad_type_id: p.pad_type_id,
+        pad_token,
+        pad_to_multiple_of: if p.pad_to_multiple_of == 0 { None } else { Some(p.pad_to_multiple_of) },
+    };
+    
+    c_tok.tokenizer.with_padding(Some(params));
+}
+
+// === Special Token IDs ===
+// Unified API: automatically uses config if available, falls back to heuristic
+
+/// Get special token ID by name ("BOS", "EOS", "PAD", "UNK")
+/// Automatically uses tokenizer_config.json if found, otherwise uses heuristic.
+/// Returns -1 if not found.
+#[no_mangle]
+pub extern "C" fn tokenizers_get_special_token_id(
+    tokenizer: *mut c_void,
+    name: *const c_char
+) -> i32 {
+    if tokenizer.is_null() || name.is_null() { return -1; }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    let c_name = unsafe { CStr::from_ptr(name) };
+    let name_str = match c_name.to_str() { Ok(s) => s, Err(_) => return -1 };
+    c_tok.get_special_token_id(name_str)
+}
+
+/// Get special token string by name ("BOS", "EOS", "PAD", "UNK")
+/// Returns the token from config if available, otherwise null.
+/// Caller must free with tokenizers_string_free.
+#[no_mangle]
+pub extern "C" fn tokenizers_get_special_token(
+    tokenizer: *mut c_void,
+    name: *const c_char
+) -> *mut c_char {
+    if tokenizer.is_null() || name.is_null() { return ptr::null_mut(); }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    let c_name = unsafe { CStr::from_ptr(name) };
+    let name_str = match c_name.to_str() { Ok(s) => s, Err(_) => return ptr::null_mut() };
+    
+    if let Some(config) = &c_tok.config {
+        if let Some(token) = config.get_special_token(name_str) {
+            return CString::new(token).unwrap().into_raw();
+        }
+    }
+    ptr::null_mut()
+}
+
+/// Get add_bos_token setting from config (false if no config)
+#[no_mangle]
+pub extern "C" fn tokenizers_get_add_bos_token(tokenizer: *mut c_void) -> bool {
+    if tokenizer.is_null() { return false; }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    c_tok.config.as_ref().map_or(false, |c| c.add_bos_token)
+}
+
+/// Get add_eos_token setting from config (false if no config)
+#[no_mangle]
+pub extern "C" fn tokenizers_get_add_eos_token(tokenizer: *mut c_void) -> bool {
+    if tokenizer.is_null() { return false; }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    c_tok.config.as_ref().map_or(false, |c| c.add_eos_token)
+}
+
+/// Check if tokenizer has a chat template (from config)
+#[no_mangle]
+pub extern "C" fn tokenizers_has_chat_template(tokenizer: *mut c_void) -> bool {
+    if tokenizer.is_null() { return false; }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    c_tok.config.as_ref().map_or(false, |c| c.chat_template.is_some())
+}
+
+/// Get chat template string (caller must free with tokenizers_string_free)
+#[no_mangle]
+pub extern "C" fn tokenizers_get_chat_template(tokenizer: *mut c_void) -> *mut c_char {
+    if tokenizer.is_null() { return ptr::null_mut(); }
+    let c_tok = unsafe { &*(tokenizer as *mut CTokenizer) };
+    if let Some(config) = &c_tok.config {
+        if let Some(template) = &config.chat_template {
+            return CString::new(template.as_str()).unwrap().into_raw();
+        }
+    }
+    ptr::null_mut()
+}
+
diff --git a/bindings/c/tokenizers_c.h b/bindings/c/tokenizers_c.h
new file mode 100644
index 000000000..111198ac9
--- /dev/null
+++ b/bindings/c/tokenizers_c.h
@@ -0,0 +1,84 @@
+#ifndef TOKENIZERS_C_H
+#define TOKENIZERS_C_H
+
+#include <stddef.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    const int* ids;
+    const int* attention_mask;
+    size_t len;
+    void* _internal_ptr;  // Internal use only - do not access
+} tokenizers_encoding_t;
+
+// Create a new tokenizer from a JSON file (auto-loads tokenizer_config.json if present)
+void* tokenizers_new_from_file(const char* path);
+
+// Create a new tokenizer with explicit config file path
+void* tokenizers_new_from_file_with_config(const char* path, const char* config_path);
+
+// Create a new tokenizer from a JSON string
+void* tokenizers_new_from_str(const char* json);
+
+// Free a tokenizer
+void tokenizers_free(void* tokenizer);
+
+// Encode text into token IDs
+tokenizers_encoding_t tokenizers_encode(void* tokenizer, const char* text, bool add_special_tokens);
+
+// Free an encoding
+void tokenizers_free_encoding(tokenizers_encoding_t enc);
+
+// Get tokenizer version
+const char* tokenizers_version();
+
+// Free a string returned by the library
+void tokenizers_string_free(char* s);
+
+// Get vocabulary size
+size_t tokenizers_vocab_size(void* tokenizer);
+
+// Get token ID for a token string
+int tokenizers_token_to_id(void* tokenizer, const char* token);
+
+// Get token string for a token ID
+char* tokenizers_id_to_token(void* tokenizer, int id);
+
+// Decode token IDs back to text
+char* tokenizers_decode(void* tokenizer, const int* ids, size_t len, bool skip_special_tokens);
+
+// Add a special token
+bool tokenizers_add_special_token(void* tokenizer, const char* token);
+
+// === Special Tokens (unified API) ===
+// Config is auto-loaded from tokenizer_config.json if present next to tokenizer.json
+
+// Get special token ID by name ("BOS", "EOS", "PAD", "UNK")
+// Uses config if available, falls back to heuristic. Returns -1 if not found.
+int tokenizers_get_special_token_id(void* tokenizer, const char* name);
+
+// Get special token string by name ("BOS", "EOS", "PAD", "UNK")
+// Returns token from config, or NULL if not available. Must free with tokenizers_string_free.
+char* tokenizers_get_special_token(void* tokenizer, const char* name);
+
+// Get add_bos_token setting (false if no config)
+bool tokenizers_get_add_bos_token(void* tokenizer);
+
+// Get add_eos_token setting (false if no config)
+bool tokenizers_get_add_eos_token(void* tokenizer);
+
+// Check if tokenizer has a chat template
+bool tokenizers_has_chat_template(void* tokenizer);
+
+// Get chat template string (must be freed with tokenizers_string_free)
+char* tokenizers_get_chat_template(void* tokenizer);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // TOKENIZERS_C_H
diff --git a/bindings/cpp/CMakeLists.txt b/bindings/cpp/CMakeLists.txt
new file mode 100644
index 000000000..7921f0b50
--- /dev/null
+++ b/bindings/cpp/CMakeLists.txt
@@ -0,0 +1,80 @@
+cmake_minimum_required(VERSION 3.16)
+project(tokenizers_cpp LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# Option to force a fresh cargo build
+option(TOKENIZERS_CPP_FORCE_CARGO "Force rebuilding the Rust C FFI library" OFF)
+option(TOKENIZERS_COMPILE_TESTS "Compile tokenizers C++ bindings tests" ON)
+
+# Build directory for Rust output (now at bindings/c)
+set(RUST_CRATE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c)
+set(RUST_OUTPUT_DIR ${RUST_CRATE_DIR}/target/release)
+set(RUST_LIB_NAME tokenizers_c)
+
+# Jinja2Cpp for chat template rendering
+set(JINJA2CPP_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "" FORCE)
+set(JINJA2CPP_DEPS_MODE internal CACHE STRING "" FORCE)
+add_subdirectory(third_party/Jinja2Cpp)
+
+# Custom command to build the Rust cdylib
+add_custom_command(
+    OUTPUT ${RUST_OUTPUT_DIR}/lib${RUST_LIB_NAME}.so
+    WORKING_DIRECTORY ${RUST_CRATE_DIR}
+    COMMAND cargo build --release
+    COMMENT "Building Rust FFI crate tokenizers_c"
+    DEPENDS ${RUST_CRATE_DIR}/src/lib.rs ${RUST_CRATE_DIR}/Cargo.toml
+    VERBATIM
+)
+
+add_custom_target(build_rust_ffi DEPENDS ${RUST_OUTPUT_DIR}/lib${RUST_LIB_NAME}.so)
+
+add_library(${RUST_LIB_NAME} SHARED IMPORTED GLOBAL)
+add_dependencies(${RUST_LIB_NAME} build_rust_ffi)
+set_target_properties(${RUST_LIB_NAME} PROPERTIES
+    IMPORTED_LOCATION ${RUST_OUTPUT_DIR}/lib${RUST_LIB_NAME}.so
+)
+
+# C++ wrapper library with chat template support
+add_library(tokenizers_cpp_impl STATIC
+    src/tokenizers.cpp
+)
+add_dependencies(tokenizers_cpp_impl build_rust_ffi)
+target_include_directories(tokenizers_cpp_impl PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_link_libraries(tokenizers_cpp_impl PUBLIC ${RUST_LIB_NAME} jinja2cpp)
+
+# Interface library for easy linking
+add_library(tokenizers_cpp INTERFACE)
+target_link_libraries(tokenizers_cpp INTERFACE tokenizers_cpp_impl)
+
+# Tests
+if(TOKENIZERS_COMPILE_TESTS)
+    enable_testing()
+
+    include(FetchContent)
+    FetchContent_Declare(
+      googletest
+      URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip
+    )
+    # For Windows: Prevent overriding the parent project's compiler/linker settings
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+    FetchContent_MakeAvailable(googletest)
+
+    # Google Test executable
+    add_executable(tokenizer_tests_gtest
+        tests/test_tokenizer_gtest.cpp
+    )
+    target_link_libraries(tokenizer_tests_gtest PRIVATE tokenizers_cpp GTest::gtest_main)
+
+    # Set test data directory for test discovery
+    set(TOKENIZERS_TEST_DATA_DIR "${CMAKE_CURRENT_SOURCE_DIR}/data")
+
+    # Register Google Test with environment variable for test data
+    include(GoogleTest)
+    gtest_discover_tests(tokenizer_tests_gtest
+        PROPERTIES ENVIRONMENT "TOKENIZERS_TEST_DATA=${TOKENIZERS_TEST_DATA_DIR}"
+    )
+endif()
diff --git a/bindings/cpp/README.md b/bindings/cpp/README.md
new file mode 100644
index 000000000..454162f00
--- /dev/null
+++ b/bindings/cpp/README.md
@@ -0,0 +1,114 @@
+# C++ Bindings for HuggingFace Tokenizers
+
+Minimal C++17 wrapper over the Rust `tokenizers` crate.
+
+## Quick Start
+
+See the [example project](example/) for a complete, working demonstration of all features.
+
+```bash
+# Build and run the example
+cmake -S bindings/cpp/example -B build_example
+cmake --build build_example
+./build_example/tokenizer_example path/to/tokenizer.json "Your text here"
+```
+
+## Overview
+
+Architecture:
+- Rust FFI crate (`tokenizers_c`) exposes a C ABI (load, encode, vocab ops, special tokens).
+- Header-only C++ class `tokenizers::Tokenizer` provides RAII, `encode()` returning `std::vector<int32_t>`.
+- Build system: CMake + cargo. CTest for tests.
+
+## Build
+
+Prerequisites: Rust toolchain, CMake >= 3.16, a C++17 compiler.
+
+```bash
+
+# prerequisite 1: Install rustc and cargo, if you dont have it already
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+. "$HOME/.cargo/env" 
+
+# NOTE: the below commands should be run from the tokenizers repo root
+
+# prerequisite 2: original tokenizer (rust) can be built and tested
+make -C ./tokenizers test
+
+# Configure & build
+cmake -S bindings/cpp -B build-cpp
+cmake --build build-cpp -j
+# if you run out of memory, replace "-j" (use all cores) with "-j4" (use only 4 cores)
+
+# Run tests (Google Test suite)
+ctest --test-dir build-cpp -V
+```
+
+## FFI API Surface
+
+C++ `Tokenizer` class methods:
+- `load(path)` / constructor - load tokenizer from JSON file
+- `FromBlobJSON(json)` - load tokenizer from JSON string (static method)
+- `encode(text, add_special_tokens=true)` - encode text to token IDs
+- `encode_batch(texts, add_special_tokens=true)` - encode batch of texts
+- `decode(ids, skip_special_tokens=true)` - decode IDs to string
+- `decode_batch(batch_ids, skip_special_tokens=true)` - decode batch of IDs
+- `vocab_size()` - get vocabulary size
+- `token_to_id(token)` - lookup token ID (returns -1 if not found)
+- `id_to_token(id)` - lookup token string (returns empty if not found)
+- `add_special_token(token)` - add a special token to vocabulary
+- `add_special_tokens(tokens)` - add multiple special tokens
+- `set_padding(params)` - configure padding
+- `disable_padding()` - disable padding
+- `set_truncation(params)` - configure truncation
+- `disable_truncation()` - disable truncation
+- `save(path, pretty=true)` - save tokenizer to JSON file
+- `to_string(pretty=false)` - serialize tokenizer to JSON string
+- `valid()` - check if tokenizer loaded successfully
+- `version()` - get FFI version string (static method)
+
+## Test Coverage
+
+C++ binding tests are now unified using Google Test in `bindings/cpp/tests/test_tokenizer_gtest.cpp`.
+The suite covers:
+- Basic encode/decode
+- Batch encode/decode
+- Vocabulary operations
+- Padding and Truncation
+- Special tokens management
+- Serialization (save/load/to_string)
+- Error handling
+- Integration with BERT tokenizer
+
+Original Rust tests also available via `ctest -R tokenizers_rust_all`.
+
+## Usage
+
+Add `bindings/cpp/include` to your include path and link against the generated `libtokenizers_c.so` (or platform equivalent) built in `bindings/c/target/release`.
+
+Example:
+```cpp
+#include "tokenizers/tokenizers.h"
+using namespace tokenizers;
+
+int main() {
+    Tokenizer tok("path/to/tokenizer.json");
+    if (!tok.valid()) return 1;
+    
+    auto ids = tok.encode("Hello world!");
+    for (auto id : ids) {
+        std::cout << id << " ";
+    }
+    
+    std::string decoded = tok.decode(ids);
+    std::cout << "\nDecoded: " << decoded << "\n";
+}
+```
+
+## Notes & Future Improvements
+- Error handling returns empty/default values; could be extended with status codes/exceptions.
+- Full Rust test suite available through CTest for integration tracking.
+- Thread safety: Create one instance per thread or add mutex.
+
+## License
+Apache-2.0 (same as upstream project).
diff --git a/bindings/cpp/data b/bindings/cpp/data
new file mode 120000
index 000000000..538a6e8cc
--- /dev/null
+++ b/bindings/cpp/data
@@ -0,0 +1 @@
+../../tokenizers/data
\ No newline at end of file
diff --git a/bindings/cpp/example/CMakeLists.txt b/bindings/cpp/example/CMakeLists.txt
new file mode 100644
index 000000000..abd156d92
--- /dev/null
+++ b/bindings/cpp/example/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.16)
+project(tokenizers_example LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Include the tokenizers C++ bindings as a subdirectory
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/tokenizers_cpp_build)
+
+# Example executable
+add_executable(tokenizer_example main.cpp)
+target_link_libraries(tokenizer_example PRIVATE tokenizers_cpp tokenizers_c)
+target_include_directories(tokenizer_example PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+
+message(STATUS "Example project configured. Build with: cmake -S bindings/cpp/example -B build_example && cmake --build build_example")
diff --git a/bindings/cpp/example/README.md b/bindings/cpp/example/README.md
new file mode 100644
index 000000000..4d994c29b
--- /dev/null
+++ b/bindings/cpp/example/README.md
@@ -0,0 +1,83 @@
+# C++ Bindings Example
+
+This example demonstrates how to use the HuggingFace Tokenizers C++ bindings.
+
+## Building
+
+```bash
+# Make sure test resources are available (includes sample tokenizer JSON files)
+make -C tokenizers test
+
+# Build the example
+cmake -S bindings/cpp/example -B build_example
+cmake --build build_example
+
+# Run the example with a tokenizer file
+./build_example/tokenizer_example ../../tokenizers/data/tokenizer.json "Hello world!"
+```
+
+## What This Example Shows
+
+The example program demonstrates:
+
+1. **Basic Encoding**: Encoding text to token IDs with and without special tokens
+2. **Token Lookup**: Looking up token IDs by token string
+3. **Adding Special Tokens**: Dynamically adding custom special tokens to the vocabulary
+4. **Batch Processing**: Encoding multiple texts efficiently
+5. **Move Semantics**: Using C++11 move semantics for efficient resource management
+6. **Error Handling**: Checking tokenizer validity and handling missing tokens
+
+## Usage
+
+```bash
+# Basic usage with default text
+./build_example/tokenizer_example <path_to_tokenizer.json>
+
+# Encode custom text
+./build_example/tokenizer_example <path_to_tokenizer.json> "Your custom text here"
+```
+
+## Example Output
+
+```
+Tokenizers C++ Bindings Version: tokenizers_c 0.0.1
+
+Loading tokenizer from: ../../tokenizers/data/tokenizer.json
+✓ Tokenizer loaded successfully
+
+Vocabulary size: 30000
+
+=== Example 1: Basic Encoding ===
+Input text: "Hello world!"
+Tokens (with special tokens): [79, 33, 56, 63, 63, 66, 88, 66, 69, 63, 55, 5]
+Token count: 12
+
+=== Example 2: Encoding Without Special Tokens ===
+Tokens (without special tokens): [79, 33, 56, 63, 63, 66, 88, 66, 69, 63, 55]
+Token count: 11
+
+...
+```
+
+## Integration into Your Project
+
+To use the tokenizers C++ bindings in your own CMake project:
+
+```cmake
+# Add tokenizers as a subdirectory
+add_subdirectory(path/to/tokenizers/bindings/cpp ${CMAKE_BINARY_DIR}/tokenizers_build)
+
+# Link your target
+target_link_libraries(your_target PRIVATE tokenizers_cpp tokenizers_c)
+target_include_directories(your_target PRIVATE path/to/tokenizers/bindings/cpp/include)
+```
+
+Then in your C++ code:
+
+```cpp
+#include "tokenizers/tokenizers.h"
+using namespace tokenizers;
+
+Tokenizer tok("path/to/tokenizer.json");
+auto ids = tok.encode("Hello world!");
+```
diff --git a/bindings/cpp/example/main.cpp b/bindings/cpp/example/main.cpp
new file mode 100644
index 000000000..42b0259e8
--- /dev/null
+++ b/bindings/cpp/example/main.cpp
@@ -0,0 +1,129 @@
+#include "tokenizers/tokenizers.h"
+#include <iostream>
+#include <vector>
+#include <string>
+
+using namespace tokenizers;
+
+int main(int argc, char* argv[]) {
+    // Check if tokenizer path is provided
+    if (argc < 2) {
+        std::cerr << "Usage: " << argv[0] << " <path_to_tokenizer.json> [text_to_encode]\n";
+        std::cerr << "\nExample:\n";
+        std::cerr << "  " << argv[0] << " ../../tokenizers/data/tokenizer.json \"Hello world!\"\n";
+        return 1;
+    }
+
+    std::string tokenizer_path = argv[1];
+    std::string text = (argc >= 3) ? argv[2] : "Hello, world!";
+
+    // Print version information
+    std::cout << "Tokenizers C++ Bindings Version: " << Tokenizer::version() << "\n\n";
+
+    // Load the tokenizer
+    std::cout << "Loading tokenizer from: " << tokenizer_path << "\n";
+    Tokenizer tokenizer(tokenizer_path);
+    
+    if (!tokenizer.valid()) {
+        std::cerr << "Error: Failed to load tokenizer from " << tokenizer_path << "\n";
+        std::cerr << "Make sure the file exists and is a valid tokenizer JSON file.\n";
+        return 1;
+    }
+
+    std::cout << "✓ Tokenizer loaded successfully\n\n";
+
+    // Get vocabulary size
+    size_t vocab_size = tokenizer.vocab_size();
+    std::cout << "Vocabulary size: " << vocab_size << "\n\n";
+
+    // Example 1: Basic encoding
+    std::cout << "=== Example 1: Basic Encoding ===\n";
+    std::cout << "Input text: \"" << text << "\"\n";
+    
+    auto ids_with_special = tokenizer.encode(text, true);
+    std::cout << "Tokens (with special tokens): [";
+    for (size_t i = 0; i < ids_with_special.size(); ++i) {
+        std::cout << ids_with_special[i];
+        if (i + 1 < ids_with_special.size()) std::cout << ", ";
+    }
+    std::cout << "]\n";
+    std::cout << "Token count: " << ids_with_special.size() << "\n\n";
+
+    // Example 2: Encoding without special tokens
+    std::cout << "=== Example 2: Encoding Without Special Tokens ===\n";
+    auto ids_without_special = tokenizer.encode(text, false);
+    std::cout << "Tokens (without special tokens): [";
+    for (size_t i = 0; i < ids_without_special.size(); ++i) {
+        std::cout << ids_without_special[i];
+        if (i + 1 < ids_without_special.size()) std::cout << ", ";
+    }
+    std::cout << "]\n";
+    std::cout << "Token count: " << ids_without_special.size() << "\n\n";
+
+    // Example 3: Token lookup
+    std::cout << "=== Example 3: Token ID Lookup ===\n";
+    std::vector<std::string> sample_tokens = {"hello", "world", "the", "[UNK]", "[PAD]"};
+    for (const auto& token : sample_tokens) {
+        int32_t id = tokenizer.token_to_id(token);
+        if (id >= 0) {
+            std::cout << "Token \"" << token << "\" -> ID: " << id << "\n";
+        } else {
+            std::cout << "Token \"" << token << "\" -> Not found in vocabulary\n";
+        }
+    }
+    std::cout << "\n";
+
+    // Example 4: Adding special tokens
+    std::cout << "=== Example 4: Adding Custom Special Token ===\n";
+    std::string new_token = "[CUSTOM_TOKEN]";
+    size_t vocab_before = tokenizer.vocab_size();
+    bool added = tokenizer.add_special_token(new_token);
+    size_t vocab_after = tokenizer.vocab_size();
+    
+    if (added) {
+        std::cout << "✓ Successfully added special token: " << new_token << "\n";
+        std::cout << "Vocabulary size increased: " << vocab_before << " -> " << vocab_after << "\n";
+        
+        int32_t new_id = tokenizer.token_to_id(new_token);
+        std::cout << "New token ID: " << new_id << "\n\n";
+        
+        // Encode text with the new token
+        std::string text_with_token = "Hello " + new_token + " world";
+        auto ids = tokenizer.encode(text_with_token, true);
+        std::cout << "Encoding \"" << text_with_token << "\":\n";
+        std::cout << "Token IDs: [";
+        for (size_t i = 0; i < ids.size(); ++i) {
+            std::cout << ids[i];
+            if (i + 1 < ids.size()) std::cout << ", ";
+        }
+        std::cout << "]\n";
+    } else {
+        std::cout << "Failed to add special token (may already exist)\n";
+    }
+    std::cout << "\n";
+
+    // Example 5: Batch encoding multiple texts
+    std::cout << "=== Example 5: Encoding Multiple Texts ===\n";
+    std::vector<std::string> texts = {
+        "The quick brown fox",
+        "jumps over the lazy dog",
+        "Hello, world!",
+        "Testing tokenization"
+    };
+    
+    for (const auto& t : texts) {
+        auto tokens = tokenizer.encode(t, true);
+        std::cout << "\"" << t << "\" -> " << tokens.size() << " tokens\n";
+    }
+    std::cout << "\n";
+
+    // Example 6: Move semantics
+    std::cout << "=== Example 6: Move Semantics ===\n";
+    Tokenizer moved_tokenizer = std::move(tokenizer);
+    std::cout << "Original tokenizer valid: " << (tokenizer.valid() ? "yes" : "no") << "\n";
+    std::cout << "Moved tokenizer valid: " << (moved_tokenizer.valid() ? "yes" : "no") << "\n";
+    std::cout << "Moved tokenizer vocab size: " << moved_tokenizer.vocab_size() << "\n\n";
+
+    std::cout << "=== All Examples Completed Successfully ===\n";
+    return 0;
+}
diff --git a/bindings/cpp/include/tokenizers/tokenizers.h b/bindings/cpp/include/tokenizers/tokenizers.h
new file mode 100644
index 000000000..511e74cea
--- /dev/null
+++ b/bindings/cpp/include/tokenizers/tokenizers.h
@@ -0,0 +1,415 @@
+#pragma once
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <stdexcept>
+
+// Forward declare jinja2 types to avoid pulling in heavy headers
+namespace jinja2 { class Template; }
+
+extern "C" {
+    struct tokenizers_encoding_t {
+        const int32_t* ids;
+        const int32_t* attention_mask;
+        size_t len;
+        void* _internal_ptr;  // Internal use only - do not access
+    };
+
+    struct tokenizers_padding_params_t {
+        uint32_t pad_id;
+        uint32_t pad_type_id;
+        const char* pad_token;
+        int strategy;
+        size_t fixed_length;
+        int direction;
+        size_t pad_to_multiple_of;
+    };
+
+    struct tokenizers_truncation_params_t {
+        size_t max_length;
+        size_t stride;
+        int strategy;
+        int direction;
+    };
+
+    void* tokenizers_new_from_file(const char* path);
+    void* tokenizers_new_from_file_with_config(const char* path, const char* config_path);
+    void* tokenizers_new_from_str(const char* json);
+    void tokenizers_free(void* tokenizer);
+    tokenizers_encoding_t tokenizers_encode(void* tokenizer, const char* text, bool add_special_tokens);
+    void tokenizers_free_encoding(tokenizers_encoding_t enc);
+    const char* tokenizers_version();
+    void tokenizers_string_free(char* s);
+    size_t tokenizers_vocab_size(void* tokenizer);
+    int32_t tokenizers_token_to_id(void* tokenizer, const char* token);
+    char* tokenizers_id_to_token(void* tokenizer, int32_t id);
+    char* tokenizers_decode(void* tokenizer, const int32_t* ids, size_t len, bool skip_special_tokens);
+    bool tokenizers_save(void* tokenizer, const char* path, bool pretty);
+    char* tokenizers_to_str(void* tokenizer, bool pretty);
+    bool tokenizers_add_special_token(void* tokenizer, const char* token);
+    size_t tokenizers_add_special_tokens(void* tokenizer, const char** tokens, size_t len);
+    size_t tokenizers_add_tokens(void* tokenizer, const char** tokens, size_t len);
+    tokenizers_encoding_t* tokenizers_encode_batch(void* tokenizer, const char** texts, size_t len, bool add_special_tokens);
+    void tokenizers_free_batch_encoding(tokenizers_encoding_t* encodings, size_t len);
+    char** tokenizers_decode_batch(void* tokenizer, const int32_t** ids, const size_t* lens, size_t batch_len, bool skip_special_tokens);
+    void tokenizers_free_batch_decode(char** strings, size_t len);
+    void tokenizers_set_padding(void* tokenizer, const tokenizers_padding_params_t* params);
+    void tokenizers_set_truncation(void* tokenizer, const tokenizers_truncation_params_t* params);
+    
+    // Unified special token API (auto-uses config if available, falls back to heuristic)
+    int32_t tokenizers_get_special_token_id(void* tokenizer, const char* name);
+    char* tokenizers_get_special_token(void* tokenizer, const char* name);
+    bool tokenizers_get_add_bos_token(void* tokenizer);
+    bool tokenizers_get_add_eos_token(void* tokenizer);
+    bool tokenizers_has_chat_template(void* tokenizer);
+    char* tokenizers_get_chat_template(void* tokenizer);
+}
+
+namespace tokenizers {
+
+struct Encoding {
+    std::vector<int32_t> ids;
+    std::vector<int32_t> attention_mask;
+    
+    operator std::vector<int32_t>() const { return ids; }
+
+    size_t size() const { return ids.size(); }
+    bool empty() const { return ids.empty(); }
+    int32_t operator[](size_t i) const { return ids[i]; }
+    std::vector<int32_t>::const_iterator begin() const { return ids.begin(); }
+    std::vector<int32_t>::const_iterator end() const { return ids.end(); }
+    
+    bool operator==(const Encoding& other) const {
+        return ids == other.ids && attention_mask == other.attention_mask;
+    }
+    bool operator!=(const Encoding& other) const {
+        return !(*this == other);
+    }
+};
+
+/// Chat message for apply_chat_template
+struct ChatMessage {
+    std::string role;     // "system", "user", "assistant"
+    std::string content;  // Message content
+};
+
+/// Exception for chat template errors
+class ChatTemplateError : public std::runtime_error {
+public:
+    explicit ChatTemplateError(const std::string& msg) : std::runtime_error(msg) {}
+};
+
+struct PaddingParams {
+    uint32_t pad_id = 0;
+    uint32_t pad_type_id = 0;
+    std::string pad_token = "[PAD]";
+    enum Strategy { BatchLongest = 0, Fixed = 1 } strategy = BatchLongest;
+    size_t fixed_length = 0;
+    enum Direction { Left = 0, Right = 1 } direction = Right;
+    size_t pad_to_multiple_of = 0;
+};
+
+struct TruncationParams {
+    size_t max_length = 512;
+    size_t stride = 0;
+    enum Strategy { LongestFirst = 0, OnlyFirst = 1, OnlySecond = 2 } strategy = LongestFirst;
+    enum Direction { Left = 0, Right = 1 } direction = Right;
+};
+
+class Tokenizer {
+public:
+    Tokenizer() = default;
+    /// Load tokenizer from file, auto-loads tokenizer_config.json if present
+    explicit Tokenizer(const std::string& path) { load(path); }
+    /// Load tokenizer with explicit config file path
+    Tokenizer(const std::string& path, const std::string& config_path) { load(path, config_path); }
+    ~Tokenizer() { reset(); }
+    Tokenizer(const Tokenizer&) = delete;
+    Tokenizer& operator=(const Tokenizer&) = delete;
+    Tokenizer(Tokenizer&& other) noexcept : handle_(other.handle_) { other.handle_ = nullptr; }
+    Tokenizer& operator=(Tokenizer&& other) noexcept {
+        if (this != &other) {
+            reset();
+            handle_ = other.handle_;
+            other.handle_ = nullptr;
+        }
+        return *this;
+    }
+
+    static Tokenizer FromBlobJSON(const std::string& json) {
+        Tokenizer t;
+        t.handle_ = tokenizers_new_from_str(json.c_str());
+        return t;
+    }
+
+    /// Load tokenizer, auto-loads tokenizer_config.json if present
+    bool load(const std::string& path) {
+        reset();
+        handle_ = tokenizers_new_from_file(path.c_str());
+        return handle_ != nullptr;
+    }
+    
+    /// Load tokenizer with explicit config file path
+    bool load(const std::string& path, const std::string& config_path) {
+        reset();
+        handle_ = tokenizers_new_from_file_with_config(path.c_str(), config_path.c_str());
+        return handle_ != nullptr;
+    }
+
+    Encoding encode(const std::string& text, bool add_special_tokens = true) const {
+        if (!handle_) return {};
+        tokenizers_encoding_t enc = tokenizers_encode(handle_, text.c_str(), add_special_tokens);
+        Encoding out;
+        if (enc.ids && enc.len) {
+            out.ids.assign(enc.ids, enc.ids + enc.len);
+        }
+        if (enc.attention_mask && enc.len) {
+            out.attention_mask.assign(enc.attention_mask, enc.attention_mask + enc.len);
+        }
+        tokenizers_free_encoding(enc);
+        return out;
+    }
+
+    std::vector<Encoding> encode_batch(const std::vector<std::string>& texts, bool add_special_tokens = true) const {
+        if (!handle_) return {};
+        std::vector<const char*> c_texts;
+        c_texts.reserve(texts.size());
+        for (const auto& t : texts) c_texts.push_back(t.c_str());
+        
+        tokenizers_encoding_t* encs = tokenizers_encode_batch(handle_, c_texts.data(), c_texts.size(), add_special_tokens);
+        if (!encs) return {};
+        
+        std::vector<Encoding> out;
+        out.reserve(texts.size());
+        for (size_t i = 0; i < texts.size(); ++i) {
+            Encoding e;
+            if (encs[i].ids && encs[i].len) {
+                e.ids.assign(encs[i].ids, encs[i].ids + encs[i].len);
+            }
+            if (encs[i].attention_mask && encs[i].len) {
+                e.attention_mask.assign(encs[i].attention_mask, encs[i].attention_mask + encs[i].len);
+            }
+            out.push_back(std::move(e));
+        }
+        tokenizers_free_batch_encoding(encs, texts.size());
+        return out;
+    }
+
+    std::string decode(const std::vector<int32_t>& ids, bool skip_special_tokens = true) const {
+        if (!handle_) return {};
+        char* s = tokenizers_decode(handle_, ids.data(), ids.size(), skip_special_tokens);
+        if (!s) return {};
+        std::string res(s);
+        tokenizers_string_free(s);
+        return res;
+    }
+
+    std::vector<std::string> decode_batch(const std::vector<std::vector<int32_t>>& batch_ids, bool skip_special_tokens = true) const {
+        if (!handle_) return {};
+        std::vector<const int32_t*> c_ids;
+        std::vector<size_t> c_lens;
+        c_ids.reserve(batch_ids.size());
+        c_lens.reserve(batch_ids.size());
+        
+        for (const auto& ids : batch_ids) {
+            c_ids.push_back(ids.data());
+            c_lens.push_back(ids.size());
+        }
+        
+        char** strings = tokenizers_decode_batch(handle_, c_ids.data(), c_lens.data(), batch_ids.size(), skip_special_tokens);
+        if (!strings) return {};
+        
+        std::vector<std::string> res;
+        res.reserve(batch_ids.size());
+        for (size_t i = 0; i < batch_ids.size(); ++i) {
+            if (strings[i]) {
+                res.emplace_back(strings[i]);
+            } else {
+                res.emplace_back("");
+            }
+        }
+        tokenizers_free_batch_decode(strings, batch_ids.size());
+        return res;
+    }
+
+    size_t vocab_size() const {
+        if (!handle_) return 0;
+        return tokenizers_vocab_size(handle_);
+    }
+
+    int32_t token_to_id(const std::string& token) const {
+        if (!handle_) return -1;
+        return tokenizers_token_to_id(handle_, token.c_str());
+    }
+
+    std::string id_to_token(int32_t id) const {
+        if (!handle_) return {};
+        char* s = tokenizers_id_to_token(handle_, id);
+        if (!s) return {};
+        std::string res(s);
+        tokenizers_string_free(s);
+        return res;
+    }
+
+    bool save(const std::string& path, bool pretty = true) const {
+        if (!handle_) return false;
+        return tokenizers_save(handle_, path.c_str(), pretty);
+    }
+
+    std::string to_string(bool pretty = false) const {
+        if (!handle_) return {};
+        char* s = tokenizers_to_str(handle_, pretty);
+        if (!s) return {};
+        std::string res(s);
+        tokenizers_string_free(s);
+        return res;
+    }
+
+    bool add_special_token(const std::string& token) {
+        if (!handle_) return false;
+        return tokenizers_add_special_token(handle_, token.c_str());
+    }
+
+    size_t add_special_tokens(const std::vector<std::string>& tokens) {
+        if (!handle_) return 0;
+        std::vector<const char*> c_tokens;
+        c_tokens.reserve(tokens.size());
+        for (const auto& t : tokens) c_tokens.push_back(t.c_str());
+        return tokenizers_add_special_tokens(handle_, c_tokens.data(), c_tokens.size());
+    }
+
+    void set_padding(const PaddingParams& params) {
+        if (!handle_) return;
+        tokenizers_padding_params_t c_params;
+        c_params.pad_id = params.pad_id;
+        c_params.pad_type_id = params.pad_type_id;
+        c_params.pad_token = params.pad_token.c_str();
+        c_params.strategy = (int)params.strategy;
+        c_params.fixed_length = params.fixed_length;
+        c_params.direction = (int)params.direction;
+        c_params.pad_to_multiple_of = params.pad_to_multiple_of;
+        
+        tokenizers_set_padding(handle_, &c_params);
+    }
+    
+    void disable_padding() {
+        if (!handle_) return;
+        tokenizers_set_padding(handle_, nullptr);
+    }
+
+    void set_truncation(const TruncationParams& params) {
+        if (!handle_) return;
+        tokenizers_truncation_params_t c_params;
+        c_params.max_length = params.max_length;
+        c_params.stride = params.stride;
+        c_params.strategy = (int)params.strategy;
+        c_params.direction = (int)params.direction;
+        
+        tokenizers_set_truncation(handle_, &c_params);
+    }
+
+    void disable_truncation() {
+        if (!handle_) return;
+        tokenizers_set_truncation(handle_, nullptr);
+    }
+
+    size_t add_tokens(const std::vector<std::string>& tokens) {
+        if (!handle_) return 0;
+        std::vector<const char*> c_tokens;
+        c_tokens.reserve(tokens.size());
+        for (const auto& t : tokens) c_tokens.push_back(t.c_str());
+        return tokenizers_add_tokens(handle_, c_tokens.data(), c_tokens.size());
+    }
+
+    // === Special Token API (unified - auto-uses config if available) ===
+    
+    /// Get special token ID by name ("BOS", "EOS", "PAD", "UNK")
+    /// Auto-uses tokenizer_config.json if present, falls back to heuristic.
+    int32_t special_token_id(const std::string& name) const {
+        if (!handle_) return -1;
+        return tokenizers_get_special_token_id(handle_, name.c_str());
+    }
+    
+    /// Get special token string by name ("BOS", "EOS", "PAD", "UNK")
+    /// Returns token from config if available, empty string otherwise.
+    std::string special_token(const std::string& name) const {
+        if (!handle_) return {};
+        char* s = tokenizers_get_special_token(handle_, name.c_str());
+        if (!s) return {};
+        std::string res(s);
+        tokenizers_string_free(s);
+        return res;
+    }
+    
+    // Convenience ID accessors
+    int32_t bos_id() const { return special_token_id("BOS"); }
+    int32_t eos_id() const { return special_token_id("EOS"); }
+    int32_t pad_id() const { return special_token_id("PAD"); }
+    int32_t unk_id() const { return special_token_id("UNK"); }
+    
+    // Convenience token string accessors
+    std::string bos_token() const { return special_token("BOS"); }
+    std::string eos_token() const { return special_token("EOS"); }
+    std::string pad_token() const { return special_token("PAD"); }
+    std::string unk_token() const { return special_token("UNK"); }
+    
+    /// Whether config specifies BOS token should be added
+    bool add_bos_token() const {
+        if (!handle_) return false;
+        return tokenizers_get_add_bos_token(handle_);
+    }
+    
+    /// Whether config specifies EOS token should be added  
+    bool add_eos_token() const {
+        if (!handle_) return false;
+        return tokenizers_get_add_eos_token(handle_);
+    }
+    
+    /// Check if tokenizer has a chat template (from config)
+    bool has_chat_template() const {
+        if (!handle_) return false;
+        return tokenizers_has_chat_template(handle_);
+    }
+    
+    /// Get the raw chat template string (Jinja2 template)
+    std::string chat_template() const {
+        if (!handle_) return {};
+        char* s = tokenizers_get_chat_template(handle_);
+        if (!s) return {};
+        std::string res(s);
+        tokenizers_string_free(s);
+        return res;
+    }
+    
+    /// Apply chat template to format messages
+    /// @param messages Vector of ChatMessage with role and content
+    /// @param add_generation_prompt If true, adds prompt for assistant response
+    /// @return Formatted string ready for tokenization
+    /// @throws ChatTemplateError if no template or rendering fails
+    std::string apply_chat_template(
+        const std::vector<ChatMessage>& messages,
+        bool add_generation_prompt = true
+    ) const;
+
+    bool valid() const { return handle_ != nullptr; }
+
+    static std::string version() {
+        const char* v = tokenizers_version();
+        if (!v) return {};
+        std::string s(v);
+        tokenizers_string_free(const_cast<char*>(v));
+        return s;
+    }
+
+private:
+    void reset() {
+        if (handle_) {
+            tokenizers_free(handle_);
+            handle_ = nullptr;
+        }
+    }
+
+    void* handle_ = nullptr;
+};
+
+} // namespace tokenizers
diff --git a/bindings/cpp/src/tokenizers.cpp b/bindings/cpp/src/tokenizers.cpp
new file mode 100644
index 000000000..4b431567c
--- /dev/null
+++ b/bindings/cpp/src/tokenizers.cpp
@@ -0,0 +1,59 @@
+/**
+ * Tokenizer C++ bindings implementation
+ */
+
+#include <tokenizers/tokenizers.h>
+#include <jinja2cpp/template.h>
+#include <jinja2cpp/value.h>
+
+namespace tokenizers {
+
+std::string Tokenizer::apply_chat_template(
+    const std::vector<ChatMessage>& messages,
+    bool add_generation_prompt
+) const {
+    // Get the template string
+    std::string tmpl_str = chat_template();
+    if (tmpl_str.empty()) {
+        throw ChatTemplateError("No chat template available for this tokenizer");
+    }
+    
+    // Create Jinja2 template
+    jinja2::Template tpl;
+    auto load_result = tpl.Load(tmpl_str, "chat_template");
+    if (!load_result) {
+        throw ChatTemplateError("Failed to parse chat template: " + 
+            load_result.error().ToString());
+    }
+    
+    // Convert messages to Jinja2 values
+    jinja2::ValuesList jinja_messages;
+    for (const auto& msg : messages) {
+        jinja2::ValuesMap msg_map;
+        msg_map["role"] = msg.role;
+        msg_map["content"] = msg.content;
+        jinja_messages.push_back(std::move(msg_map));
+    }
+    
+    // Build parameters map
+    jinja2::ValuesMap params;
+    params["messages"] = std::move(jinja_messages);
+    params["add_generation_prompt"] = add_generation_prompt;
+    
+    // Add special tokens as variables (commonly used in templates)
+    params["bos_token"] = bos_token();
+    params["eos_token"] = eos_token();
+    params["pad_token"] = pad_token();
+    params["unk_token"] = unk_token();
+    
+    // Render the template
+    auto render_result = tpl.RenderAsString(params);
+    if (!render_result) {
+        throw ChatTemplateError("Failed to render chat template: " + 
+            render_result.error().ToString());
+    }
+    
+    return render_result.value();
+}
+
+} // namespace tokenizers
diff --git a/bindings/cpp/tests/test_common.h b/bindings/cpp/tests/test_common.h
new file mode 100644
index 000000000..d79d1fd62
--- /dev/null
+++ b/bindings/cpp/tests/test_common.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <string>
+#include <filesystem>
+#include <cstdlib>
+
+namespace test_utils {
+
+inline std::string find_resource(const std::string& name) {
+    namespace fs = std::filesystem;
+    
+    // First check environment variable (set by CMake or user)
+    if (const char* env = std::getenv("TOKENIZERS_TEST_DATA")) {
+        auto path = fs::path(env) / name;
+        if (fs::exists(path)) return path.string();
+    }
+    
+    // Fallback: search relative paths
+    for (const auto& dir : {"data", "../data", "../../data", "../../../data"}) {
+        auto path = fs::path(dir) / name;
+        if (fs::exists(path)) return path.string();
+    }
+    return {};
+}
+
+} // namespace test_utils
diff --git a/bindings/cpp/tests/test_tokenizer_gtest.cpp b/bindings/cpp/tests/test_tokenizer_gtest.cpp
new file mode 100644
index 000000000..c1f89eaa0
--- /dev/null
+++ b/bindings/cpp/tests/test_tokenizer_gtest.cpp
@@ -0,0 +1,282 @@
+/**
+ * Tokenizer C++ bindings tests
+ */
+#include <gtest/gtest.h>
+#include <tokenizers/tokenizers.h>
+#include "test_common.h"
+#include <filesystem>
+
+using namespace tokenizers;
+using test_utils::find_resource;
+
+// ==================== Basic Tokenizer Tests ====================
+
+class TokenizerTest : public ::testing::Test {
+protected:
+    Tokenizer tok;
+    
+    void SetUp() override {
+        std::string path = find_resource("tokenizer.json");
+        ASSERT_FALSE(path.empty()) << "Could not find tokenizer.json";
+        tok = Tokenizer(path);
+        ASSERT_TRUE(tok.valid());
+    }
+};
+
+TEST_F(TokenizerTest, Encode) {
+    auto output = tok.encode("my name is john");
+    EXPECT_FALSE(output.ids.empty());
+    EXPECT_EQ(output.ids.size(), output.attention_mask.size());
+    
+    // Consistency check - same input gives same output
+    EXPECT_EQ(tok.encode("my name is john"), output);
+}
+
+TEST_F(TokenizerTest, EncodeBatch) {
+    std::vector<std::string> batch = {"my name is john", "my pair"};
+    auto output = tok.encode_batch(batch);
+    ASSERT_EQ(output.size(), 2);
+    EXPECT_FALSE(output[0].ids.empty());
+    EXPECT_FALSE(output[1].ids.empty());
+}
+
+TEST_F(TokenizerTest, Decode) {
+    auto encoding = tok.encode("my name is john");
+    auto decoded = tok.decode(encoding.ids);
+    EXPECT_NE(decoded.find("name"), std::string::npos);
+    EXPECT_NE(decoded.find("john"), std::string::npos);
+}
+
+TEST_F(TokenizerTest, DecodeBatch) {
+    std::vector<std::string> batch = {"my name is john", "my pair"};
+    auto encodings = tok.encode_batch(batch);
+    
+    std::vector<std::vector<int32_t>> batch_ids;
+    for (const auto& enc : encodings) batch_ids.push_back(enc.ids);
+    
+    auto decoded = tok.decode_batch(batch_ids);
+    ASSERT_EQ(decoded.size(), 2);
+    EXPECT_NE(decoded[0].find("john"), std::string::npos);
+    EXPECT_NE(decoded[1].find("pair"), std::string::npos);
+}
+
+TEST_F(TokenizerTest, Vocab) {
+    EXPECT_GT(tok.vocab_size(), 0);
+    
+    int32_t id = tok.token_to_id("the");
+    if (id != -1) {
+        EXPECT_EQ(tok.id_to_token(id), "the");
+    }
+}
+
+TEST_F(TokenizerTest, Padding) {
+    PaddingParams params;
+    params.strategy = PaddingParams::Fixed;
+    params.fixed_length = 10;
+    params.pad_id = 0;
+    tok.set_padding(params);
+    
+    auto output = tok.encode("short");
+    EXPECT_EQ(output.ids.size(), 10);
+    
+    tok.disable_padding();
+    EXPECT_LT(tok.encode("short").ids.size(), 10);
+}
+
+TEST_F(TokenizerTest, AddSpecialTokens) {
+    size_t added = tok.add_special_tokens({"[SPECIAL1]", "[SPECIAL2]"});
+    EXPECT_EQ(added, 2);
+    
+    int32_t id = tok.token_to_id("[SPECIAL1]");
+    EXPECT_NE(id, -1);
+    
+    auto output = tok.encode("Hello [SPECIAL1]");
+    EXPECT_NE(std::find(output.ids.begin(), output.ids.end(), id), output.ids.end());
+}
+
+TEST_F(TokenizerTest, SaveAndLoad) {
+    std::string save_path = "test_save_gtest.json";
+    EXPECT_TRUE(tok.save(save_path));
+    
+    Tokenizer t2(save_path);
+    EXPECT_TRUE(t2.valid());
+    EXPECT_EQ(t2.vocab_size(), tok.vocab_size());
+    
+    std::filesystem::remove(save_path);
+}
+
+TEST_F(TokenizerTest, ToStringAndFromBlob) {
+    std::string json = tok.to_string(false);
+    EXPECT_FALSE(json.empty());
+    
+    Tokenizer t2 = Tokenizer::FromBlobJSON(json);
+    EXPECT_TRUE(t2.valid());
+    EXPECT_EQ(t2.vocab_size(), tok.vocab_size());
+}
+
+TEST_F(TokenizerTest, SpecialTokensFromConfig) {
+    // Config should be auto-loaded from tokenizer_config.json
+    EXPECT_EQ(tok.bos_token(), "<bos>");
+    EXPECT_EQ(tok.eos_token(), "<eos>");
+    EXPECT_EQ(tok.pad_token(), "<pad>");
+    EXPECT_EQ(tok.unk_token(), "<unk>");
+    
+    EXPECT_GE(tok.bos_id(), 0);
+    EXPECT_GE(tok.eos_id(), 0);
+    EXPECT_GE(tok.pad_id(), 0);
+    EXPECT_GE(tok.unk_id(), 0);
+    
+    EXPECT_TRUE(tok.add_bos_token());
+    EXPECT_FALSE(tok.add_eos_token());
+}
+
+TEST_F(TokenizerTest, ChatTemplate) {
+    EXPECT_TRUE(tok.has_chat_template());
+    EXPECT_FALSE(tok.chat_template().empty());
+    
+    std::vector<ChatMessage> messages = {
+        {"user", "Hello!"},
+        {"assistant", "Hi there!"},
+        {"user", "How are you?"}
+    };
+    
+    std::string result = tok.apply_chat_template(messages, true);
+    EXPECT_NE(result.find("Hello!"), std::string::npos);
+    EXPECT_NE(result.find("Hi there!"), std::string::npos);
+    EXPECT_NE(result.find("How are you?"), std::string::npos);
+}
+
+// ==================== BERT Tokenizer Tests ====================
+
+class BertTokenizerTest : public ::testing::Test {
+protected:
+    Tokenizer tok;
+    
+    void SetUp() override {
+        std::string path = find_resource("bert-wiki.json");
+        ASSERT_FALSE(path.empty()) << "Could not find bert-wiki.json";
+        // Pass empty config path to skip loading tokenizer_config.json
+        tok = Tokenizer(path, "");
+        ASSERT_TRUE(tok.valid());
+    }
+};
+
+TEST_F(BertTokenizerTest, SpecialTokensViaHeuristic) {
+    // BERT tokens found via heuristic (no config file)
+    EXPECT_EQ(tok.id_to_token(tok.bos_id()), "[CLS]");
+    EXPECT_EQ(tok.id_to_token(tok.eos_id()), "[SEP]");
+    EXPECT_EQ(tok.id_to_token(tok.pad_id()), "[PAD]");
+    EXPECT_EQ(tok.id_to_token(tok.unk_id()), "[UNK]");
+    
+    // IDs should match token_to_id
+    EXPECT_EQ(tok.bos_id(), tok.token_to_id("[CLS]"));
+    EXPECT_EQ(tok.eos_id(), tok.token_to_id("[SEP]"));
+    EXPECT_EQ(tok.pad_id(), tok.token_to_id("[PAD]"));
+    EXPECT_EQ(tok.unk_id(), tok.token_to_id("[UNK]"));
+}
+
+TEST_F(BertTokenizerTest, ExplicitConfigPath) {
+    auto config_path = find_resource("bert_tokenizer_config.json");
+    if (config_path.empty()) {
+        GTEST_SKIP() << "bert_tokenizer_config.json not found";
+    }
+    
+    auto tok_path = find_resource("bert-wiki.json");
+    Tokenizer tok_with_config(tok_path, config_path);
+    ASSERT_TRUE(tok_with_config.valid());
+    
+    EXPECT_EQ(tok_with_config.bos_token(), "[CLS]");
+    EXPECT_EQ(tok_with_config.eos_token(), "[SEP]");
+    EXPECT_FALSE(tok_with_config.has_chat_template());
+}
+
+TEST_F(BertTokenizerTest, NoChatTemplate) {
+    EXPECT_FALSE(tok.has_chat_template());
+    
+    std::vector<ChatMessage> messages = {{"user", "Hello!"}};
+    EXPECT_THROW(tok.apply_chat_template(messages), ChatTemplateError);
+}
+
+// ==================== Error Handling Tests ====================
+
+TEST(TokenizerErrorTest, InvalidFile) {
+    Tokenizer tok("nonexistent_file.json");
+    EXPECT_FALSE(tok.valid());
+    
+    // All operations should return safe defaults
+    EXPECT_EQ(tok.vocab_size(), 0);
+    EXPECT_TRUE(tok.encode("test").empty());
+    EXPECT_EQ(tok.token_to_id("test"), -1);
+    EXPECT_EQ(tok.bos_id(), -1);
+    EXPECT_TRUE(tok.bos_token().empty());
+    EXPECT_FALSE(tok.has_chat_template());
+}
+
+TEST(TokenizerErrorTest, MoveSemantics) {
+    auto path = find_resource("tokenizer.json");
+    ASSERT_FALSE(path.empty());
+    
+    Tokenizer tok(path);
+    EXPECT_TRUE(tok.valid());
+    
+    Tokenizer moved = std::move(tok);
+    EXPECT_TRUE(moved.valid());
+    EXPECT_FALSE(tok.valid());
+}
+
+TEST(TokenizerErrorTest, UnknownToken) {
+    auto path = find_resource("tokenizer.json");
+    ASSERT_FALSE(path.empty());
+    
+    Tokenizer tok(path);
+    EXPECT_EQ(tok.token_to_id("[DEFINITELY_NOT_IN_VOCAB_12345]"), -1);
+}
+
+TEST(TokenizerErrorTest, FromBlobNoChatTemplate) {
+    // Tokenizer loaded from string has no config
+    std::string json = R"({
+        "version": "1.0",
+        "added_tokens": [{"id": 0, "content": "[UNK]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true}],
+        "model": {"type": "WordLevel", "vocab": {"[UNK]": 0, "hello": 1}, "unk_token": "[UNK]"}
+    })";
+    
+    Tokenizer tok = Tokenizer::FromBlobJSON(json);
+    ASSERT_TRUE(tok.valid());
+    EXPECT_FALSE(tok.has_chat_template());
+}
+
+// ==================== Optional Tokenizer Tests ====================
+
+TEST(OptionalTokenizerTest, Llama) {
+    auto path = find_resource("llama-3-tokenizer.json");
+    if (path.empty()) {
+        GTEST_SKIP() << "llama-3-tokenizer.json not found";
+    }
+    
+    Tokenizer tok(path);
+    ASSERT_TRUE(tok.valid());
+    
+    int32_t bos = tok.bos_id();
+    if (bos >= 0) {
+        std::string bos_token = tok.id_to_token(bos);
+        EXPECT_TRUE(bos_token == "<|begin_of_text|>" || bos_token == "<s>");
+    }
+}
+
+TEST(OptionalTokenizerTest, Unigram) {
+    auto path = find_resource("unigram.json");
+    if (path.empty()) {
+        GTEST_SKIP() << "unigram.json not found";
+    }
+    
+    Tokenizer tok(path);
+    if (!tok.valid()) {
+        GTEST_SKIP() << "unigram.json is not a complete tokenizer file";
+    }
+    
+    // Just verify API doesn't crash
+    tok.bos_id();
+    tok.eos_id();
+    tok.unk_id();
+}
+
diff --git a/bindings/cpp/third_party/Jinja2Cpp b/bindings/cpp/third_party/Jinja2Cpp
new file mode 160000
index 000000000..2053cfabf
--- /dev/null
+++ b/bindings/cpp/third_party/Jinja2Cpp
@@ -0,0 +1 @@
+Subproject commit 2053cfabfafaeab65aff0bc083a83b105a939202
diff --git a/tokenizers/Makefile b/tokenizers/Makefile
index 927fe794e..0635936d8 100644
--- a/tokenizers/Makefile
+++ b/tokenizers/Makefile
@@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)
 
 SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json  $(DATA_DIR)/llama-3-tokenizer.json
 BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
-TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
+TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json $(DATA_DIR)/tokenizer_config.json $(DATA_DIR)/bert_tokenizer_config.json
 
 .PHONY : build
 build :
@@ -87,3 +87,12 @@ $(DATA_DIR)/bert-wiki.json :
 $(DATA_DIR)/llama-3-tokenizer.json :
 	$(dir_guard)
 	wget https://huggingface.co/hf-internal-testing/llama3-tokenizer/resolve/main/tokenizer.json -O $@
+
+# Config files for C++ bindings tests
+$(DATA_DIR)/tokenizer_config.json :
+	$(dir_guard)
+	@echo '{"bos_token":"<bos>","eos_token":"<eos>","pad_token":"<pad>","unk_token":"<unk>","add_bos_token":true,"add_eos_token":false,"chat_template":"{% for message in messages %}{% if message['"'"'role'"'"'] == '"'"'user'"'"' %}{{ '"'"'User: '"'"' + message['"'"'content'"'"'] + '"'"'\\n'"'"' }}{% elif message['"'"'role'"'"'] == '"'"'assistant'"'"' %}{{ '"'"'Assistant: '"'"' + message['"'"'content'"'"'] + '"'"'\\n'"'"' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '"'"'Assistant: '"'"' }}{% endif %}"}' > $@
+
+$(DATA_DIR)/bert_tokenizer_config.json :
+	$(dir_guard)
+	@echo '{"bos_token":"[CLS]","eos_token":"[SEP]","pad_token":"[PAD]","unk_token":"[UNK]","add_bos_token":true,"add_eos_token":true,"chat_template":null}' > $@