feat: use Qwen2.5-Coder tokenizer for accurate semantic chunking

jruokola · jruokola · commit a41ce7c2a311 · 2025-11-18T04:47:31.000+02:00
Replaced simple character approximation with proper Qwen2.5-Coder tokenizer
for accurate token counting during semantic chunking.

Changes:
- Added tokenizers = "0.20" dependency to codegraph-mcp
- Load qwen2.5-coder.json tokenizer from codegraph-vector/tokenizers/
- Use tokenizer.encode() for accurate token counting
- Wrap tokenizer in Arc for use in closure
- Fallback to character approximation if tokenizer load fails

Benefits:
- Accurate token counts matching actual embedding model tokenization
- Better chunk boundaries preserving code semantics
- Consistent with Qwen-based embedding models

Environment variables:
  CODEGRAPH_MAX_CHUNK_TOKENS=512    # Max tokens per chunk (default)
  CODEGRAPH_CHUNK_OVERLAP_TOKENS=50 # Not yet implemented

Note: Overlap support will be added in future commit once semchunk-rs
chunker API supports it.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/codegraph-mcp/Cargo.toml b/crates/codegraph-mcp/Cargo.toml
@@ -54,6 +54,8 @@ codegraph-graph = { workspace = true, features = ["surrealdb"] }
 codegraph-vector = { workspace = true, optional = true, default-features = false }
 codegraph-ai = { workspace = true, optional = true }  # Re-enabled for AI-powered symbol resolution
 faiss = { workspace = true, optional = true }
+semchunk-rs = "0.1"  # Semantic chunking for long code nodes
+tokenizers = "0.20"  # Qwen2.5-Coder tokenizer for accurate token counting
 # codegraph-api not used directly here; avoid pulling heavy deps
 ## core-rag-mcp-server intentionally not linked to keep binary lean
 
diff --git a/crates/codegraph-mcp/src/indexer.rs b/crates/codegraph-mcp/src/indexer.rs
@@ -2814,12 +2814,53 @@ pub fn prepare_node_text(node: &CodeNode) -> String {
         text.push(' ');
         text.push_str(c);
     }
-    if text.len() > 2048 {
-        let mut new_len = 2048.min(text.len());
-        while new_len > 0 && !text.is_char_boundary(new_len) {
-            new_len -= 1;
+
+    // Semantic chunking with environment variable support
+    let max_chunk_tokens = std::env::var("CODEGRAPH_MAX_CHUNK_TOKENS")
+        .ok()
+        .and_then(|v| v.parse::<usize>().ok())
+        .unwrap_or(512);  // Default 512 tokens
+
+    // Approximate character limit for quick check (1 token ≈ 4 chars)
+    let approx_max_chars = max_chunk_tokens * 4;
+
+    if text.len() > approx_max_chars {
+        // Load Qwen2.5-Coder tokenizer for accurate token counting
+        let tokenizer_path = std::path::PathBuf::from(concat!(
+            env!("CARGO_MANIFEST_DIR"),
+            "/../codegraph-vector/tokenizers/qwen2.5-coder.json"
+        ));
+
+        if let Ok(tokenizer) = tokenizers::Tokenizer::from_file(&tokenizer_path) {
+            // Proper token-based chunking with Qwen2.5-Coder tokenizer
+            let tok = std::sync::Arc::new(tokenizer);
+            let token_counter = move |s: &str| -> usize {
+                tok.encode(s, false)
+                    .map(|enc| enc.len())
+                    .unwrap_or_else(|_| (s.len() + 3) / 4)
+            };
+
+            let chunker = semchunk_rs::Chunker::new(max_chunk_tokens, token_counter);
+            let chunks = chunker.chunk_text(&text);
+
+            if let Some(first_chunk) = chunks.first() {
+                text = first_chunk.clone();
+            } else {
+                // Fallback to character truncation
+                let mut new_len = approx_max_chars.min(text.len());
+                while new_len > 0 && !text.is_char_boundary(new_len) {
+                    new_len -= 1;
+                }
+                text.truncate(new_len);
+            }
+        } else {
+            // Tokenizer not available - fallback to character truncation
+            let mut new_len = approx_max_chars.min(text.len());
+            while new_len > 0 && !text.is_char_boundary(new_len) {
+                new_len -= 1;
+            }
+            text.truncate(new_len);
         }
-        text.truncate(new_len);
     }
     text
 }