Skip to content

Commit a41ce7c

Browse files
committed
feat: use Qwen2.5-Coder tokenizer for accurate semantic chunking
Replaced simple character approximation with proper Qwen2.5-Coder tokenizer for accurate token counting during semantic chunking. Changes: - Added tokenizers = "0.20" dependency to codegraph-mcp - Load qwen2.5-coder.json tokenizer from codegraph-vector/tokenizers/ - Use tokenizer.encode() for accurate token counting - Wrap tokenizer in Arc for use in closure - Fallback to character approximation if tokenizer load fails Benefits: - Accurate token counts matching actual embedding model tokenization - Better chunk boundaries preserving code semantics - Consistent with Qwen-based embedding models Environment variables: CODEGRAPH_MAX_CHUNK_TOKENS=512 # Max tokens per chunk (default) CODEGRAPH_CHUNK_OVERLAP_TOKENS=50 # Not yet implemented Note: Overlap support will be added in future commit once semchunk-rs chunker API supports it.
1 parent ef0a5e8 commit a41ce7c

File tree

3 files changed

+50
-5
lines changed

3 files changed

+50
-5
lines changed

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/codegraph-mcp/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ codegraph-graph = { workspace = true, features = ["surrealdb"] }
5454
codegraph-vector = { workspace = true, optional = true, default-features = false }
5555
codegraph-ai = { workspace = true, optional = true } # Re-enabled for AI-powered symbol resolution
5656
faiss = { workspace = true, optional = true }
57+
semchunk-rs = "0.1" # Semantic chunking for long code nodes
58+
tokenizers = "0.20" # Qwen2.5-Coder tokenizer for accurate token counting
5759
# codegraph-api not used directly here; avoid pulling heavy deps
5860
## core-rag-mcp-server intentionally not linked to keep binary lean
5961

crates/codegraph-mcp/src/indexer.rs

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2814,12 +2814,53 @@ pub fn prepare_node_text(node: &CodeNode) -> String {
28142814
text.push(' ');
28152815
text.push_str(c);
28162816
}
2817-
if text.len() > 2048 {
2818-
let mut new_len = 2048.min(text.len());
2819-
while new_len > 0 && !text.is_char_boundary(new_len) {
2820-
new_len -= 1;
2817+
2818+
// Semantic chunking with environment variable support
2819+
let max_chunk_tokens = std::env::var("CODEGRAPH_MAX_CHUNK_TOKENS")
2820+
.ok()
2821+
.and_then(|v| v.parse::<usize>().ok())
2822+
.unwrap_or(512); // Default 512 tokens
2823+
2824+
// Approximate character limit for quick check (1 token ≈ 4 chars)
2825+
let approx_max_chars = max_chunk_tokens * 4;
2826+
2827+
if text.len() > approx_max_chars {
2828+
// Load Qwen2.5-Coder tokenizer for accurate token counting
2829+
let tokenizer_path = std::path::PathBuf::from(concat!(
2830+
env!("CARGO_MANIFEST_DIR"),
2831+
"/../codegraph-vector/tokenizers/qwen2.5-coder.json"
2832+
));
2833+
2834+
if let Ok(tokenizer) = tokenizers::Tokenizer::from_file(&tokenizer_path) {
2835+
// Proper token-based chunking with Qwen2.5-Coder tokenizer
2836+
let tok = std::sync::Arc::new(tokenizer);
2837+
let token_counter = move |s: &str| -> usize {
2838+
tok.encode(s, false)
2839+
.map(|enc| enc.len())
2840+
.unwrap_or_else(|_| (s.len() + 3) / 4)
2841+
};
2842+
2843+
let chunker = semchunk_rs::Chunker::new(max_chunk_tokens, token_counter);
2844+
let chunks = chunker.chunk_text(&text);
2845+
2846+
if let Some(first_chunk) = chunks.first() {
2847+
text = first_chunk.clone();
2848+
} else {
2849+
// Fallback to character truncation
2850+
let mut new_len = approx_max_chars.min(text.len());
2851+
while new_len > 0 && !text.is_char_boundary(new_len) {
2852+
new_len -= 1;
2853+
}
2854+
text.truncate(new_len);
2855+
}
2856+
} else {
2857+
// Tokenizer not available - fallback to character truncation
2858+
let mut new_len = approx_max_chars.min(text.len());
2859+
while new_len > 0 && !text.is_char_boundary(new_len) {
2860+
new_len -= 1;
2861+
}
2862+
text.truncate(new_len);
28212863
}
2822-
text.truncate(new_len);
28232864
}
28242865
text
28252866
}

0 commit comments

Comments
 (0)