diff --git a/README.md b/README.md index 6146e2c..3fd6322 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,36 @@ MCP_TRANSPORT=streamable-http btmcp-server # Runs on http://127.0.0.1:8000 - `MCP_HOST` - Host for HTTP mode (default: `127.0.0.1`) - `MCP_PORT` - Port for HTTP mode (default: `8000`) - `BTMCP_CACHE_DIR` - Custom cache directory (default: `/.cache/`) +- `BTMCP_ENABLE_SEMANTIC` - Enable semantic search (`true` or `false`, default: `true`) +- `BTMCP_MODEL_NAME` - Sentence-transformers model name (default: `all-MiniLM-L6-v2`) + +### Customizing the Semantic Model + +You can configure which sentence-transformers model to use for semantic search: + +```bash +# Use a larger, more accurate model (requires more memory and disk space) +BTMCP_MODEL_NAME="all-mpnet-base-v2" btmcp-server + +# Use a smaller, faster model (requires less memory and disk space) +BTMCP_MODEL_NAME="all-MiniLM-L12-v2" btmcp-server + +# Disable semantic search entirely (BM25-only mode) +BTMCP_ENABLE_SEMANTIC=false btmcp-server +``` + +**Popular model options:** + +| Model | Embedding Size | Model Size | Speed | Accuracy | +|-------|----------------|------------|-------|----------| +| `all-MiniLM-L6-v2` (default) | 384 | ~80MB | Fast | Good | +| `all-MiniLM-L12-v2` | 384 | ~120MB | Medium | Better | +| `all-mpnet-base-v2` | 768 | ~420MB | Slower | Best | +| `multi-qa-MiniLM-L6-cos-v1` | 384 | ~80MB | Fast | Good (Q&A optimized) | + +**Note:** When you change the model, the cache will be invalidated and PDFs will be re-indexed with the new model's embeddings. + +For a complete list of available models, see the [Sentence Transformers documentation](https://www.sbert.net/docs/pretrained_models.html). --- @@ -547,7 +577,7 @@ PDF Files (specs/) • Tables & Figures → Indexer: • BM25 tokenization - • Semantic embeddings (all-MiniLM-L6-v2) + • Semantic embeddings (configurable model, default: all-MiniLM-L6-v2) → Cache (pickle): • BM25 index • Embeddings diff --git a/src/btmcp/indexer.py b/src/btmcp/indexer.py index 6c4eb53..12f00c0 100644 --- a/src/btmcp/indexer.py +++ b/src/btmcp/indexer.py @@ -29,11 +29,17 @@ class Indexer: """Index PDF content with BM25 keyword search and semantic search.""" - def __init__(self, enable_semantic: bool = True): + def __init__( + self, + enable_semantic: bool = True, + model_name: str = "all-MiniLM-L6-v2", + ): """Initialize indexer. :param enable_semantic: Enable semantic search with embeddings (if available) :type enable_semantic: bool + :param model_name: Name of the sentence-transformers model to use + :type model_name: str """ self.chunks: list[dict[str, Any]] = [] self.bm25: BM25Okapi | None = None @@ -43,15 +49,16 @@ def __init__(self, enable_semantic: bool = True): self.enable_semantic = enable_semantic and SEMANTIC_AVAILABLE self.embeddings: np.ndarray | None = None self.embedding_model: SentenceTransformer | None = None + self.model_name = model_name # Metadata extraction self.metadata_extractor = MetadataExtractor() if self.enable_semantic: if SEMANTIC_AVAILABLE and SentenceTransformer is not None: - # Load lightweight model (80MB, 384 dimensions) - self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2") - logger.info("Semantic search enabled with all-MiniLM-L6-v2") + # Load the specified model + self.embedding_model = SentenceTransformer(model_name) + logger.info(f"Semantic search enabled with {model_name}") else: logger.warning( "Semantic search requested but sentence-transformers not available" @@ -547,6 +554,7 @@ def save_cache(self, cache_path: Path | str) -> None: "bm25": self.bm25, "enable_semantic": self.enable_semantic, "embeddings": self.embeddings, + "model_name": self.model_name, } with cache_path.open("wb") as f: @@ -576,14 +584,17 @@ def load_cache(self, cache_path: Path | str) -> None: if "enable_semantic" in cache_data: self.enable_semantic = cache_data["enable_semantic"] print(f" Semantic search: {self.enable_semantic}") + if "model_name" in cache_data: + self.model_name = cache_data["model_name"] + print(f" Model name: {self.model_name}") if "embeddings" in cache_data: self.embeddings = cache_data["embeddings"] if self.embeddings is not None: print(f" Loaded embeddings: {self.embeddings.shape}") # Reinitialize embedding model if we have embeddings if self.embeddings is not None and self.embedding_model is None: - print(" Loading embedding model...") - self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2") + print(f" Loading embedding model: {self.model_name}...") + self.embedding_model = SentenceTransformer(self.model_name) def is_cache_fresh( self, cache_path: Path | str, pdf_paths: Sequence[Path | str] diff --git a/src/btmcp/server.py b/src/btmcp/server.py index 8909fc4..4a9b151 100644 --- a/src/btmcp/server.py +++ b/src/btmcp/server.py @@ -52,6 +52,23 @@ def get_cache_path() -> Path: return cache_dir / "index.cache" +def get_semantic_config() -> dict[str, str | bool]: + """Get semantic search configuration from environment variables. + + :return: Configuration dictionary with enable_semantic and model_name + :rtype: dict[str, str | bool] + """ + enable_semantic_env = os.getenv("BTMCP_ENABLE_SEMANTIC", "true").lower() + enable_semantic = enable_semantic_env in ("true", "1", "yes") + + model_name = os.getenv("BTMCP_MODEL_NAME", "all-MiniLM-L6-v2") + + return { + "enable_semantic": enable_semantic, + "model_name": model_name, + } + + def get_security_settings() -> TransportSecuritySettings: """Get transport security settings for DNS rebinding protection. @@ -105,6 +122,7 @@ def validate_transport(transport: str) -> Literal["stdio", "sse", "streamable-ht # Get server configuration config = get_server_config() +semantic_config = get_semantic_config() # Create MCP server instance with security settings and configured host/port mcp = FastMCP( @@ -114,8 +132,11 @@ def validate_transport(transport: str) -> Literal["stdio", "sse", "streamable-ht transport_security=get_security_settings(), ) -# Initialize spec server -spec_server = SpecServer() +# Initialize spec server with semantic configuration +spec_server = SpecServer( + enable_semantic=bool(semantic_config["enable_semantic"]), + model_name=str(semantic_config["model_name"]), +) @mcp.tool() diff --git a/src/btmcp/spec_server.py b/src/btmcp/spec_server.py index 0dd313a..f60a5ef 100644 --- a/src/btmcp/spec_server.py +++ b/src/btmcp/spec_server.py @@ -22,11 +22,25 @@ class PdfInfo(TypedDict): class SpecServer: """Server for loading and searching Bluetooth specifications.""" - def __init__(self) -> None: - """Initialize spec server with indexer.""" - self.indexer = Indexer() + def __init__( + self, + enable_semantic: bool = True, + model_name: str = "all-MiniLM-L6-v2", + ) -> None: + """Initialize spec server with indexer. + + :param enable_semantic: Enable semantic search with embeddings (if available) + :type enable_semantic: bool + :param model_name: Name of the sentence-transformers model to use + :type model_name: str + """ + self.indexer = Indexer( + enable_semantic=enable_semantic, model_name=model_name + ) self.pdf_loader = PDFLoader() self.specs_dir: Path | None = None + self.enable_semantic = enable_semantic + self.model_name = model_name def load_pages(self, pages: list[dict[str, Any]], pdf_name: str) -> None: """Load PDF pages and build search index. @@ -153,7 +167,9 @@ def rebuild_index(self) -> str: return "No PDF files found in specs directory" # Clear current index - self.indexer = Indexer() + self.indexer = Indexer( + enable_semantic=self.enable_semantic, model_name=self.model_name + ) # Reload all PDFs total_pages = 0 diff --git a/tests/test_indexer.py b/tests/test_indexer.py index 14c44e6..727ab04 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -847,3 +847,72 @@ def test_indexer_handles_pages_without_section_headers(): # Second chunk has section assert chunks[1]["metadata"]["section"] == "4.2.1" + + +def test_indexer_custom_model_name(): + """Test that custom model name can be specified.""" + # Use a different smaller model for testing + custom_model = "all-MiniLM-L12-v2" + indexer = Indexer(enable_semantic=True, model_name=custom_model) + + assert indexer.model_name == custom_model + assert indexer.enable_semantic is True + # Model should be loaded with custom name + assert indexer.embedding_model is not None + + +def test_indexer_default_model_name(): + """Test that default model name is used when not specified.""" + indexer = Indexer(enable_semantic=True) + + assert indexer.model_name == "all-MiniLM-L6-v2" + assert indexer.embedding_model is not None + + +def test_indexer_model_name_persists_in_cache(tmp_path): + """Test that model name is saved to and loaded from cache.""" + custom_model = "all-MiniLM-L12-v2" + indexer1 = Indexer(enable_semantic=True, model_name=custom_model) + + pages = [ + {"page": 1, "text": "GATT Service documentation."}, + ] + + chunks = indexer1.create_chunks(pages, pdf_name="test.pdf") + indexer1.build_index(chunks) + + # Save cache + cache_file = tmp_path / "index.cache" + indexer1.save_cache(cache_file) + + # Load into new indexer with different default model + indexer2 = Indexer(enable_semantic=True, model_name="different-model") + indexer2.load_cache(cache_file) + + # Model name should be restored from cache + assert indexer2.model_name == custom_model + + +def test_indexer_model_name_used_when_loading_embeddings(tmp_path): + """Test that correct model is loaded when restoring embeddings from cache.""" + custom_model = "all-MiniLM-L12-v2" + indexer1 = Indexer(enable_semantic=True, model_name=custom_model) + + pages = [ + {"page": 1, "text": "GATT Service documentation."}, + ] + + chunks = indexer1.create_chunks(pages, pdf_name="test.pdf") + indexer1.build_index(chunks) + + # Save cache + cache_file = tmp_path / "index.cache" + indexer1.save_cache(cache_file) + + # Load into new indexer without initializing the model first + indexer2 = Indexer(enable_semantic=False) # Start with semantic disabled + indexer2.load_cache(cache_file) + + # Model should be loaded with the cached model name + assert indexer2.model_name == custom_model + assert indexer2.embedding_model is not None