From cf8a2be632faf11fe11fc74f63d97c52e4159b07 Mon Sep 17 00:00:00 2001 From: bkellam Date: Tue, 25 Nov 2025 23:15:29 -0800 Subject: [PATCH 1/3] allow list --- packages/backend/src/zoekt.ts | 5 ++++- packages/shared/src/env.server.ts | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/backend/src/zoekt.ts b/packages/backend/src/zoekt.ts index 9f65f473..68af1160 100644 --- a/packages/backend/src/zoekt.ts +++ b/packages/backend/src/zoekt.ts @@ -1,5 +1,5 @@ import { Repo } from "@sourcebot/db"; -import { createLogger } from "@sourcebot/shared"; +import { createLogger, env } from "@sourcebot/shared"; import { exec } from "child_process"; import { INDEX_CACHE_DIR } from "./constants.js"; import { Settings } from "./types.js"; @@ -11,6 +11,8 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio const { path: repoPath } = getRepoPath(repo); const shardPrefix = getShardPrefix(repo.orgId, repo.id); + const largeFileGlobPatterns = env.ALWAYS_INDEX_FILE_PATTERNS?.split(',').map(pattern => pattern.trim()) ?? []; + const command = [ 'zoekt-git-index', '-allow_missing_branches', @@ -21,6 +23,7 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio `-tenant_id ${repo.orgId}`, `-repo_id ${repo.id}`, `-shard_prefix ${shardPrefix}`, + ...largeFileGlobPatterns.map((pattern) => `-large_file ${pattern}`), repoPath ].join(' '); diff --git a/packages/shared/src/env.server.ts b/packages/shared/src/env.server.ts index 919a5884..f9c63f9c 100644 --- a/packages/shared/src/env.server.ts +++ b/packages/shared/src/env.server.ts @@ -219,6 +219,9 @@ export const env = createEnv({ // Configure the default maximum number of search results to return by default. DEFAULT_MAX_MATCH_COUNT: numberSchema.default(10_000), + + // A comma separated list of glob patterns that shwould always be indexed regardless of their size. + ALWAYS_INDEX_FILE_PATTERNS: z.string().optional(), }, runtimeEnv, emptyStringAsUndefined: true, From feb8eceee8c03782c2edd91f1fbeb88e98c91843 Mon Sep 17 00:00:00 2001 From: bkellam Date: Tue, 25 Nov 2025 23:18:03 -0800 Subject: [PATCH 2/3] changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5795487..a213565c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Added `ALWAYS_INDEX_FILE_PATTERNS` environment variable to allow specifying a comma seperated list of glob patterns matching file paths that should always be indexed, regardless of size or # of trigrams. [#631](https://github.com/sourcebot-dev/sourcebot/pull/631) + ### Fixed - Fixed issue where single quotes could not be used in search queries. [#629](https://github.com/sourcebot-dev/sourcebot/pull/629) From 7ce65cb66a2dec6257ce5f2b5c9a22f2568d579b Mon Sep 17 00:00:00 2001 From: bkellam Date: Tue, 25 Nov 2025 23:36:53 -0800 Subject: [PATCH 3/3] docs --- .../configuration/environment-variables.mdx | 1 + docs/docs/connections/overview.mdx | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/docs/docs/configuration/environment-variables.mdx b/docs/docs/configuration/environment-variables.mdx index 87167858..e29fb88f 100644 --- a/docs/docs/configuration/environment-variables.mdx +++ b/docs/docs/configuration/environment-variables.mdx @@ -35,6 +35,7 @@ The following environment variables allow you to configure your Sourcebot deploy | `SOURCEBOT_STRUCTURED_LOGGING_FILE` | - |

Optional file to log to if structured logging is enabled

| | `SOURCEBOT_TELEMETRY_DISABLED` | `false` |

Enables/disables telemetry collection in Sourcebot. See [this doc](/docs/overview.mdx#telemetry) for more info.

| | `DEFAULT_MAX_MATCH_COUNT` | `10000` |

The default maximum number of search results to return when using search in the web app.

| +| `ALWAYS_INDEX_FILE_PATTERNS` | - |

A comma separated list of glob patterns matching file paths that should always be indexed, regardless of size or number of trigrams.

| ### Enterprise Environment Variables | Variable | Default | Description | diff --git a/docs/docs/connections/overview.mdx b/docs/docs/connections/overview.mdx index ab9f8ffc..cb3b1432 100644 --- a/docs/docs/connections/overview.mdx +++ b/docs/docs/connections/overview.mdx @@ -69,6 +69,26 @@ To learn more about how to create a connection for a specific code host, check o Missing your code host? [Submit a feature request on GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new?template=feature_request.md). +## Indexing Large Files + +By default, Sourcebot will skip indexing files that are larger than 2MB or have more than 20,000 trigrams. You can configure this by setting the `maxFileSize` and `maxTrigramCount` [settings](/docs/configuration/config-file#settings). + +These limits can be ignored for specific files by passing in a comma separated list of glob patterns matching file paths to the `ALWAYS_INDEX_FILE_PATTERNS` environment variable. For example: + +```bash +# Always index all .sum and .lock files +ALWAYS_INDEX_FILE_PATTERNS=**/*.sum,**/*.lock +``` + +Files that have been skipped are assigned the `skipped` language. You can view a list of all skipped files by using the following query: +``` +lang:skipped +``` + +## Indexing Binary Files + +Binary files cannot be indexed by Sourcebot. See [#575](https://github.com/sourcebot-dev/sourcebot/issues/575) for more information. + ## Schema reference ---