diff --git a/CHANGELOG.md b/CHANGELOG.md
index f5795487..a213565c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
+### Added
+- Added `ALWAYS_INDEX_FILE_PATTERNS` environment variable to allow specifying a comma seperated list of glob patterns matching file paths that should always be indexed, regardless of size or # of trigrams. [#631](https://github.com/sourcebot-dev/sourcebot/pull/631)
+
### Fixed
- Fixed issue where single quotes could not be used in search queries. [#629](https://github.com/sourcebot-dev/sourcebot/pull/629)
diff --git a/docs/docs/configuration/environment-variables.mdx b/docs/docs/configuration/environment-variables.mdx
index 87167858..e29fb88f 100644
--- a/docs/docs/configuration/environment-variables.mdx
+++ b/docs/docs/configuration/environment-variables.mdx
@@ -35,6 +35,7 @@ The following environment variables allow you to configure your Sourcebot deploy
| `SOURCEBOT_STRUCTURED_LOGGING_FILE` | - |
Optional file to log to if structured logging is enabled
|
| `SOURCEBOT_TELEMETRY_DISABLED` | `false` | Enables/disables telemetry collection in Sourcebot. See [this doc](/docs/overview.mdx#telemetry) for more info.
|
| `DEFAULT_MAX_MATCH_COUNT` | `10000` | The default maximum number of search results to return when using search in the web app.
|
+| `ALWAYS_INDEX_FILE_PATTERNS` | - | A comma separated list of glob patterns matching file paths that should always be indexed, regardless of size or number of trigrams.
|
### Enterprise Environment Variables
| Variable | Default | Description |
diff --git a/docs/docs/connections/overview.mdx b/docs/docs/connections/overview.mdx
index ab9f8ffc..cb3b1432 100644
--- a/docs/docs/connections/overview.mdx
+++ b/docs/docs/connections/overview.mdx
@@ -69,6 +69,26 @@ To learn more about how to create a connection for a specific code host, check o
Missing your code host? [Submit a feature request on GitHub](https://github.com/sourcebot-dev/sourcebot/issues/new?template=feature_request.md).
+## Indexing Large Files
+
+By default, Sourcebot will skip indexing files that are larger than 2MB or have more than 20,000 trigrams. You can configure this by setting the `maxFileSize` and `maxTrigramCount` [settings](/docs/configuration/config-file#settings).
+
+These limits can be ignored for specific files by passing in a comma separated list of glob patterns matching file paths to the `ALWAYS_INDEX_FILE_PATTERNS` environment variable. For example:
+
+```bash
+# Always index all .sum and .lock files
+ALWAYS_INDEX_FILE_PATTERNS=**/*.sum,**/*.lock
+```
+
+Files that have been skipped are assigned the `skipped` language. You can view a list of all skipped files by using the following query:
+```
+lang:skipped
+```
+
+## Indexing Binary Files
+
+Binary files cannot be indexed by Sourcebot. See [#575](https://github.com/sourcebot-dev/sourcebot/issues/575) for more information.
+
## Schema reference
---
diff --git a/packages/backend/src/zoekt.ts b/packages/backend/src/zoekt.ts
index 9f65f473..68af1160 100644
--- a/packages/backend/src/zoekt.ts
+++ b/packages/backend/src/zoekt.ts
@@ -1,5 +1,5 @@
import { Repo } from "@sourcebot/db";
-import { createLogger } from "@sourcebot/shared";
+import { createLogger, env } from "@sourcebot/shared";
import { exec } from "child_process";
import { INDEX_CACHE_DIR } from "./constants.js";
import { Settings } from "./types.js";
@@ -11,6 +11,8 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
const { path: repoPath } = getRepoPath(repo);
const shardPrefix = getShardPrefix(repo.orgId, repo.id);
+ const largeFileGlobPatterns = env.ALWAYS_INDEX_FILE_PATTERNS?.split(',').map(pattern => pattern.trim()) ?? [];
+
const command = [
'zoekt-git-index',
'-allow_missing_branches',
@@ -21,6 +23,7 @@ export const indexGitRepository = async (repo: Repo, settings: Settings, revisio
`-tenant_id ${repo.orgId}`,
`-repo_id ${repo.id}`,
`-shard_prefix ${shardPrefix}`,
+ ...largeFileGlobPatterns.map((pattern) => `-large_file ${pattern}`),
repoPath
].join(' ');
diff --git a/packages/shared/src/env.server.ts b/packages/shared/src/env.server.ts
index 919a5884..f9c63f9c 100644
--- a/packages/shared/src/env.server.ts
+++ b/packages/shared/src/env.server.ts
@@ -219,6 +219,9 @@ export const env = createEnv({
// Configure the default maximum number of search results to return by default.
DEFAULT_MAX_MATCH_COUNT: numberSchema.default(10_000),
+
+ // A comma separated list of glob patterns that shwould always be indexed regardless of their size.
+ ALWAYS_INDEX_FILE_PATTERNS: z.string().optional(),
},
runtimeEnv,
emptyStringAsUndefined: true,