Skip to content

Commit df241f5

Browse files
authored
feat(artifact): add file UID param in retrieval task (#1046)
Because - Filename isn't a unique identifier in catalogs anymore This commit - Adds a file UID param in the retrieval task in the Instill Artifact component. - The filename param is kept temporarily for backwards compatibility while clients update their recipes. - Adds a file UID field in the retrieved chunk object. - Removes the database schema version from the config.
1 parent 349ccd3 commit df241f5

File tree

11 files changed

+67
-42
lines changed

11 files changed

+67
-42
lines changed

cmd/migration/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ func main() {
107107
codeMigrator, cleanup := initCodeMigrator(ctx)
108108
defer cleanup()
109109

110-
if err := runMigration(dsn, databaseConfig.Version, codeMigrator.Migrate); err != nil {
110+
if err := runMigration(dsn, database.TargetSchemaVersion, codeMigrator.Migrate); err != nil {
111111
log.With(zap.Error(err)).Fatal("Running migration")
112112
}
113113
}

config/config.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ type DatabaseConfig struct {
120120
ReplicationTimeFrame int `koanf:"replicationtimeframe"` // in seconds
121121
} `koanf:"replica"`
122122
Name string `koanf:"name"`
123-
Version uint `koanf:"version"`
124123
TimeZone string `koanf:"timezone"`
125124
Pool struct {
126125
IdleConnections int `koanf:"idleconnections"`

config/config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ database:
2525
host: pg-sql
2626
port: 5432
2727
name: pipeline
28-
version: 40
2928
timezone: Etc/UTC
3029
pool:
3130
idleconnections: 5

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ require (
4545
github.com/h2non/filetype v1.1.3
4646
github.com/iancoleman/strcase v0.3.0
4747
github.com/influxdata/influxdb-client-go/v2 v2.14.0
48-
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20250626141501-c8e22cc2e0b6
48+
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20250707160902-77023eb2f033
4949
github.com/instill-ai/usage-client v0.4.0
5050
github.com/instill-ai/x v0.8.0-alpha.0.20250522164415-9172edd336bb
5151
github.com/itchyny/gojq v0.12.17

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,8 +460,8 @@ github.com/influxdata/influxdb-client-go/v2 v2.14.0 h1:AjbBfJuq+QoaXNcrova8smSjw
460460
github.com/influxdata/influxdb-client-go/v2 v2.14.0/go.mod h1:Ahpm3QXKMJslpXl3IftVLVezreAUtBOTZssDrjZEFHI=
461461
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf h1:7JTmneyiNEwVBOHSjoMxiWAqB992atOeepeFYegn5RU=
462462
github.com/influxdata/line-protocol v0.0.0-20210922203350-b1ad95c89adf/go.mod h1:xaLFMmpvUxqXtVkUJfg9QmT88cDaCJ3ZKgdZ78oO8Qo=
463-
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20250626141501-c8e22cc2e0b6 h1:5o4fBRte53mNXcjengF2TtlGaY4vUiM49iWT2KoOa5Q=
464-
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20250626141501-c8e22cc2e0b6/go.mod h1:bCnBosofpaUxKBuTTJM3/I3thAK37kvfBnKByjnLsl4=
463+
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20250707160902-77023eb2f033 h1:jhP9Gz7tw57rTTHQ7WhHOWf7z/worMcfr/n3NAcjbD4=
464+
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20250707160902-77023eb2f033/go.mod h1:bCnBosofpaUxKBuTTJM3/I3thAK37kvfBnKByjnLsl4=
465465
github.com/instill-ai/usage-client v0.4.0 h1:xf1hAlO4a8lZwZzz9bprZOJqU3ghIcIsavUUB7UURyg=
466466
github.com/instill-ai/usage-client v0.4.0/go.mod h1:zZ9LRoXps2u63ARYPAbR2YvqTib3dWJLObZn+9YqhF0=
467467
github.com/instill-ai/x v0.8.0-alpha.0.20250522164415-9172edd336bb h1:lrHTet3MB9ctNTvY/H8qcyiHd1vdTRqzUMkGsh5/Pp8=

pkg/component/data/instillartifact/v0/config/definition.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,6 @@ description: Access files and perform RAG-based search and retrieval through cat
1616
tombstone: false
1717
type: COMPONENT_TYPE_DATA
1818
uid: 6ec46048-f82f-4452-ba19-79698af9186e
19-
version: 0.1.0
19+
version: 0.1.1
2020
sourceUrl: https://github.com/instill-ai/pipeline-backend/blob/main/pkg/component/data/instillartifact/v0
2121
releaseStage: RELEASE_STAGE_ALPHA

pkg/component/data/instillartifact/v0/config/tasks.yaml

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,22 @@ $defs:
6969
uiOrder: 2
7070
title: Text Content
7171
type: string
72+
source-file-uid:
73+
description: The UID of the source file.
74+
uiOrder: 3
75+
title: Source File UID
76+
type: string
7277
source-file-name:
7378
description: The name of the source file.
74-
uiOrder: 3
79+
uiOrder: 4
7580
title: Source File Name
7681
type: string
7782
required:
7883
- chunk-uid
7984
- similarity-score
8085
- text-content
8186
- source-file-name
87+
- source-file-uid
8288
title: Chunk
8389
type: object
8490
namespace:
@@ -582,11 +588,11 @@ TASK_RETRIEVE:
582588
uiOrder: 2
583589
type: integer
584590
title: Top K
585-
filename:
586-
description: File name to filter, empty for all.
591+
file-uid:
592+
description: Optional filter by file.
587593
uiOrder: 3
588594
type: string
589-
title: Filename
595+
title: File UID
590596
file-media-type:
591597
description: The media type to filter, empty for all.
592598
uiOrder: 4
@@ -606,6 +612,14 @@ TASK_RETRIEVE:
606612
- summary
607613
- augmented
608614
title: Content type
615+
filename:
616+
description: |-
617+
File name to filter, empty for all. This field is deprecated and the
618+
file UID should be used instead. The filename isn't unique by catalog
619+
and therefore the filter might produce unexpected results.
620+
uiOrder: 6
621+
type: string
622+
title: Filename
609623
required:
610624
- namespace
611625
- catalog-id

pkg/component/data/instillartifact/v0/io.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,12 +164,16 @@ type SearchChunksInput struct {
164164
TextPrompt string `json:"text-prompt"`
165165
// TopK for searching chunks
166166
TopK uint32 `json:"top-k"`
167-
// File name to filter
168-
Filename string `json:"filename"`
167+
// File filter
168+
FileUID string `json:"file-uid"`
169169
// The media type to filter
170170
FileMediaType string `json:"file-media-type"`
171171
// The content type to filter
172-
ContetType string `json:"content-type"`
172+
ContentType string `json:"content-type"`
173+
174+
// File name to filter
175+
// Deprecated, use FileUID instead
176+
Filename string `json:"filename"`
173177
}
174178

175179
// SearchChunksOutput is the output for searching chunks
@@ -186,6 +190,8 @@ type SimilarityChunk struct {
186190
SimilarityScore float32 `json:"similarity-score"`
187191
// Text content of the chunk
188192
TextContent string `json:"text-content"`
193+
// Source file UID
194+
SourceFileUID string `json:"source-file-uid"`
189195
// Source file name
190196
SourceFileName string `json:"source-file-name"`
191197
// Content type

pkg/component/data/instillartifact/v0/task_query.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,16 @@ func (e *execution) query(input *structpb.Struct) (*structpb.Struct, error) {
3939

4040
output := QueryOutput{
4141
Answer: queryRes.Answer,
42-
Chunks: []SimilarityChunk{},
42+
Chunks: make([]SimilarityChunk, 0, len(queryRes.GetSimilarChunks())),
4343
}
4444

45-
for _, chunkPB := range queryRes.SimilarChunks {
45+
for _, chunkPB := range queryRes.GetSimilarChunks() {
4646
output.Chunks = append(output.Chunks, SimilarityChunk{
47-
ChunkUID: chunkPB.ChunkUid,
48-
SimilarityScore: chunkPB.SimilarityScore,
49-
TextContent: chunkPB.TextContent,
50-
SourceFileName: chunkPB.SourceFile,
47+
ChunkUID: chunkPB.GetChunkUid(),
48+
SimilarityScore: chunkPB.GetSimilarityScore(),
49+
TextContent: chunkPB.GetTextContent(),
50+
SourceFileName: chunkPB.GetSourceFile(),
51+
SourceFileUID: chunkPB.GetChunkMetadata().GetOriginalFileUid(),
5152
})
5253
}
5354

pkg/component/data/instillartifact/v0/task_search_chunks.go

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,10 @@ import (
1010

1111
"github.com/instill-ai/pipeline-backend/pkg/component/base"
1212

13-
artifactPB "github.com/instill-ai/protogen-go/artifact/artifact/v1alpha"
13+
artifactpb "github.com/instill-ai/protogen-go/artifact/artifact/v1alpha"
1414
)
1515

1616
func (e *execution) searchChunks(input *structpb.Struct) (*structpb.Struct, error) {
17-
1817
inputStruct := SearchChunksInput{}
1918
err := base.ConvertFromStructpb(input, &inputStruct)
2019
if err != nil {
@@ -27,56 +26,60 @@ func (e *execution) searchChunks(input *structpb.Struct) (*structpb.Struct, erro
2726
defer cancel()
2827
ctx = metadata.NewOutgoingContext(ctx, getRequestMetadata(e.SystemVariables))
2928

30-
var fileMediaType artifactPB.FileMediaType
31-
var contentType artifactPB.ContentType
29+
var fileMediaType artifactpb.FileMediaType
30+
var contentType artifactpb.ContentType
3231

3332
switch inputStruct.FileMediaType {
3433
case "document":
35-
fileMediaType = artifactPB.FileMediaType_FILE_MEDIA_TYPE_DOCUMENT
34+
fileMediaType = artifactpb.FileMediaType_FILE_MEDIA_TYPE_DOCUMENT
3635
case "image":
37-
fileMediaType = artifactPB.FileMediaType_FILE_MEDIA_TYPE_IMAGE
36+
fileMediaType = artifactpb.FileMediaType_FILE_MEDIA_TYPE_IMAGE
3837
case "audio":
39-
fileMediaType = artifactPB.FileMediaType_FILE_MEDIA_TYPE_AUDIO
38+
fileMediaType = artifactpb.FileMediaType_FILE_MEDIA_TYPE_AUDIO
4039
case "video":
41-
fileMediaType = artifactPB.FileMediaType_FILE_MEDIA_TYPE_VIDEO
40+
fileMediaType = artifactpb.FileMediaType_FILE_MEDIA_TYPE_VIDEO
4241
default:
43-
fileMediaType = artifactPB.FileMediaType_FILE_MEDIA_TYPE_UNSPECIFIED
42+
fileMediaType = artifactpb.FileMediaType_FILE_MEDIA_TYPE_UNSPECIFIED
4443
}
4544

46-
switch inputStruct.ContetType {
45+
switch inputStruct.ContentType {
4746
case "chunk":
48-
contentType = artifactPB.ContentType_CONTENT_TYPE_CHUNK
47+
contentType = artifactpb.ContentType_CONTENT_TYPE_CHUNK
4948
case "summary":
50-
contentType = artifactPB.ContentType_CONTENT_TYPE_SUMMARY
49+
contentType = artifactpb.ContentType_CONTENT_TYPE_SUMMARY
5150
case "augmented":
52-
contentType = artifactPB.ContentType_CONTENT_TYPE_AUGMENTED
51+
contentType = artifactpb.ContentType_CONTENT_TYPE_AUGMENTED
5352
default:
54-
contentType = artifactPB.ContentType_CONTENT_TYPE_UNSPECIFIED
53+
contentType = artifactpb.ContentType_CONTENT_TYPE_UNSPECIFIED
5554
}
5655

57-
searchRes, err := artifactClient.SimilarityChunksSearch(ctx, &artifactPB.SimilarityChunksSearchRequest{
56+
searchRes, err := artifactClient.SimilarityChunksSearch(ctx, &artifactpb.SimilarityChunksSearchRequest{
5857
NamespaceId: inputStruct.Namespace,
5958
CatalogId: inputStruct.CatalogID,
6059
TextPrompt: inputStruct.TextPrompt,
6160
TopK: inputStruct.TopK,
62-
FileName: inputStruct.Filename,
61+
FileUid: inputStruct.FileUID,
6362
FileMediaType: fileMediaType,
6463
ContentType: contentType,
64+
65+
// Deprecated: we keep using it for backwards compatibility.
66+
FileName: inputStruct.Filename,
6567
})
6668
if err != nil {
6769
return nil, fmt.Errorf("failed to search chunks: %w", err)
6870
}
6971

7072
output := SearchChunksOutput{
71-
Chunks: []SimilarityChunk{},
73+
Chunks: make([]SimilarityChunk, 0, len(searchRes.GetSimilarChunks())),
7274
}
7375

74-
for _, chunkPB := range searchRes.SimilarChunks {
76+
for _, chunkPB := range searchRes.GetSimilarChunks() {
7577
output.Chunks = append(output.Chunks, SimilarityChunk{
76-
ChunkUID: chunkPB.ChunkUid,
77-
SimilarityScore: chunkPB.SimilarityScore,
78-
TextContent: chunkPB.TextContent,
79-
SourceFileName: chunkPB.SourceFile,
78+
ChunkUID: chunkPB.GetChunkUid(),
79+
SimilarityScore: chunkPB.GetSimilarityScore(),
80+
TextContent: chunkPB.GetTextContent(),
81+
SourceFileName: chunkPB.GetSourceFile(),
82+
SourceFileUID: chunkPB.GetChunkMetadata().GetOriginalFileUid(),
8083
ContentType: chunkPB.GetChunkMetadata().GetContentType().String(),
8184
})
8285
}

0 commit comments

Comments
 (0)