Skip to content

Commit 291b379

Browse files
authored
feat(component,ai,gemini): add multimedia support with unified format… (#1114)
Because - The Gemini component only supported text and image inputs, limiting its capabilities for audio, video, and document processing - Format validation logic was scattered and repetitive across different media types - Error messages for unsupported formats were not helpful in guiding users toward conversion solutions - The component didn't leverage Gemini API's native support for remote file URIs, causing unnecessary data transfer - Test coverage was incomplete for multimedia processing functions This commit - Adds comprehensive audio and video support to the Gemini chat component, enabling multimedia understanding capabilities - Implements unified format validation system using `formatSupport` struct to centralize Gemini API and Instill Core format definitions - Enhances error messages to provide specific conversion guidance (e.g., `:png`, `:wav`, `:mp4` syntax) for unsupported formats - Refactors `buildReqParts` function into focused helper functions (`processImageParts`, `processAudioParts`, `processVideoParts`, `processDocumentParts`) for better maintainability - Optimizes remote file handling by using `genai.NewPartFromURI` for HTTP/HTTPS URLs and `external.BinaryFetcher` for data URIs - Adds comprehensive unit tests for all processing functions, including streaming/non-streaming logic and edge cases - Aligns format support lists with actual constants defined in `pkg/data` packages while maintaining complete Gemini API format coverage - Implements document processing modes (visual for PDF, text for plain text formats, conversion guidance for office documents)
1 parent 793bd1d commit 291b379

File tree

5 files changed

+1130
-808
lines changed

5 files changed

+1130
-808
lines changed
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
package gemini
2+
3+
import (
4+
"fmt"
5+
"slices"
6+
"strings"
7+
8+
"github.com/instill-ai/pipeline-backend/pkg/data"
9+
)
10+
11+
// formatSupport defines format support levels for different media types
12+
type formatSupport struct {
13+
gemini []string
14+
instill []string
15+
}
16+
17+
var (
18+
imageFormats = formatSupport{
19+
gemini: []string{
20+
data.PNG, // PNG
21+
data.JPEG, // JPEG
22+
data.WEBP, // WEBP
23+
"image/heic", // HEIC
24+
"image/heif", // HEIF
25+
},
26+
instill: []string{
27+
data.PNG, // PNG
28+
data.JPEG, // JPEG
29+
data.WEBP, // WEBP
30+
data.GIF, // GIF
31+
data.BMP, // BMP
32+
data.TIFF, // TIFF
33+
},
34+
}
35+
36+
audioFormats = formatSupport{
37+
gemini: []string{
38+
data.WAV, // WAV
39+
"audio/mp3", // MP3
40+
data.MP3, // MP3 (audio/mpeg - alternative MIME type)
41+
data.AIFF, // AIFF
42+
data.AAC, // AAC
43+
data.OGG, // OGG Vorbis
44+
data.FLAC, // FLAC
45+
},
46+
instill: []string{
47+
data.MP3, // MP3 (audio/mpeg)
48+
data.WAV, // WAV
49+
data.AAC, // AAC
50+
data.OGG, // OGG
51+
data.FLAC, // FLAC
52+
data.M4A, // M4A (audio/mp4)
53+
data.WMA, // WMA (audio/x-ms-wma)
54+
data.AIFF, // AIFF
55+
},
56+
}
57+
58+
videoFormats = formatSupport{
59+
gemini: []string{
60+
data.MP4, // MP4
61+
data.MPEG, // MPEG
62+
data.MOV, // MOV (video/quicktime)
63+
"video/mov", // MOV (standard MIME type)
64+
data.AVI, // AVI (video/x-msvideo)
65+
"video/avi", // AVI (standard MIME type)
66+
data.FLV, // FLV (video/x-flv)
67+
"video/mpg", // MPG - supported by Gemini but not defined in video.go
68+
data.WEBM, // WEBM
69+
data.WMV, // WMV (video/x-ms-wmv)
70+
"video/wmv", // WMV (standard MIME type)
71+
"video/3gpp", // 3GPP - supported by Gemini but not defined in video.go
72+
},
73+
instill: []string{
74+
data.MP4, // MP4
75+
data.AVI, // AVI (video/x-msvideo)
76+
data.MOV, // MOV (video/quicktime)
77+
data.WEBM, // WEBM
78+
data.MKV, // MKV (video/x-matroska)
79+
data.FLV, // FLV (video/x-flv)
80+
data.WMV, // WMV (video/x-ms-wmv)
81+
data.MPEG, // MPEG
82+
},
83+
}
84+
85+
documentFormats = formatSupport{
86+
gemini: []string{
87+
data.PDF, // PDF - only visual document format supported by Gemini
88+
},
89+
instill: []string{
90+
data.PDF, // PDF
91+
data.DOC, // DOC
92+
data.DOCX, // DOCX
93+
data.PPT, // PPT
94+
data.PPTX, // PPTX
95+
data.XLS, // XLS
96+
data.XLSX, // XLSX
97+
data.HTML, // HTML
98+
data.MARKDOWN, // Markdown
99+
data.TEXT, // Plain text
100+
data.PLAIN, // Plain text
101+
data.CSV, // CSV
102+
},
103+
}
104+
)
105+
106+
// validateFormat checks if a format is supported and returns appropriate error messages
107+
// For documents, it also returns the processing mode ("visual", "text", or "" for error)
108+
func validateFormat(contentType, mediaType string, formats formatSupport, convertibleFormats, supportedTargets, examples string) (string, error) {
109+
// Check if the format is supported by Gemini API
110+
if slices.Contains(formats.gemini, contentType) {
111+
// Special handling for documents to determine processing mode
112+
if mediaType == "document" {
113+
if contentType == data.PDF {
114+
return "visual", nil // PDF supports visual processing
115+
}
116+
// Text-based documents supported by Gemini (currently none, but future-proof)
117+
return "text", nil
118+
}
119+
return "", nil // Other media types don't need mode
120+
}
121+
122+
// Check if it's a known Instill Core format that can be converted
123+
if slices.Contains(formats.instill, contentType) {
124+
// Special handling for documents
125+
if mediaType == "document" {
126+
// Text-based documents: Process as plain text
127+
textBasedTypes := []string{data.HTML, data.MARKDOWN, data.TEXT, data.PLAIN, data.CSV}
128+
if slices.Contains(textBasedTypes, contentType) || strings.HasPrefix(contentType, "text/") {
129+
return "text", nil
130+
}
131+
132+
// Office documents: Need PDF conversion for visual elements
133+
officeTypes := []string{data.DOC, data.DOCX, data.PPT, data.PPTX, data.XLS, data.XLSX}
134+
if slices.Contains(officeTypes, contentType) {
135+
return "", fmt.Errorf("document format %s will be processed as text only, losing visual elements like charts and formatting. Use \":pdf\" syntax to convert to PDF for document vision capabilities", contentType)
136+
}
137+
138+
// Other known document formats
139+
return "", fmt.Errorf("document format %s is not supported by Gemini API. Use \":pdf\" syntax to convert DOC, DOCX, PPT, PPTX, XLS, XLSX to PDF (supported by both Gemini and Instill Core), such as \":pdf\"", contentType)
140+
}
141+
142+
// Non-document media types
143+
return "", fmt.Errorf("%s format %s is not supported by Gemini API. Use \":\" syntax to convert %s to %s (supported by both Gemini and Instill Core), such as \"%s\"", mediaType, contentType, convertibleFormats, supportedTargets, examples)
144+
}
145+
146+
// Unknown format - can't be processed at all
147+
return "", fmt.Errorf("%s format %s is not supported and cannot be processed", mediaType, contentType)
148+
}

pkg/component/ai/gemini/v0/config/tasks.yaml

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,8 +1513,22 @@ TASK_CHAT:
15131513
type: array
15141514
items:
15151515
type: string
1516-
documents:
1516+
audio:
15171517
uiOrder: 4
1518+
title: Audio
1519+
description: URI references or base64 content of input audio.
1520+
type: array
1521+
items:
1522+
type: string
1523+
videos:
1524+
uiOrder: 5
1525+
title: Videos
1526+
description: URI references or base64 content of input videos.
1527+
type: array
1528+
items:
1529+
type: string
1530+
documents:
1531+
uiOrder: 6
15181532
title: Documents
15191533
description: >-
15201534
URI references or base64 content of input documents. Different vendors might have different constraints on the document format. For example,
@@ -1523,32 +1537,32 @@ TASK_CHAT:
15231537
items:
15241538
type: string
15251539
system-message:
1526-
uiOrder: 5
1540+
uiOrder: 7
15271541
title: System Message
15281542
description: Instruction to set the assistant's behavior, tone, or persona. Different vendors might name this field differently.
15291543
type: string
15301544
chat-history:
1531-
uiOrder: 6
1545+
uiOrder: 8
15321546
title: Chat History
15331547
description: Conversation history, each message includes a role and content.
15341548
type: array
15351549
items:
15361550
$ref: "#/$defs/content"
15371551
max-output-tokens:
1538-
uiOrder: 7
1552+
uiOrder: 9
15391553
title: Max Output Token
15401554
description: The maximum number of tokens to generate in the model output.
15411555
type: integer
15421556
temperature:
1543-
uiOrder: 8
1557+
uiOrder: 10
15441558
title: Temperature
15451559
description: >-
15461560
A parameter that controls the randomness and creativity of a large language model's output by adjusting the probability of the next
15471561
word it chooses. A low temperature (e.g., near 0) produces more deterministic, focused, and consistent text, while a high temperature (e.g., near
15481562
1) leads to more creative, random, and varied output.
15491563
type: number
15501564
top-k:
1551-
uiOrder: 9
1565+
uiOrder: 11
15521566
title: Top-K
15531567
description: >-
15541568
A text generation parameter that limits the selection of the next token to the K most probable tokens, discarding the rest to control randomness
@@ -1568,44 +1582,44 @@ TASK_CHAT:
15681582
choice to a smaller, more focused set of highly probable words, resulting in more factual and conservative output.
15691583
type: number
15701584
seed:
1571-
uiOrder: 11
1585+
uiOrder: 12
15721586
title: Seed
15731587
description: A random seed used to control the stochasticity of text generation to produce repeatable outputs
15741588
type: integer
15751589
contents:
1576-
uiOrder: 12
1590+
uiOrder: 13
15771591
title: Contents
15781592
description: The input contents to the model. Each item represents a user or model turn composed of parts (text or images).
15791593
type: array
15801594
items:
15811595
$ref: "#/$defs/content"
15821596
tools:
1583-
uiOrder: 13
1597+
uiOrder: 14
15841598
title: Tools
15851599
description: Tools available to the model, e.g., function declarations.
15861600
type: array
15871601
items:
15881602
$ref: "#/$defs/tool"
15891603
tool-config:
1590-
uiOrder: 14
1604+
uiOrder: 15
15911605
$ref: "#/$defs/tool-config"
15921606
safety-settings:
1593-
uiOrder: 15
1607+
uiOrder: 16
15941608
title: Safety Settings
15951609
description: Safety settings for content filtering.
15961610
type: array
15971611
items:
15981612
$ref: "#/$defs/safety-setting"
15991613
system-instruction:
1600-
uiOrder: 16
1614+
uiOrder: 17
16011615
title: System Instruction
16021616
description: A system instruction to guide the model behavior.
16031617
$ref: "#/$defs/content"
16041618
generation-config:
1605-
uiOrder: 17
1619+
uiOrder: 18
16061620
$ref: "#/$defs/generation-config"
16071621
cached-content:
1608-
uiOrder: 18
1622+
uiOrder: 19
16091623
title: Cached Content
16101624
description: "The name of a cached content to use as context. Format: cachedContents/{cachedContent}."
16111625
type: string

pkg/component/ai/gemini/v0/io.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ type TaskChatInput struct {
1111
Stream *bool `instill:"stream"`
1212
Prompt *string `instill:"prompt"`
1313
Images []format.Image `instill:"images"`
14+
Audio []format.Audio `instill:"audio"`
15+
Videos []format.Video `instill:"videos"`
1416
Documents []format.Document `instill:"documents"`
1517
SystemMessage *string `instill:"system-message"`
1618
ChatHistory []*genai.Content `instill:"chat-history"`

0 commit comments

Comments
 (0)