Skip to content

Commit 3117046

Browse files
authored
fix(component,ai,gemini): unify InlineData processing and enable images in streaming responses (#1125)
Because - InlineData wasn't properly cleaned up in streaming mode, causing binary data exposure in JSON outputs - Streaming responses didn't show images while non-streaming responses did, creating an inconsistent user experience - Image extraction and InlineData cleanup logic was duplicated across different functions This commit - Creates a unified `processInlineDataInCandidates` function that handles both image extraction and InlineData cleanup with configurable behavior - Fixes InlineData cleanup in streaming mode by ensuring binary data is removed from all intermediate responses - Enables image extraction in streaming responses, making the behavior consistent with non-streaming mode - Replaces duplicated cleanup logic in `renderFinal` and `buildStreamOutput` with the unified approach - Adds comprehensive tests covering streaming cleanup, image extraction, and consistency between streaming and final modes - Maintains backward compatibility while improving memory efficiency and user experience
1 parent 904992d commit 3117046

File tree

2 files changed

+355
-27
lines changed

2 files changed

+355
-27
lines changed

pkg/component/ai/gemini/v0/task_chat.go

Lines changed: 39 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,6 @@ func (e *execution) mergeResponseChunk(r *genai.GenerateContentResponse, finalRe
263263
func (e *execution) buildStreamOutput(texts []string, finalResp *genai.GenerateContentResponse) TaskChatOutput {
264264
streamOutput := TaskChatOutput{
265265
Texts: texts,
266-
Images: []format.Image{},
267266
Usage: map[string]any{},
268267
Candidates: []*genai.Candidate{},
269268
UsageMetadata: nil,
@@ -285,9 +284,9 @@ func (e *execution) buildStreamOutput(texts []string, finalResp *genai.GenerateC
285284
streamOutput.ResponseID = &ri
286285
}
287286

288-
// Note: Image extraction and InlineData cleanup is deferred to renderFinal()
289-
// to avoid processing the same images multiple times during streaming.
290-
// Streaming responses will have empty Images array until the final response.
287+
// Extract images and clean up InlineData in streaming responses
288+
// This ensures users see images in streaming responses while preventing binary data exposure
289+
streamOutput.Images = processInlineDataInCandidates(finalResp.Candidates, true)
291290

292291
// Build usage map from UsageMetadata if available
293292
if finalResp.UsageMetadata != nil {
@@ -298,15 +297,49 @@ func (e *execution) buildStreamOutput(texts []string, finalResp *genai.GenerateC
298297
return streamOutput
299298
}
300299

300+
// processInlineDataInCandidates handles InlineData processing in candidates with unified logic.
301+
// If extractImages is true, it extracts image data and converts to format.Image.
302+
// Always cleans up InlineData to prevent binary data exposure in JSON output.
303+
func processInlineDataInCandidates(candidates []*genai.Candidate, extractImages bool) []format.Image {
304+
var images []format.Image
305+
if extractImages {
306+
images = make([]format.Image, 0)
307+
}
308+
309+
for _, c := range candidates {
310+
if c != nil && c.Content != nil {
311+
for _, p := range c.Content.Parts {
312+
if p != nil && p.InlineData != nil {
313+
// Extract image if requested and the data is an image
314+
if extractImages && strings.Contains(strings.ToLower(p.InlineData.MIMEType), "image") {
315+
// Convert blob data to format.Image using the standard data package approach
316+
// Normalize MIME type and use the existing NewImageFromBytes function
317+
normalizedMimeType := strings.ToLower(strings.TrimSpace(strings.Split(p.InlineData.MIMEType, ";")[0]))
318+
img, err := data.NewImageFromBytes(p.InlineData.Data, normalizedMimeType, "", true)
319+
if err == nil {
320+
images = append(images, img)
321+
}
322+
}
323+
// Always clean up InlineData to prevent raw binary data from being exposed in JSON output
324+
// The binary data is already extracted and converted to format.Image above (if requested)
325+
p.InlineData = nil
326+
}
327+
}
328+
}
329+
}
330+
331+
return images
332+
}
333+
301334
// buildUsageMap creates a usage map from UsageMetadata with kebab-case keys
302335
func buildUsageMap(metadata *genai.GenerateContentResponseUsageMetadata) map[string]any {
303336
usage := make(map[string]any)
304337
usage["prompt-token-count"] = metadata.PromptTokenCount
305338
usage["cached-content-token-count"] = metadata.CachedContentTokenCount
306339
usage["candidates-token-count"] = metadata.CandidatesTokenCount
307-
usage["total-token-count"] = metadata.TotalTokenCount
308340
usage["tool-use-prompt-token-count"] = metadata.ToolUsePromptTokenCount
309341
usage["thoughts-token-count"] = metadata.ThoughtsTokenCount
342+
usage["total-token-count"] = metadata.TotalTokenCount
310343

311344
if len(metadata.PromptTokensDetails) > 0 {
312345
arr := make([]map[string]any, 0, len(metadata.PromptTokensDetails))
@@ -396,28 +429,7 @@ func renderFinal(resp *genai.GenerateContentResponse, texts []string) TaskChatOu
396429
}
397430

398431
// Extract generated images from candidates and clean up InlineData to prevent raw binary exposure
399-
if len(resp.Candidates) > 0 {
400-
images := make([]format.Image, 0)
401-
for _, c := range resp.Candidates {
402-
if c.Content != nil {
403-
for _, p := range c.Content.Parts {
404-
if p != nil && p.InlineData != nil && strings.Contains(strings.ToLower(p.InlineData.MIMEType), "image") {
405-
// Convert blob data to format.Image using the standard data package approach
406-
// Normalize MIME type and use the existing NewImageFromBytes function
407-
normalizedMimeType := strings.ToLower(strings.TrimSpace(strings.Split(p.InlineData.MIMEType, ";")[0]))
408-
img, err := data.NewImageFromBytes(p.InlineData.Data, normalizedMimeType, "", true)
409-
if err == nil {
410-
images = append(images, img)
411-
}
412-
// Clean up InlineData to prevent raw binary data from being exposed in JSON output
413-
// The binary data is already extracted and converted to format.Image above
414-
p.InlineData = nil
415-
}
416-
}
417-
}
418-
}
419-
out.Images = images
420-
}
432+
out.Images = processInlineDataInCandidates(resp.Candidates, true)
421433

422434
if resp.UsageMetadata != nil {
423435
out.Usage = buildUsageMap(resp.UsageMetadata)

0 commit comments

Comments
 (0)